Add external control interface for tokenizer.

This commit is contained in:
Stephen Chung 2021-04-04 23:08:27 +08:00
parent aacb7f0b24
commit bc9c1ab850
3 changed files with 58 additions and 92 deletions

View File

@ -9,7 +9,6 @@ use crate::parser::ParseState;
use crate::stdlib::{ use crate::stdlib::{
any::{type_name, TypeId}, any::{type_name, TypeId},
boxed::Box, boxed::Box,
num::NonZeroUsize,
string::String, string::String,
}; };
use crate::{ use crate::{
@ -1158,16 +1157,8 @@ impl Engine {
scripts: &[&str], scripts: &[&str],
optimization_level: OptimizationLevel, optimization_level: OptimizationLevel,
) -> Result<AST, ParseError> { ) -> Result<AST, ParseError> {
let (stream, buffer) = self.lex_raw(scripts, None); let (stream, tokenizer_control) = self.lex_raw(scripts, None);
let mut state = ParseState::new( let mut state = ParseState::new(self, tokenizer_control);
self,
buffer,
#[cfg(not(feature = "unchecked"))]
NonZeroUsize::new(self.max_expr_depth()),
#[cfg(not(feature = "unchecked"))]
#[cfg(not(feature = "no_function"))]
NonZeroUsize::new(self.max_function_expr_depth()),
);
self.parse( self.parse(
&mut stream.peekable(), &mut stream.peekable(),
&mut state, &mut state,
@ -1347,7 +1338,7 @@ impl Engine {
.into()); .into());
}; };
let (stream, buffer) = self.lex_raw( let (stream, tokenizer_control) = self.lex_raw(
&scripts, &scripts,
Some(if has_null { Some(if has_null {
|token| match token { |token| match token {
@ -1360,15 +1351,7 @@ impl Engine {
}), }),
); );
let mut state = ParseState::new( let mut state = ParseState::new(self, tokenizer_control);
self,
buffer,
#[cfg(not(feature = "unchecked"))]
NonZeroUsize::new(self.max_expr_depth()),
#[cfg(not(feature = "unchecked"))]
#[cfg(not(feature = "no_function"))]
NonZeroUsize::new(self.max_function_expr_depth()),
);
let ast = self.parse_global_expr( let ast = self.parse_global_expr(
&mut stream.peekable(), &mut stream.peekable(),
@ -1454,18 +1437,10 @@ impl Engine {
script: &str, script: &str,
) -> Result<AST, ParseError> { ) -> Result<AST, ParseError> {
let scripts = [script]; let scripts = [script];
let (stream, buffer) = self.lex_raw(&scripts, None); let (stream, tokenizer_control) = self.lex_raw(&scripts, None);
let mut peekable = stream.peekable(); let mut peekable = stream.peekable();
let mut state = ParseState::new( let mut state = ParseState::new(self, tokenizer_control);
self,
buffer,
#[cfg(not(feature = "unchecked"))]
NonZeroUsize::new(self.max_expr_depth()),
#[cfg(not(feature = "unchecked"))]
#[cfg(not(feature = "no_function"))]
NonZeroUsize::new(self.max_function_expr_depth()),
);
self.parse_global_expr(&mut peekable, &mut state, scope, self.optimization_level) self.parse_global_expr(&mut peekable, &mut state, scope, self.optimization_level)
} }
/// Evaluate a script file. /// Evaluate a script file.
@ -1624,16 +1599,8 @@ impl Engine {
script: &str, script: &str,
) -> Result<T, Box<EvalAltResult>> { ) -> Result<T, Box<EvalAltResult>> {
let scripts = [script]; let scripts = [script];
let (stream, buffer) = self.lex_raw(&scripts, None); let (stream, tokenizer_control) = self.lex_raw(&scripts, None);
let mut state = ParseState::new( let mut state = ParseState::new(self, tokenizer_control);
self,
buffer,
#[cfg(not(feature = "unchecked"))]
NonZeroUsize::new(self.max_expr_depth()),
#[cfg(not(feature = "unchecked"))]
#[cfg(not(feature = "no_function"))]
NonZeroUsize::new(self.max_function_expr_depth()),
);
// No need to optimize a lone expression // No need to optimize a lone expression
let ast = self.parse_global_expr( let ast = self.parse_global_expr(
@ -1779,16 +1746,8 @@ impl Engine {
script: &str, script: &str,
) -> Result<(), Box<EvalAltResult>> { ) -> Result<(), Box<EvalAltResult>> {
let scripts = [script]; let scripts = [script];
let (stream, buffer) = self.lex_raw(&scripts, None); let (stream, tokenizer_control) = self.lex_raw(&scripts, None);
let mut state = ParseState::new( let mut state = ParseState::new(self, tokenizer_control);
self,
buffer,
#[cfg(not(feature = "unchecked"))]
NonZeroUsize::new(self.max_expr_depth()),
#[cfg(not(feature = "unchecked"))]
#[cfg(not(feature = "no_function"))]
NonZeroUsize::new(self.max_function_expr_depth()),
);
let ast = self.parse( let ast = self.parse(
&mut stream.peekable(), &mut stream.peekable(),

View File

@ -11,7 +11,6 @@ use crate::optimize::optimize_into_ast;
use crate::optimize::OptimizationLevel; use crate::optimize::OptimizationLevel;
use crate::stdlib::{ use crate::stdlib::{
boxed::Box, boxed::Box,
cell::Cell,
collections::BTreeMap, collections::BTreeMap,
format, format,
hash::{Hash, Hasher}, hash::{Hash, Hasher},
@ -22,7 +21,9 @@ use crate::stdlib::{
vec::Vec, vec::Vec,
}; };
use crate::syntax::{CustomSyntax, MARKER_BLOCK, MARKER_EXPR, MARKER_IDENT}; use crate::syntax::{CustomSyntax, MARKER_BLOCK, MARKER_EXPR, MARKER_IDENT};
use crate::token::{is_keyword_function, is_valid_identifier, Token, TokenStream}; use crate::token::{
is_keyword_function, is_valid_identifier, Token, TokenStream, TokenizerControl,
};
use crate::utils::{get_hasher, IdentifierBuilder}; use crate::utils::{get_hasher, IdentifierBuilder};
use crate::{ use crate::{
calc_fn_hash, Dynamic, Engine, Identifier, LexError, ParseError, ParseErrorType, Position, calc_fn_hash, Dynamic, Engine, Identifier, LexError, ParseError, ParseErrorType, Position,
@ -45,7 +46,7 @@ pub struct ParseState<'e> {
/// Reference to the scripting [`Engine`]. /// Reference to the scripting [`Engine`].
engine: &'e Engine, engine: &'e Engine,
/// Input stream buffer containing the next character to read. /// Input stream buffer containing the next character to read.
buffer: Shared<Cell<Option<char>>>, tokenizer_control: TokenizerControl,
/// Interned strings. /// Interned strings.
interned_strings: IdentifierBuilder, interned_strings: IdentifierBuilder,
/// Encapsulates a local stack with variable names to simulate an actual runtime scope. /// Encapsulates a local stack with variable names to simulate an actual runtime scope.
@ -76,22 +77,15 @@ pub struct ParseState<'e> {
impl<'e> ParseState<'e> { impl<'e> ParseState<'e> {
/// Create a new [`ParseState`]. /// Create a new [`ParseState`].
#[inline(always)] #[inline(always)]
pub fn new( pub fn new(engine: &'e Engine, tokenizer_control: TokenizerControl) -> Self {
engine: &'e Engine,
buffer: Shared<Cell<Option<char>>>,
#[cfg(not(feature = "unchecked"))] max_expr_depth: Option<NonZeroUsize>,
#[cfg(not(feature = "unchecked"))]
#[cfg(not(feature = "no_function"))]
max_function_expr_depth: Option<NonZeroUsize>,
) -> Self {
Self { Self {
engine, engine,
buffer, tokenizer_control,
#[cfg(not(feature = "unchecked"))] #[cfg(not(feature = "unchecked"))]
max_expr_depth, max_expr_depth: NonZeroUsize::new(engine.max_expr_depth()),
#[cfg(not(feature = "unchecked"))] #[cfg(not(feature = "unchecked"))]
#[cfg(not(feature = "no_function"))] #[cfg(not(feature = "no_function"))]
max_function_expr_depth, max_function_expr_depth: NonZeroUsize::new(engine.max_function_expr_depth()),
#[cfg(not(feature = "no_closure"))] #[cfg(not(feature = "no_closure"))]
external_vars: Default::default(), external_vars: Default::default(),
#[cfg(not(feature = "no_closure"))] #[cfg(not(feature = "no_closure"))]
@ -982,14 +976,8 @@ fn parse_primary(
// | ... // | ...
#[cfg(not(feature = "no_function"))] #[cfg(not(feature = "no_function"))]
Token::Pipe | Token::Or if settings.allow_anonymous_fn => { Token::Pipe | Token::Or if settings.allow_anonymous_fn => {
let mut new_state = ParseState::new( let mut new_state = ParseState::new(state.engine, state.tokenizer_control.clone());
state.engine, new_state.max_expr_depth = new_state.max_function_expr_depth;
state.buffer.clone(),
#[cfg(not(feature = "unchecked"))]
state.max_function_expr_depth,
#[cfg(not(feature = "unchecked"))]
state.max_function_expr_depth,
);
let settings = ParseSettings { let settings = ParseSettings {
allow_if_expr: true, allow_if_expr: true,
@ -1034,7 +1022,9 @@ fn parse_primary(
segments.push(expr); segments.push(expr);
// Make sure to parse the following as text // Make sure to parse the following as text
state.buffer.set(Some('`')); let mut control = state.tokenizer_control.get();
control.is_within_text = true;
state.tokenizer_control.set(control);
match input.next().unwrap() { match input.next().unwrap() {
(Token::StringConstant(s), pos) => { (Token::StringConstant(s), pos) => {
@ -2540,14 +2530,9 @@ fn parse_stmt(
match input.next().unwrap() { match input.next().unwrap() {
(Token::Fn, pos) => { (Token::Fn, pos) => {
let mut new_state = ParseState::new( let mut new_state =
state.engine, ParseState::new(state.engine, state.tokenizer_control.clone());
state.buffer.clone(), new_state.max_expr_depth = new_state.max_function_expr_depth;
#[cfg(not(feature = "unchecked"))]
state.max_function_expr_depth,
#[cfg(not(feature = "unchecked"))]
state.max_function_expr_depth,
);
let settings = ParseSettings { let settings = ParseSettings {
allow_if_expr: true, allow_if_expr: true,

View File

@ -11,10 +11,11 @@ use crate::stdlib::{
iter::{FusedIterator, Peekable}, iter::{FusedIterator, Peekable},
num::NonZeroUsize, num::NonZeroUsize,
ops::{Add, AddAssign}, ops::{Add, AddAssign},
rc::Rc,
str::{Chars, FromStr}, str::{Chars, FromStr},
string::{String, ToString}, string::{String, ToString},
}; };
use crate::{Engine, LexError, Shared, StaticVec, INT}; use crate::{Engine, LexError, StaticVec, INT};
#[cfg(not(feature = "no_float"))] #[cfg(not(feature = "no_float"))]
use crate::ast::FloatWrapper; use crate::ast::FloatWrapper;
@ -25,6 +26,17 @@ use rust_decimal::Decimal;
#[cfg(not(feature = "no_function"))] #[cfg(not(feature = "no_function"))]
use crate::engine::KEYWORD_IS_DEF_FN; use crate::engine::KEYWORD_IS_DEF_FN;
/// A type containing commands to control the tokenizer.
#[derive(Debug, Clone, Eq, PartialEq, Hash, Copy, Default)]
pub struct TokenizeControlBlock {
/// Is the current tokenizer position within an interpolated text string?
/// This flag allows switching the tokenizer back to _text_ parsing after an interpolation stream.
pub is_within_text: bool,
}
/// A shared object that allows control of the tokenizer from outside.
pub type TokenizerControl = Rc<Cell<TokenizeControlBlock>>;
type LERR = LexError; type LERR = LexError;
/// Separator character for numbers. /// Separator character for numbers.
@ -849,6 +861,9 @@ pub trait InputStream {
/// _(INTERNALS)_ Parse a string literal ended by `termination_char`. /// _(INTERNALS)_ Parse a string literal ended by `termination_char`.
/// Exported under the `internals` feature only. /// Exported under the `internals` feature only.
/// ///
/// Returns the parsed string and a boolean indicating whether the string is
/// terminated by an interpolation `${`.
///
/// # Volatile API /// # Volatile API
/// ///
/// This function is volatile and may change. /// This function is volatile and may change.
@ -1840,8 +1855,8 @@ pub struct TokenIterator<'a> {
state: TokenizeState, state: TokenizeState,
/// Current position. /// Current position.
pos: Position, pos: Position,
/// Buffer containing the next character to read, if any. /// External buffer containing the next character to read, if any.
buffer: Shared<Cell<Option<char>>>, tokenizer_control: TokenizerControl,
/// Input character stream. /// Input character stream.
stream: MultiInputsStream<'a>, stream: MultiInputsStream<'a>,
/// A processor function that maps a token to another. /// A processor function that maps a token to another.
@ -1852,9 +1867,16 @@ impl<'a> Iterator for TokenIterator<'a> {
type Item = (Token, Position); type Item = (Token, Position);
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
if let Some(ch) = self.buffer.take() { let mut control = self.tokenizer_control.get();
self.stream.unget(ch);
if control.is_within_text {
// Push a back-tick into the stream
self.stream.unget('`');
// Rewind the current position by one character
self.pos.rewind(); self.pos.rewind();
// Reset it
control.is_within_text = false;
self.tokenizer_control.set(control);
} }
let (token, pos) = match get_next_token(&mut self.stream, &mut self.state, &mut self.pos) { let (token, pos) = match get_next_token(&mut self.stream, &mut self.state, &mut self.pos) {
@ -1945,7 +1967,7 @@ impl Engine {
pub fn lex<'a>( pub fn lex<'a>(
&'a self, &'a self,
input: impl IntoIterator<Item = &'a &'a str>, input: impl IntoIterator<Item = &'a &'a str>,
) -> (TokenIterator<'a>, Shared<Cell<Option<char>>>) { ) -> (TokenIterator<'a>, ExternalBuffer) {
self.lex_raw(input, None) self.lex_raw(input, None)
} }
/// _(INTERNALS)_ Tokenize an input text stream with a mapping function. /// _(INTERNALS)_ Tokenize an input text stream with a mapping function.
@ -1956,7 +1978,7 @@ impl Engine {
&'a self, &'a self,
input: impl IntoIterator<Item = &'a &'a str>, input: impl IntoIterator<Item = &'a &'a str>,
map: fn(Token) -> Token, map: fn(Token) -> Token,
) -> (TokenIterator<'a>, Shared<Cell<Option<char>>>) { ) -> (TokenIterator<'a>, ExternalBuffer) {
self.lex_raw(input, Some(map)) self.lex_raw(input, Some(map))
} }
/// Tokenize an input text stream with an optional mapping function. /// Tokenize an input text stream with an optional mapping function.
@ -1965,8 +1987,8 @@ impl Engine {
&'a self, &'a self,
input: impl IntoIterator<Item = &'a &'a str>, input: impl IntoIterator<Item = &'a &'a str>,
map: Option<fn(Token) -> Token>, map: Option<fn(Token) -> Token>,
) -> (TokenIterator<'a>, Shared<Cell<Option<char>>>) { ) -> (TokenIterator<'a>, TokenizerControl) {
let buffer: Shared<Cell<Option<char>>> = Cell::new(None).into(); let buffer: TokenizerControl = Default::default();
let buffer2 = buffer.clone(); let buffer2 = buffer.clone();
( (
@ -1984,7 +2006,7 @@ impl Engine {
disable_doc_comments: self.disable_doc_comments, disable_doc_comments: self.disable_doc_comments,
}, },
pos: Position::new(1, 0), pos: Position::new(1, 0),
buffer, tokenizer_control: buffer,
stream: MultiInputsStream { stream: MultiInputsStream {
buf: None, buf: None,
streams: input.into_iter().map(|s| s.chars().peekable()).collect(), streams: input.into_iter().map(|s| s.chars().peekable()).collect(),