Refactor tokenizer.

This commit is contained in:
Stephen Chung 2020-06-26 19:44:50 +08:00
parent 14746f94ca
commit 31eaf321d0
2 changed files with 702 additions and 640 deletions

View File

@ -131,15 +131,11 @@ pub use optimize::OptimizationLevel;
#[cfg(feature = "internals")] #[cfg(feature = "internals")]
#[deprecated(note = "this type is volatile and may change")] #[deprecated(note = "this type is volatile and may change")]
pub use token::Token; pub use token::{get_next_token, parse_string_literal, InputStream, Token, TokenizeState};
#[cfg(feature = "internals")] #[cfg(feature = "internals")]
#[deprecated(note = "this type is volatile and may change")] #[deprecated(note = "this type is volatile and may change")]
pub use parser::Expr; pub use parser::{Expr, ReturnType, Stmt};
#[cfg(feature = "internals")]
#[deprecated(note = "this type is volatile and may change")]
pub use parser::Stmt;
#[cfg(feature = "internals")] #[cfg(feature = "internals")]
#[deprecated(note = "this type is volatile and may change")] #[deprecated(note = "this type is volatile and may change")]
@ -148,7 +144,3 @@ pub use module::ModuleRef;
#[cfg(feature = "internals")] #[cfg(feature = "internals")]
#[deprecated(note = "this type is volatile and may change")] #[deprecated(note = "this type is volatile and may change")]
pub use utils::StaticVec; pub use utils::StaticVec;
#[cfg(feature = "internals")]
#[deprecated(note = "this type is volatile and may change")]
pub use parser::ReturnType;

View File

@ -209,6 +209,7 @@ pub enum Token {
#[cfg(not(feature = "no_module"))] #[cfg(not(feature = "no_module"))]
As, As,
LexError(Box<LexError>), LexError(Box<LexError>),
Comment(String),
EOF, EOF,
} }
@ -429,89 +430,46 @@ impl From<Token> for String {
} }
} }
/// An iterator on a `Token` stream. /// State of the tokenizer.
pub struct TokenIterator<'a> { #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Default)]
pub struct TokenizeState {
/// Maximum length of a string (0 = unlimited). /// Maximum length of a string (0 = unlimited).
max_string_size: usize, max_string_size: usize,
/// Can the next token be a unary operator? /// Can the next token be a unary operator?
can_be_unary: bool, can_be_unary: bool,
/// Current position. /// Is the tokenizer currently inside a block comment?
pos: Position, comment_level: usize,
/// The input character streams. /// Return `None` at the end of the stream instead of `Some(Token::EOF)`?
streams: StaticVec<Peekable<Chars<'a>>>, end_with_none: bool,
/// Include comments?
include_comments: bool,
} }
impl<'a> TokenIterator<'a> { /// Trait that encapsulates a peekable character input stream.
/// Consume the next character. pub trait InputStream {
fn eat_next(&mut self) {
self.get_next();
self.advance();
}
/// Get the next character /// Get the next character
fn get_next(&mut self) -> Option<char> { fn get_next(&mut self) -> Option<char>;
loop {
if self.streams.is_empty() {
// No more streams
return None;
} else if let Some(ch) = self.streams[0].next() {
// Next character in current stream
return Some(ch);
} else {
// Jump to the next stream
let _ = self.streams.remove(0);
}
}
}
/// Peek the next character /// Peek the next character
fn peek_next(&mut self) -> Option<char> { fn peek_next(&mut self) -> Option<char>;
loop { }
if self.streams.is_empty() {
// No more streams
return None;
} else if let Some(ch) = self.streams[0].peek() {
// Next character in current stream
return Some(*ch);
} else {
// Jump to the next stream
let _ = self.streams.remove(0);
}
}
}
/// Move the current position one character ahead.
fn advance(&mut self) {
self.pos.advance();
}
/// Move the current position back one character.
///
/// # Panics
///
/// Panics if already at the beginning of a line - cannot rewind to the previous line.
fn rewind(&mut self) {
self.pos.rewind();
}
/// Move the current position to the next line.
fn new_line(&mut self) {
self.pos.new_line()
}
/// Parse a string literal wrapped by `enclosing_char`. /// Parse a string literal wrapped by `enclosing_char`.
pub fn parse_string_literal( pub fn parse_string_literal(
&mut self, stream: &mut impl InputStream,
state: &mut TokenizeState,
pos: &mut Position,
enclosing_char: char, enclosing_char: char,
max_length: usize, ) -> Result<String, (LexError, Position)> {
) -> Result<String, (LexError, Position)> {
let mut result = Vec::new(); let mut result = Vec::new();
let mut escape = String::with_capacity(12); let mut escape = String::with_capacity(12);
loop { loop {
let next_char = self let next_char = stream.get_next().ok_or((LERR::UnterminatedString, *pos))?;
.get_next()
.ok_or((LERR::UnterminatedString, self.pos))?;
self.advance(); pos.advance();
if max_length > 0 && result.len() > max_length { if state.max_string_size > 0 && result.len() > state.max_string_size {
return Err((LexError::StringTooLong(max_length), self.pos)); return Err((LexError::StringTooLong(state.max_string_size), *pos));
} }
match next_char { match next_char {
@ -554,22 +512,22 @@ impl<'a> TokenIterator<'a> {
}; };
for _ in 0..len { for _ in 0..len {
let c = self.get_next().ok_or_else(|| { let c = stream
(LERR::MalformedEscapeSequence(seq.to_string()), self.pos) .get_next()
})?; .ok_or_else(|| (LERR::MalformedEscapeSequence(seq.to_string()), *pos))?;
seq.push(c); seq.push(c);
self.advance(); pos.advance();
out_val *= 16; out_val *= 16;
out_val += c.to_digit(16).ok_or_else(|| { out_val += c
(LERR::MalformedEscapeSequence(seq.to_string()), self.pos) .to_digit(16)
})?; .ok_or_else(|| (LERR::MalformedEscapeSequence(seq.to_string()), *pos))?;
} }
result.push( result.push(
char::from_u32(out_val) char::from_u32(out_val)
.ok_or_else(|| (LERR::MalformedEscapeSequence(seq), self.pos))?, .ok_or_else(|| (LERR::MalformedEscapeSequence(seq), *pos))?,
); );
} }
@ -583,14 +541,12 @@ impl<'a> TokenIterator<'a> {
ch if enclosing_char == ch && escape.is_empty() => break, ch if enclosing_char == ch && escape.is_empty() => break,
// Unknown escape sequence // Unknown escape sequence
_ if !escape.is_empty() => { _ if !escape.is_empty() => return Err((LERR::MalformedEscapeSequence(escape), *pos)),
return Err((LERR::MalformedEscapeSequence(escape), self.pos))
}
// Cannot have new-lines inside string literals // Cannot have new-lines inside string literals
'\n' => { '\n' => {
self.rewind(); pos.rewind();
return Err((LERR::UnterminatedString, self.pos)); return Err((LERR::UnterminatedString, *pos));
} }
// All other characters // All other characters
@ -603,25 +559,94 @@ impl<'a> TokenIterator<'a> {
let s = result.iter().collect::<String>(); let s = result.iter().collect::<String>();
if max_length > 0 && s.len() > max_length { if state.max_string_size > 0 && s.len() > state.max_string_size {
return Err((LexError::StringTooLong(max_length), self.pos)); return Err((LexError::StringTooLong(state.max_string_size), *pos));
} }
Ok(s) Ok(s)
}
/// Consume the next character.
fn eat_next(stream: &mut impl InputStream, pos: &mut Position) {
stream.get_next();
pos.advance();
}
/// Scan for a block comment until the end.
fn scan_comment(
stream: &mut impl InputStream,
state: &mut TokenizeState,
pos: &mut Position,
comment: &mut String,
) {
while let Some(c) = stream.get_next() {
pos.advance();
if state.include_comments {
comment.push(c);
}
match c {
'/' => {
if let Some(c2) = stream.get_next() {
if state.include_comments {
comment.push(c2);
}
if c2 == '*' {
state.comment_level += 1;
}
}
pos.advance();
}
'*' => {
if let Some(c2) = stream.get_next() {
if state.include_comments {
comment.push(c2);
}
if c2 == '/' {
state.comment_level -= 1;
}
}
pos.advance();
}
'\n' => pos.new_line(),
_ => (),
}
if state.comment_level == 0 {
break;
}
}
}
/// Get the next token.
pub fn get_next_token(
stream: &mut impl InputStream,
state: &mut TokenizeState,
pos: &mut Position,
) -> Option<(Token, Position)> {
// Still inside a comment?
if state.comment_level > 0 {
let start_pos = *pos;
let mut comment = Default::default();
scan_comment(stream, state, pos, &mut comment);
if state.include_comments {
println!("Comment ({}): {}", start_pos, comment);
return Some((Token::Comment(comment), start_pos));
}
} }
/// Get the next token.
fn inner_next(&mut self) -> Option<(Token, Position)> {
let mut negated = false; let mut negated = false;
while let Some(c) = self.get_next() { while let Some(c) = stream.get_next() {
self.advance(); pos.advance();
let pos = self.pos; let start_pos = *pos;
match (c, self.peek_next().unwrap_or('\0')) { match (c, stream.peek_next().unwrap_or('\0')) {
// \n // \n
('\n', _) => self.new_line(), ('\n', _) => pos.new_line(),
// digit ... // digit ...
('0'..='9', _) => { ('0'..='9', _) => {
@ -629,32 +654,30 @@ impl<'a> TokenIterator<'a> {
let mut radix_base: Option<u32> = None; let mut radix_base: Option<u32> = None;
result.push(c); result.push(c);
while let Some(next_char) = self.peek_next() { while let Some(next_char) = stream.peek_next() {
match next_char { match next_char {
'0'..='9' | '_' => { '0'..='9' | '_' => {
result.push(next_char); result.push(next_char);
self.eat_next(); eat_next(stream, pos);
} }
#[cfg(not(feature = "no_float"))] #[cfg(not(feature = "no_float"))]
'.' => { '.' => {
result.push(next_char); result.push(next_char);
self.eat_next(); eat_next(stream, pos);
while let Some(next_char_in_float) = self.peek_next() { while let Some(next_char_in_float) = stream.peek_next() {
match next_char_in_float { match next_char_in_float {
'0'..='9' | '_' => { '0'..='9' | '_' => {
result.push(next_char_in_float); result.push(next_char_in_float);
self.eat_next(); eat_next(stream, pos);
} }
_ => break, _ => break,
} }
} }
} }
// 0x????, 0o????, 0b???? // 0x????, 0o????, 0b????
ch @ 'x' | ch @ 'X' | ch @ 'o' | ch @ 'O' | ch @ 'b' | ch @ 'B' ch @ 'x' | ch @ 'X' | ch @ 'o' | ch @ 'O' | ch @ 'b' | ch @ 'B' if c == '0' => {
if c == '0' =>
{
result.push(next_char); result.push(next_char);
self.eat_next(); eat_next(stream, pos);
let valid = match ch { let valid = match ch {
'x' | 'X' => [ 'x' | 'X' => [
@ -679,13 +702,13 @@ impl<'a> TokenIterator<'a> {
_ => unreachable!(), _ => unreachable!(),
}); });
while let Some(next_char_in_escape_seq) = self.peek_next() { while let Some(next_char_in_escape_seq) = stream.peek_next() {
if !valid.contains(&next_char_in_escape_seq) { if !valid.contains(&next_char_in_escape_seq) {
break; break;
} }
result.push(next_char_in_escape_seq); result.push(next_char_in_escape_seq);
self.eat_next(); eat_next(stream, pos);
} }
} }
@ -709,7 +732,7 @@ impl<'a> TokenIterator<'a> {
result.into_iter().collect(), result.into_iter().collect(),
))) )))
}), }),
pos, start_pos,
)); ));
} else { } else {
let out: String = result.iter().filter(|&&c| c != '_').collect(); let out: String = result.iter().filter(|&&c| c != '_').collect();
@ -725,7 +748,7 @@ impl<'a> TokenIterator<'a> {
result.into_iter().collect(), result.into_iter().collect(),
))) )))
}), }),
pos, start_pos,
)); ));
} }
} }
@ -735,11 +758,11 @@ impl<'a> TokenIterator<'a> {
let mut result = Vec::new(); let mut result = Vec::new();
result.push(c); result.push(c);
while let Some(next_char) = self.peek_next() { while let Some(next_char) = stream.peek_next() {
match next_char { match next_char {
x if x.is_ascii_alphanumeric() || x == '_' => { x if x.is_ascii_alphanumeric() || x == '_' => {
result.push(x); result.push(x);
self.eat_next(); eat_next(stream, pos);
} }
_ => break, _ => break,
} }
@ -756,7 +779,7 @@ impl<'a> TokenIterator<'a> {
if !is_valid_identifier { if !is_valid_identifier {
return Some(( return Some((
Token::LexError(Box::new(LERR::MalformedIdentifier(identifier))), Token::LexError(Box::new(LERR::MalformedIdentifier(identifier))),
pos, start_pos,
)); ));
} }
@ -790,30 +813,24 @@ impl<'a> TokenIterator<'a> {
_ => Token::Identifier(identifier), _ => Token::Identifier(identifier),
}, },
pos, start_pos,
)); ));
} }
// " - string literal // " - string literal
('"', _) => { ('"', _) => return parse_string_literal(stream, state, pos, '"')
return self
.parse_string_literal('"', self.max_string_size)
.map_or_else( .map_or_else(
|err| Some((Token::LexError(Box::new(err.0)), err.1)), |err| Some((Token::LexError(Box::new(err.0)), err.1)),
|out| Some((Token::StringConst(out), pos)), |out| Some((Token::StringConst(out), start_pos)),
); ),
}
// ' - character literal // ' - character literal
('\'', '\'') => { ('\'', '\'') => return Some((
return Some((
Token::LexError(Box::new(LERR::MalformedChar("".to_string()))), Token::LexError(Box::new(LERR::MalformedChar("".to_string()))),
pos, start_pos,
)); )),
} ('\'', _) => return Some(
('\'', _) => { parse_string_literal(stream, state, pos, '\'')
return Some(
self.parse_string_literal('\'', self.max_string_size)
.map_or_else( .map_or_else(
|err| (Token::LexError(Box::new(err.0)), err.1), |err| (Token::LexError(Box::new(err.0)), err.1),
|result| { |result| {
@ -823,283 +840,329 @@ impl<'a> TokenIterator<'a> {
if chars.next().is_some() { if chars.next().is_some() {
( (
Token::LexError(Box::new(LERR::MalformedChar(result))), Token::LexError(Box::new(LERR::MalformedChar(result))),
pos, start_pos,
) )
} else { } else {
(Token::CharConstant(first.expect("should be Some")), pos) (Token::CharConstant(first.expect("should be Some")), start_pos)
} }
}, },
), ),
); ),
}
// Braces // Braces
('{', _) => return Some((Token::LeftBrace, pos)), ('{', _) => return Some((Token::LeftBrace, start_pos)),
('}', _) => return Some((Token::RightBrace, pos)), ('}', _) => return Some((Token::RightBrace, start_pos)),
// Parentheses // Parentheses
('(', _) => return Some((Token::LeftParen, pos)), ('(', _) => return Some((Token::LeftParen, start_pos)),
(')', _) => return Some((Token::RightParen, pos)), (')', _) => return Some((Token::RightParen, start_pos)),
// Indexing // Indexing
('[', _) => return Some((Token::LeftBracket, pos)), ('[', _) => return Some((Token::LeftBracket, start_pos)),
(']', _) => return Some((Token::RightBracket, pos)), (']', _) => return Some((Token::RightBracket, start_pos)),
// Map literal // Map literal
#[cfg(not(feature = "no_object"))] #[cfg(not(feature = "no_object"))]
('#', '{') => { ('#', '{') => {
self.eat_next(); eat_next(stream, pos);
return Some((Token::MapStart, pos)); return Some((Token::MapStart, start_pos));
} }
// Operators // Operators
('+', '=') => { ('+', '=') => {
self.eat_next(); eat_next(stream, pos);
return Some((Token::PlusAssign, pos)); return Some((Token::PlusAssign, start_pos));
} }
('+', _) if self.can_be_unary => return Some((Token::UnaryPlus, pos)), ('+', _) if state.can_be_unary => return Some((Token::UnaryPlus, start_pos)),
('+', _) => return Some((Token::Plus, pos)), ('+', _) => return Some((Token::Plus, start_pos)),
('-', '0'..='9') if self.can_be_unary => negated = true, ('-', '0'..='9') if state.can_be_unary => negated = true,
('-', '0'..='9') => return Some((Token::Minus, pos)), ('-', '0'..='9') => return Some((Token::Minus, start_pos)),
('-', '=') => { ('-', '=') => {
self.eat_next(); eat_next(stream, pos);
return Some((Token::MinusAssign, pos)); return Some((Token::MinusAssign, start_pos));
} }
('-', '>') => { ('-', '>') => return Some((
return Some((
Token::LexError(Box::new(LERR::ImproperSymbol( Token::LexError(Box::new(LERR::ImproperSymbol(
"'->' is not a valid symbol. This is not C or C++!".to_string(), "'->' is not a valid symbol. This is not C or C++!".to_string(),
))), ))),
pos, start_pos,
)) )),
} ('-', _) if state.can_be_unary => return Some((Token::UnaryMinus, start_pos)),
('-', _) if self.can_be_unary => return Some((Token::UnaryMinus, pos)), ('-', _) => return Some((Token::Minus, start_pos)),
('-', _) => return Some((Token::Minus, pos)),
('*', '=') => { ('*', '=') => {
self.eat_next(); eat_next(stream, pos);
return Some((Token::MultiplyAssign, pos)); return Some((Token::MultiplyAssign, start_pos));
} }
('*', _) => return Some((Token::Multiply, pos)), ('*', _) => return Some((Token::Multiply, start_pos)),
// Comments // Comments
('/', '/') => { ('/', '/') => {
self.eat_next(); eat_next(stream, pos);
while let Some(c) = self.get_next() { let mut comment = if state.include_comments {
"//".to_string()
} else {
Default::default()
};
while let Some(c) = stream.get_next() {
if c == '\n' { if c == '\n' {
self.new_line(); pos.new_line();
break; break;
} }
self.advance(); if state.include_comments {
comment.push(c);
}
pos.advance();
}
if state.include_comments {
println!("Comment ({}): {}", start_pos, comment);
return Some((Token::Comment(comment), start_pos));
} }
} }
('/', '*') => { ('/', '*') => {
let mut level = 1; state.comment_level = 1;
self.eat_next(); eat_next(stream, pos);
while let Some(c) = self.get_next() { let mut comment = if state.include_comments {
self.advance(); "/*".to_string()
} else {
Default::default()
};
scan_comment(stream, state, pos, &mut comment);
match c { if state.include_comments {
'/' => { println!("Comment ({}): {}", start_pos, comment);
if self.get_next() == Some('*') { return Some((Token::Comment(comment), start_pos));
level += 1;
}
self.advance();
}
'*' => {
if self.get_next() == Some('/') {
level -= 1;
}
self.advance();
}
'\n' => self.new_line(),
_ => (),
}
if level == 0 {
break;
}
} }
} }
('/', '=') => { ('/', '=') => {
self.eat_next(); eat_next(stream, pos);
return Some((Token::DivideAssign, pos)); return Some((Token::DivideAssign, start_pos));
} }
('/', _) => return Some((Token::Divide, pos)), ('/', _) => return Some((Token::Divide, start_pos)),
(';', _) => return Some((Token::SemiColon, pos)), (';', _) => return Some((Token::SemiColon, start_pos)),
(',', _) => return Some((Token::Comma, pos)), (',', _) => return Some((Token::Comma, start_pos)),
('.', _) => return Some((Token::Period, pos)), ('.', _) => return Some((Token::Period, start_pos)),
('=', '=') => { ('=', '=') => {
self.eat_next(); eat_next(stream, pos);
// Warn against `===` // Warn against `===`
if self.peek_next() == Some('=') { if stream.peek_next() == Some('=') {
return Some(( return Some((
Token::LexError(Box::new(LERR::ImproperSymbol( Token::LexError(Box::new(LERR::ImproperSymbol(
"'===' is not a valid operator. This is not JavaScript! Should it be '=='?" "'===' is not a valid operator. This is not JavaScript! Should it be '=='?"
.to_string(), .to_string(),
))), ))),
pos, start_pos,
)); ));
} }
return Some((Token::EqualsTo, pos)); return Some((Token::EqualsTo, start_pos));
} }
('=', '>') => { ('=', '>') => return Some((
return Some((
Token::LexError(Box::new(LERR::ImproperSymbol( Token::LexError(Box::new(LERR::ImproperSymbol(
"'=>' is not a valid symbol. This is not Rust! Should it be '>='?" "'=>' is not a valid symbol. This is not Rust! Should it be '>='?"
.to_string(), .to_string(),
))), ))),
pos, start_pos,
)) )),
} ('=', _) => return Some((Token::Equals, start_pos)),
('=', _) => return Some((Token::Equals, pos)),
(':', ':') => { (':', ':') => {
self.eat_next(); eat_next(stream, pos);
return Some((Token::DoubleColon, pos)); return Some((Token::DoubleColon, start_pos));
} }
(':', '=') => { (':', '=') => return Some((
return Some((
Token::LexError(Box::new(LERR::ImproperSymbol( Token::LexError(Box::new(LERR::ImproperSymbol(
"':=' is not a valid assignment operator. This is not Pascal! Should it be simply '='?" "':=' is not a valid assignment operator. This is not Pascal! Should it be simply '='?"
.to_string(), .to_string(),
))), ))),
pos, start_pos,
)) )),
} (':', _) => return Some((Token::Colon, start_pos)),
(':', _) => return Some((Token::Colon, pos)),
('<', '=') => { ('<', '=') => {
self.eat_next(); eat_next(stream, pos);
return Some((Token::LessThanEqualsTo, pos)); return Some((Token::LessThanEqualsTo, start_pos));
} }
('<', '-') => { ('<', '-') => return Some((
return Some((
Token::LexError(Box::new(LERR::ImproperSymbol( Token::LexError(Box::new(LERR::ImproperSymbol(
"'<-' is not a valid symbol. Should it be '<='?".to_string(), "'<-' is not a valid symbol. Should it be '<='?".to_string(),
))), ))),
pos, start_pos,
)) )),
}
('<', '<') => { ('<', '<') => {
self.eat_next(); eat_next(stream, pos);
return Some(( return Some((
if self.peek_next() == Some('=') { if stream.peek_next() == Some('=') {
self.eat_next(); eat_next(stream, pos);
Token::LeftShiftAssign Token::LeftShiftAssign
} else { } else {
Token::LeftShift Token::LeftShift
}, },
pos, start_pos,
)); ));
} }
('<', _) => return Some((Token::LessThan, pos)), ('<', _) => return Some((Token::LessThan, start_pos)),
('>', '=') => { ('>', '=') => {
self.eat_next(); eat_next(stream, pos);
return Some((Token::GreaterThanEqualsTo, pos)); return Some((Token::GreaterThanEqualsTo, start_pos));
} }
('>', '>') => { ('>', '>') => {
self.eat_next(); eat_next(stream, pos);
return Some(( return Some((
if self.peek_next() == Some('=') { if stream.peek_next() == Some('=') {
self.eat_next(); eat_next(stream, pos);
Token::RightShiftAssign Token::RightShiftAssign
} else { } else {
Token::RightShift Token::RightShift
}, },
pos, start_pos,
)); ));
} }
('>', _) => return Some((Token::GreaterThan, pos)), ('>', _) => return Some((Token::GreaterThan, start_pos)),
('!', '=') => { ('!', '=') => {
self.eat_next(); eat_next(stream, pos);
// Warn against `!==` // Warn against `!==`
if self.peek_next() == Some('=') { if stream.peek_next() == Some('=') {
return Some(( return Some((
Token::LexError(Box::new(LERR::ImproperSymbol( Token::LexError(Box::new(LERR::ImproperSymbol(
"'!==' is not a valid operator. This is not JavaScript! Should it be '!='?" "'!==' is not a valid operator. This is not JavaScript! Should it be '!='?"
.to_string(), .to_string(),
))), ))),
pos, start_pos,
)); ));
} }
return Some((Token::NotEqualsTo, pos)); return Some((Token::NotEqualsTo, start_pos));
} }
('!', _) => return Some((Token::Bang, pos)), ('!', _) => return Some((Token::Bang, start_pos)),
('|', '|') => { ('|', '|') => {
self.eat_next(); eat_next(stream, pos);
return Some((Token::Or, pos)); return Some((Token::Or, start_pos));
} }
('|', '=') => { ('|', '=') => {
self.eat_next(); eat_next(stream, pos);
return Some((Token::OrAssign, pos)); return Some((Token::OrAssign, start_pos));
} }
('|', _) => return Some((Token::Pipe, pos)), ('|', _) => return Some((Token::Pipe, start_pos)),
('&', '&') => { ('&', '&') => {
self.eat_next(); eat_next(stream, pos);
return Some((Token::And, pos)); return Some((Token::And, start_pos));
} }
('&', '=') => { ('&', '=') => {
self.eat_next(); eat_next(stream, pos);
return Some((Token::AndAssign, pos)); return Some((Token::AndAssign, start_pos));
} }
('&', _) => return Some((Token::Ampersand, pos)), ('&', _) => return Some((Token::Ampersand, start_pos)),
('^', '=') => { ('^', '=') => {
self.eat_next(); eat_next(stream, pos);
return Some((Token::XOrAssign, pos)); return Some((Token::XOrAssign, start_pos));
} }
('^', _) => return Some((Token::XOr, pos)), ('^', _) => return Some((Token::XOr, start_pos)),
('%', '=') => { ('%', '=') => {
self.eat_next(); eat_next(stream, pos);
return Some((Token::ModuloAssign, pos)); return Some((Token::ModuloAssign, start_pos));
} }
('%', _) => return Some((Token::Modulo, pos)), ('%', _) => return Some((Token::Modulo, start_pos)),
('~', '=') => { ('~', '=') => {
self.eat_next(); eat_next(stream, pos);
return Some((Token::PowerOfAssign, pos)); return Some((Token::PowerOfAssign, start_pos));
} }
('~', _) => return Some((Token::PowerOf, pos)), ('~', _) => return Some((Token::PowerOf, start_pos)),
('\0', _) => unreachable!(), ('\0', _) => unreachable!(),
(ch, _) if ch.is_whitespace() => (), (ch, _) if ch.is_whitespace() => (),
(ch, _) => return Some((Token::LexError(Box::new(LERR::UnexpectedChar(ch))), pos)), (ch, _) => return Some((Token::LexError(Box::new(LERR::UnexpectedChar(ch))), start_pos)),
} }
} }
self.advance(); pos.advance();
Some((Token::EOF, self.pos))
if state.end_with_none {
None
} else {
Some((Token::EOF, *pos))
} }
} }
/// An type that implements the `InputStream` trait.
/// Multiple charaacter streams are jointed together to form one single stream.
pub struct MultiInputsStream<'a> {
/// The input character streams.
streams: StaticVec<Peekable<Chars<'a>>>,
}
impl InputStream for MultiInputsStream<'_> {
/// Get the next character
fn get_next(&mut self) -> Option<char> {
loop {
if self.streams.is_empty() {
// No more streams
return None;
} else if let Some(ch) = self.streams[0].next() {
// Next character in current stream
return Some(ch);
} else {
// Jump to the next stream
let _ = self.streams.remove(0);
}
}
}
/// Peek the next character
fn peek_next(&mut self) -> Option<char> {
loop {
if self.streams.is_empty() {
// No more streams
return None;
} else if let Some(ch) = self.streams[0].peek() {
// Next character in current stream
return Some(*ch);
} else {
// Jump to the next stream
let _ = self.streams.remove(0);
}
}
}
}
/// An iterator on a `Token` stream.
pub struct TokenIterator<'a> {
/// Current state.
state: TokenizeState,
/// Current position.
pos: Position,
/// Input character stream.
stream: MultiInputsStream<'a>,
}
impl<'a> Iterator for TokenIterator<'a> { impl<'a> Iterator for TokenIterator<'a> {
type Item = (Token, Position); type Item = (Token, Position);
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
self.inner_next().map(|x| { get_next_token(&mut self.stream, &mut self.state, &mut self.pos).map(|x| {
// Save the last token // Save the last token's state
self.can_be_unary = x.0.is_next_unary(); self.state.can_be_unary = x.0.is_next_unary();
x x
}) })
} }
@ -1108,9 +1171,16 @@ impl<'a> Iterator for TokenIterator<'a> {
/// Tokenize an input text stream. /// Tokenize an input text stream.
pub fn lex<'a>(input: &'a [&'a str], max_string_size: usize) -> TokenIterator<'a> { pub fn lex<'a>(input: &'a [&'a str], max_string_size: usize) -> TokenIterator<'a> {
TokenIterator { TokenIterator {
state: TokenizeState {
max_string_size, max_string_size,
can_be_unary: true, can_be_unary: true,
comment_level: 0,
end_with_none: false,
include_comments: false,
},
pos: Position::new(1, 0), pos: Position::new(1, 0),
stream: MultiInputsStream {
streams: input.iter().map(|s| s.chars().peekable()).collect(), streams: input.iter().map(|s| s.chars().peekable()).collect(),
},
} }
} }