From 31eaf321d0237322586487cb980140216afa22eb Mon Sep 17 00:00:00 2001 From: Stephen Chung Date: Fri, 26 Jun 2020 19:44:50 +0800 Subject: [PATCH] Refactor tokenizer. --- src/lib.rs | 12 +- src/token.rs | 1330 ++++++++++++++++++++++++++------------------------ 2 files changed, 702 insertions(+), 640 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 04c49e42..8ebea050 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -131,15 +131,11 @@ pub use optimize::OptimizationLevel; #[cfg(feature = "internals")] #[deprecated(note = "this type is volatile and may change")] -pub use token::Token; +pub use token::{get_next_token, parse_string_literal, InputStream, Token, TokenizeState}; #[cfg(feature = "internals")] #[deprecated(note = "this type is volatile and may change")] -pub use parser::Expr; - -#[cfg(feature = "internals")] -#[deprecated(note = "this type is volatile and may change")] -pub use parser::Stmt; +pub use parser::{Expr, ReturnType, Stmt}; #[cfg(feature = "internals")] #[deprecated(note = "this type is volatile and may change")] @@ -148,7 +144,3 @@ pub use module::ModuleRef; #[cfg(feature = "internals")] #[deprecated(note = "this type is volatile and may change")] pub use utils::StaticVec; - -#[cfg(feature = "internals")] -#[deprecated(note = "this type is volatile and may change")] -pub use parser::ReturnType; diff --git a/src/token.rs b/src/token.rs index 76019d3c..1e53d8a3 100644 --- a/src/token.rs +++ b/src/token.rs @@ -209,6 +209,7 @@ pub enum Token { #[cfg(not(feature = "no_module"))] As, LexError(Box), + Comment(String), EOF, } @@ -429,24 +430,690 @@ impl From for String { } } -/// An iterator on a `Token` stream. -pub struct TokenIterator<'a> { +/// State of the tokenizer. +#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Default)] +pub struct TokenizeState { /// Maximum length of a string (0 = unlimited). max_string_size: usize, /// Can the next token be a unary operator? can_be_unary: bool, - /// Current position. - pos: Position, + /// Is the tokenizer currently inside a block comment? + comment_level: usize, + /// Return `None` at the end of the stream instead of `Some(Token::EOF)`? + end_with_none: bool, + /// Include comments? + include_comments: bool, +} + +/// Trait that encapsulates a peekable character input stream. +pub trait InputStream { + /// Get the next character + fn get_next(&mut self) -> Option; + /// Peek the next character + fn peek_next(&mut self) -> Option; +} + +/// Parse a string literal wrapped by `enclosing_char`. +pub fn parse_string_literal( + stream: &mut impl InputStream, + state: &mut TokenizeState, + pos: &mut Position, + enclosing_char: char, +) -> Result { + let mut result = Vec::new(); + let mut escape = String::with_capacity(12); + + loop { + let next_char = stream.get_next().ok_or((LERR::UnterminatedString, *pos))?; + + pos.advance(); + + if state.max_string_size > 0 && result.len() > state.max_string_size { + return Err((LexError::StringTooLong(state.max_string_size), *pos)); + } + + match next_char { + // \... + '\\' if escape.is_empty() => { + escape.push('\\'); + } + // \\ + '\\' if !escape.is_empty() => { + escape.clear(); + result.push('\\'); + } + // \t + 't' if !escape.is_empty() => { + escape.clear(); + result.push('\t'); + } + // \n + 'n' if !escape.is_empty() => { + escape.clear(); + result.push('\n'); + } + // \r + 'r' if !escape.is_empty() => { + escape.clear(); + result.push('\r'); + } + // \x??, \u????, \U???????? + ch @ 'x' | ch @ 'u' | ch @ 'U' if !escape.is_empty() => { + let mut seq = escape.clone(); + seq.push(ch); + escape.clear(); + + let mut out_val: u32 = 0; + let len = match ch { + 'x' => 2, + 'u' => 4, + 'U' => 8, + _ => unreachable!(), + }; + + for _ in 0..len { + let c = stream + .get_next() + .ok_or_else(|| (LERR::MalformedEscapeSequence(seq.to_string()), *pos))?; + + seq.push(c); + pos.advance(); + + out_val *= 16; + out_val += c + .to_digit(16) + .ok_or_else(|| (LERR::MalformedEscapeSequence(seq.to_string()), *pos))?; + } + + result.push( + char::from_u32(out_val) + .ok_or_else(|| (LERR::MalformedEscapeSequence(seq), *pos))?, + ); + } + + // \{enclosing_char} - escaped + ch if enclosing_char == ch && !escape.is_empty() => { + escape.clear(); + result.push(ch) + } + + // Close wrapper + ch if enclosing_char == ch && escape.is_empty() => break, + + // Unknown escape sequence + _ if !escape.is_empty() => return Err((LERR::MalformedEscapeSequence(escape), *pos)), + + // Cannot have new-lines inside string literals + '\n' => { + pos.rewind(); + return Err((LERR::UnterminatedString, *pos)); + } + + // All other characters + ch => { + escape.clear(); + result.push(ch); + } + } + } + + let s = result.iter().collect::(); + + if state.max_string_size > 0 && s.len() > state.max_string_size { + return Err((LexError::StringTooLong(state.max_string_size), *pos)); + } + + Ok(s) +} + +/// Consume the next character. +fn eat_next(stream: &mut impl InputStream, pos: &mut Position) { + stream.get_next(); + pos.advance(); +} + +/// Scan for a block comment until the end. +fn scan_comment( + stream: &mut impl InputStream, + state: &mut TokenizeState, + pos: &mut Position, + comment: &mut String, +) { + while let Some(c) = stream.get_next() { + pos.advance(); + + if state.include_comments { + comment.push(c); + } + + match c { + '/' => { + if let Some(c2) = stream.get_next() { + if state.include_comments { + comment.push(c2); + } + if c2 == '*' { + state.comment_level += 1; + } + } + pos.advance(); + } + '*' => { + if let Some(c2) = stream.get_next() { + if state.include_comments { + comment.push(c2); + } + if c2 == '/' { + state.comment_level -= 1; + } + } + pos.advance(); + } + '\n' => pos.new_line(), + _ => (), + } + + if state.comment_level == 0 { + break; + } + } +} + +/// Get the next token. +pub fn get_next_token( + stream: &mut impl InputStream, + state: &mut TokenizeState, + pos: &mut Position, +) -> Option<(Token, Position)> { + // Still inside a comment? + if state.comment_level > 0 { + let start_pos = *pos; + let mut comment = Default::default(); + scan_comment(stream, state, pos, &mut comment); + + if state.include_comments { + println!("Comment ({}): {}", start_pos, comment); + return Some((Token::Comment(comment), start_pos)); + } + } + + let mut negated = false; + + while let Some(c) = stream.get_next() { + pos.advance(); + + let start_pos = *pos; + + match (c, stream.peek_next().unwrap_or('\0')) { + // \n + ('\n', _) => pos.new_line(), + + // digit ... + ('0'..='9', _) => { + let mut result = Vec::new(); + let mut radix_base: Option = None; + result.push(c); + + while let Some(next_char) = stream.peek_next() { + match next_char { + '0'..='9' | '_' => { + result.push(next_char); + eat_next(stream, pos); + } + #[cfg(not(feature = "no_float"))] + '.' => { + result.push(next_char); + eat_next(stream, pos); + while let Some(next_char_in_float) = stream.peek_next() { + match next_char_in_float { + '0'..='9' | '_' => { + result.push(next_char_in_float); + eat_next(stream, pos); + } + _ => break, + } + } + } + // 0x????, 0o????, 0b???? + ch @ 'x' | ch @ 'X' | ch @ 'o' | ch @ 'O' | ch @ 'b' | ch @ 'B' if c == '0' => { + result.push(next_char); + eat_next(stream, pos); + + let valid = match ch { + 'x' | 'X' => [ + 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C', 'D', 'E', 'F', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '_', + ], + 'o' | 'O' => [ + '0', '1', '2', '3', '4', '5', '6', '7', '_', '_', '_', '_', + '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', + ], + 'b' | 'B' => [ + '0', '1', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', + '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', + ], + _ => unreachable!(), + }; + + radix_base = Some(match ch { + 'x' | 'X' => 16, + 'o' | 'O' => 8, + 'b' | 'B' => 2, + _ => unreachable!(), + }); + + while let Some(next_char_in_escape_seq) = stream.peek_next() { + if !valid.contains(&next_char_in_escape_seq) { + break; + } + + result.push(next_char_in_escape_seq); + eat_next(stream, pos); + } + } + + _ => break, + } + } + + if negated { + result.insert(0, '-'); + } + + // Parse number + if let Some(radix) = radix_base { + let out: String = result.iter().skip(2).filter(|&&c| c != '_').collect(); + + return Some(( + INT::from_str_radix(&out, radix) + .map(Token::IntegerConstant) + .unwrap_or_else(|_| { + Token::LexError(Box::new(LERR::MalformedNumber( + result.into_iter().collect(), + ))) + }), + start_pos, + )); + } else { + let out: String = result.iter().filter(|&&c| c != '_').collect(); + let num = INT::from_str(&out).map(Token::IntegerConstant); + + // If integer parsing is unnecessary, try float instead + #[cfg(not(feature = "no_float"))] + let num = num.or_else(|_| FLOAT::from_str(&out).map(Token::FloatConstant)); + + return Some(( + num.unwrap_or_else(|_| { + Token::LexError(Box::new(LERR::MalformedNumber( + result.into_iter().collect(), + ))) + }), + start_pos, + )); + } + } + + // letter or underscore ... + ('A'..='Z', _) | ('a'..='z', _) | ('_', _) => { + let mut result = Vec::new(); + result.push(c); + + while let Some(next_char) = stream.peek_next() { + match next_char { + x if x.is_ascii_alphanumeric() || x == '_' => { + result.push(x); + eat_next(stream, pos); + } + _ => break, + } + } + + let is_valid_identifier = result + .iter() + .find(|&ch| char::is_ascii_alphanumeric(ch)) // first alpha-numeric character + .map(char::is_ascii_alphabetic) // is a letter + .unwrap_or(false); // if no alpha-numeric at all - syntax error + + let identifier: String = result.iter().collect(); + + if !is_valid_identifier { + return Some(( + Token::LexError(Box::new(LERR::MalformedIdentifier(identifier))), + start_pos, + )); + } + + return Some(( + match identifier.as_str() { + "true" => Token::True, + "false" => Token::False, + "let" => Token::Let, + "const" => Token::Const, + "if" => Token::If, + "else" => Token::Else, + "while" => Token::While, + "loop" => Token::Loop, + "continue" => Token::Continue, + "break" => Token::Break, + "return" => Token::Return, + "throw" => Token::Throw, + "for" => Token::For, + "in" => Token::In, + #[cfg(not(feature = "no_function"))] + "private" => Token::Private, + #[cfg(not(feature = "no_module"))] + "import" => Token::Import, + #[cfg(not(feature = "no_module"))] + "export" => Token::Export, + #[cfg(not(feature = "no_module"))] + "as" => Token::As, + + #[cfg(not(feature = "no_function"))] + "fn" => Token::Fn, + + _ => Token::Identifier(identifier), + }, + start_pos, + )); + } + + // " - string literal + ('"', _) => return parse_string_literal(stream, state, pos, '"') + .map_or_else( + |err| Some((Token::LexError(Box::new(err.0)), err.1)), + |out| Some((Token::StringConst(out), start_pos)), + ), + + // ' - character literal + ('\'', '\'') => return Some(( + Token::LexError(Box::new(LERR::MalformedChar("".to_string()))), + start_pos, + )), + ('\'', _) => return Some( + parse_string_literal(stream, state, pos, '\'') + .map_or_else( + |err| (Token::LexError(Box::new(err.0)), err.1), + |result| { + let mut chars = result.chars(); + let first = chars.next(); + + if chars.next().is_some() { + ( + Token::LexError(Box::new(LERR::MalformedChar(result))), + start_pos, + ) + } else { + (Token::CharConstant(first.expect("should be Some")), start_pos) + } + }, + ), + ), + + // Braces + ('{', _) => return Some((Token::LeftBrace, start_pos)), + ('}', _) => return Some((Token::RightBrace, start_pos)), + + // Parentheses + ('(', _) => return Some((Token::LeftParen, start_pos)), + (')', _) => return Some((Token::RightParen, start_pos)), + + // Indexing + ('[', _) => return Some((Token::LeftBracket, start_pos)), + (']', _) => return Some((Token::RightBracket, start_pos)), + + // Map literal + #[cfg(not(feature = "no_object"))] + ('#', '{') => { + eat_next(stream, pos); + return Some((Token::MapStart, start_pos)); + } + + // Operators + ('+', '=') => { + eat_next(stream, pos); + return Some((Token::PlusAssign, start_pos)); + } + ('+', _) if state.can_be_unary => return Some((Token::UnaryPlus, start_pos)), + ('+', _) => return Some((Token::Plus, start_pos)), + + ('-', '0'..='9') if state.can_be_unary => negated = true, + ('-', '0'..='9') => return Some((Token::Minus, start_pos)), + ('-', '=') => { + eat_next(stream, pos); + return Some((Token::MinusAssign, start_pos)); + } + ('-', '>') => return Some(( + Token::LexError(Box::new(LERR::ImproperSymbol( + "'->' is not a valid symbol. This is not C or C++!".to_string(), + ))), + start_pos, + )), + ('-', _) if state.can_be_unary => return Some((Token::UnaryMinus, start_pos)), + ('-', _) => return Some((Token::Minus, start_pos)), + + ('*', '=') => { + eat_next(stream, pos); + return Some((Token::MultiplyAssign, start_pos)); + } + ('*', _) => return Some((Token::Multiply, start_pos)), + + // Comments + ('/', '/') => { + eat_next(stream, pos); + + let mut comment = if state.include_comments { + "//".to_string() + } else { + Default::default() + }; + + while let Some(c) = stream.get_next() { + if c == '\n' { + pos.new_line(); + break; + } + + if state.include_comments { + comment.push(c); + } + pos.advance(); + } + + if state.include_comments { + println!("Comment ({}): {}", start_pos, comment); + return Some((Token::Comment(comment), start_pos)); + } + } + ('/', '*') => { + state.comment_level = 1; + + eat_next(stream, pos); + + let mut comment = if state.include_comments { + "/*".to_string() + } else { + Default::default() + }; + scan_comment(stream, state, pos, &mut comment); + + if state.include_comments { + println!("Comment ({}): {}", start_pos, comment); + return Some((Token::Comment(comment), start_pos)); + } + } + + ('/', '=') => { + eat_next(stream, pos); + return Some((Token::DivideAssign, start_pos)); + } + ('/', _) => return Some((Token::Divide, start_pos)), + + (';', _) => return Some((Token::SemiColon, start_pos)), + (',', _) => return Some((Token::Comma, start_pos)), + ('.', _) => return Some((Token::Period, start_pos)), + + ('=', '=') => { + eat_next(stream, pos); + + // Warn against `===` + if stream.peek_next() == Some('=') { + return Some(( + Token::LexError(Box::new(LERR::ImproperSymbol( + "'===' is not a valid operator. This is not JavaScript! Should it be '=='?" + .to_string(), + ))), + start_pos, + )); + } + + return Some((Token::EqualsTo, start_pos)); + } + ('=', '>') => return Some(( + Token::LexError(Box::new(LERR::ImproperSymbol( + "'=>' is not a valid symbol. This is not Rust! Should it be '>='?" + .to_string(), + ))), + start_pos, + )), + ('=', _) => return Some((Token::Equals, start_pos)), + + (':', ':') => { + eat_next(stream, pos); + return Some((Token::DoubleColon, start_pos)); + } + (':', '=') => return Some(( + Token::LexError(Box::new(LERR::ImproperSymbol( + "':=' is not a valid assignment operator. This is not Pascal! Should it be simply '='?" + .to_string(), + ))), + start_pos, + )), + (':', _) => return Some((Token::Colon, start_pos)), + + ('<', '=') => { + eat_next(stream, pos); + return Some((Token::LessThanEqualsTo, start_pos)); + } + ('<', '-') => return Some(( + Token::LexError(Box::new(LERR::ImproperSymbol( + "'<-' is not a valid symbol. Should it be '<='?".to_string(), + ))), + start_pos, + )), + ('<', '<') => { + eat_next(stream, pos); + + return Some(( + if stream.peek_next() == Some('=') { + eat_next(stream, pos); + Token::LeftShiftAssign + } else { + Token::LeftShift + }, + start_pos, + )); + } + ('<', _) => return Some((Token::LessThan, start_pos)), + + ('>', '=') => { + eat_next(stream, pos); + return Some((Token::GreaterThanEqualsTo, start_pos)); + } + ('>', '>') => { + eat_next(stream, pos); + + return Some(( + if stream.peek_next() == Some('=') { + eat_next(stream, pos); + Token::RightShiftAssign + } else { + Token::RightShift + }, + start_pos, + )); + } + ('>', _) => return Some((Token::GreaterThan, start_pos)), + + ('!', '=') => { + eat_next(stream, pos); + + // Warn against `!==` + if stream.peek_next() == Some('=') { + return Some(( + Token::LexError(Box::new(LERR::ImproperSymbol( + "'!==' is not a valid operator. This is not JavaScript! Should it be '!='?" + .to_string(), + ))), + start_pos, + )); + } + + return Some((Token::NotEqualsTo, start_pos)); + } + ('!', _) => return Some((Token::Bang, start_pos)), + + ('|', '|') => { + eat_next(stream, pos); + return Some((Token::Or, start_pos)); + } + ('|', '=') => { + eat_next(stream, pos); + return Some((Token::OrAssign, start_pos)); + } + ('|', _) => return Some((Token::Pipe, start_pos)), + + ('&', '&') => { + eat_next(stream, pos); + return Some((Token::And, start_pos)); + } + ('&', '=') => { + eat_next(stream, pos); + return Some((Token::AndAssign, start_pos)); + } + ('&', _) => return Some((Token::Ampersand, start_pos)), + + ('^', '=') => { + eat_next(stream, pos); + return Some((Token::XOrAssign, start_pos)); + } + ('^', _) => return Some((Token::XOr, start_pos)), + + ('%', '=') => { + eat_next(stream, pos); + return Some((Token::ModuloAssign, start_pos)); + } + ('%', _) => return Some((Token::Modulo, start_pos)), + + ('~', '=') => { + eat_next(stream, pos); + return Some((Token::PowerOfAssign, start_pos)); + } + ('~', _) => return Some((Token::PowerOf, start_pos)), + + ('\0', _) => unreachable!(), + + (ch, _) if ch.is_whitespace() => (), + (ch, _) => return Some((Token::LexError(Box::new(LERR::UnexpectedChar(ch))), start_pos)), + } + } + + pos.advance(); + + if state.end_with_none { + None + } else { + Some((Token::EOF, *pos)) + } +} + +/// An type that implements the `InputStream` trait. +/// Multiple charaacter streams are jointed together to form one single stream. +pub struct MultiInputsStream<'a> { /// The input character streams. streams: StaticVec>>, } -impl<'a> TokenIterator<'a> { - /// Consume the next character. - fn eat_next(&mut self) { - self.get_next(); - self.advance(); - } +impl InputStream for MultiInputsStream<'_> { /// Get the next character fn get_next(&mut self) -> Option { loop { @@ -477,629 +1144,25 @@ impl<'a> TokenIterator<'a> { } } } - /// Move the current position one character ahead. - fn advance(&mut self) { - self.pos.advance(); - } - /// Move the current position back one character. - /// - /// # Panics - /// - /// Panics if already at the beginning of a line - cannot rewind to the previous line. - fn rewind(&mut self) { - self.pos.rewind(); - } - /// Move the current position to the next line. - fn new_line(&mut self) { - self.pos.new_line() - } - - /// Parse a string literal wrapped by `enclosing_char`. - pub fn parse_string_literal( - &mut self, - enclosing_char: char, - max_length: usize, - ) -> Result { - let mut result = Vec::new(); - let mut escape = String::with_capacity(12); - - loop { - let next_char = self - .get_next() - .ok_or((LERR::UnterminatedString, self.pos))?; - - self.advance(); - - if max_length > 0 && result.len() > max_length { - return Err((LexError::StringTooLong(max_length), self.pos)); - } - - match next_char { - // \... - '\\' if escape.is_empty() => { - escape.push('\\'); - } - // \\ - '\\' if !escape.is_empty() => { - escape.clear(); - result.push('\\'); - } - // \t - 't' if !escape.is_empty() => { - escape.clear(); - result.push('\t'); - } - // \n - 'n' if !escape.is_empty() => { - escape.clear(); - result.push('\n'); - } - // \r - 'r' if !escape.is_empty() => { - escape.clear(); - result.push('\r'); - } - // \x??, \u????, \U???????? - ch @ 'x' | ch @ 'u' | ch @ 'U' if !escape.is_empty() => { - let mut seq = escape.clone(); - seq.push(ch); - escape.clear(); - - let mut out_val: u32 = 0; - let len = match ch { - 'x' => 2, - 'u' => 4, - 'U' => 8, - _ => unreachable!(), - }; - - for _ in 0..len { - let c = self.get_next().ok_or_else(|| { - (LERR::MalformedEscapeSequence(seq.to_string()), self.pos) - })?; - - seq.push(c); - self.advance(); - - out_val *= 16; - out_val += c.to_digit(16).ok_or_else(|| { - (LERR::MalformedEscapeSequence(seq.to_string()), self.pos) - })?; - } - - result.push( - char::from_u32(out_val) - .ok_or_else(|| (LERR::MalformedEscapeSequence(seq), self.pos))?, - ); - } - - // \{enclosing_char} - escaped - ch if enclosing_char == ch && !escape.is_empty() => { - escape.clear(); - result.push(ch) - } - - // Close wrapper - ch if enclosing_char == ch && escape.is_empty() => break, - - // Unknown escape sequence - _ if !escape.is_empty() => { - return Err((LERR::MalformedEscapeSequence(escape), self.pos)) - } - - // Cannot have new-lines inside string literals - '\n' => { - self.rewind(); - return Err((LERR::UnterminatedString, self.pos)); - } - - // All other characters - ch => { - escape.clear(); - result.push(ch); - } - } - } - - let s = result.iter().collect::(); - - if max_length > 0 && s.len() > max_length { - return Err((LexError::StringTooLong(max_length), self.pos)); - } - - Ok(s) - } - - /// Get the next token. - fn inner_next(&mut self) -> Option<(Token, Position)> { - let mut negated = false; - - while let Some(c) = self.get_next() { - self.advance(); - - let pos = self.pos; - - match (c, self.peek_next().unwrap_or('\0')) { - // \n - ('\n', _) => self.new_line(), - - // digit ... - ('0'..='9', _) => { - let mut result = Vec::new(); - let mut radix_base: Option = None; - result.push(c); - - while let Some(next_char) = self.peek_next() { - match next_char { - '0'..='9' | '_' => { - result.push(next_char); - self.eat_next(); - } - #[cfg(not(feature = "no_float"))] - '.' => { - result.push(next_char); - self.eat_next(); - while let Some(next_char_in_float) = self.peek_next() { - match next_char_in_float { - '0'..='9' | '_' => { - result.push(next_char_in_float); - self.eat_next(); - } - _ => break, - } - } - } - // 0x????, 0o????, 0b???? - ch @ 'x' | ch @ 'X' | ch @ 'o' | ch @ 'O' | ch @ 'b' | ch @ 'B' - if c == '0' => - { - result.push(next_char); - self.eat_next(); - - let valid = match ch { - 'x' | 'X' => [ - 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C', 'D', 'E', 'F', - '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '_', - ], - 'o' | 'O' => [ - '0', '1', '2', '3', '4', '5', '6', '7', '_', '_', '_', '_', - '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', - ], - 'b' | 'B' => [ - '0', '1', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', - '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', - ], - _ => unreachable!(), - }; - - radix_base = Some(match ch { - 'x' | 'X' => 16, - 'o' | 'O' => 8, - 'b' | 'B' => 2, - _ => unreachable!(), - }); - - while let Some(next_char_in_escape_seq) = self.peek_next() { - if !valid.contains(&next_char_in_escape_seq) { - break; - } - - result.push(next_char_in_escape_seq); - self.eat_next(); - } - } - - _ => break, - } - } - - if negated { - result.insert(0, '-'); - } - - // Parse number - if let Some(radix) = radix_base { - let out: String = result.iter().skip(2).filter(|&&c| c != '_').collect(); - - return Some(( - INT::from_str_radix(&out, radix) - .map(Token::IntegerConstant) - .unwrap_or_else(|_| { - Token::LexError(Box::new(LERR::MalformedNumber( - result.into_iter().collect(), - ))) - }), - pos, - )); - } else { - let out: String = result.iter().filter(|&&c| c != '_').collect(); - let num = INT::from_str(&out).map(Token::IntegerConstant); - - // If integer parsing is unnecessary, try float instead - #[cfg(not(feature = "no_float"))] - let num = num.or_else(|_| FLOAT::from_str(&out).map(Token::FloatConstant)); - - return Some(( - num.unwrap_or_else(|_| { - Token::LexError(Box::new(LERR::MalformedNumber( - result.into_iter().collect(), - ))) - }), - pos, - )); - } - } - - // letter or underscore ... - ('A'..='Z', _) | ('a'..='z', _) | ('_', _) => { - let mut result = Vec::new(); - result.push(c); - - while let Some(next_char) = self.peek_next() { - match next_char { - x if x.is_ascii_alphanumeric() || x == '_' => { - result.push(x); - self.eat_next(); - } - _ => break, - } - } - - let is_valid_identifier = result - .iter() - .find(|&ch| char::is_ascii_alphanumeric(ch)) // first alpha-numeric character - .map(char::is_ascii_alphabetic) // is a letter - .unwrap_or(false); // if no alpha-numeric at all - syntax error - - let identifier: String = result.iter().collect(); - - if !is_valid_identifier { - return Some(( - Token::LexError(Box::new(LERR::MalformedIdentifier(identifier))), - pos, - )); - } - - return Some(( - match identifier.as_str() { - "true" => Token::True, - "false" => Token::False, - "let" => Token::Let, - "const" => Token::Const, - "if" => Token::If, - "else" => Token::Else, - "while" => Token::While, - "loop" => Token::Loop, - "continue" => Token::Continue, - "break" => Token::Break, - "return" => Token::Return, - "throw" => Token::Throw, - "for" => Token::For, - "in" => Token::In, - #[cfg(not(feature = "no_function"))] - "private" => Token::Private, - #[cfg(not(feature = "no_module"))] - "import" => Token::Import, - #[cfg(not(feature = "no_module"))] - "export" => Token::Export, - #[cfg(not(feature = "no_module"))] - "as" => Token::As, - - #[cfg(not(feature = "no_function"))] - "fn" => Token::Fn, - - _ => Token::Identifier(identifier), - }, - pos, - )); - } - - // " - string literal - ('"', _) => { - return self - .parse_string_literal('"', self.max_string_size) - .map_or_else( - |err| Some((Token::LexError(Box::new(err.0)), err.1)), - |out| Some((Token::StringConst(out), pos)), - ); - } - - // ' - character literal - ('\'', '\'') => { - return Some(( - Token::LexError(Box::new(LERR::MalformedChar("".to_string()))), - pos, - )); - } - ('\'', _) => { - return Some( - self.parse_string_literal('\'', self.max_string_size) - .map_or_else( - |err| (Token::LexError(Box::new(err.0)), err.1), - |result| { - let mut chars = result.chars(); - let first = chars.next(); - - if chars.next().is_some() { - ( - Token::LexError(Box::new(LERR::MalformedChar(result))), - pos, - ) - } else { - (Token::CharConstant(first.expect("should be Some")), pos) - } - }, - ), - ); - } - - // Braces - ('{', _) => return Some((Token::LeftBrace, pos)), - ('}', _) => return Some((Token::RightBrace, pos)), - - // Parentheses - ('(', _) => return Some((Token::LeftParen, pos)), - (')', _) => return Some((Token::RightParen, pos)), - - // Indexing - ('[', _) => return Some((Token::LeftBracket, pos)), - (']', _) => return Some((Token::RightBracket, pos)), - - // Map literal - #[cfg(not(feature = "no_object"))] - ('#', '{') => { - self.eat_next(); - return Some((Token::MapStart, pos)); - } - - // Operators - ('+', '=') => { - self.eat_next(); - return Some((Token::PlusAssign, pos)); - } - ('+', _) if self.can_be_unary => return Some((Token::UnaryPlus, pos)), - ('+', _) => return Some((Token::Plus, pos)), - - ('-', '0'..='9') if self.can_be_unary => negated = true, - ('-', '0'..='9') => return Some((Token::Minus, pos)), - ('-', '=') => { - self.eat_next(); - return Some((Token::MinusAssign, pos)); - } - ('-', '>') => { - return Some(( - Token::LexError(Box::new(LERR::ImproperSymbol( - "'->' is not a valid symbol. This is not C or C++!".to_string(), - ))), - pos, - )) - } - ('-', _) if self.can_be_unary => return Some((Token::UnaryMinus, pos)), - ('-', _) => return Some((Token::Minus, pos)), - - ('*', '=') => { - self.eat_next(); - return Some((Token::MultiplyAssign, pos)); - } - ('*', _) => return Some((Token::Multiply, pos)), - - // Comments - ('/', '/') => { - self.eat_next(); - - while let Some(c) = self.get_next() { - if c == '\n' { - self.new_line(); - break; - } - - self.advance(); - } - } - ('/', '*') => { - let mut level = 1; - - self.eat_next(); - - while let Some(c) = self.get_next() { - self.advance(); - - match c { - '/' => { - if self.get_next() == Some('*') { - level += 1; - } - self.advance(); - } - '*' => { - if self.get_next() == Some('/') { - level -= 1; - } - self.advance(); - } - '\n' => self.new_line(), - _ => (), - } - - if level == 0 { - break; - } - } - } - - ('/', '=') => { - self.eat_next(); - return Some((Token::DivideAssign, pos)); - } - ('/', _) => return Some((Token::Divide, pos)), - - (';', _) => return Some((Token::SemiColon, pos)), - (',', _) => return Some((Token::Comma, pos)), - ('.', _) => return Some((Token::Period, pos)), - - ('=', '=') => { - self.eat_next(); - - // Warn against `===` - if self.peek_next() == Some('=') { - return Some(( - Token::LexError(Box::new(LERR::ImproperSymbol( - "'===' is not a valid operator. This is not JavaScript! Should it be '=='?" - .to_string(), - ))), - pos, - )); - } - - return Some((Token::EqualsTo, pos)); - } - ('=', '>') => { - return Some(( - Token::LexError(Box::new(LERR::ImproperSymbol( - "'=>' is not a valid symbol. This is not Rust! Should it be '>='?" - .to_string(), - ))), - pos, - )) - } - ('=', _) => return Some((Token::Equals, pos)), - - (':', ':') => { - self.eat_next(); - return Some((Token::DoubleColon, pos)); - } - (':', '=') => { - return Some(( - Token::LexError(Box::new(LERR::ImproperSymbol( - "':=' is not a valid assignment operator. This is not Pascal! Should it be simply '='?" - .to_string(), - ))), - pos, - )) - } - (':', _) => return Some((Token::Colon, pos)), - - ('<', '=') => { - self.eat_next(); - return Some((Token::LessThanEqualsTo, pos)); - } - ('<', '-') => { - return Some(( - Token::LexError(Box::new(LERR::ImproperSymbol( - "'<-' is not a valid symbol. Should it be '<='?".to_string(), - ))), - pos, - )) - } - ('<', '<') => { - self.eat_next(); - - return Some(( - if self.peek_next() == Some('=') { - self.eat_next(); - Token::LeftShiftAssign - } else { - Token::LeftShift - }, - pos, - )); - } - ('<', _) => return Some((Token::LessThan, pos)), - - ('>', '=') => { - self.eat_next(); - return Some((Token::GreaterThanEqualsTo, pos)); - } - ('>', '>') => { - self.eat_next(); - - return Some(( - if self.peek_next() == Some('=') { - self.eat_next(); - Token::RightShiftAssign - } else { - Token::RightShift - }, - pos, - )); - } - ('>', _) => return Some((Token::GreaterThan, pos)), - - ('!', '=') => { - self.eat_next(); - - // Warn against `!==` - if self.peek_next() == Some('=') { - return Some(( - Token::LexError(Box::new(LERR::ImproperSymbol( - "'!==' is not a valid operator. This is not JavaScript! Should it be '!='?" - .to_string(), - ))), - pos, - )); - } - - return Some((Token::NotEqualsTo, pos)); - } - ('!', _) => return Some((Token::Bang, pos)), - - ('|', '|') => { - self.eat_next(); - return Some((Token::Or, pos)); - } - ('|', '=') => { - self.eat_next(); - return Some((Token::OrAssign, pos)); - } - ('|', _) => return Some((Token::Pipe, pos)), - - ('&', '&') => { - self.eat_next(); - return Some((Token::And, pos)); - } - ('&', '=') => { - self.eat_next(); - return Some((Token::AndAssign, pos)); - } - ('&', _) => return Some((Token::Ampersand, pos)), - - ('^', '=') => { - self.eat_next(); - return Some((Token::XOrAssign, pos)); - } - ('^', _) => return Some((Token::XOr, pos)), - - ('%', '=') => { - self.eat_next(); - return Some((Token::ModuloAssign, pos)); - } - ('%', _) => return Some((Token::Modulo, pos)), - - ('~', '=') => { - self.eat_next(); - return Some((Token::PowerOfAssign, pos)); - } - ('~', _) => return Some((Token::PowerOf, pos)), - - ('\0', _) => unreachable!(), - - (ch, _) if ch.is_whitespace() => (), - (ch, _) => return Some((Token::LexError(Box::new(LERR::UnexpectedChar(ch))), pos)), - } - } - - self.advance(); - Some((Token::EOF, self.pos)) - } +} + +/// An iterator on a `Token` stream. +pub struct TokenIterator<'a> { + /// Current state. + state: TokenizeState, + /// Current position. + pos: Position, + /// Input character stream. + stream: MultiInputsStream<'a>, } impl<'a> Iterator for TokenIterator<'a> { type Item = (Token, Position); fn next(&mut self) -> Option { - self.inner_next().map(|x| { - // Save the last token - self.can_be_unary = x.0.is_next_unary(); + get_next_token(&mut self.stream, &mut self.state, &mut self.pos).map(|x| { + // Save the last token's state + self.state.can_be_unary = x.0.is_next_unary(); x }) } @@ -1108,9 +1171,16 @@ impl<'a> Iterator for TokenIterator<'a> { /// Tokenize an input text stream. pub fn lex<'a>(input: &'a [&'a str], max_string_size: usize) -> TokenIterator<'a> { TokenIterator { - max_string_size, - can_be_unary: true, + state: TokenizeState { + max_string_size, + can_be_unary: true, + comment_level: 0, + end_with_none: false, + include_comments: false, + }, pos: Position::new(1, 0), - streams: input.iter().map(|s| s.chars().peekable()).collect(), + stream: MultiInputsStream { + streams: input.iter().map(|s| s.chars().peekable()).collect(), + }, } }