Refactor tokenizer.

This commit is contained in:
Stephen Chung 2020-06-26 19:44:50 +08:00
parent 14746f94ca
commit 31eaf321d0
2 changed files with 702 additions and 640 deletions

View File

@ -131,15 +131,11 @@ pub use optimize::OptimizationLevel;
#[cfg(feature = "internals")]
#[deprecated(note = "this type is volatile and may change")]
pub use token::Token;
pub use token::{get_next_token, parse_string_literal, InputStream, Token, TokenizeState};
#[cfg(feature = "internals")]
#[deprecated(note = "this type is volatile and may change")]
pub use parser::Expr;
#[cfg(feature = "internals")]
#[deprecated(note = "this type is volatile and may change")]
pub use parser::Stmt;
pub use parser::{Expr, ReturnType, Stmt};
#[cfg(feature = "internals")]
#[deprecated(note = "this type is volatile and may change")]
@ -148,7 +144,3 @@ pub use module::ModuleRef;
#[cfg(feature = "internals")]
#[deprecated(note = "this type is volatile and may change")]
pub use utils::StaticVec;
#[cfg(feature = "internals")]
#[deprecated(note = "this type is volatile and may change")]
pub use parser::ReturnType;

View File

@ -209,6 +209,7 @@ pub enum Token {
#[cfg(not(feature = "no_module"))]
As,
LexError(Box<LexError>),
Comment(String),
EOF,
}
@ -429,89 +430,46 @@ impl From<Token> for String {
}
}
/// An iterator on a `Token` stream.
pub struct TokenIterator<'a> {
/// State of the tokenizer.
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Default)]
pub struct TokenizeState {
/// Maximum length of a string (0 = unlimited).
max_string_size: usize,
/// Can the next token be a unary operator?
can_be_unary: bool,
/// Current position.
pos: Position,
/// The input character streams.
streams: StaticVec<Peekable<Chars<'a>>>,
/// Is the tokenizer currently inside a block comment?
comment_level: usize,
/// Return `None` at the end of the stream instead of `Some(Token::EOF)`?
end_with_none: bool,
/// Include comments?
include_comments: bool,
}
impl<'a> TokenIterator<'a> {
/// Consume the next character.
fn eat_next(&mut self) {
self.get_next();
self.advance();
}
/// Trait that encapsulates a peekable character input stream.
pub trait InputStream {
/// Get the next character
fn get_next(&mut self) -> Option<char> {
loop {
if self.streams.is_empty() {
// No more streams
return None;
} else if let Some(ch) = self.streams[0].next() {
// Next character in current stream
return Some(ch);
} else {
// Jump to the next stream
let _ = self.streams.remove(0);
}
}
}
fn get_next(&mut self) -> Option<char>;
/// Peek the next character
fn peek_next(&mut self) -> Option<char> {
loop {
if self.streams.is_empty() {
// No more streams
return None;
} else if let Some(ch) = self.streams[0].peek() {
// Next character in current stream
return Some(*ch);
} else {
// Jump to the next stream
let _ = self.streams.remove(0);
}
}
}
/// Move the current position one character ahead.
fn advance(&mut self) {
self.pos.advance();
}
/// Move the current position back one character.
///
/// # Panics
///
/// Panics if already at the beginning of a line - cannot rewind to the previous line.
fn rewind(&mut self) {
self.pos.rewind();
}
/// Move the current position to the next line.
fn new_line(&mut self) {
self.pos.new_line()
fn peek_next(&mut self) -> Option<char>;
}
/// Parse a string literal wrapped by `enclosing_char`.
pub fn parse_string_literal(
&mut self,
stream: &mut impl InputStream,
state: &mut TokenizeState,
pos: &mut Position,
enclosing_char: char,
max_length: usize,
) -> Result<String, (LexError, Position)> {
let mut result = Vec::new();
let mut escape = String::with_capacity(12);
loop {
let next_char = self
.get_next()
.ok_or((LERR::UnterminatedString, self.pos))?;
let next_char = stream.get_next().ok_or((LERR::UnterminatedString, *pos))?;
self.advance();
pos.advance();
if max_length > 0 && result.len() > max_length {
return Err((LexError::StringTooLong(max_length), self.pos));
if state.max_string_size > 0 && result.len() > state.max_string_size {
return Err((LexError::StringTooLong(state.max_string_size), *pos));
}
match next_char {
@ -554,22 +512,22 @@ impl<'a> TokenIterator<'a> {
};
for _ in 0..len {
let c = self.get_next().ok_or_else(|| {
(LERR::MalformedEscapeSequence(seq.to_string()), self.pos)
})?;
let c = stream
.get_next()
.ok_or_else(|| (LERR::MalformedEscapeSequence(seq.to_string()), *pos))?;
seq.push(c);
self.advance();
pos.advance();
out_val *= 16;
out_val += c.to_digit(16).ok_or_else(|| {
(LERR::MalformedEscapeSequence(seq.to_string()), self.pos)
})?;
out_val += c
.to_digit(16)
.ok_or_else(|| (LERR::MalformedEscapeSequence(seq.to_string()), *pos))?;
}
result.push(
char::from_u32(out_val)
.ok_or_else(|| (LERR::MalformedEscapeSequence(seq), self.pos))?,
.ok_or_else(|| (LERR::MalformedEscapeSequence(seq), *pos))?,
);
}
@ -583,14 +541,12 @@ impl<'a> TokenIterator<'a> {
ch if enclosing_char == ch && escape.is_empty() => break,
// Unknown escape sequence
_ if !escape.is_empty() => {
return Err((LERR::MalformedEscapeSequence(escape), self.pos))
}
_ if !escape.is_empty() => return Err((LERR::MalformedEscapeSequence(escape), *pos)),
// Cannot have new-lines inside string literals
'\n' => {
self.rewind();
return Err((LERR::UnterminatedString, self.pos));
pos.rewind();
return Err((LERR::UnterminatedString, *pos));
}
// All other characters
@ -603,25 +559,94 @@ impl<'a> TokenIterator<'a> {
let s = result.iter().collect::<String>();
if max_length > 0 && s.len() > max_length {
return Err((LexError::StringTooLong(max_length), self.pos));
if state.max_string_size > 0 && s.len() > state.max_string_size {
return Err((LexError::StringTooLong(state.max_string_size), *pos));
}
Ok(s)
}
/// Consume the next character.
fn eat_next(stream: &mut impl InputStream, pos: &mut Position) {
stream.get_next();
pos.advance();
}
/// Scan for a block comment until the end.
fn scan_comment(
stream: &mut impl InputStream,
state: &mut TokenizeState,
pos: &mut Position,
comment: &mut String,
) {
while let Some(c) = stream.get_next() {
pos.advance();
if state.include_comments {
comment.push(c);
}
match c {
'/' => {
if let Some(c2) = stream.get_next() {
if state.include_comments {
comment.push(c2);
}
if c2 == '*' {
state.comment_level += 1;
}
}
pos.advance();
}
'*' => {
if let Some(c2) = stream.get_next() {
if state.include_comments {
comment.push(c2);
}
if c2 == '/' {
state.comment_level -= 1;
}
}
pos.advance();
}
'\n' => pos.new_line(),
_ => (),
}
if state.comment_level == 0 {
break;
}
}
}
/// Get the next token.
fn inner_next(&mut self) -> Option<(Token, Position)> {
pub fn get_next_token(
stream: &mut impl InputStream,
state: &mut TokenizeState,
pos: &mut Position,
) -> Option<(Token, Position)> {
// Still inside a comment?
if state.comment_level > 0 {
let start_pos = *pos;
let mut comment = Default::default();
scan_comment(stream, state, pos, &mut comment);
if state.include_comments {
println!("Comment ({}): {}", start_pos, comment);
return Some((Token::Comment(comment), start_pos));
}
}
let mut negated = false;
while let Some(c) = self.get_next() {
self.advance();
while let Some(c) = stream.get_next() {
pos.advance();
let pos = self.pos;
let start_pos = *pos;
match (c, self.peek_next().unwrap_or('\0')) {
match (c, stream.peek_next().unwrap_or('\0')) {
// \n
('\n', _) => self.new_line(),
('\n', _) => pos.new_line(),
// digit ...
('0'..='9', _) => {
@ -629,32 +654,30 @@ impl<'a> TokenIterator<'a> {
let mut radix_base: Option<u32> = None;
result.push(c);
while let Some(next_char) = self.peek_next() {
while let Some(next_char) = stream.peek_next() {
match next_char {
'0'..='9' | '_' => {
result.push(next_char);
self.eat_next();
eat_next(stream, pos);
}
#[cfg(not(feature = "no_float"))]
'.' => {
result.push(next_char);
self.eat_next();
while let Some(next_char_in_float) = self.peek_next() {
eat_next(stream, pos);
while let Some(next_char_in_float) = stream.peek_next() {
match next_char_in_float {
'0'..='9' | '_' => {
result.push(next_char_in_float);
self.eat_next();
eat_next(stream, pos);
}
_ => break,
}
}
}
// 0x????, 0o????, 0b????
ch @ 'x' | ch @ 'X' | ch @ 'o' | ch @ 'O' | ch @ 'b' | ch @ 'B'
if c == '0' =>
{
ch @ 'x' | ch @ 'X' | ch @ 'o' | ch @ 'O' | ch @ 'b' | ch @ 'B' if c == '0' => {
result.push(next_char);
self.eat_next();
eat_next(stream, pos);
let valid = match ch {
'x' | 'X' => [
@ -679,13 +702,13 @@ impl<'a> TokenIterator<'a> {
_ => unreachable!(),
});
while let Some(next_char_in_escape_seq) = self.peek_next() {
while let Some(next_char_in_escape_seq) = stream.peek_next() {
if !valid.contains(&next_char_in_escape_seq) {
break;
}
result.push(next_char_in_escape_seq);
self.eat_next();
eat_next(stream, pos);
}
}
@ -709,7 +732,7 @@ impl<'a> TokenIterator<'a> {
result.into_iter().collect(),
)))
}),
pos,
start_pos,
));
} else {
let out: String = result.iter().filter(|&&c| c != '_').collect();
@ -725,7 +748,7 @@ impl<'a> TokenIterator<'a> {
result.into_iter().collect(),
)))
}),
pos,
start_pos,
));
}
}
@ -735,11 +758,11 @@ impl<'a> TokenIterator<'a> {
let mut result = Vec::new();
result.push(c);
while let Some(next_char) = self.peek_next() {
while let Some(next_char) = stream.peek_next() {
match next_char {
x if x.is_ascii_alphanumeric() || x == '_' => {
result.push(x);
self.eat_next();
eat_next(stream, pos);
}
_ => break,
}
@ -756,7 +779,7 @@ impl<'a> TokenIterator<'a> {
if !is_valid_identifier {
return Some((
Token::LexError(Box::new(LERR::MalformedIdentifier(identifier))),
pos,
start_pos,
));
}
@ -790,30 +813,24 @@ impl<'a> TokenIterator<'a> {
_ => Token::Identifier(identifier),
},
pos,
start_pos,
));
}
// " - string literal
('"', _) => {
return self
.parse_string_literal('"', self.max_string_size)
('"', _) => return parse_string_literal(stream, state, pos, '"')
.map_or_else(
|err| Some((Token::LexError(Box::new(err.0)), err.1)),
|out| Some((Token::StringConst(out), pos)),
);
}
|out| Some((Token::StringConst(out), start_pos)),
),
// ' - character literal
('\'', '\'') => {
return Some((
('\'', '\'') => return Some((
Token::LexError(Box::new(LERR::MalformedChar("".to_string()))),
pos,
));
}
('\'', _) => {
return Some(
self.parse_string_literal('\'', self.max_string_size)
start_pos,
)),
('\'', _) => return Some(
parse_string_literal(stream, state, pos, '\'')
.map_or_else(
|err| (Token::LexError(Box::new(err.0)), err.1),
|result| {
@ -823,283 +840,329 @@ impl<'a> TokenIterator<'a> {
if chars.next().is_some() {
(
Token::LexError(Box::new(LERR::MalformedChar(result))),
pos,
start_pos,
)
} else {
(Token::CharConstant(first.expect("should be Some")), pos)
(Token::CharConstant(first.expect("should be Some")), start_pos)
}
},
),
);
}
),
// Braces
('{', _) => return Some((Token::LeftBrace, pos)),
('}', _) => return Some((Token::RightBrace, pos)),
('{', _) => return Some((Token::LeftBrace, start_pos)),
('}', _) => return Some((Token::RightBrace, start_pos)),
// Parentheses
('(', _) => return Some((Token::LeftParen, pos)),
(')', _) => return Some((Token::RightParen, pos)),
('(', _) => return Some((Token::LeftParen, start_pos)),
(')', _) => return Some((Token::RightParen, start_pos)),
// Indexing
('[', _) => return Some((Token::LeftBracket, pos)),
(']', _) => return Some((Token::RightBracket, pos)),
('[', _) => return Some((Token::LeftBracket, start_pos)),
(']', _) => return Some((Token::RightBracket, start_pos)),
// Map literal
#[cfg(not(feature = "no_object"))]
('#', '{') => {
self.eat_next();
return Some((Token::MapStart, pos));
eat_next(stream, pos);
return Some((Token::MapStart, start_pos));
}
// Operators
('+', '=') => {
self.eat_next();
return Some((Token::PlusAssign, pos));
eat_next(stream, pos);
return Some((Token::PlusAssign, start_pos));
}
('+', _) if self.can_be_unary => return Some((Token::UnaryPlus, pos)),
('+', _) => return Some((Token::Plus, pos)),
('+', _) if state.can_be_unary => return Some((Token::UnaryPlus, start_pos)),
('+', _) => return Some((Token::Plus, start_pos)),
('-', '0'..='9') if self.can_be_unary => negated = true,
('-', '0'..='9') => return Some((Token::Minus, pos)),
('-', '0'..='9') if state.can_be_unary => negated = true,
('-', '0'..='9') => return Some((Token::Minus, start_pos)),
('-', '=') => {
self.eat_next();
return Some((Token::MinusAssign, pos));
eat_next(stream, pos);
return Some((Token::MinusAssign, start_pos));
}
('-', '>') => {
return Some((
('-', '>') => return Some((
Token::LexError(Box::new(LERR::ImproperSymbol(
"'->' is not a valid symbol. This is not C or C++!".to_string(),
))),
pos,
))
}
('-', _) if self.can_be_unary => return Some((Token::UnaryMinus, pos)),
('-', _) => return Some((Token::Minus, pos)),
start_pos,
)),
('-', _) if state.can_be_unary => return Some((Token::UnaryMinus, start_pos)),
('-', _) => return Some((Token::Minus, start_pos)),
('*', '=') => {
self.eat_next();
return Some((Token::MultiplyAssign, pos));
eat_next(stream, pos);
return Some((Token::MultiplyAssign, start_pos));
}
('*', _) => return Some((Token::Multiply, pos)),
('*', _) => return Some((Token::Multiply, start_pos)),
// Comments
('/', '/') => {
self.eat_next();
eat_next(stream, pos);
while let Some(c) = self.get_next() {
let mut comment = if state.include_comments {
"//".to_string()
} else {
Default::default()
};
while let Some(c) = stream.get_next() {
if c == '\n' {
self.new_line();
pos.new_line();
break;
}
self.advance();
if state.include_comments {
comment.push(c);
}
pos.advance();
}
if state.include_comments {
println!("Comment ({}): {}", start_pos, comment);
return Some((Token::Comment(comment), start_pos));
}
}
('/', '*') => {
let mut level = 1;
state.comment_level = 1;
self.eat_next();
eat_next(stream, pos);
while let Some(c) = self.get_next() {
self.advance();
let mut comment = if state.include_comments {
"/*".to_string()
} else {
Default::default()
};
scan_comment(stream, state, pos, &mut comment);
match c {
'/' => {
if self.get_next() == Some('*') {
level += 1;
}
self.advance();
}
'*' => {
if self.get_next() == Some('/') {
level -= 1;
}
self.advance();
}
'\n' => self.new_line(),
_ => (),
}
if level == 0 {
break;
}
if state.include_comments {
println!("Comment ({}): {}", start_pos, comment);
return Some((Token::Comment(comment), start_pos));
}
}
('/', '=') => {
self.eat_next();
return Some((Token::DivideAssign, pos));
eat_next(stream, pos);
return Some((Token::DivideAssign, start_pos));
}
('/', _) => return Some((Token::Divide, pos)),
('/', _) => return Some((Token::Divide, start_pos)),
(';', _) => return Some((Token::SemiColon, pos)),
(',', _) => return Some((Token::Comma, pos)),
('.', _) => return Some((Token::Period, pos)),
(';', _) => return Some((Token::SemiColon, start_pos)),
(',', _) => return Some((Token::Comma, start_pos)),
('.', _) => return Some((Token::Period, start_pos)),
('=', '=') => {
self.eat_next();
eat_next(stream, pos);
// Warn against `===`
if self.peek_next() == Some('=') {
if stream.peek_next() == Some('=') {
return Some((
Token::LexError(Box::new(LERR::ImproperSymbol(
"'===' is not a valid operator. This is not JavaScript! Should it be '=='?"
.to_string(),
))),
pos,
start_pos,
));
}
return Some((Token::EqualsTo, pos));
return Some((Token::EqualsTo, start_pos));
}
('=', '>') => {
return Some((
('=', '>') => return Some((
Token::LexError(Box::new(LERR::ImproperSymbol(
"'=>' is not a valid symbol. This is not Rust! Should it be '>='?"
.to_string(),
))),
pos,
))
}
('=', _) => return Some((Token::Equals, pos)),
start_pos,
)),
('=', _) => return Some((Token::Equals, start_pos)),
(':', ':') => {
self.eat_next();
return Some((Token::DoubleColon, pos));
eat_next(stream, pos);
return Some((Token::DoubleColon, start_pos));
}
(':', '=') => {
return Some((
(':', '=') => return Some((
Token::LexError(Box::new(LERR::ImproperSymbol(
"':=' is not a valid assignment operator. This is not Pascal! Should it be simply '='?"
.to_string(),
))),
pos,
))
}
(':', _) => return Some((Token::Colon, pos)),
start_pos,
)),
(':', _) => return Some((Token::Colon, start_pos)),
('<', '=') => {
self.eat_next();
return Some((Token::LessThanEqualsTo, pos));
eat_next(stream, pos);
return Some((Token::LessThanEqualsTo, start_pos));
}
('<', '-') => {
return Some((
('<', '-') => return Some((
Token::LexError(Box::new(LERR::ImproperSymbol(
"'<-' is not a valid symbol. Should it be '<='?".to_string(),
))),
pos,
))
}
start_pos,
)),
('<', '<') => {
self.eat_next();
eat_next(stream, pos);
return Some((
if self.peek_next() == Some('=') {
self.eat_next();
if stream.peek_next() == Some('=') {
eat_next(stream, pos);
Token::LeftShiftAssign
} else {
Token::LeftShift
},
pos,
start_pos,
));
}
('<', _) => return Some((Token::LessThan, pos)),
('<', _) => return Some((Token::LessThan, start_pos)),
('>', '=') => {
self.eat_next();
return Some((Token::GreaterThanEqualsTo, pos));
eat_next(stream, pos);
return Some((Token::GreaterThanEqualsTo, start_pos));
}
('>', '>') => {
self.eat_next();
eat_next(stream, pos);
return Some((
if self.peek_next() == Some('=') {
self.eat_next();
if stream.peek_next() == Some('=') {
eat_next(stream, pos);
Token::RightShiftAssign
} else {
Token::RightShift
},
pos,
start_pos,
));
}
('>', _) => return Some((Token::GreaterThan, pos)),
('>', _) => return Some((Token::GreaterThan, start_pos)),
('!', '=') => {
self.eat_next();
eat_next(stream, pos);
// Warn against `!==`
if self.peek_next() == Some('=') {
if stream.peek_next() == Some('=') {
return Some((
Token::LexError(Box::new(LERR::ImproperSymbol(
"'!==' is not a valid operator. This is not JavaScript! Should it be '!='?"
.to_string(),
))),
pos,
start_pos,
));
}
return Some((Token::NotEqualsTo, pos));
return Some((Token::NotEqualsTo, start_pos));
}
('!', _) => return Some((Token::Bang, pos)),
('!', _) => return Some((Token::Bang, start_pos)),
('|', '|') => {
self.eat_next();
return Some((Token::Or, pos));
eat_next(stream, pos);
return Some((Token::Or, start_pos));
}
('|', '=') => {
self.eat_next();
return Some((Token::OrAssign, pos));
eat_next(stream, pos);
return Some((Token::OrAssign, start_pos));
}
('|', _) => return Some((Token::Pipe, pos)),
('|', _) => return Some((Token::Pipe, start_pos)),
('&', '&') => {
self.eat_next();
return Some((Token::And, pos));
eat_next(stream, pos);
return Some((Token::And, start_pos));
}
('&', '=') => {
self.eat_next();
return Some((Token::AndAssign, pos));
eat_next(stream, pos);
return Some((Token::AndAssign, start_pos));
}
('&', _) => return Some((Token::Ampersand, pos)),
('&', _) => return Some((Token::Ampersand, start_pos)),
('^', '=') => {
self.eat_next();
return Some((Token::XOrAssign, pos));
eat_next(stream, pos);
return Some((Token::XOrAssign, start_pos));
}
('^', _) => return Some((Token::XOr, pos)),
('^', _) => return Some((Token::XOr, start_pos)),
('%', '=') => {
self.eat_next();
return Some((Token::ModuloAssign, pos));
eat_next(stream, pos);
return Some((Token::ModuloAssign, start_pos));
}
('%', _) => return Some((Token::Modulo, pos)),
('%', _) => return Some((Token::Modulo, start_pos)),
('~', '=') => {
self.eat_next();
return Some((Token::PowerOfAssign, pos));
eat_next(stream, pos);
return Some((Token::PowerOfAssign, start_pos));
}
('~', _) => return Some((Token::PowerOf, pos)),
('~', _) => return Some((Token::PowerOf, start_pos)),
('\0', _) => unreachable!(),
(ch, _) if ch.is_whitespace() => (),
(ch, _) => return Some((Token::LexError(Box::new(LERR::UnexpectedChar(ch))), pos)),
(ch, _) => return Some((Token::LexError(Box::new(LERR::UnexpectedChar(ch))), start_pos)),
}
}
self.advance();
Some((Token::EOF, self.pos))
pos.advance();
if state.end_with_none {
None
} else {
Some((Token::EOF, *pos))
}
}
/// An type that implements the `InputStream` trait.
/// Multiple charaacter streams are jointed together to form one single stream.
pub struct MultiInputsStream<'a> {
/// The input character streams.
streams: StaticVec<Peekable<Chars<'a>>>,
}
impl InputStream for MultiInputsStream<'_> {
/// Get the next character
fn get_next(&mut self) -> Option<char> {
loop {
if self.streams.is_empty() {
// No more streams
return None;
} else if let Some(ch) = self.streams[0].next() {
// Next character in current stream
return Some(ch);
} else {
// Jump to the next stream
let _ = self.streams.remove(0);
}
}
}
/// Peek the next character
fn peek_next(&mut self) -> Option<char> {
loop {
if self.streams.is_empty() {
// No more streams
return None;
} else if let Some(ch) = self.streams[0].peek() {
// Next character in current stream
return Some(*ch);
} else {
// Jump to the next stream
let _ = self.streams.remove(0);
}
}
}
}
/// An iterator on a `Token` stream.
pub struct TokenIterator<'a> {
/// Current state.
state: TokenizeState,
/// Current position.
pos: Position,
/// Input character stream.
stream: MultiInputsStream<'a>,
}
impl<'a> Iterator for TokenIterator<'a> {
type Item = (Token, Position);
fn next(&mut self) -> Option<Self::Item> {
self.inner_next().map(|x| {
// Save the last token
self.can_be_unary = x.0.is_next_unary();
get_next_token(&mut self.stream, &mut self.state, &mut self.pos).map(|x| {
// Save the last token's state
self.state.can_be_unary = x.0.is_next_unary();
x
})
}
@ -1108,9 +1171,16 @@ impl<'a> Iterator for TokenIterator<'a> {
/// Tokenize an input text stream.
pub fn lex<'a>(input: &'a [&'a str], max_string_size: usize) -> TokenIterator<'a> {
TokenIterator {
state: TokenizeState {
max_string_size,
can_be_unary: true,
comment_level: 0,
end_with_none: false,
include_comments: false,
},
pos: Position::new(1, 0),
stream: MultiInputsStream {
streams: input.iter().map(|s| s.chars().peekable()).collect(),
},
}
}