Remove matching by ASCII because compiler should already optimize.

This commit is contained in:
Stephen Chung 2023-03-16 13:05:29 +08:00
parent 952c77d9bb
commit 55f022174b

View File

@ -871,6 +871,8 @@ impl Token {
#[inline] #[inline]
#[must_use] #[must_use]
pub fn lookup_symbol_from_syntax(syntax: &str) -> Option<Self> { pub fn lookup_symbol_from_syntax(syntax: &str) -> Option<Self> {
// This implementation is based upon a pre-calculated table generated
// by GNU gperf on the list of keywords.
let utf8 = syntax.as_bytes(); let utf8 = syntax.as_bytes();
let len = utf8.len(); let len = utf8.len();
let mut hash_val = len; let mut hash_val = len;
@ -891,7 +893,11 @@ impl Token {
match KEYWORDS_LIST[hash_val] { match KEYWORDS_LIST[hash_val] {
(_, Token::EOF) => None, (_, Token::EOF) => None,
(s, ref t) if s == syntax => Some(t.clone()), // Fail early to avoid calling memcmp()
// Since we are already working with bytes, mind as well check the first one
(s, ref t) if s.len() == len && s.as_bytes()[0] == utf8[0] && s == syntax => {
Some(t.clone())
}
_ => None, _ => None,
} }
} }
@ -1543,103 +1549,11 @@ fn get_next_token_inner(
// Identifiers and strings that can have non-ASCII characters // Identifiers and strings that can have non-ASCII characters
match (c, cc) { match (c, cc) {
// letter or underscore ...
_ if is_id_first_alphabetic(c) || c == '_' => {
return Some(parse_identifier_token(stream, state, pos, start_pos, c));
}
// " - string literal
('"', ..) => {
return parse_string_literal(stream, state, pos, c, false, true, false)
.map_or_else(
|(err, err_pos)| Some((Token::LexError(err.into()), err_pos)),
|(result, ..)| Some((Token::StringConstant(result.into()), start_pos)),
);
}
// ` - string literal
('`', ..) => {
// Start from the next line if at the end of line
match stream.peek_next() {
// `\r - start from next line
Some('\r') => {
eat_next_and_advance(stream, pos);
// `\r\n
if stream.peek_next() == Some('\n') {
eat_next_and_advance(stream, pos);
}
pos.new_line();
}
// `\n - start from next line
Some('\n') => {
eat_next_and_advance(stream, pos);
pos.new_line();
}
_ => (),
}
return parse_string_literal(stream, state, pos, c, true, false, true).map_or_else(
|(err, err_pos)| Some((Token::LexError(err.into()), err_pos)),
|(result, interpolated, ..)| {
if interpolated {
Some((Token::InterpolatedString(result.into()), start_pos))
} else {
Some((Token::StringConstant(result.into()), start_pos))
}
},
);
}
// ' - character literal
('\'', '\'') => {
return Some((
Token::LexError(LERR::MalformedChar(String::new()).into()),
start_pos,
))
}
('\'', ..) => {
return Some(
parse_string_literal(stream, state, pos, c, false, false, false).map_or_else(
|(err, err_pos)| (Token::LexError(err.into()), err_pos),
|(result, ..)| {
let mut chars = result.chars();
let first = chars.next().unwrap();
if chars.next().is_some() {
(
Token::LexError(LERR::MalformedChar(result.to_string()).into()),
start_pos,
)
} else {
(Token::CharConstant(first), start_pos)
}
},
),
)
}
_ => (),
}
// Non-ASCII inputs are not valid here
if !c.is_ascii() {
return Some((
Token::LexError(LERR::UnexpectedInput(c.to_string()).into()),
start_pos,
));
}
// Match ASCII byte values (faster?)
let mut buf = [0_u8; 2];
c.encode_utf8(&mut buf[0..1]);
if cc.is_ascii() {
cc.encode_utf8(&mut buf[1..]);
}
match (buf[0], buf[1]) {
// \n // \n
(b'\n', ..) => pos.new_line(), ('\n', ..) => pos.new_line(),
// digit ... // digit ...
(b'0'..=b'9', ..) => { ('0'..='9', ..) => {
let mut result = SmartString::new_const(); let mut result = SmartString::new_const();
let mut radix_base: Option<u32> = None; let mut radix_base: Option<u32> = None;
let mut valid: fn(char) -> bool = is_numeric_digit; let mut valid: fn(char) -> bool = is_numeric_digit;
@ -1798,38 +1712,107 @@ fn get_next_token_inner(
return Some((token, num_pos)); return Some((token, num_pos));
} }
// " - string literal
('"', ..) => {
return parse_string_literal(stream, state, pos, c, false, true, false)
.map_or_else(
|(err, err_pos)| Some((Token::LexError(err.into()), err_pos)),
|(result, ..)| Some((Token::StringConstant(result.into()), start_pos)),
);
}
// ` - string literal
('`', ..) => {
// Start from the next line if at the end of line
match stream.peek_next() {
// `\r - start from next line
Some('\r') => {
eat_next_and_advance(stream, pos);
// `\r\n
if stream.peek_next() == Some('\n') {
eat_next_and_advance(stream, pos);
}
pos.new_line();
}
// `\n - start from next line
Some('\n') => {
eat_next_and_advance(stream, pos);
pos.new_line();
}
_ => (),
}
return parse_string_literal(stream, state, pos, c, true, false, true).map_or_else(
|(err, err_pos)| Some((Token::LexError(err.into()), err_pos)),
|(result, interpolated, ..)| {
if interpolated {
Some((Token::InterpolatedString(result.into()), start_pos))
} else {
Some((Token::StringConstant(result.into()), start_pos))
}
},
);
}
// ' - character literal
('\'', '\'') => {
return Some((
Token::LexError(LERR::MalformedChar(String::new()).into()),
start_pos,
))
}
('\'', ..) => {
return Some(
parse_string_literal(stream, state, pos, c, false, false, false).map_or_else(
|(err, err_pos)| (Token::LexError(err.into()), err_pos),
|(result, ..)| {
let mut chars = result.chars();
let first = chars.next().unwrap();
if chars.next().is_some() {
(
Token::LexError(LERR::MalformedChar(result.to_string()).into()),
start_pos,
)
} else {
(Token::CharConstant(first), start_pos)
}
},
),
)
}
// Braces // Braces
(b'{', ..) => return Some((Token::LeftBrace, start_pos)), ('{', ..) => return Some((Token::LeftBrace, start_pos)),
(b'}', ..) => return Some((Token::RightBrace, start_pos)), ('}', ..) => return Some((Token::RightBrace, start_pos)),
// Unit // Unit
(b'(', b')') => { ('(', ')') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::Unit, start_pos)); return Some((Token::Unit, start_pos));
} }
// Parentheses // Parentheses
(b'(', b'*') => { ('(', '*') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::Reserved(Box::new("(*".into())), start_pos)); return Some((Token::Reserved(Box::new("(*".into())), start_pos));
} }
(b'(', ..) => return Some((Token::LeftParen, start_pos)), ('(', ..) => return Some((Token::LeftParen, start_pos)),
(b')', ..) => return Some((Token::RightParen, start_pos)), (')', ..) => return Some((Token::RightParen, start_pos)),
// Indexing // Indexing
(b'[', ..) => return Some((Token::LeftBracket, start_pos)), ('[', ..) => return Some((Token::LeftBracket, start_pos)),
(b']', ..) => return Some((Token::RightBracket, start_pos)), (']', ..) => return Some((Token::RightBracket, start_pos)),
// Map literal // Map literal
#[cfg(not(feature = "no_object"))] #[cfg(not(feature = "no_object"))]
(b'#', b'{') => { ('#', '{') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::MapStart, start_pos)); return Some((Token::MapStart, start_pos));
} }
// Shebang // Shebang
(b'#', b'!') => return Some((Token::Reserved(Box::new("#!".into())), start_pos)), ('#', '!') => return Some((Token::Reserved(Box::new("#!".into())), start_pos)),
(b'#', b' ') => { ('#', ' ') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
let token = if stream.peek_next() == Some('{') { let token = if stream.peek_next() == Some('{') {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
@ -1840,50 +1823,50 @@ fn get_next_token_inner(
return Some((Token::Reserved(Box::new(token.into())), start_pos)); return Some((Token::Reserved(Box::new(token.into())), start_pos));
} }
(b'#', ..) => return Some((Token::Reserved(Box::new("#".into())), start_pos)), ('#', ..) => return Some((Token::Reserved(Box::new("#".into())), start_pos)),
// Operators // Operators
(b'+', b'=') => { ('+', '=') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::PlusAssign, start_pos)); return Some((Token::PlusAssign, start_pos));
} }
(b'+', b'+') => { ('+', '+') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::Reserved(Box::new("++".into())), start_pos)); return Some((Token::Reserved(Box::new("++".into())), start_pos));
} }
(b'+', ..) if !state.next_token_cannot_be_unary => { ('+', ..) if !state.next_token_cannot_be_unary => {
return Some((Token::UnaryPlus, start_pos)) return Some((Token::UnaryPlus, start_pos))
} }
(b'+', ..) => return Some((Token::Plus, start_pos)), ('+', ..) => return Some((Token::Plus, start_pos)),
(b'-', b'0'..=b'9') if !state.next_token_cannot_be_unary => negated = Some(start_pos), ('-', '0'..='9') if !state.next_token_cannot_be_unary => negated = Some(start_pos),
(b'-', b'0'..=b'9') => return Some((Token::Minus, start_pos)), ('-', '0'..='9') => return Some((Token::Minus, start_pos)),
(b'-', b'=') => { ('-', '=') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::MinusAssign, start_pos)); return Some((Token::MinusAssign, start_pos));
} }
(b'-', b'>') => { ('-', '>') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::Reserved(Box::new("->".into())), start_pos)); return Some((Token::Reserved(Box::new("->".into())), start_pos));
} }
(b'-', b'-') => { ('-', '-') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::Reserved(Box::new("--".into())), start_pos)); return Some((Token::Reserved(Box::new("--".into())), start_pos));
} }
(b'-', ..) if !state.next_token_cannot_be_unary => { ('-', ..) if !state.next_token_cannot_be_unary => {
return Some((Token::UnaryMinus, start_pos)) return Some((Token::UnaryMinus, start_pos))
} }
(b'-', ..) => return Some((Token::Minus, start_pos)), ('-', ..) => return Some((Token::Minus, start_pos)),
(b'*', b')') => { ('*', ')') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::Reserved(Box::new("*)".into())), start_pos)); return Some((Token::Reserved(Box::new("*)".into())), start_pos));
} }
(b'*', b'=') => { ('*', '=') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::MultiplyAssign, start_pos)); return Some((Token::MultiplyAssign, start_pos));
} }
(b'*', b'*') => { ('*', '*') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some(( return Some((
@ -1896,10 +1879,10 @@ fn get_next_token_inner(
start_pos, start_pos,
)); ));
} }
(b'*', ..) => return Some((Token::Multiply, start_pos)), ('*', ..) => return Some((Token::Multiply, start_pos)),
// Comments // Comments
(b'/', b'/') => { ('/', '/') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
let mut comment: Option<String> = match stream.peek_next() { let mut comment: Option<String> = match stream.peek_next() {
@ -1956,7 +1939,7 @@ fn get_next_token_inner(
} }
} }
} }
(b'/', b'*') => { ('/', '*') => {
state.comment_level = 1; state.comment_level = 1;
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
@ -1984,16 +1967,16 @@ fn get_next_token_inner(
} }
} }
(b'/', b'=') => { ('/', '=') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::DivideAssign, start_pos)); return Some((Token::DivideAssign, start_pos));
} }
(b'/', ..) => return Some((Token::Divide, start_pos)), ('/', ..) => return Some((Token::Divide, start_pos)),
(b';', ..) => return Some((Token::SemiColon, start_pos)), (';', ..) => return Some((Token::SemiColon, start_pos)),
(b',', ..) => return Some((Token::Comma, start_pos)), (',', ..) => return Some((Token::Comma, start_pos)),
(b'.', b'.') => { ('.', '.') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some(( return Some((
match stream.peek_next() { match stream.peek_next() {
@ -2010,9 +1993,9 @@ fn get_next_token_inner(
start_pos, start_pos,
)); ));
} }
(b'.', ..) => return Some((Token::Period, start_pos)), ('.', ..) => return Some((Token::Period, start_pos)),
(b'=', b'=') => { ('=', '=') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
if stream.peek_next() == Some('=') { if stream.peek_next() == Some('=') {
@ -2022,14 +2005,14 @@ fn get_next_token_inner(
return Some((Token::EqualsTo, start_pos)); return Some((Token::EqualsTo, start_pos));
} }
(b'=', b'>') => { ('=', '>') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::DoubleArrow, start_pos)); return Some((Token::DoubleArrow, start_pos));
} }
(b'=', ..) => return Some((Token::Equals, start_pos)), ('=', ..) => return Some((Token::Equals, start_pos)),
#[cfg(not(feature = "no_module"))] #[cfg(not(feature = "no_module"))]
(b':', b':') => { (':', ':') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
if stream.peek_next() == Some('<') { if stream.peek_next() == Some('<') {
@ -2039,25 +2022,25 @@ fn get_next_token_inner(
return Some((Token::DoubleColon, start_pos)); return Some((Token::DoubleColon, start_pos));
} }
(b':', b'=') => { (':', '=') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::Reserved(Box::new(":=".into())), start_pos)); return Some((Token::Reserved(Box::new(":=".into())), start_pos));
} }
(b':', b';') => { (':', ';') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::Reserved(Box::new(":;".into())), start_pos)); return Some((Token::Reserved(Box::new(":;".into())), start_pos));
} }
(b':', ..) => return Some((Token::Colon, start_pos)), (':', ..) => return Some((Token::Colon, start_pos)),
(b'<', b'=') => { ('<', '=') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::LessThanEqualsTo, start_pos)); return Some((Token::LessThanEqualsTo, start_pos));
} }
(b'<', b'-') => { ('<', '-') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::Reserved(Box::new("<-".into())), start_pos)); return Some((Token::Reserved(Box::new("<-".into())), start_pos));
} }
(b'<', b'<') => { ('<', '<') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some(( return Some((
@ -2070,17 +2053,17 @@ fn get_next_token_inner(
start_pos, start_pos,
)); ));
} }
(b'<', b'|') => { ('<', '|') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::Reserved(Box::new("<|".into())), start_pos)); return Some((Token::Reserved(Box::new("<|".into())), start_pos));
} }
(b'<', ..) => return Some((Token::LessThan, start_pos)), ('<', ..) => return Some((Token::LessThan, start_pos)),
(b'>', b'=') => { ('>', '=') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::GreaterThanEqualsTo, start_pos)); return Some((Token::GreaterThanEqualsTo, start_pos));
} }
(b'>', b'>') => { ('>', '>') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some(( return Some((
@ -2093,9 +2076,9 @@ fn get_next_token_inner(
start_pos, start_pos,
)); ));
} }
(b'>', ..) => return Some((Token::GreaterThan, start_pos)), ('>', ..) => return Some((Token::GreaterThan, start_pos)),
(b'!', b'i') => { ('!', 'i') => {
stream.get_next().unwrap(); stream.get_next().unwrap();
if stream.peek_next() == Some('n') { if stream.peek_next() == Some('n') {
stream.get_next().unwrap(); stream.get_next().unwrap();
@ -2116,7 +2099,7 @@ fn get_next_token_inner(
stream.unget('i'); stream.unget('i');
return Some((Token::Bang, start_pos)); return Some((Token::Bang, start_pos));
} }
(b'!', b'=') => { ('!', '=') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
if stream.peek_next() == Some('=') { if stream.peek_next() == Some('=') {
@ -2126,55 +2109,55 @@ fn get_next_token_inner(
return Some((Token::NotEqualsTo, start_pos)); return Some((Token::NotEqualsTo, start_pos));
} }
(b'!', b'.') => { ('!', '.') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::Reserved(Box::new("!.".into())), start_pos)); return Some((Token::Reserved(Box::new("!.".into())), start_pos));
} }
(b'!', ..) => return Some((Token::Bang, start_pos)), ('!', ..) => return Some((Token::Bang, start_pos)),
(b'|', b'|') => { ('|', '|') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::Or, start_pos)); return Some((Token::Or, start_pos));
} }
(b'|', b'=') => { ('|', '=') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::OrAssign, start_pos)); return Some((Token::OrAssign, start_pos));
} }
(b'|', b'>') => { ('|', '>') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::Reserved(Box::new("|>".into())), start_pos)); return Some((Token::Reserved(Box::new("|>".into())), start_pos));
} }
(b'|', ..) => return Some((Token::Pipe, start_pos)), ('|', ..) => return Some((Token::Pipe, start_pos)),
(b'&', b'&') => { ('&', '&') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::And, start_pos)); return Some((Token::And, start_pos));
} }
(b'&', b'=') => { ('&', '=') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::AndAssign, start_pos)); return Some((Token::AndAssign, start_pos));
} }
(b'&', ..) => return Some((Token::Ampersand, start_pos)), ('&', ..) => return Some((Token::Ampersand, start_pos)),
(b'^', b'=') => { ('^', '=') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::XOrAssign, start_pos)); return Some((Token::XOrAssign, start_pos));
} }
(b'^', ..) => return Some((Token::XOr, start_pos)), ('^', ..) => return Some((Token::XOr, start_pos)),
(b'~', ..) => return Some((Token::Reserved(Box::new("~".into())), start_pos)), ('~', ..) => return Some((Token::Reserved(Box::new("~".into())), start_pos)),
(b'%', b'=') => { ('%', '=') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::ModuloAssign, start_pos)); return Some((Token::ModuloAssign, start_pos));
} }
(b'%', ..) => return Some((Token::Modulo, start_pos)), ('%', ..) => return Some((Token::Modulo, start_pos)),
(b'@', ..) => return Some((Token::Reserved(Box::new("@".into())), start_pos)), ('@', ..) => return Some((Token::Reserved(Box::new("@".into())), start_pos)),
(b'$', ..) => return Some((Token::Reserved(Box::new("$".into())), start_pos)), ('$', ..) => return Some((Token::Reserved(Box::new("$".into())), start_pos)),
(b'?', b'.') => { ('?', '.') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some(( return Some((
#[cfg(not(feature = "no_object"))] #[cfg(not(feature = "no_object"))]
@ -2184,11 +2167,11 @@ fn get_next_token_inner(
start_pos, start_pos,
)); ));
} }
(b'?', b'?') => { ('?', '?') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some((Token::DoubleQuestion, start_pos)); return Some((Token::DoubleQuestion, start_pos));
} }
(b'?', b'[') => { ('?', '[') => {
eat_next_and_advance(stream, pos); eat_next_and_advance(stream, pos);
return Some(( return Some((
#[cfg(not(feature = "no_index"))] #[cfg(not(feature = "no_index"))]
@ -2198,7 +2181,12 @@ fn get_next_token_inner(
start_pos, start_pos,
)); ));
} }
(b'?', ..) => return Some((Token::Reserved(Box::new("?".into())), start_pos)), ('?', ..) => return Some((Token::Reserved(Box::new("?".into())), start_pos)),
// letter or underscore ...
_ if is_id_first_alphabetic(c) || c == '_' => {
return Some(parse_identifier_token(stream, state, pos, start_pos, c));
}
_ if c.is_whitespace() => (), _ if c.is_whitespace() => (),
@ -2323,6 +2311,8 @@ pub fn is_id_continue(x: char) -> bool {
#[inline] #[inline]
#[must_use] #[must_use]
pub fn is_reserved_keyword_or_symbol(syntax: &str) -> (bool, bool, bool) { pub fn is_reserved_keyword_or_symbol(syntax: &str) -> (bool, bool, bool) {
// This implementation is based upon a pre-calculated table generated
// by GNU gperf on the list of keywords.
let utf8 = syntax.as_bytes(); let utf8 = syntax.as_bytes();
let len = utf8.len(); let len = utf8.len();
let rounds = len.min(3); let rounds = len.min(3);
@ -2342,7 +2332,13 @@ pub fn is_reserved_keyword_or_symbol(syntax: &str) -> (bool, bool, bool) {
match RESERVED_LIST[hash_val] { match RESERVED_LIST[hash_val] {
("", ..) => (false, false, false), ("", ..) => (false, false, false),
(s, true, a, b) => (s == syntax, a, b), (s, true, a, b) => (
// Fail early to avoid calling memcmp()
// Since we are already working with bytes, mind as well check the first one
s.len() == len && s.as_bytes()[0] == utf8[0] && s == syntax,
a,
b,
),
_ => (false, false, false), _ => (false, false, false),
} }
} }