Fine tune table-driven tokenizing.

This commit is contained in:
Stephen Chung 2023-03-15 17:22:11 +08:00
parent 2aa7b99d1e
commit 41636eac55
11 changed files with 351 additions and 159 deletions

2
.gitignore vendored
View File

@ -6,9 +6,9 @@ benches/results
clippy.toml clippy.toml
Rhai.toml Rhai.toml
**/*.bat **/*.bat
**/*.exe
doc/rhai-sync.json doc/rhai-sync.json
doc/rhai.json doc/rhai.json
tools/
.idea/ .idea/
.idea .idea
.idea/* .idea/*

View File

@ -31,6 +31,7 @@ Enhancements
* Range cases in `switch` statements now also match floating-point and decimal values. In order to support this, however, small numeric ranges cases are no longer unrolled. * Range cases in `switch` statements now also match floating-point and decimal values. In order to support this, however, small numeric ranges cases are no longer unrolled.
* Loading a module via `import` now gives the module access to the current scope, including variables and constants defined inside. * Loading a module via `import` now gives the module access to the current scope, including variables and constants defined inside.
* Some very simple operator calls (e.g. integer add) are short-circuited to avoid the overhead of a function call, resulting in a small speed improvement. * Some very simple operator calls (e.g. integer add) are short-circuited to avoid the overhead of a function call, resulting in a small speed improvement.
* The tokenizer now uses table-driven keyword recognizers generated by GNU gperf. At least _theoretically_ it should be faster...
Version 1.12.0 Version 1.12.0

View File

@ -28,4 +28,5 @@ Sub-Directories
| `func` | Support for function calls | | `func` | Support for function calls |
| `eval` | Evaluation engine | | `eval` | Evaluation engine |
| `serde` | Support for [`serde`](https://crates.io/crates/serde) | | `serde` | Support for [`serde`](https://crates.io/crates/serde) |
| `tools` | External tools needed for building |
| `bin` | Pre-built CLI binaries (e.g. `rhai-run`, `rhai-repl`) | | `bin` | Pre-built CLI binaries (e.g. `rhai-run`, `rhai-repl`) |

View File

@ -232,7 +232,7 @@ impl Engine {
} }
let token = Token::lookup_symbol_from_syntax(s).or_else(|| { let token = Token::lookup_symbol_from_syntax(s).or_else(|| {
if is_reserved_keyword_or_symbol(s) { if is_reserved_keyword_or_symbol(s).0 {
Some(Token::Reserved(Box::new(s.into()))) Some(Token::Reserved(Box::new(s.into())))
} else { } else {
None None
@ -296,7 +296,7 @@ impl Engine {
// Identifier or symbol in first position // Identifier or symbol in first position
_ if segments.is_empty() _ if segments.is_empty()
&& (is_valid_identifier(s) || is_reserved_keyword_or_symbol(s)) => && (is_valid_identifier(s) || is_reserved_keyword_or_symbol(s).0) =>
{ {
// Make it a custom keyword/symbol if it is disabled or reserved // Make it a custom keyword/symbol if it is disabled or reserved
if self if self

View File

@ -54,7 +54,7 @@ impl Engine {
#[inline(always)] #[inline(always)]
#[must_use] #[must_use]
pub fn module_resolver(&self) -> &dyn crate::ModuleResolver { pub fn module_resolver(&self) -> &dyn crate::ModuleResolver {
const DUMMY_RESOLVER: crate::module::resolvers::DummyModuleResolver = static DUMMY_RESOLVER: crate::module::resolvers::DummyModuleResolver =
crate::module::resolvers::DummyModuleResolver; crate::module::resolvers::DummyModuleResolver;
self.module_resolver.as_deref().unwrap_or(&DUMMY_RESOLVER) self.module_resolver.as_deref().unwrap_or(&DUMMY_RESOLVER)

View File

@ -11,7 +11,7 @@ use crate::engine::{Precedence, KEYWORD_THIS, OP_CONTAINS, OP_NOT};
use crate::eval::{Caches, GlobalRuntimeState}; use crate::eval::{Caches, GlobalRuntimeState};
use crate::func::{hashing::get_hasher, StraightHashMap}; use crate::func::{hashing::get_hasher, StraightHashMap};
use crate::tokenizer::{ use crate::tokenizer::{
is_keyword_function, is_valid_function_name, is_valid_identifier, Token, TokenStream, is_reserved_keyword_or_symbol, is_valid_function_name, is_valid_identifier, Token, TokenStream,
TokenizerControl, TokenizerControl,
}; };
use crate::types::dynamic::{AccessMode, Union}; use crate::types::dynamic::{AccessMode, Union};
@ -1665,7 +1665,9 @@ impl Engine {
match input.peek().expect(NEVER_ENDS).0 { match input.peek().expect(NEVER_ENDS).0 {
// Function call is allowed to have reserved keyword // Function call is allowed to have reserved keyword
Token::LeftParen | Token::Bang | Token::Unit if is_keyword_function(&s).0 => { Token::LeftParen | Token::Bang | Token::Unit
if is_reserved_keyword_or_symbol(&s).1 =>
{
Expr::Variable( Expr::Variable(
(None, ns, 0, state.get_interned_string(*s)).into(), (None, ns, 0, state.get_interned_string(*s)).into(),
None, None,
@ -1824,7 +1826,7 @@ impl Engine {
// Prevents capturing of the object properties as vars: xxx.<var> // Prevents capturing of the object properties as vars: xxx.<var>
state.allow_capture = false; state.allow_capture = false;
} }
(Token::Reserved(s), ..) if is_keyword_function(s).1 => (), (Token::Reserved(s), ..) if is_reserved_keyword_or_symbol(s).2 => (),
(Token::Reserved(s), pos) => { (Token::Reserved(s), pos) => {
return Err(PERR::Reserved(s.to_string()).into_err(*pos)) return Err(PERR::Reserved(s.to_string()).into_err(*pos))
} }

View File

@ -1,9 +1,6 @@
//! Main module defining the lexer and parser. //! Main module defining the lexer and parser.
use crate::engine::{ use crate::engine::Precedence;
Precedence, KEYWORD_DEBUG, KEYWORD_EVAL, KEYWORD_FN_PTR, KEYWORD_FN_PTR_CALL,
KEYWORD_FN_PTR_CURRY, KEYWORD_IS_DEF_VAR, KEYWORD_PRINT, KEYWORD_TYPE_OF,
};
use crate::func::native::OnParseTokenCallback; use crate::func::native::OnParseTokenCallback;
use crate::{Engine, Identifier, LexError, Position, SmartString, StaticVec, INT, UNSIGNED_INT}; use crate::{Engine, Identifier, LexError, Position, SmartString, StaticVec, INT, UNSIGNED_INT};
use smallvec::SmallVec; use smallvec::SmallVec;
@ -308,7 +305,9 @@ impl fmt::Display for Token {
} }
} }
// Table-driven keyword recognizer generated by GNU gperf. // Table-driven keyword recognizer generated by GNU gperf on the file `tools/keywords.txt`.
//
// When adding new keywords, make sure to update `tools/keywords.txt` and re-generate this.
const MIN_KEYWORD_LEN: usize = 1; const MIN_KEYWORD_LEN: usize = 1;
const MAX_KEYWORD_LEN: usize = 8; const MAX_KEYWORD_LEN: usize = 8;
@ -508,7 +507,9 @@ static KEYWORDS_LIST: [(&str, Token); 153] = [
("#{", Token::MapStart), ("#{", Token::MapStart),
]; ];
// Table-driven reserved symbol recognizer generated by GNU gperf. // Table-driven reserved symbol recognizer generated by GNU gperf on the file `tools/reserved.txt`.
//
// When adding new reserved symbols, make sure to update `tools/reserved.txt` and re-generate this.
const MIN_RESERVED_LEN: usize = 1; const MIN_RESERVED_LEN: usize = 1;
const MAX_RESERVED_LEN: usize = 10; const MAX_RESERVED_LEN: usize = 10;
@ -530,120 +531,120 @@ static RESERVED_ASSOC_VALUES: [u8; 256] = [
113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
]; ];
static RESERVED_LIST: [(&str, bool); 113] = [ static RESERVED_LIST: [(&str, bool, bool, bool); 113] = [
("", false), ("", false, false, false),
("~", true), ("~", true, false, false),
("is", true), ("is", true, false, false),
("...", true), ("...", true, false, false),
("", false), ("", false, false, false),
("print", true), ("print", true, true, false),
("@", true), ("@", true, false, false),
("private", cfg!(feature = "no_function")), ("private", cfg!(feature = "no_function"), false, false),
("", false), ("", false, false, false),
("this", true), ("this", true, false, false),
("", false), ("", false, false, false),
("thread", true), ("thread", true, false, false),
("as", cfg!(feature = "no_module")), ("as", cfg!(feature = "no_module"), false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("spawn", true), ("spawn", true, false, false),
("static", true), ("static", true, false, false),
(":=", true), (":=", true, false, false),
("===", true), ("===", true, false, false),
("case", true), ("case", true, false, false),
("super", true), ("super", true, false, false),
("shared", true), ("shared", true, false, false),
("package", true), ("package", true, false, false),
("use", true), ("use", true, false, false),
("with", true), ("with", true, false, false),
("curry", true), ("curry", true, true, true),
("$", true), ("$", true, false, false),
("type_of", true), ("type_of", true, true, true),
("nil", true), ("nil", true, false, false),
("sync", true), ("sync", true, false, false),
("yield", true), ("yield", true, false, false),
("import", cfg!(feature = "no_module")), ("import", cfg!(feature = "no_module"), false, false),
("--", true), ("--", true, false, false),
("new", true), ("new", true, false, false),
("exit", true), ("exit", true, false, false),
("async", true), ("async", true, false, false),
("export", cfg!(feature = "no_module")), ("export", cfg!(feature = "no_module"), false, false),
("!.", true), ("!.", true, false, false),
("", false), ("", false, false, false),
("call", true), ("call", true, true, true),
("match", true), ("match", true, false, false),
("", false), ("", false, false, false),
("fn", cfg!(feature = "no_function")), ("fn", cfg!(feature = "no_function"), false, false),
("var", true), ("var", true, false, false),
("null", true), ("null", true, false, false),
("await", true), ("await", true, false, false),
("#", true), ("#", true, false, false),
("default", true), ("default", true, false, false),
("!==", true), ("!==", true, false, false),
("eval", true), ("eval", true, true, false),
("debug", true), ("debug", true, true, false),
("?", true), ("?", true, false, false),
("?.", cfg!(feature = "no_object")), ("?.", cfg!(feature = "no_object"), false, false),
("", false), ("", false, false, false),
("protected", true), ("protected", true, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("go", true), ("go", true, false, false),
("", false), ("", false, false, false),
("goto", true), ("goto", true, false, false),
("", false), ("", false, false, false),
("public", true), ("public", true, false, false),
("<-", true), ("<-", true, false, false),
("", false), ("", false, false, false),
("is_def_fn", cfg!(not(feature = "no_function"))), ("is_def_fn", cfg!(not(feature = "no_function")), true, false),
("is_def_var", true), ("is_def_var", true, true, false),
("", false), ("", false, false, false),
("<|", true), ("<|", true, false, false),
("::<", true), ("::<", true, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("->", true), ("->", true, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("module", true), ("module", true, false, false),
("|>", true), ("|>", true, false, false),
("", false), ("", false, false, false),
("void", true), ("void", true, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("#!", true), ("#!", true, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("?[", cfg!(feature = "no_index")), ("?[", cfg!(feature = "no_index"), false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("Fn", true), ("Fn", true, true, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
(":;", true), (":;", true, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("++", true), ("++", true, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("*)", true), ("*)", true, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("", false), ("", false, false, false),
("(*", true), ("(*", true, false, false),
]; ];
impl Token { impl Token {
@ -2250,7 +2251,7 @@ fn parse_identifier_token(
return (token, start_pos); return (token, start_pos);
} }
if is_reserved_keyword_or_symbol(&identifier) { if is_reserved_keyword_or_symbol(&identifier).0 {
return (Token::Reserved(Box::new(identifier)), start_pos); return (Token::Reserved(Box::new(identifier)), start_pos);
} }
@ -2264,30 +2265,6 @@ fn parse_identifier_token(
(Token::Identifier(identifier.into()), start_pos) (Token::Identifier(identifier.into()), start_pos)
} }
/// Can a keyword be called like a function?
///
/// # Return values
///
/// The first `bool` indicates whether the keyword can be called normally as a function.
///
/// The second `bool` indicates whether the keyword can be called in method-call style.
#[inline]
#[must_use]
pub fn is_keyword_function(name: &str) -> (bool, bool) {
match name {
KEYWORD_TYPE_OF | KEYWORD_FN_PTR_CALL | KEYWORD_FN_PTR_CURRY => (true, true),
KEYWORD_PRINT | KEYWORD_DEBUG | KEYWORD_EVAL | KEYWORD_FN_PTR | KEYWORD_IS_DEF_VAR => {
(true, false)
}
#[cfg(not(feature = "no_function"))]
crate::engine::KEYWORD_IS_DEF_FN => (true, false),
_ => (false, false),
}
}
/// _(internals)_ Is a text string a valid identifier? /// _(internals)_ Is a text string a valid identifier?
/// Exported under the `internals` feature only. /// Exported under the `internals` feature only.
#[must_use] #[must_use]
@ -2313,7 +2290,7 @@ pub fn is_valid_identifier(name: &str) -> bool {
#[must_use] #[must_use]
pub fn is_valid_function_name(name: &str) -> bool { pub fn is_valid_function_name(name: &str) -> bool {
is_valid_identifier(name) is_valid_identifier(name)
&& !is_reserved_keyword_or_symbol(name) && !is_reserved_keyword_or_symbol(name).0
&& Token::lookup_symbol_from_syntax(name).is_none() && Token::lookup_symbol_from_syntax(name).is_none()
} }
@ -2350,16 +2327,24 @@ pub const fn is_id_continue(x: char) -> bool {
} }
/// Is a piece of syntax a reserved keyword or reserved symbol? /// Is a piece of syntax a reserved keyword or reserved symbol?
///
/// # Return values
///
/// The first `bool` indicates whether it is a reserved keyword or symbol.
///
/// The second `bool` indicates whether the keyword can be called normally as a function.
///
/// The third `bool` indicates whether the keyword can be called in method-call style.
#[inline] #[inline]
#[must_use] #[must_use]
pub fn is_reserved_keyword_or_symbol(syntax: &str) -> bool { pub fn is_reserved_keyword_or_symbol(syntax: &str) -> (bool, bool, bool) {
let utf8 = syntax.as_bytes(); let utf8 = syntax.as_bytes();
let len = utf8.len(); let len = utf8.len();
let rounds = len.min(3); let rounds = len.min(3);
let mut hash_val = len; let mut hash_val = len;
if !(MIN_RESERVED_LEN..=MAX_RESERVED_LEN).contains(&len) { if !(MIN_RESERVED_LEN..=MAX_RESERVED_LEN).contains(&len) {
return false; return (false, false, false);
} }
for x in 0..rounds { for x in 0..rounds {
@ -2367,12 +2352,13 @@ pub fn is_reserved_keyword_or_symbol(syntax: &str) -> bool {
} }
if !(MIN_RESERVED_HASH_VALUE..=MAX_RESERVED_HASH_VALUE).contains(&hash_val) { if !(MIN_RESERVED_HASH_VALUE..=MAX_RESERVED_HASH_VALUE).contains(&hash_val) {
return false; return (false, false, false);
} }
match RESERVED_LIST[hash_val] { match RESERVED_LIST[hash_val] {
(s, t) if s == syntax => t, ("", ..) => (false, false, false),
_ => false, (s, true, a, b) => (s == syntax, a, b),
_ => (false, false, false),
} }
} }

7
src/tools/README.md Normal file
View File

@ -0,0 +1,7 @@
Build Tools
===========
| File | Description |
| -------------- | ------------------------------------------- |
| `keywords.txt` | Input file for GNU gperf for the tokenizer. |
| `reserved.txt` | Input file for GNU gperf for the tokenizer. |

102
src/tools/keywords.txt Normal file
View File

@ -0,0 +1,102 @@
// This file holds a list of keywords/symbols for the Rhai language, with mapping to
// an appropriate `Token` variant.
//
// Generate the output table via:
// ```bash
// gperf -t keywords.txt
// ```
//
// Since GNU gperf does not produce Rust output, the ANSI-C output must be hand-edited and
// manually spliced into `tokenizer.rs`.
//
// This includes:
// * Rewrite the C hashing program (especially since it uses a `switch` statement with fall-through)
// into equivalent Rust as the function `lookup_symbol_from_syntax`.
// * Update the values for the `???_KEYWORD_???` constants.
// * Copy the `asso_values` array into `KEYWORD_ASSOC_VALUES`.
// * Copy the `wordlist` array into `KEYWORDS_LIST` with the following modifications:
// - Remove the `#line` comments
// - Change the entry wrapping `{ .. }` into tuples `( .. )`
// - Replace all entries `("")` by `("", Token::EOF)`
// - Put feature flags on the appropriate lines, and duplicating lines that maps to `Token::EOF`
// for the opposite feature flags
//
struct keyword;
%%
"{", Token::LeftBrace
"}", Token::RightBrace
"(", Token::LeftParen
")", Token::RightParen
"[", Token::LeftBracket
"]", Token::RightBracket
"()", Token::Unit
"+", Token::Plus
"-", Token::Minus
"*", Token::Multiply
"/", Token::Divide
";", Token::SemiColon
":", Token::Colon
"::", Token::DoubleColon
"=>", Token::DoubleArrow
"_", Token::Underscore
",", Token::Comma
".", Token::Period
"?.", Token::Elvis
"??", Token::DoubleQuestion
"?[", Token::QuestionBracket
"..", Token::ExclusiveRange
"..=", Token::InclusiveRange
"#{", Token::MapStart
"=", Token::Equals
"true", Token::True
"false", Token::False
"let", Token::Let
"const", Token::Const
"if", Token::If
"else", Token::Else
"switch", Token::Switch
"do", Token::Do
"while", Token::While
"until", Token::Until
"loop", Token::Loop
"for", Token::For
"in", Token::In
"!in", Token::NotIn
"<", Token::LessThan
">", Token::GreaterThan
"<=", Token::LessThanEqualsTo
">=", Token::GreaterThanEqualsTo
"==", Token::EqualsTo
"!=", Token::NotEqualsTo
"!", Token::Bang
"|", Token::Pipe
"||", Token::Or
"&", Token::Ampersand
"&&", Token::And
"continue", Token::Continue
"break", Token::Break
"return", Token::Return
"throw", Token::Throw
"try", Token::Try
"catch", Token::Catch
"+=", Token::PlusAssign
"-=", Token::MinusAssign
"*=", Token::MultiplyAssign
"/=", Token::DivideAssign
"<<=", Token::LeftShiftAssign
">>=", Token::RightShiftAssign
"&=", Token::AndAssign
"|=", Token::OrAssign
"^=", Token::XOrAssign
"<<", Token::LeftShift
">>", Token::RightShift
"^", Token::XOr
"%", Token::Modulo
"%=", Token::ModuloAssign
"**", Token::PowerOf
"**=", Token::PowerOfAssign
"fn", Token::Fn
"private", Token::Private
"import", Token::Import
"export", Token::Export
"as", Token::As

93
src/tools/reserved.txt Normal file
View File

@ -0,0 +1,93 @@
// This file holds a list of reserved symbols for the Rhai language.
//
// The mapped attributes are:
// - is this a reserved symbol? (bool)
// - can this keyword be called normally as a function? (bool)
// - can this keyword be called in method-call style? (bool)
//
// Generate the output table via:
// ```bash
// gperf -t reserved.txt
// ```
//
// Since GNU gperf does not produce Rust output, the ANSI-C output must be hand-edited and
// manually spliced into `tokenizer.rs`.
//
// This includes:
// * Rewrite the C hashing program (especially since it uses a `switch` statement with fall-through)
// into equivalent Rust as the function `is_reserved_keyword_or_symbol`.
// * Update the values for the `???_RESERVED_???` constants.
// * Copy the `asso_values` array into `RESERVED_ASSOC_VALUES`.
// * Copy the `wordlist` array into `RESERVED_LIST` with the following modifications:
// - Remove the `#line` comments
// - Change the entry wrapping `{ .. }` into tuples `( .. )`
// - Replace all entries `("")` by `("", false, false, false)`
// - Feature flags can be incorporated directly into the output via the `cfg!` macro
//
struct reserved;
%%
"?.", cfg!(feature = "no_object"), false, false
"?[", cfg!(feature = "no_index"), false, false
"fn", cfg!(feature = "no_function"), false, false
"private", cfg!(feature = "no_function"), false, false
"import", cfg!(feature = "no_module"), false, false
"export", cfg!(feature = "no_module"), false, false
"as", cfg!(feature = "no_module"), false, false
"===", true, false, false
"!==", true, false, false
"->", true, false, false
"<-", true, false, false
"?", true, false, false
":=", true, false, false
":;", true, false, false
"~", true, false, false
"!.", true, false, false
"::<", true, false, false
"(*", true, false, false
"*)", true, false, false
"#", true, false, false
"#!", true, false, false
"@", true, false, false
"$", true, false, false
"++", true, false, false
"--", true, false, false
"...", true, false, false
"<|", true, false, false
"|>", true, false, false
"public", true, false, false
"protected", true, false, false
"super", true, false, false
"new", true, false, false
"use", true, false, false
"module", true, false, false
"package", true, false, false
"var", true, false, false
"static", true, false, false
"shared", true, false, false
"with", true, false, false
"is", true, false, false
"goto", true, false, false
"exit", true, false, false
"match", true, false, false
"case", true, false, false
"default", true, false, false
"void", true, false, false
"null", true, false, false
"nil", true, false, false
"spawn", true, false, false
"thread", true, false, false
"go", true, false, false
"sync", true, false, false
"async", true, false, false
"await", true, false, false
"yield", true, false, false
"print", true, true, false
"debug", true, true, false
"type_of", true, true, true
"eval", true, true, false
"Fn", true, true, false
"call", true, true, true
"curry", true, true, true
"this", true, false, false
"is_def_var", true, true, false
"is_def_fn", cfg!(not(feature = "no_function")), true, false

View File

@ -538,7 +538,7 @@ impl TryFrom<ImmutableString> for FnPtr {
#[cfg(not(feature = "no_function"))] #[cfg(not(feature = "no_function"))]
fn_def: None, fn_def: None,
}) })
} else if is_reserved_keyword_or_symbol(&value) } else if is_reserved_keyword_or_symbol(&value).0
|| Token::lookup_symbol_from_syntax(&value).is_some() || Token::lookup_symbol_from_syntax(&value).is_some()
{ {
Err( Err(