diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 49318e1c..1370e03e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -29,6 +29,7 @@ jobs: - "--features no_object" - "--features no_function" - "--features no_module" + - "--features unicode-xid-ident" toolchain: [stable] experimental: [false] include: diff --git a/Cargo.toml b/Cargo.toml index 2f2ab525..66bdba28 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ no_object = [] # no custom objects no_function = [] # no script-defined functions no_module = [] # no modules internals = [] # expose internal data structures +unicode-xid-ident = ["unicode-xid"] # allow unicode-xid for identifiers. # compiling for no-std no_std = [ "num-traits/libm", "hashbrown", "core-error", "libm", "ahash" ] @@ -73,6 +74,11 @@ default_features = false features = ["derive", "alloc"] optional = true +[dependencies.unicode-xid] +version = "0.2.1" +default_features = false +optional = true + [target.'cfg(target_arch = "wasm32")'.dependencies] instant= { version = "0.1.4", features = ["wasm-bindgen"] } # WASM implementation of std::time::Instant diff --git a/RELEASES.md b/RELEASES.md index bd4623c0..87d12d44 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -20,6 +20,7 @@ New features * Custom syntax now works even without the `internals` feature. * Currying of function pointers is supported via the new `curry` keyword. * `Module::set_indexer_get_set_fn` is added as a shorthand of both `Module::set_indexer_get_fn` and `Module::set_indexer_set_fn`. +* New `unicode-xid-ident` feature to allow unicode-xid for identifiers. Breaking changes ---------------- diff --git a/doc/src/links.md b/doc/src/links.md index 889cee29..936c714d 100644 --- a/doc/src/links.md +++ b/doc/src/links.md @@ -12,6 +12,7 @@ [`no_std`]: {{rootUrl}}/start/features.md [`no-std`]: {{rootUrl}}/start/features.md [`internals`]: {{rootUrl}}/start/features.md +[`unicode-xid-ident`]: {{rootUrl}}/start/features.md [minimal builds]: {{rootUrl}}/start/builds/minimal.md [WASM]: {{rootUrl}}/start/builds/wasm.md diff --git a/doc/src/start/features.md b/doc/src/start/features.md index 0d31d077..83311cf1 100644 --- a/doc/src/start/features.md +++ b/doc/src/start/features.md @@ -26,6 +26,7 @@ more control over what a script can (or cannot) do. | `no_std` | Build for `no-std`. Notice that additional dependencies will be pulled in to replace `std` features. | | `serde` | Enable serialization/deserialization via `serde`. Notice that the [`serde`](https://crates.io/crates/serde) crate will be pulled in together with its dependencies. | | `internals` | Expose internal data structures (e.g. [`AST`] nodes). Beware that Rhai internals are volatile and may change from version to version. | +| `unicode-xid-ident` | Allow unicode-xid for identifiers. | Example diff --git a/src/token.rs b/src/token.rs index edbac0aa..9f5456d9 100644 --- a/src/token.rs +++ b/src/token.rs @@ -735,32 +735,6 @@ pub trait InputStream { fn peek_next(&mut self) -> Option; } -pub fn is_keyword_function(name: &str) -> bool { - name == KEYWORD_PRINT - || name == KEYWORD_DEBUG - || name == KEYWORD_TYPE_OF - || name == KEYWORD_EVAL - || name == KEYWORD_FN_PTR - || name == KEYWORD_FN_PTR_CALL - || name == KEYWORD_FN_PTR_CURRY -} - -pub fn is_valid_identifier(name: impl Iterator) -> bool { - let mut first_alphabetic = false; - - for ch in name { - match ch { - '_' => (), - _ if char::is_ascii_alphabetic(&ch) => first_alphabetic = true, - _ if !first_alphabetic => return false, - _ if char::is_ascii_alphanumeric(&ch) => (), - _ => return false, - } - } - - first_alphabetic -} - /// [INTERNALS] Parse a string literal wrapped by `enclosing_char`. /// Exported under the `internals` feature only. /// @@ -1107,35 +1081,7 @@ fn get_next_token_inner( // letter or underscore ... ('A'..='Z', _) | ('a'..='z', _) | ('_', _) => { - let mut result = Vec::new(); - result.push(c); - - while let Some(next_char) = stream.peek_next() { - match next_char { - x if x.is_ascii_alphanumeric() || x == '_' => { - result.push(x); - eat_next(stream, pos); - } - _ => break, - } - } - - let is_valid_identifier = is_valid_identifier(result.iter().cloned()); - - let identifier: String = result.into_iter().collect(); - - if !is_valid_identifier { - return Some(( - Token::LexError(Box::new(LERR::MalformedIdentifier(identifier))), - start_pos, - )); - } - - return Some(( - Token::lookup_from_syntax(&identifier) - .unwrap_or_else(|| Token::Identifier(identifier)), - start_pos, - )); + return get_identifier(stream, pos, start_pos, c); } // " - string literal @@ -1413,6 +1359,10 @@ fn get_next_token_inner( ('\0', _) => unreachable!(), (ch, _) if ch.is_whitespace() => (), + #[cfg(feature = "unicode-xid-ident")] + (ch, _) if unicode_xid::UnicodeXID::is_xid_start(ch) => { + return get_identifier(stream, pos, start_pos, c); + } (ch, _) => { return Some(( Token::LexError(Box::new(LERR::UnexpectedInput(ch.to_string()))), @@ -1431,6 +1381,91 @@ fn get_next_token_inner( } } +/// Get the next identifier. +fn get_identifier( + stream: &mut impl InputStream, + pos: &mut Position, + start_pos: Position, + first_char: char, +) -> Option<(Token, Position)> { + let mut result = Vec::new(); + result.push(first_char); + + while let Some(next_char) = stream.peek_next() { + match next_char { + x if is_id_continue(x) => { + result.push(x); + eat_next(stream, pos); + } + _ => break, + } + } + + let is_valid_identifier = is_valid_identifier(result.iter().cloned()); + + let identifier: String = result.into_iter().collect(); + + if !is_valid_identifier { + return Some(( + Token::LexError(Box::new(LERR::MalformedIdentifier(identifier))), + start_pos, + )); + } + + return Some(( + Token::lookup_from_syntax(&identifier).unwrap_or_else(|| Token::Identifier(identifier)), + start_pos, + )); +} + +/// Is this keyword allowed as a function? +pub fn is_keyword_function(name: &str) -> bool { + name == KEYWORD_PRINT + || name == KEYWORD_DEBUG + || name == KEYWORD_TYPE_OF + || name == KEYWORD_EVAL + || name == KEYWORD_FN_PTR + || name == KEYWORD_FN_PTR_CALL + || name == KEYWORD_FN_PTR_CURRY +} + +pub fn is_valid_identifier(name: impl Iterator) -> bool { + let mut first_alphabetic = false; + + for ch in name { + match ch { + '_' => (), + _ if is_id_first_alphabetic(ch) => first_alphabetic = true, + _ if !first_alphabetic => return false, + _ if char::is_ascii_alphanumeric(&ch) => (), + _ => return false, + } + } + + first_alphabetic +} + +#[cfg(feature = "unicode-xid-ident")] +fn is_id_first_alphabetic(x: char) -> bool { + unicode_xid::UnicodeXID::is_xid_start(x) +} + +#[cfg(feature = "unicode-xid-ident")] +fn is_id_continue(x: char) -> bool { + unicode_xid::UnicodeXID::is_xid_continue(x) +} + +#[cfg(not(feature = "unicode-xid-ident"))] + +fn is_id_first_alphabetic(x: char) -> bool { + x.is_ascii_alphabetic() +} + +#[cfg(not(feature = "unicode-xid-ident"))] +fn is_id_continue(x: char) -> bool { + x.is_ascii_alphanumeric() || x == '_' +} + /// A type that implements the `InputStream` trait. /// Multiple character streams are jointed together to form one single stream. pub struct MultiInputsStream<'a> { diff --git a/tests/tokens.rs b/tests/tokens.rs index 523beab7..843fc719 100644 --- a/tests/tokens.rs +++ b/tests/tokens.rs @@ -51,3 +51,29 @@ fn test_tokens_custom_operator() -> Result<(), Box> { Ok(()) } + +#[test] +fn test_tokens_unicode_xid_ident() -> Result<(), Box> { + let engine = Engine::new(); + let result = engine.eval::( + r" + fn すべての答え() { 42 } + すべての答え() + ", + ); + #[cfg(feature = "unicode-xid-ident")] + assert_eq!(result?, 42); + + #[cfg(not(feature = "unicode-xid-ident"))] + assert!(result.is_err()); + + let result = engine.eval::( + r" + fn _1() { 1 } + _1() + ", + ); + assert!(result.is_err()); + + Ok(()) +}