From 56a8120d7589a2a8296d58a866892e4e2b3dca22 Mon Sep 17 00:00:00 2001 From: ekicyou Date: Wed, 29 Jul 2020 05:26:57 +0900 Subject: [PATCH 1/8] add get_identifier --- src/token.rs | 66 +++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/src/token.rs b/src/token.rs index 775e78db..2bcb3c00 100644 --- a/src/token.rs +++ b/src/token.rs @@ -1076,35 +1076,7 @@ fn get_next_token_inner( // letter or underscore ... ('A'..='Z', _) | ('a'..='z', _) | ('_', _) => { - let mut result = Vec::new(); - result.push(c); - - while let Some(next_char) = stream.peek_next() { - match next_char { - x if x.is_ascii_alphanumeric() || x == '_' => { - result.push(x); - eat_next(stream, pos); - } - _ => break, - } - } - - let is_valid_identifier = is_valid_identifier(result.iter().cloned()); - - let identifier: String = result.into_iter().collect(); - - if !is_valid_identifier { - return Some(( - Token::LexError(Box::new(LERR::MalformedIdentifier(identifier))), - start_pos, - )); - } - - return Some(( - Token::lookup_from_syntax(&identifier) - .unwrap_or_else(|| Token::Identifier(identifier)), - start_pos, - )); + return get_identifier(stream, pos, start_pos, c); } // " - string literal @@ -1400,6 +1372,42 @@ fn get_next_token_inner( } } +/// Get the next identifier. +fn get_identifier( + stream: &mut impl InputStream, + pos: &mut Position, + start_pos: Position, + first_char: char, +) -> Option<(Token, Position)> { + let mut result = Vec::new(); + result.push(first_char); + + while let Some(next_char) = stream.peek_next() { + match next_char { + x if x.is_ascii_alphanumeric() || x == '_' => { + result.push(x); + eat_next(stream, pos); + } + _ => break, + } + } + + let is_valid_identifier = is_valid_identifier(result.iter().cloned()); + + let identifier: String = result.into_iter().collect(); + + if !is_valid_identifier { + return Some(( + Token::LexError(Box::new(LERR::MalformedIdentifier(identifier))), + start_pos, + )); + } + + return Some(( + Token::lookup_from_syntax(&identifier).unwrap_or_else(|| Token::Identifier(identifier)), + start_pos, + )); +} /// A type that implements the `InputStream` trait. /// Multiple character streams are jointed together to form one single stream. pub struct MultiInputsStream<'a> { From c6d5bd000d8e35536c8a5e11b5f39e65b66e2f43 Mon Sep 17 00:00:00 2001 From: ekicyou Date: Wed, 29 Jul 2020 06:24:41 +0900 Subject: [PATCH 2/8] move is_valid_identifier --- src/token.rs | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/src/token.rs b/src/token.rs index 2bcb3c00..ec357a71 100644 --- a/src/token.rs +++ b/src/token.rs @@ -714,22 +714,6 @@ pub trait InputStream { fn peek_next(&mut self) -> Option; } -pub fn is_valid_identifier(name: impl Iterator) -> bool { - let mut first_alphabetic = false; - - for ch in name { - match ch { - '_' => (), - _ if char::is_ascii_alphabetic(&ch) => first_alphabetic = true, - _ if !first_alphabetic => return false, - _ if char::is_ascii_alphanumeric(&ch) => (), - _ => return false, - } - } - - first_alphabetic -} - /// [INTERNALS] Parse a string literal wrapped by `enclosing_char`. /// Exported under the `internals` feature only. /// @@ -1384,7 +1368,7 @@ fn get_identifier( while let Some(next_char) = stream.peek_next() { match next_char { - x if x.is_ascii_alphanumeric() || x == '_' => { + x if is_id_continue(x) => { result.push(x); eat_next(stream, pos); } @@ -1408,6 +1392,31 @@ fn get_identifier( start_pos, )); } + +pub fn is_valid_identifier(name: impl Iterator) -> bool { + let mut first_alphabetic = false; + + for ch in name { + match ch { + '_' => (), + _ if is_first_alphabetic(ch) => first_alphabetic = true, + _ if !first_alphabetic => return false, + _ if char::is_ascii_alphanumeric(&ch) => (), + _ => return false, + } + } + + first_alphabetic +} + +fn is_first_alphabetic(x: char) -> bool { + x.is_ascii_alphabetic() +} + +fn is_id_continue(x: char) -> bool { + x.is_ascii_alphanumeric() || x == '_' +} + /// A type that implements the `InputStream` trait. /// Multiple character streams are jointed together to form one single stream. pub struct MultiInputsStream<'a> { From 79022b185809b615502ef165e61f8e86212ea845 Mon Sep 17 00:00:00 2001 From: ekicyou Date: Wed, 29 Jul 2020 06:54:23 +0900 Subject: [PATCH 3/8] refactoring --- src/token.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/token.rs b/src/token.rs index ec357a71..d79c5f72 100644 --- a/src/token.rs +++ b/src/token.rs @@ -1399,7 +1399,7 @@ pub fn is_valid_identifier(name: impl Iterator) -> bool { for ch in name { match ch { '_' => (), - _ if is_first_alphabetic(ch) => first_alphabetic = true, + _ if is_id_first_alphabetic(ch) => first_alphabetic = true, _ if !first_alphabetic => return false, _ if char::is_ascii_alphanumeric(&ch) => (), _ => return false, @@ -1409,7 +1409,7 @@ pub fn is_valid_identifier(name: impl Iterator) -> bool { first_alphabetic } -fn is_first_alphabetic(x: char) -> bool { +fn is_id_first_alphabetic(x: char) -> bool { x.is_ascii_alphabetic() } From 9b0375b870d5e1812d0bb384376e143039c49aff Mon Sep 17 00:00:00 2001 From: ekicyou Date: Wed, 29 Jul 2020 07:03:21 +0900 Subject: [PATCH 4/8] add unicode-xid --- Cargo.toml | 1 + src/token.rs | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 2f2ab525..f0fdaa33 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ categories = [ "no-std", "embedded", "parser-implementations" ] [dependencies] num-traits = { version = "0.2.11", default-features = false } +unicode-xid = "0.2.1" [features] #default = ["unchecked", "sync", "no_optimize", "no_float", "only_i32", "no_index", "no_object", "no_function", "no_module"] diff --git a/src/token.rs b/src/token.rs index d79c5f72..6cb1a4d2 100644 --- a/src/token.rs +++ b/src/token.rs @@ -1338,6 +1338,9 @@ fn get_next_token_inner( ('\0', _) => unreachable!(), (ch, _) if ch.is_whitespace() => (), + (ch, _) if unicode_xid::UnicodeXID::is_xid_start(ch) => { + return get_identifier(stream, pos, start_pos, c); + } (ch, _) => { return Some(( Token::LexError(Box::new(LERR::UnexpectedInput(ch.to_string()))), @@ -1409,6 +1412,15 @@ pub fn is_valid_identifier(name: impl Iterator) -> bool { first_alphabetic } +fn is_id_first_alphabetic(x: char) -> bool { + unicode_xid::UnicodeXID::is_xid_start(x) +} + +fn is_id_continue(x: char) -> bool { + unicode_xid::UnicodeXID::is_xid_continue(x) +} + +/* fn is_id_first_alphabetic(x: char) -> bool { x.is_ascii_alphabetic() } @@ -1416,6 +1428,7 @@ fn is_id_first_alphabetic(x: char) -> bool { fn is_id_continue(x: char) -> bool { x.is_ascii_alphanumeric() || x == '_' } +*/ /// A type that implements the `InputStream` trait. /// Multiple character streams are jointed together to form one single stream. From 288e0a4d14ce1f5511560e33e24f2a9aa6c9109d Mon Sep 17 00:00:00 2001 From: ekicyou Date: Wed, 29 Jul 2020 08:25:37 +0900 Subject: [PATCH 5/8] add unicode_xid, test --- Cargo.toml | 7 ++++++- src/token.rs | 8 ++++++-- tests/tokens.rs | 18 ++++++++++++++++++ 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f0fdaa33..66bdba28 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,6 @@ categories = [ "no-std", "embedded", "parser-implementations" ] [dependencies] num-traits = { version = "0.2.11", default-features = false } -unicode-xid = "0.2.1" [features] #default = ["unchecked", "sync", "no_optimize", "no_float", "only_i32", "no_index", "no_object", "no_function", "no_module"] @@ -35,6 +34,7 @@ no_object = [] # no custom objects no_function = [] # no script-defined functions no_module = [] # no modules internals = [] # expose internal data structures +unicode-xid-ident = ["unicode-xid"] # allow unicode-xid for identifiers. # compiling for no-std no_std = [ "num-traits/libm", "hashbrown", "core-error", "libm", "ahash" ] @@ -74,6 +74,11 @@ default_features = false features = ["derive", "alloc"] optional = true +[dependencies.unicode-xid] +version = "0.2.1" +default_features = false +optional = true + [target.'cfg(target_arch = "wasm32")'.dependencies] instant= { version = "0.1.4", features = ["wasm-bindgen"] } # WASM implementation of std::time::Instant diff --git a/src/token.rs b/src/token.rs index 6cb1a4d2..d2f016cb 100644 --- a/src/token.rs +++ b/src/token.rs @@ -1338,6 +1338,7 @@ fn get_next_token_inner( ('\0', _) => unreachable!(), (ch, _) if ch.is_whitespace() => (), + #[cfg(feature = "unicode-xid-ident")] (ch, _) if unicode_xid::UnicodeXID::is_xid_start(ch) => { return get_identifier(stream, pos, start_pos, c); } @@ -1412,23 +1413,26 @@ pub fn is_valid_identifier(name: impl Iterator) -> bool { first_alphabetic } +#[cfg(feature = "unicode-xid-ident")] fn is_id_first_alphabetic(x: char) -> bool { unicode_xid::UnicodeXID::is_xid_start(x) } +#[cfg(feature = "unicode-xid-ident")] fn is_id_continue(x: char) -> bool { unicode_xid::UnicodeXID::is_xid_continue(x) } -/* +#[cfg(not(feature = "unicode-xid-ident"))] + fn is_id_first_alphabetic(x: char) -> bool { x.is_ascii_alphabetic() } +#[cfg(not(feature = "unicode-xid-ident"))] fn is_id_continue(x: char) -> bool { x.is_ascii_alphanumeric() || x == '_' } -*/ /// A type that implements the `InputStream` trait. /// Multiple character streams are jointed together to form one single stream. diff --git a/tests/tokens.rs b/tests/tokens.rs index 523beab7..2523af7d 100644 --- a/tests/tokens.rs +++ b/tests/tokens.rs @@ -51,3 +51,21 @@ fn test_tokens_custom_operator() -> Result<(), Box> { Ok(()) } + +#[test] +fn test_tokens_unicode_xid_ident() -> Result<(), Box> { + let engine = Engine::new(); + let result = engine.eval::( + r" + fn すべての答え() { 42 } + すべての答え() + ", + ); + #[cfg(feature = "unicode-xid-ident")] + assert_eq!(result?, 42); + + #[cfg(not(feature = "unicode-xid-ident"))] + assert!(result.is_err()); + + Ok(()) +} From aff7550f7dfd859e09fef3282b5030c056c26e01 Mon Sep 17 00:00:00 2001 From: ekicyou Date: Wed, 29 Jul 2020 08:38:38 +0900 Subject: [PATCH 6/8] document update --- .github/workflows/build.yml | 1 + doc/src/links.md | 1 + doc/src/start/features.md | 1 + 3 files changed, 3 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 49318e1c..1370e03e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -29,6 +29,7 @@ jobs: - "--features no_object" - "--features no_function" - "--features no_module" + - "--features unicode-xid-ident" toolchain: [stable] experimental: [false] include: diff --git a/doc/src/links.md b/doc/src/links.md index 889cee29..936c714d 100644 --- a/doc/src/links.md +++ b/doc/src/links.md @@ -12,6 +12,7 @@ [`no_std`]: {{rootUrl}}/start/features.md [`no-std`]: {{rootUrl}}/start/features.md [`internals`]: {{rootUrl}}/start/features.md +[`unicode-xid-ident`]: {{rootUrl}}/start/features.md [minimal builds]: {{rootUrl}}/start/builds/minimal.md [WASM]: {{rootUrl}}/start/builds/wasm.md diff --git a/doc/src/start/features.md b/doc/src/start/features.md index f2286a38..9f9f3808 100644 --- a/doc/src/start/features.md +++ b/doc/src/start/features.md @@ -26,6 +26,7 @@ more control over what a script can (or cannot) do. | `no_std` | Build for `no-std`. Notice that additional dependencies will be pulled in to replace `std` features. | | `serde` | Enable serialization/deserialization via [`serde`]. Notice that the [`serde`](https://crates.io/crates/serde) crate will be pulled in together with its dependencies. | | `internals` | Expose internal data structures (e.g. [`AST`] nodes). Beware that Rhai internals are volatile and may change from version to version. | +| `unicode-xid-ident` | Allow unicode-xid for identifiers. | Example From a836eb7f8b7fe8c77b894a9c0943e6915ac38b4e Mon Sep 17 00:00:00 2001 From: ekicyou Date: Wed, 29 Jul 2020 10:20:25 +0900 Subject: [PATCH 7/8] add RELEASES.md --- RELEASES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/RELEASES.md b/RELEASES.md index 22aae65d..364e9b10 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -19,6 +19,7 @@ New features * Custom syntax now works even without the `internals` feature. * Currying of function pointers is supported via the `curry` keyword. * `Module::set_indexer_get_set_fn` is added as a shorthand of both `Module::set_indexer_get_fn` and `Module::set_indexer_set_fn`. +* New `unicode-xid-ident` feature to allow unicode-xid for identifiers. Breaking changes ---------------- From 7d333fdc4ed8bcdc5fb237ba6b4e7b3e9f55c47d Mon Sep 17 00:00:00 2001 From: ekicyou Date: Wed, 29 Jul 2020 10:41:44 +0900 Subject: [PATCH 8/8] add test --- tests/tokens.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/tokens.rs b/tests/tokens.rs index 2523af7d..843fc719 100644 --- a/tests/tokens.rs +++ b/tests/tokens.rs @@ -67,5 +67,13 @@ fn test_tokens_unicode_xid_ident() -> Result<(), Box> { #[cfg(not(feature = "unicode-xid-ident"))] assert!(result.is_err()); + let result = engine.eval::( + r" + fn _1() { 1 } + _1() + ", + ); + assert!(result.is_err()); + Ok(()) }