rhai/src/token.rs

1768 lines
53 KiB
Rust
Raw Normal View History

//! Main module defining the lexer and parser.
2020-07-16 06:09:31 +02:00
use crate::engine::{
2020-11-16 16:10:14 +01:00
KEYWORD_DEBUG, KEYWORD_EVAL, KEYWORD_FN_PTR, KEYWORD_FN_PTR_CALL, KEYWORD_FN_PTR_CURRY,
2020-10-03 10:25:58 +02:00
KEYWORD_IS_DEF_FN, KEYWORD_IS_DEF_VAR, KEYWORD_PRINT, KEYWORD_THIS, KEYWORD_TYPE_OF,
2020-07-16 06:09:31 +02:00
};
use crate::stdlib::{
borrow::Cow,
boxed::Box,
2020-10-25 14:57:18 +01:00
char, fmt, format,
iter::Peekable,
str::{Chars, FromStr},
string::{String, ToString},
};
2020-11-16 16:10:14 +01:00
use crate::{Engine, LexError, StaticVec, INT};
#[cfg(not(feature = "no_float"))]
use crate::FLOAT;
type LERR = LexError;
2020-07-05 09:23:51 +02:00
pub type TokenStream<'a, 't> = Peekable<TokenIterator<'a, 't>>;
2020-06-11 12:13:33 +02:00
/// A location (line number + character position) in the input script.
///
2020-07-28 13:11:37 +02:00
/// # Limitations
///
/// In order to keep footprint small, both line number and character position have 16-bit resolution,
/// meaning they go up to a maximum of 65,535 lines and 65,535 characters per line.
///
/// Advancing beyond the maximum line length or maximum number of lines is not an error but has no effect.
#[derive(Eq, PartialEq, Ord, PartialOrd, Hash, Clone, Copy)]
pub struct Position {
/// Line number - 0 = none
line: u16,
/// Character position - 0 = BOL
pos: u16,
}
impl Position {
2020-11-20 09:52:28 +01:00
/// A [`Position`] representing no position.
2020-11-02 16:54:19 +01:00
pub const NONE: Self = Self { line: 0, pos: 0 };
2020-11-20 09:52:28 +01:00
/// A [`Position`] representing the first position.
2020-11-02 16:54:19 +01:00
pub const START: Self = Self { line: 1, pos: 0 };
2020-11-20 09:52:28 +01:00
/// Create a new [`Position`].
2020-07-28 13:11:37 +02:00
///
/// `line` must not be zero.
2020-11-20 09:52:28 +01:00
/// If [`Position`] is zero, then it is at the beginning of a line.
2020-07-28 13:11:37 +02:00
///
/// # Panics
///
/// Panics if `line` is zero.
2020-10-08 16:25:50 +02:00
#[inline(always)]
pub fn new(line: u16, position: u16) -> Self {
assert!(line != 0, "line cannot be zero");
Self {
line,
pos: position,
}
}
2020-11-20 09:52:28 +01:00
/// Get the line number (1-based), or [`None`] if there is no position.
2020-10-08 16:25:50 +02:00
#[inline(always)]
2020-11-02 16:54:19 +01:00
pub fn line(self) -> Option<usize> {
if self.is_none() {
None
} else {
Some(self.line as usize)
}
}
2020-11-20 09:52:28 +01:00
/// Get the character position (1-based), or [`None`] if at beginning of a line.
2020-10-08 16:25:50 +02:00
#[inline(always)]
2020-11-02 16:54:19 +01:00
pub fn position(self) -> Option<usize> {
if self.is_none() || self.pos == 0 {
None
} else {
Some(self.pos as usize)
}
}
/// Advance by one character position.
2020-10-08 16:25:50 +02:00
#[inline(always)]
pub(crate) fn advance(&mut self) {
assert!(!self.is_none(), "cannot advance Position::none");
// Advance up to maximum position
if self.pos < u16::MAX {
self.pos += 1;
}
}
/// Go backwards by one character position.
///
/// # Panics
///
/// Panics if already at beginning of a line - cannot rewind to a previous line.
2020-10-08 16:25:50 +02:00
#[inline(always)]
pub(crate) fn rewind(&mut self) {
assert!(!self.is_none(), "cannot rewind Position::none");
assert!(self.pos > 0, "cannot rewind at position 0");
self.pos -= 1;
}
/// Advance to the next line.
2020-10-08 16:25:50 +02:00
#[inline(always)]
pub(crate) fn new_line(&mut self) {
assert!(!self.is_none(), "cannot advance Position::none");
// Advance up to maximum position
if self.line < u16::MAX {
self.line += 1;
self.pos = 0;
}
}
2020-11-20 09:52:28 +01:00
/// Is this [`Position`] at the beginning of a line?
2020-10-08 16:25:50 +02:00
#[inline(always)]
2020-11-02 16:54:19 +01:00
pub fn is_beginning_of_line(self) -> bool {
self.line == 0 && !self.is_none()
}
2020-11-20 09:52:28 +01:00
/// Is there no [`Position`]?
2020-10-08 16:25:50 +02:00
#[inline(always)]
2020-11-02 16:54:19 +01:00
pub fn is_none(self) -> bool {
self == Self::NONE
}
}
impl Default for Position {
2020-10-08 16:25:50 +02:00
#[inline(always)]
fn default() -> Self {
2020-11-02 16:54:19 +01:00
Self::START
}
}
impl fmt::Display for Position {
2020-10-08 16:25:50 +02:00
#[inline(always)]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.is_none() {
write!(f, "none")
} else {
write!(f, "line {}, position {}", self.line, self.pos)
}
}
}
impl fmt::Debug for Position {
2020-10-08 16:25:50 +02:00
#[inline(always)]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2020-05-05 09:00:10 +02:00
write!(f, "{}:{}", self.line, self.pos)
}
}
2020-11-20 09:52:28 +01:00
/// _(INTERNALS)_ A Rhai language token.
/// Exported under the `internals` feature only.
///
/// ## WARNING
///
/// This type is volatile and may change.
#[derive(Debug, PartialEq, Clone)]
pub enum Token {
/// An `INT` constant.
IntegerConstant(INT),
2020-07-28 13:11:37 +02:00
/// A `FLOAT` constant.
///
2020-07-28 13:11:37 +02:00
/// Reserved under the `no_float` feature.
2020-04-17 14:08:41 +02:00
#[cfg(not(feature = "no_float"))]
FloatConstant(FLOAT),
/// An identifier.
Identifier(String),
/// A character constant.
CharConstant(char),
/// A string constant.
2020-07-05 09:23:51 +02:00
StringConstant(String),
/// `{`
LeftBrace,
/// `}`
RightBrace,
/// `(`
LeftParen,
/// `)`
RightParen,
/// `[`
LeftBracket,
/// `]`
RightBracket,
/// `+`
Plus,
/// `+` (unary)
UnaryPlus,
/// `-`
Minus,
/// `-` (unary)
UnaryMinus,
/// `*`
Multiply,
/// `/`
Divide,
/// `%`
Modulo,
/// `~`
PowerOf,
/// `<<`
LeftShift,
/// `>>`
RightShift,
/// `;`
SemiColon,
/// `:`
Colon,
/// `::`
2020-05-03 19:19:01 +02:00
DoubleColon,
2020-11-13 11:32:18 +01:00
/// `=>`
DoubleArrow,
/// `_`
Underscore,
/// `,`
Comma,
/// `.`
Period,
/// `#{`
MapStart,
/// `=`
Equals,
/// `true`
True,
/// `false`
False,
/// `let`
Let,
/// `const`
Const,
/// `if`
If,
/// `else`
Else,
2020-11-13 11:32:18 +01:00
/// `switch`
Switch,
2020-11-20 15:23:37 +01:00
/// `do`
Do,
/// `while`
While,
2020-11-20 15:23:37 +01:00
/// `until`
Until,
/// `loop`
Loop,
/// `for`
For,
/// `in`
In,
/// `<`
LessThan,
/// `>`
GreaterThan,
/// `<=`
LessThanEqualsTo,
/// `>=`
GreaterThanEqualsTo,
/// `==`
EqualsTo,
/// `!=`
NotEqualsTo,
/// `!`
Bang,
/// `|`
Pipe,
/// `||`
Or,
/// `^`
XOr,
/// `&`
Ampersand,
/// `&&`
And,
/// `fn`
///
2020-07-28 13:11:37 +02:00
/// Reserved under the `no_function` feature.
#[cfg(not(feature = "no_function"))]
Fn,
/// `continue`
Continue,
/// `break`
Break,
/// `return`
Return,
/// `throw`
Throw,
2020-10-20 17:16:03 +02:00
/// `try`
Try,
/// `catch`
Catch,
/// `+=`
PlusAssign,
/// `-=`
MinusAssign,
/// `*=`
MultiplyAssign,
/// `/=`
DivideAssign,
/// `<<=`
LeftShiftAssign,
/// `>>=`
RightShiftAssign,
/// `&=`
AndAssign,
/// `|=`
OrAssign,
/// `^=`
XOrAssign,
/// `%=`
ModuloAssign,
/// `~=`
PowerOfAssign,
/// `private`
///
2020-07-28 13:11:37 +02:00
/// Reserved under the `no_function` feature.
2020-06-02 07:33:16 +02:00
#[cfg(not(feature = "no_function"))]
Private,
/// `import`
///
2020-07-28 13:11:37 +02:00
/// Reserved under the `no_module` feature.
2020-06-25 05:07:46 +02:00
#[cfg(not(feature = "no_module"))]
2020-05-04 11:43:54 +02:00
Import,
/// `export`
///
2020-07-28 13:11:37 +02:00
/// Reserved under the `no_module` feature.
#[cfg(not(feature = "no_module"))]
2020-05-04 11:43:54 +02:00
Export,
/// `as`
///
2020-07-28 13:11:37 +02:00
/// Reserved under the `no_module` feature.
2020-06-25 05:07:46 +02:00
#[cfg(not(feature = "no_module"))]
2020-05-04 11:43:54 +02:00
As,
/// A lexer error.
2020-11-02 16:54:19 +01:00
LexError(LexError),
/// A comment block.
2020-06-26 13:44:50 +02:00
Comment(String),
/// A reserved symbol.
2020-07-08 06:09:18 +02:00
Reserved(String),
/// A custom keyword.
2020-07-05 11:41:45 +02:00
Custom(String),
/// End of the input stream.
EOF,
}
impl Token {
/// Get the syntax of the token.
pub fn syntax(&self) -> Cow<'static, str> {
use Token::*;
match self {
IntegerConstant(i) => i.to_string().into(),
2020-04-17 14:08:41 +02:00
#[cfg(not(feature = "no_float"))]
FloatConstant(f) => f.to_string().into(),
2020-07-05 11:41:45 +02:00
StringConstant(_) => "string".into(),
CharConstant(c) => c.to_string().into(),
2020-07-05 11:41:45 +02:00
Identifier(s) => s.clone().into(),
2020-07-08 06:09:18 +02:00
Reserved(s) => s.clone().into(),
2020-07-05 11:41:45 +02:00
Custom(s) => s.clone().into(),
LexError(err) => err.to_string().into(),
2020-07-05 09:23:51 +02:00
token => match token {
LeftBrace => "{",
RightBrace => "}",
LeftParen => "(",
RightParen => ")",
LeftBracket => "[",
RightBracket => "]",
Plus => "+",
UnaryPlus => "+",
Minus => "-",
UnaryMinus => "-",
Multiply => "*",
Divide => "/",
SemiColon => ";",
Colon => ":",
2020-05-03 19:19:01 +02:00
DoubleColon => "::",
2020-11-13 11:32:18 +01:00
DoubleArrow => "=>",
Underscore => "_",
Comma => ",",
Period => ".",
MapStart => "#{",
Equals => "=",
True => "true",
False => "false",
Let => "let",
Const => "const",
If => "if",
Else => "else",
2020-11-13 11:32:18 +01:00
Switch => "switch",
2020-11-20 15:23:37 +01:00
Do => "do",
While => "while",
2020-11-20 15:23:37 +01:00
Until => "until",
Loop => "loop",
2020-05-04 11:43:54 +02:00
For => "for",
In => "in",
LessThan => "<",
GreaterThan => ">",
Bang => "!",
LessThanEqualsTo => "<=",
GreaterThanEqualsTo => ">=",
EqualsTo => "==",
NotEqualsTo => "!=",
Pipe => "|",
Or => "||",
Ampersand => "&",
And => "&&",
Continue => "continue",
Break => "break",
Return => "return",
Throw => "throw",
2020-10-20 17:16:03 +02:00
Try => "try",
Catch => "catch",
PlusAssign => "+=",
MinusAssign => "-=",
MultiplyAssign => "*=",
DivideAssign => "/=",
LeftShiftAssign => "<<=",
RightShiftAssign => ">>=",
AndAssign => "&=",
OrAssign => "|=",
XOrAssign => "^=",
LeftShift => "<<",
RightShift => ">>",
XOr => "^",
Modulo => "%",
ModuloAssign => "%=",
PowerOf => "~",
PowerOfAssign => "~=",
#[cfg(not(feature = "no_function"))]
Fn => "fn",
2020-06-02 07:33:16 +02:00
#[cfg(not(feature = "no_function"))]
Private => "private",
2020-07-05 09:23:51 +02:00
#[cfg(not(feature = "no_module"))]
2020-05-04 11:43:54 +02:00
Import => "import",
#[cfg(not(feature = "no_module"))]
2020-05-04 11:43:54 +02:00
Export => "export",
2020-07-05 09:23:51 +02:00
#[cfg(not(feature = "no_module"))]
2020-05-04 11:43:54 +02:00
As => "as",
EOF => "{EOF}",
2020-06-16 16:14:46 +02:00
_ => unreachable!("operator should be match in outer scope"),
2020-07-05 09:23:51 +02:00
}
.into(),
}
}
2020-07-09 13:54:28 +02:00
/// Reverse lookup a token from a piece of syntax.
pub fn lookup_from_syntax(syntax: &str) -> Option<Self> {
use Token::*;
Some(match syntax {
"{" => LeftBrace,
"}" => RightBrace,
"(" => LeftParen,
")" => RightParen,
"[" => LeftBracket,
"]" => RightBracket,
"+" => Plus,
"-" => Minus,
"*" => Multiply,
"/" => Divide,
";" => SemiColon,
":" => Colon,
"::" => DoubleColon,
2020-11-13 11:32:18 +01:00
"=>" => DoubleArrow,
"_" => Underscore,
2020-07-09 13:54:28 +02:00
"," => Comma,
"." => Period,
"#{" => MapStart,
"=" => Equals,
"true" => True,
"false" => False,
"let" => Let,
"const" => Const,
"if" => If,
"else" => Else,
2020-11-13 11:32:18 +01:00
"switch" => Switch,
2020-11-20 15:23:37 +01:00
"do" => Do,
2020-07-09 13:54:28 +02:00
"while" => While,
2020-11-20 15:23:37 +01:00
"until" => Until,
2020-07-09 13:54:28 +02:00
"loop" => Loop,
"for" => For,
"in" => In,
"<" => LessThan,
">" => GreaterThan,
"!" => Bang,
"<=" => LessThanEqualsTo,
">=" => GreaterThanEqualsTo,
"==" => EqualsTo,
"!=" => NotEqualsTo,
"|" => Pipe,
"||" => Or,
"&" => Ampersand,
"&&" => And,
"continue" => Continue,
"break" => Break,
"return" => Return,
"throw" => Throw,
2020-10-20 17:16:03 +02:00
"try" => Try,
"catch" => Catch,
2020-07-09 13:54:28 +02:00
"+=" => PlusAssign,
"-=" => MinusAssign,
"*=" => MultiplyAssign,
"/=" => DivideAssign,
"<<=" => LeftShiftAssign,
">>=" => RightShiftAssign,
"&=" => AndAssign,
"|=" => OrAssign,
"^=" => XOrAssign,
"<<" => LeftShift,
">>" => RightShift,
"^" => XOr,
"%" => Modulo,
"%=" => ModuloAssign,
"~" => PowerOf,
"~=" => PowerOfAssign,
#[cfg(not(feature = "no_function"))]
"fn" => Fn,
2020-07-09 13:54:28 +02:00
#[cfg(not(feature = "no_function"))]
"private" => Private,
2020-07-09 13:54:28 +02:00
#[cfg(not(feature = "no_module"))]
"import" => Import,
#[cfg(not(feature = "no_module"))]
"export" => Export,
#[cfg(not(feature = "no_module"))]
"as" => As,
#[cfg(feature = "no_function")]
"fn" | "private" => Reserved(syntax.into()),
#[cfg(feature = "no_module")]
"import" | "export" | "as" => Reserved(syntax.into()),
2020-11-13 11:32:18 +01:00
"===" | "!==" | "->" | "<-" | ":=" | "::<" | "(*" | "*)" | "#" | "public" | "new"
2020-11-20 15:23:37 +01:00
| "use" | "module" | "package" | "var" | "static" | "shared" | "with" | "each"
| "then" | "goto" | "unless" | "exit" | "match" | "case" | "default" | "void"
2020-11-13 11:32:18 +01:00
| "null" | "nil" | "spawn" | "thread" | "go" | "sync" | "async" | "await" | "yield" => {
Reserved(syntax.into())
}
2020-07-26 15:57:30 +02:00
2020-07-16 06:09:31 +02:00
KEYWORD_PRINT | KEYWORD_DEBUG | KEYWORD_TYPE_OF | KEYWORD_EVAL | KEYWORD_FN_PTR
2020-10-03 10:25:58 +02:00
| KEYWORD_FN_PTR_CALL | KEYWORD_FN_PTR_CURRY | KEYWORD_IS_DEF_VAR
| KEYWORD_IS_DEF_FN | KEYWORD_THIS => Reserved(syntax.into()),
#[cfg(not(feature = "no_closure"))]
2020-11-16 09:28:04 +01:00
crate::engine::KEYWORD_IS_SHARED => Reserved(syntax.into()),
2020-07-09 13:54:28 +02:00
_ => return None,
})
}
2020-11-20 09:52:28 +01:00
// Is this token [`EOF`][Token::EOF]?
2020-10-08 16:25:50 +02:00
#[inline(always)]
pub fn is_eof(&self) -> bool {
use Token::*;
match self {
EOF => true,
_ => false,
}
}
// If another operator is after these, it's probably an unary operator
2020-11-20 09:52:28 +01:00
// (not sure about `fn` name).
pub fn is_next_unary(&self) -> bool {
use Token::*;
match self {
LexError(_) |
2020-07-05 11:41:45 +02:00
LeftBrace | // {+expr} - is unary
// RightBrace | {expr} - expr not unary & is closing
2020-07-05 11:41:45 +02:00
LeftParen | // (-expr) - is unary
// RightParen | (expr) - expr not unary & is closing
LeftBracket | // [-expr] - is unary
// RightBracket | [expr] - expr not unary & is closing
Plus |
UnaryPlus |
Minus |
UnaryMinus |
Multiply |
Divide |
Comma |
Period |
Equals |
LessThan |
GreaterThan |
Bang |
LessThanEqualsTo |
GreaterThanEqualsTo |
EqualsTo |
NotEqualsTo |
Pipe |
Or |
Ampersand |
And |
If |
2020-11-20 15:23:37 +01:00
Do |
While |
2020-11-20 15:23:37 +01:00
Until |
PlusAssign |
MinusAssign |
MultiplyAssign |
DivideAssign |
LeftShiftAssign |
RightShiftAssign |
AndAssign |
OrAssign |
XOrAssign |
LeftShift |
RightShift |
XOr |
Modulo |
ModuloAssign |
Return |
Throw |
PowerOf |
In |
2020-07-05 11:41:45 +02:00
PowerOfAssign => true,
_ => false,
}
}
/// Get the precedence number of the token.
2020-10-25 14:57:18 +01:00
pub fn precedence(&self) -> u8 {
use Token::*;
match self {
// Assignments are not considered expressions - set to zero
Equals | PlusAssign | MinusAssign | MultiplyAssign | DivideAssign | LeftShiftAssign
| RightShiftAssign | AndAssign | OrAssign | XOrAssign | ModuloAssign
| PowerOfAssign => 0,
2020-07-05 11:41:45 +02:00
Or | XOr | Pipe => 30,
2020-07-05 11:41:45 +02:00
And | Ampersand => 60,
EqualsTo | NotEqualsTo => 90,
2020-10-13 10:01:42 +02:00
In => 110,
2020-10-13 11:16:19 +02:00
LessThan | LessThanEqualsTo | GreaterThan | GreaterThanEqualsTo => 130,
Plus | Minus => 150,
2020-10-13 09:49:09 +02:00
Divide | Multiply | Modulo => 180,
2020-10-13 03:33:16 +02:00
2020-10-13 09:49:09 +02:00
PowerOf => 190,
LeftShift | RightShift => 210,
2020-07-05 11:41:45 +02:00
Period => 240,
_ => 0,
}
}
/// Does an expression bind to the right (instead of left)?
pub fn is_bind_right(&self) -> bool {
use Token::*;
match self {
// Assignments bind to the right
Equals | PlusAssign | MinusAssign | MultiplyAssign | DivideAssign | LeftShiftAssign
| RightShiftAssign | AndAssign | OrAssign | XOrAssign | ModuloAssign
| PowerOfAssign => true,
// Property access binds to the right
Period => true,
_ => false,
}
}
2020-07-05 09:23:51 +02:00
/// Is this token an operator?
pub fn is_operator(&self) -> bool {
use Token::*;
match self {
LeftBrace | RightBrace | LeftParen | RightParen | LeftBracket | RightBracket | Plus
| UnaryPlus | Minus | UnaryMinus | Multiply | Divide | Modulo | PowerOf | LeftShift
| RightShift | SemiColon | Colon | DoubleColon | Comma | Period | MapStart | Equals
| LessThan | GreaterThan | LessThanEqualsTo | GreaterThanEqualsTo | EqualsTo
| NotEqualsTo | Bang | Pipe | Or | XOr | Ampersand | And | PlusAssign | MinusAssign
| MultiplyAssign | DivideAssign | LeftShiftAssign | RightShiftAssign | AndAssign
| OrAssign | XOrAssign | ModuloAssign | PowerOfAssign => true,
_ => false,
}
}
2020-07-28 13:11:37 +02:00
/// Is this token an active standard keyword?
2020-07-05 09:23:51 +02:00
pub fn is_keyword(&self) -> bool {
use Token::*;
match self {
#[cfg(not(feature = "no_function"))]
Fn | Private => true,
#[cfg(not(feature = "no_module"))]
Import | Export | As => true,
2020-11-20 15:23:37 +01:00
True | False | Let | Const | If | Else | Do | While | Until | Loop | For | In
| Continue | Break | Return | Throw | Try | Catch => true,
2020-07-05 09:23:51 +02:00
_ => false,
}
}
/// Is this token a reserved symbol?
2020-10-08 16:25:50 +02:00
#[inline(always)]
pub fn is_reserved(&self) -> bool {
match self {
Self::Reserved(_) => true,
_ => false,
}
}
2020-07-26 16:25:30 +02:00
/// Convert a token into a function name, if possible.
2020-08-05 16:53:01 +02:00
#[cfg(not(feature = "no_function"))]
2020-07-30 12:18:28 +02:00
pub(crate) fn into_function_name_for_override(self) -> Result<String, Self> {
2020-07-26 16:25:30 +02:00
match self {
2020-07-30 12:18:28 +02:00
Self::Reserved(s) if can_override_keyword(&s) => Ok(s),
2020-07-26 16:25:30 +02:00
Self::Custom(s) | Self::Identifier(s) if is_valid_identifier(s.chars()) => Ok(s),
_ => Err(self),
}
}
/// Is this token a custom keyword?
2020-10-08 16:25:50 +02:00
#[inline(always)]
pub fn is_custom(&self) -> bool {
match self {
Self::Custom(_) => true,
_ => false,
}
}
}
2020-05-04 13:36:58 +02:00
impl From<Token> for String {
2020-10-08 16:25:50 +02:00
#[inline(always)]
2020-05-04 13:36:58 +02:00
fn from(token: Token) -> Self {
token.syntax().into()
}
}
2020-11-20 09:52:28 +01:00
/// _(INTERNALS)_ State of the tokenizer.
/// Exported under the `internals` feature only.
///
/// ## WARNING
///
/// This type is volatile and may change.
2020-07-05 09:23:51 +02:00
#[derive(Debug, Clone, Eq, PartialEq, Default)]
2020-06-26 13:44:50 +02:00
pub struct TokenizeState {
2020-06-14 08:25:47 +02:00
/// Maximum length of a string (0 = unlimited).
2020-06-26 16:03:21 +02:00
pub max_string_size: usize,
/// Can the next token be a unary operator?
2020-06-26 16:03:21 +02:00
pub non_unary: bool,
2020-06-26 13:44:50 +02:00
/// Is the tokenizer currently inside a block comment?
2020-06-26 16:03:21 +02:00
pub comment_level: usize,
2020-11-20 09:52:28 +01:00
/// Return [`None`] at the end of the stream instead of [`Some(Token::EOF)`][Token::EOF]?
2020-06-26 16:03:21 +02:00
pub end_with_none: bool,
2020-06-26 13:44:50 +02:00
/// Include comments?
2020-06-26 16:03:21 +02:00
pub include_comments: bool,
}
2020-11-20 09:52:28 +01:00
/// _(INTERNALS)_ Trait that encapsulates a peekable character input stream.
/// Exported under the `internals` feature only.
///
/// ## WARNING
///
/// This trait is volatile and may change.
2020-06-26 13:44:50 +02:00
pub trait InputStream {
fn unread(&mut self, ch: char);
/// Get the next character
2020-06-26 13:44:50 +02:00
fn get_next(&mut self) -> Option<char>;
/// Peek the next character
2020-06-26 13:44:50 +02:00
fn peek_next(&mut self) -> Option<char>;
}
2020-11-20 09:52:28 +01:00
/// _(INTERNALS)_ Parse a string literal wrapped by `enclosing_char`.
/// Exported under the `internals` feature only.
///
/// ## WARNING
///
/// This type is volatile and may change.
2020-06-26 13:44:50 +02:00
pub fn parse_string_literal(
stream: &mut impl InputStream,
state: &mut TokenizeState,
pos: &mut Position,
enclosing_char: char,
) -> Result<String, (LexError, Position)> {
2020-07-29 10:10:06 +02:00
let mut result: StaticVec<char> = Default::default();
let mut escape: StaticVec<char> = Default::default();
2020-06-26 13:44:50 +02:00
let start = *pos;
2020-06-26 13:44:50 +02:00
loop {
let next_char = stream.get_next().ok_or((LERR::UnterminatedString, start))?;
2020-06-26 13:44:50 +02:00
pos.advance();
if state.max_string_size > 0 && result.len() > state.max_string_size {
return Err((LexError::StringTooLong(state.max_string_size), *pos));
}
2020-06-26 13:44:50 +02:00
match next_char {
// \...
'\\' if escape.is_empty() => {
escape.push('\\');
}
// \\
'\\' if !escape.is_empty() => {
escape.clear();
result.push('\\');
}
// \t
't' if !escape.is_empty() => {
escape.clear();
result.push('\t');
}
// \n
'n' if !escape.is_empty() => {
escape.clear();
result.push('\n');
}
// \r
'r' if !escape.is_empty() => {
escape.clear();
result.push('\r');
}
// \x??, \u????, \U????????
ch @ 'x' | ch @ 'u' | ch @ 'U' if !escape.is_empty() => {
let mut seq = escape.clone();
escape.clear();
2020-07-29 10:10:06 +02:00
seq.push(ch);
2020-06-26 13:44:50 +02:00
let mut out_val: u32 = 0;
let len = match ch {
'x' => 2,
'u' => 4,
'U' => 8,
_ => unreachable!(),
};
for _ in 0..len {
2020-07-29 10:10:06 +02:00
let c = stream.get_next().ok_or_else(|| {
(
LERR::MalformedEscapeSequence(seq.iter().cloned().collect()),
*pos,
)
})?;
2020-06-26 13:44:50 +02:00
seq.push(c);
pos.advance();
out_val *= 16;
2020-07-29 10:10:06 +02:00
out_val += c.to_digit(16).ok_or_else(|| {
(
LERR::MalformedEscapeSequence(seq.iter().cloned().collect()),
*pos,
)
})?;
2020-06-26 13:44:50 +02:00
}
2020-07-29 10:10:06 +02:00
result.push(char::from_u32(out_val).ok_or_else(|| {
(
LERR::MalformedEscapeSequence(seq.into_iter().collect()),
*pos,
)
})?);
2020-06-26 13:44:50 +02:00
}
// \{enclosing_char} - escaped
ch if enclosing_char == ch && !escape.is_empty() => {
escape.clear();
result.push(ch)
}
// Close wrapper
ch if enclosing_char == ch && escape.is_empty() => break,
2020-06-26 13:44:50 +02:00
// Unknown escape sequence
ch if !escape.is_empty() => {
escape.push(ch);
2020-07-29 10:10:06 +02:00
return Err((
LERR::MalformedEscapeSequence(escape.into_iter().collect()),
*pos,
));
2020-07-29 10:10:06 +02:00
}
2020-06-26 13:44:50 +02:00
// Cannot have new-lines inside string literals
'\n' => {
pos.rewind();
return Err((LERR::UnterminatedString, start));
2020-06-14 08:25:47 +02:00
}
2020-06-26 13:44:50 +02:00
// All other characters
ch => {
escape.clear();
result.push(ch);
}
}
}
2020-06-26 13:44:50 +02:00
let s = result.iter().collect::<String>();
2020-06-26 13:44:50 +02:00
if state.max_string_size > 0 && s.len() > state.max_string_size {
return Err((LexError::StringTooLong(state.max_string_size), *pos));
}
2020-06-26 13:44:50 +02:00
Ok(s)
}
2020-06-26 13:44:50 +02:00
/// Consume the next character.
2020-10-08 16:25:50 +02:00
#[inline(always)]
2020-07-09 13:54:28 +02:00
fn eat_next(stream: &mut impl InputStream, pos: &mut Position) -> Option<char> {
2020-06-26 13:44:50 +02:00
pos.advance();
2020-07-09 13:54:28 +02:00
stream.get_next()
2020-06-26 13:44:50 +02:00
}
2020-06-26 13:44:50 +02:00
/// Scan for a block comment until the end.
fn scan_comment(
stream: &mut impl InputStream,
state: &mut TokenizeState,
pos: &mut Position,
comment: &mut String,
) {
while let Some(c) = stream.get_next() {
pos.advance();
if state.include_comments {
comment.push(c);
}
2020-06-26 13:44:50 +02:00
match c {
'/' => {
if let Some(c2) = stream.get_next() {
if state.include_comments {
comment.push(c2);
}
if c2 == '*' {
state.comment_level += 1;
}
}
pos.advance();
}
'*' => {
if let Some(c2) = stream.get_next() {
if state.include_comments {
comment.push(c2);
}
if c2 == '/' {
state.comment_level -= 1;
}
}
2020-06-26 13:44:50 +02:00
pos.advance();
}
2020-06-26 13:44:50 +02:00
'\n' => pos.new_line(),
_ => (),
}
2020-06-26 13:44:50 +02:00
if state.comment_level == 0 {
break;
2020-06-14 08:25:47 +02:00
}
2020-06-26 13:44:50 +02:00
}
}
2020-06-14 08:25:47 +02:00
2020-11-20 09:52:28 +01:00
/// _(INTERNALS)_ Get the next token from the `stream`.
/// Exported under the `internals` feature only.
///
/// ## WARNING
///
/// This type is volatile and may change.
2020-10-08 16:25:50 +02:00
#[inline]
2020-06-26 16:33:27 +02:00
pub fn get_next_token(
2020-06-26 16:03:21 +02:00
stream: &mut impl InputStream,
state: &mut TokenizeState,
pos: &mut Position,
) -> Option<(Token, Position)> {
let result = get_next_token_inner(stream, state, pos);
// Save the last token's state
2020-07-05 09:23:51 +02:00
if let Some((ref token, _)) = result {
2020-06-26 16:03:21 +02:00
state.non_unary = !token.is_next_unary();
}
result
}
/// Test if the given character is a hex character.
2020-10-08 16:25:50 +02:00
#[inline(always)]
fn is_hex_char(c: char) -> bool {
match c {
'a'..='f' => true,
'A'..='F' => true,
'0'..='9' => true,
_ => false,
}
}
/// Test if the given character is an octal character.
2020-10-08 16:25:50 +02:00
#[inline(always)]
fn is_octal_char(c: char) -> bool {
match c {
'0'..='7' => true,
_ => false,
}
}
/// Test if the given character is a binary character.
2020-10-08 16:25:50 +02:00
#[inline(always)]
fn is_binary_char(c: char) -> bool {
match c {
'0' | '1' => true,
_ => false,
}
}
2020-06-26 16:03:21 +02:00
/// Get the next token.
fn get_next_token_inner(
2020-06-26 13:44:50 +02:00
stream: &mut impl InputStream,
state: &mut TokenizeState,
pos: &mut Position,
) -> Option<(Token, Position)> {
// Still inside a comment?
if state.comment_level > 0 {
let start_pos = *pos;
2020-06-26 16:03:21 +02:00
let mut comment = String::new();
2020-06-26 13:44:50 +02:00
scan_comment(stream, state, pos, &mut comment);
if state.include_comments {
return Some((Token::Comment(comment), start_pos));
}
}
2020-06-26 13:44:50 +02:00
let mut negated = false;
2020-06-26 13:44:50 +02:00
while let Some(c) = stream.get_next() {
pos.advance();
2020-06-26 13:44:50 +02:00
let start_pos = *pos;
2020-06-26 13:44:50 +02:00
match (c, stream.peek_next().unwrap_or('\0')) {
// \n
('\n', _) => pos.new_line(),
2020-06-26 13:44:50 +02:00
// digit ...
('0'..='9', _) => {
2020-07-29 10:10:06 +02:00
let mut result: StaticVec<char> = Default::default();
2020-06-26 13:44:50 +02:00
let mut radix_base: Option<u32> = None;
result.push(c);
2020-06-26 13:44:50 +02:00
while let Some(next_char) = stream.peek_next() {
match next_char {
'0'..='9' | '_' => {
result.push(next_char);
eat_next(stream, pos);
}
#[cfg(not(feature = "no_float"))]
'.' => {
stream.get_next().unwrap();
// Check if followed by digits (or _)
match stream.peek_next().unwrap_or('\0') {
'0'..='9' | '_' => {
result.push(next_char);
pos.advance()
}
_ => {
// Not a floating-point number
stream.unread(next_char);
break;
}
}
2020-06-26 13:44:50 +02:00
while let Some(next_char_in_float) = stream.peek_next() {
match next_char_in_float {
'0'..='9' | '_' => {
result.push(next_char_in_float);
eat_next(stream, pos);
}
2020-06-26 13:44:50 +02:00
_ => break,
}
}
2020-06-26 13:44:50 +02:00
}
// 0x????, 0o????, 0b????
2020-07-08 06:09:18 +02:00
ch @ 'x' | ch @ 'X' | ch @ 'o' | ch @ 'O' | ch @ 'b' | ch @ 'B'
if c == '0' =>
{
2020-06-26 13:44:50 +02:00
result.push(next_char);
eat_next(stream, pos);
let valid = match ch {
'x' | 'X' => is_hex_char,
'o' | 'O' => is_octal_char,
'b' | 'B' => is_binary_char,
2020-06-26 13:44:50 +02:00
_ => unreachable!(),
};
radix_base = Some(match ch {
'x' | 'X' => 16,
'o' | 'O' => 8,
'b' | 'B' => 2,
_ => unreachable!(),
});
while let Some(next_char_in_escape_seq) = stream.peek_next() {
if !valid(next_char_in_escape_seq) {
2020-06-26 13:44:50 +02:00
break;
}
2020-06-26 13:44:50 +02:00
result.push(next_char_in_escape_seq);
eat_next(stream, pos);
}
}
2020-06-26 13:44:50 +02:00
_ => break,
}
2020-06-26 13:44:50 +02:00
}
2020-06-26 13:44:50 +02:00
if negated {
result.insert(0, '-');
}
2020-06-26 13:44:50 +02:00
// Parse number
if let Some(radix) = radix_base {
let out: String = result.iter().skip(2).filter(|&&c| c != '_').collect();
2020-06-26 13:44:50 +02:00
return Some((
INT::from_str_radix(&out, radix)
.map(Token::IntegerConstant)
.unwrap_or_else(|_| {
2020-11-02 16:54:19 +01:00
Token::LexError(LERR::MalformedNumber(result.into_iter().collect()))
}),
2020-06-26 13:44:50 +02:00
start_pos,
));
} else {
let out: String = result.iter().filter(|&&c| c != '_').collect();
let num = INT::from_str(&out).map(Token::IntegerConstant);
// If integer parsing is unnecessary, try float instead
#[cfg(not(feature = "no_float"))]
let num = num.or_else(|_| FLOAT::from_str(&out).map(Token::FloatConstant));
return Some((
num.unwrap_or_else(|_| {
2020-11-02 16:54:19 +01:00
Token::LexError(LERR::MalformedNumber(result.into_iter().collect()))
2020-06-26 13:44:50 +02:00
}),
start_pos,
));
}
2020-06-26 13:44:50 +02:00
}
2020-06-26 13:44:50 +02:00
// letter or underscore ...
('A'..='Z', _) | ('a'..='z', _) | ('_', _) => {
2020-07-28 22:26:57 +02:00
return get_identifier(stream, pos, start_pos, c);
2020-06-26 13:44:50 +02:00
}
2020-06-26 13:44:50 +02:00
// " - string literal
2020-07-08 06:09:18 +02:00
('"', _) => {
return parse_string_literal(stream, state, pos, '"').map_or_else(
2020-11-02 16:54:19 +01:00
|err| Some((Token::LexError(err.0), err.1)),
2020-07-08 06:09:18 +02:00
|out| Some((Token::StringConstant(out), start_pos)),
)
}
2020-06-26 13:44:50 +02:00
// ' - character literal
2020-07-08 06:09:18 +02:00
('\'', '\'') => {
return Some((
2020-11-02 16:54:19 +01:00
Token::LexError(LERR::MalformedChar("".to_string())),
2020-07-08 06:09:18 +02:00
start_pos,
))
}
('\'', _) => {
return Some(parse_string_literal(stream, state, pos, '\'').map_or_else(
2020-11-02 16:54:19 +01:00
|err| (Token::LexError(err.0), err.1),
2020-07-08 06:09:18 +02:00
|result| {
let mut chars = result.chars();
let first = chars.next().unwrap();
2020-07-08 06:09:18 +02:00
if chars.next().is_some() {
2020-11-02 16:54:19 +01:00
(Token::LexError(LERR::MalformedChar(result)), start_pos)
2020-07-08 06:09:18 +02:00
} else {
(Token::CharConstant(first), start_pos)
2020-07-08 06:09:18 +02:00
}
},
))
}
2020-06-26 13:44:50 +02:00
// Braces
('{', _) => return Some((Token::LeftBrace, start_pos)),
('}', _) => return Some((Token::RightBrace, start_pos)),
// Parentheses
('(', '*') => {
eat_next(stream, pos);
return Some((Token::Reserved("(*".into()), start_pos));
}
2020-06-26 13:44:50 +02:00
('(', _) => return Some((Token::LeftParen, start_pos)),
(')', _) => return Some((Token::RightParen, start_pos)),
// Indexing
('[', _) => return Some((Token::LeftBracket, start_pos)),
(']', _) => return Some((Token::RightBracket, start_pos)),
// Map literal
#[cfg(not(feature = "no_object"))]
('#', '{') => {
eat_next(stream, pos);
return Some((Token::MapStart, start_pos));
}
2020-07-09 13:54:28 +02:00
('#', _) => return Some((Token::Reserved("#".into()), start_pos)),
2020-06-26 13:44:50 +02:00
// Operators
('+', '=') => {
eat_next(stream, pos);
return Some((Token::PlusAssign, start_pos));
}
2020-10-10 16:14:10 +02:00
('+', '+') => {
eat_next(stream, pos);
return Some((Token::Reserved("++".into()), start_pos));
}
2020-06-26 16:03:21 +02:00
('+', _) if !state.non_unary => return Some((Token::UnaryPlus, start_pos)),
2020-06-26 13:44:50 +02:00
('+', _) => return Some((Token::Plus, start_pos)),
2020-06-26 16:03:21 +02:00
('-', '0'..='9') if !state.non_unary => negated = true,
2020-06-26 13:44:50 +02:00
('-', '0'..='9') => return Some((Token::Minus, start_pos)),
('-', '=') => {
eat_next(stream, pos);
return Some((Token::MinusAssign, start_pos));
}
('-', '>') => {
eat_next(stream, pos);
return Some((Token::Reserved("->".into()), start_pos));
}
2020-10-10 16:14:10 +02:00
('-', '-') => {
eat_next(stream, pos);
return Some((Token::Reserved("--".into()), start_pos));
}
2020-06-26 16:03:21 +02:00
('-', _) if !state.non_unary => return Some((Token::UnaryMinus, start_pos)),
2020-06-26 13:44:50 +02:00
('-', _) => return Some((Token::Minus, start_pos)),
2020-10-13 09:51:59 +02:00
('*', '*') => {
eat_next(stream, pos);
return Some((Token::Reserved("**".into()), start_pos));
}
('*', ')') => {
eat_next(stream, pos);
return Some((Token::Reserved("*)".into()), start_pos));
}
2020-06-26 13:44:50 +02:00
('*', '=') => {
eat_next(stream, pos);
return Some((Token::MultiplyAssign, start_pos));
}
('*', _) => return Some((Token::Multiply, start_pos)),
// Comments
('/', '/') => {
eat_next(stream, pos);
let mut comment = if state.include_comments {
"//".to_string()
} else {
2020-06-26 16:03:21 +02:00
String::new()
2020-06-26 13:44:50 +02:00
};
while let Some(c) = stream.get_next() {
if c == '\n' {
pos.new_line();
break;
}
2020-06-26 13:44:50 +02:00
if state.include_comments {
comment.push(c);
}
pos.advance();
}
2020-06-26 13:44:50 +02:00
if state.include_comments {
return Some((Token::Comment(comment), start_pos));
}
2020-06-26 13:44:50 +02:00
}
('/', '*') => {
state.comment_level = 1;
2020-06-26 13:44:50 +02:00
eat_next(stream, pos);
2020-06-26 13:44:50 +02:00
let mut comment = if state.include_comments {
"/*".to_string()
} else {
2020-06-26 16:03:21 +02:00
String::new()
2020-06-26 13:44:50 +02:00
};
scan_comment(stream, state, pos, &mut comment);
2020-06-26 13:44:50 +02:00
if state.include_comments {
return Some((Token::Comment(comment), start_pos));
}
2020-06-26 13:44:50 +02:00
}
2020-06-26 13:44:50 +02:00
('/', '=') => {
eat_next(stream, pos);
return Some((Token::DivideAssign, start_pos));
}
('/', _) => return Some((Token::Divide, start_pos)),
2020-06-26 13:44:50 +02:00
(';', _) => return Some((Token::SemiColon, start_pos)),
(',', _) => return Some((Token::Comma, start_pos)),
2020-10-10 16:14:10 +02:00
('.', '.') => {
eat_next(stream, pos);
if stream.peek_next() == Some('.') {
eat_next(stream, pos);
return Some((Token::Reserved("...".into()), start_pos));
} else {
return Some((Token::Reserved("..".into()), start_pos));
}
}
2020-06-26 13:44:50 +02:00
('.', _) => return Some((Token::Period, start_pos)),
2020-06-26 13:44:50 +02:00
('=', '=') => {
eat_next(stream, pos);
2020-06-26 13:44:50 +02:00
if stream.peek_next() == Some('=') {
eat_next(stream, pos);
2020-07-08 06:09:18 +02:00
return Some((Token::Reserved("===".into()), start_pos));
}
2020-06-26 13:44:50 +02:00
return Some((Token::EqualsTo, start_pos));
}
('=', '>') => {
eat_next(stream, pos);
2020-11-13 11:32:18 +01:00
return Some((Token::DoubleArrow, start_pos));
}
2020-06-26 13:44:50 +02:00
('=', _) => return Some((Token::Equals, start_pos)),
(':', ':') => {
eat_next(stream, pos);
2020-07-08 06:09:18 +02:00
if stream.peek_next() == Some('<') {
eat_next(stream, pos);
2020-07-08 06:09:18 +02:00
return Some((Token::Reserved("::<".into()), start_pos));
}
2020-06-26 13:44:50 +02:00
return Some((Token::DoubleColon, start_pos));
}
(':', '=') => {
eat_next(stream, pos);
return Some((Token::Reserved(":=".into()), start_pos));
}
2020-06-26 13:44:50 +02:00
(':', _) => return Some((Token::Colon, start_pos)),
('<', '=') => {
eat_next(stream, pos);
return Some((Token::LessThanEqualsTo, start_pos));
}
('<', '-') => {
eat_next(stream, pos);
return Some((Token::Reserved("<-".into()), start_pos));
}
2020-06-26 13:44:50 +02:00
('<', '<') => {
eat_next(stream, pos);
return Some((
if stream.peek_next() == Some('=') {
eat_next(stream, pos);
Token::LeftShiftAssign
} else {
Token::LeftShift
},
start_pos,
));
}
('<', _) => return Some((Token::LessThan, start_pos)),
2020-04-22 11:36:51 +02:00
2020-06-26 13:44:50 +02:00
('>', '=') => {
eat_next(stream, pos);
return Some((Token::GreaterThanEqualsTo, start_pos));
}
('>', '>') => {
eat_next(stream, pos);
2020-06-26 13:44:50 +02:00
return Some((
if stream.peek_next() == Some('=') {
eat_next(stream, pos);
Token::RightShiftAssign
} else {
Token::RightShift
},
start_pos,
));
}
('>', _) => return Some((Token::GreaterThan, start_pos)),
2020-05-03 19:19:01 +02:00
2020-06-26 13:44:50 +02:00
('!', '=') => {
eat_next(stream, pos);
2020-06-26 13:44:50 +02:00
if stream.peek_next() == Some('=') {
eat_next(stream, pos);
2020-07-08 06:09:18 +02:00
return Some((Token::Reserved("!==".into()), start_pos));
}
2020-06-26 13:44:50 +02:00
return Some((Token::NotEqualsTo, start_pos));
}
('!', _) => return Some((Token::Bang, start_pos)),
2020-06-26 13:44:50 +02:00
('|', '|') => {
eat_next(stream, pos);
return Some((Token::Or, start_pos));
}
('|', '=') => {
eat_next(stream, pos);
return Some((Token::OrAssign, start_pos));
}
('|', _) => return Some((Token::Pipe, start_pos)),
2020-04-22 11:36:51 +02:00
2020-06-26 13:44:50 +02:00
('&', '&') => {
eat_next(stream, pos);
return Some((Token::And, start_pos));
}
('&', '=') => {
eat_next(stream, pos);
return Some((Token::AndAssign, start_pos));
}
('&', _) => return Some((Token::Ampersand, start_pos)),
2020-06-26 13:44:50 +02:00
('^', '=') => {
eat_next(stream, pos);
return Some((Token::XOrAssign, start_pos));
}
('^', _) => return Some((Token::XOr, start_pos)),
2020-06-26 13:44:50 +02:00
('%', '=') => {
eat_next(stream, pos);
return Some((Token::ModuloAssign, start_pos));
}
('%', _) => return Some((Token::Modulo, start_pos)),
2020-06-26 13:44:50 +02:00
('~', '=') => {
eat_next(stream, pos);
return Some((Token::PowerOfAssign, start_pos));
}
('~', _) => return Some((Token::PowerOf, start_pos)),
2020-07-08 06:09:18 +02:00
('@', _) => return Some((Token::Reserved("@".into()), start_pos)),
2020-10-05 17:02:50 +02:00
('$', _) => return Some((Token::Reserved("$".into()), start_pos)),
2020-07-08 06:09:18 +02:00
2020-06-26 13:44:50 +02:00
('\0', _) => unreachable!(),
2020-06-26 13:44:50 +02:00
(ch, _) if ch.is_whitespace() => (),
2020-07-29 01:25:37 +02:00
#[cfg(feature = "unicode-xid-ident")]
2020-07-29 00:03:21 +02:00
(ch, _) if unicode_xid::UnicodeXID::is_xid_start(ch) => {
return get_identifier(stream, pos, start_pos, c);
}
2020-07-08 06:09:18 +02:00
(ch, _) => {
return Some((
2020-11-02 16:54:19 +01:00
Token::LexError(LERR::UnexpectedInput(ch.to_string())),
2020-07-08 06:09:18 +02:00
start_pos,
))
}
2020-06-26 13:44:50 +02:00
}
}
2020-06-26 13:44:50 +02:00
pos.advance();
if state.end_with_none {
None
} else {
Some((Token::EOF, *pos))
}
}
2020-07-28 22:26:57 +02:00
/// Get the next identifier.
fn get_identifier(
stream: &mut impl InputStream,
pos: &mut Position,
start_pos: Position,
first_char: char,
) -> Option<(Token, Position)> {
2020-07-29 10:10:06 +02:00
let mut result: StaticVec<_> = Default::default();
2020-07-28 22:26:57 +02:00
result.push(first_char);
while let Some(next_char) = stream.peek_next() {
match next_char {
2020-07-28 23:24:41 +02:00
x if is_id_continue(x) => {
2020-07-28 22:26:57 +02:00
result.push(x);
eat_next(stream, pos);
}
_ => break,
}
}
let is_valid_identifier = is_valid_identifier(result.iter().cloned());
2020-11-13 11:32:18 +01:00
let identifier: String = result.into_iter().collect();
if let Some(token) = Token::lookup_from_syntax(&identifier) {
return Some((token, start_pos));
}
2020-07-28 22:26:57 +02:00
if !is_valid_identifier {
return Some((
2020-11-02 16:54:19 +01:00
Token::LexError(LERR::MalformedIdentifier(identifier)),
2020-07-28 22:26:57 +02:00
start_pos,
));
}
2020-11-13 11:32:18 +01:00
return Some((Token::Identifier(identifier), start_pos));
2020-07-28 22:26:57 +02:00
}
2020-07-28 23:24:41 +02:00
/// Is this keyword allowed as a function?
#[inline(always)]
pub fn is_keyword_function(name: &str) -> bool {
2020-07-31 12:43:34 +02:00
match name {
2020-08-03 06:10:20 +02:00
#[cfg(not(feature = "no_closure"))]
2020-11-16 09:28:04 +01:00
crate::engine::KEYWORD_IS_SHARED => true,
2020-07-31 12:43:34 +02:00
KEYWORD_PRINT | KEYWORD_DEBUG | KEYWORD_TYPE_OF | KEYWORD_EVAL | KEYWORD_FN_PTR
2020-10-03 10:25:58 +02:00
| KEYWORD_FN_PTR_CALL | KEYWORD_FN_PTR_CURRY | KEYWORD_IS_DEF_VAR | KEYWORD_IS_DEF_FN => {
true
}
2020-07-31 12:43:34 +02:00
_ => false,
2020-07-31 12:06:01 +02:00
}
}
2020-07-30 12:18:28 +02:00
/// Can this keyword be overridden as a function?
2020-08-05 16:53:01 +02:00
#[cfg(not(feature = "no_function"))]
2020-07-30 12:18:28 +02:00
#[inline(always)]
pub fn can_override_keyword(name: &str) -> bool {
2020-07-31 12:43:34 +02:00
match name {
2020-10-03 10:25:58 +02:00
KEYWORD_PRINT | KEYWORD_DEBUG | KEYWORD_TYPE_OF | KEYWORD_EVAL | KEYWORD_FN_PTR
| KEYWORD_IS_DEF_VAR | KEYWORD_IS_DEF_FN => true,
2020-07-31 12:43:34 +02:00
_ => false,
}
2020-07-30 12:18:28 +02:00
}
2020-11-20 09:52:28 +01:00
/// Is a text string a valid identifier?
2020-07-28 23:24:41 +02:00
pub fn is_valid_identifier(name: impl Iterator<Item = char>) -> bool {
let mut first_alphabetic = false;
for ch in name {
match ch {
'_' => (),
2020-07-28 23:54:23 +02:00
_ if is_id_first_alphabetic(ch) => first_alphabetic = true,
2020-07-28 23:24:41 +02:00
_ if !first_alphabetic => return false,
_ if char::is_ascii_alphanumeric(&ch) => (),
_ => return false,
}
}
first_alphabetic
}
2020-07-29 01:25:37 +02:00
#[cfg(feature = "unicode-xid-ident")]
#[inline(always)]
2020-07-29 00:03:21 +02:00
fn is_id_first_alphabetic(x: char) -> bool {
unicode_xid::UnicodeXID::is_xid_start(x)
}
2020-07-29 01:25:37 +02:00
#[cfg(feature = "unicode-xid-ident")]
#[inline(always)]
2020-07-29 00:03:21 +02:00
fn is_id_continue(x: char) -> bool {
unicode_xid::UnicodeXID::is_xid_continue(x)
}
2020-07-29 01:25:37 +02:00
#[cfg(not(feature = "unicode-xid-ident"))]
#[inline(always)]
2020-07-28 23:54:23 +02:00
fn is_id_first_alphabetic(x: char) -> bool {
2020-07-28 23:24:41 +02:00
x.is_ascii_alphabetic()
}
2020-07-29 01:25:37 +02:00
#[cfg(not(feature = "unicode-xid-ident"))]
#[inline(always)]
2020-07-28 23:24:41 +02:00
fn is_id_continue(x: char) -> bool {
x.is_ascii_alphanumeric() || x == '_'
}
2020-11-20 09:52:28 +01:00
/// A type that implements the [`InputStream`] trait.
2020-07-09 13:54:28 +02:00
/// Multiple character streams are jointed together to form one single stream.
2020-06-26 13:44:50 +02:00
pub struct MultiInputsStream<'a> {
/// Buffered character, if any.
buf: Option<char>,
2020-06-26 13:44:50 +02:00
/// The input character streams.
streams: StaticVec<Peekable<Chars<'a>>>,
2020-07-09 13:54:28 +02:00
/// The current stream index.
index: usize,
2020-06-26 13:44:50 +02:00
}
2020-06-26 13:44:50 +02:00
impl InputStream for MultiInputsStream<'_> {
/// Buffer a character.
2020-10-08 16:25:50 +02:00
#[inline(always)]
fn unread(&mut self, ch: char) {
self.buf = Some(ch);
}
2020-06-26 13:44:50 +02:00
/// Get the next character
fn get_next(&mut self) -> Option<char> {
if let Some(ch) = self.buf.take() {
return Some(ch);
}
2020-06-26 13:44:50 +02:00
loop {
2020-07-09 13:54:28 +02:00
if self.index >= self.streams.len() {
2020-06-26 13:44:50 +02:00
// No more streams
return None;
2020-07-09 13:54:28 +02:00
} else if let Some(ch) = self.streams[self.index].next() {
2020-06-26 13:44:50 +02:00
// Next character in current stream
return Some(ch);
} else {
// Jump to the next stream
2020-07-09 13:54:28 +02:00
self.index += 1;
}
}
}
2020-06-26 13:44:50 +02:00
/// Peek the next character
fn peek_next(&mut self) -> Option<char> {
if let Some(ch) = self.buf {
return Some(ch);
}
2020-06-26 13:44:50 +02:00
loop {
2020-07-09 13:54:28 +02:00
if self.index >= self.streams.len() {
2020-06-26 13:44:50 +02:00
// No more streams
return None;
2020-07-09 13:54:28 +02:00
} else if let Some(&ch) = self.streams[self.index].peek() {
2020-06-26 13:44:50 +02:00
// Next character in current stream
2020-07-09 13:54:28 +02:00
return Some(ch);
2020-06-26 13:44:50 +02:00
} else {
// Jump to the next stream
2020-07-09 13:54:28 +02:00
self.index += 1;
2020-06-26 13:44:50 +02:00
}
}
}
}
2020-11-20 09:52:28 +01:00
/// An iterator on a [`Token`] stream.
2020-07-05 11:41:45 +02:00
pub struct TokenIterator<'a, 'e> {
/// Reference to the scripting `Engine`.
engine: &'e Engine,
2020-06-26 13:44:50 +02:00
/// Current state.
state: TokenizeState,
/// Current position.
pos: Position,
/// Input character stream.
stream: MultiInputsStream<'a>,
2020-07-26 16:25:30 +02:00
/// A processor function (if any) that maps a token to another.
map: Option<Box<dyn Fn(Token) -> Token>>,
}
2020-07-05 09:23:51 +02:00
impl<'a> Iterator for TokenIterator<'a, '_> {
type Item = (Token, Position);
fn next(&mut self) -> Option<Self::Item> {
2020-10-25 14:57:18 +01:00
let token = match get_next_token(&mut self.stream, &mut self.state, &mut self.pos) {
// {EOF}
2020-10-25 14:57:18 +01:00
None => None,
// Reserved keyword/symbol
2020-10-25 14:57:18 +01:00
Some((Token::Reserved(s), pos)) => Some((match
(s.as_str(), self.engine.custom_keywords.contains_key(&s))
{
2020-11-02 16:54:19 +01:00
("===", false) => Token::LexError(LERR::ImproperSymbol(
"'===' is not a valid operator. This is not JavaScript! Should it be '=='?".to_string(),
2020-11-02 16:54:19 +01:00
)),
("!==", false) => Token::LexError(LERR::ImproperSymbol(
"'!==' is not a valid operator. This is not JavaScript! Should it be '!='?".to_string(),
2020-11-02 16:54:19 +01:00
)),
("->", false) => Token::LexError(LERR::ImproperSymbol(
"'->' is not a valid symbol. This is not C or C++!".to_string())),
("<-", false) => Token::LexError(LERR::ImproperSymbol(
2020-07-08 06:09:18 +02:00
"'<-' is not a valid symbol. This is not Go! Should it be '<='?".to_string(),
2020-11-02 16:54:19 +01:00
)),
(":=", false) => Token::LexError(LERR::ImproperSymbol(
"':=' is not a valid assignment operator. This is not Go! Should it be simply '='?".to_string(),
2020-11-02 16:54:19 +01:00
)),
("::<", false) => Token::LexError(LERR::ImproperSymbol(
"'::<>' is not a valid symbol. This is not Rust! Should it be '::'?".to_string(),
2020-11-02 16:54:19 +01:00
)),
("(*", false) | ("*)", false) => Token::LexError(LERR::ImproperSymbol(
"'(* .. *)' is not a valid comment format. This is not Pascal! Should it be '/* .. */'?".to_string(),
2020-11-02 16:54:19 +01:00
)),
("#", false) => Token::LexError(LERR::ImproperSymbol(
"'#' is not a valid symbol. Should it be '#{'?".to_string(),
2020-11-02 16:54:19 +01:00
)),
// Reserved keyword/operator that is custom.
(_, true) => Token::Custom(s),
// Reserved operator that is not custom.
2020-11-02 16:54:19 +01:00
(token, false) if !is_valid_identifier(token.chars()) => Token::LexError(LERR::ImproperSymbol(
format!("'{}' is a reserved symbol", token)
2020-11-02 16:54:19 +01:00
)),
// Reserved keyword that is not custom and disabled.
2020-11-02 16:54:19 +01:00
(token, false) if self.engine.disabled_symbols.contains(token) => Token::LexError(LERR::ImproperSymbol(
format!("reserved symbol '{}' is disabled", token)
2020-11-02 16:54:19 +01:00
)),
// Reserved keyword/operator that is not custom.
(_, false) => Token::Reserved(s),
2020-07-08 06:09:18 +02:00
}, pos)),
// Custom keyword
2020-10-25 14:57:18 +01:00
Some((Token::Identifier(s), pos)) if self.engine.custom_keywords.contains_key(&s) => {
Some((Token::Custom(s), pos))
}
// Custom standard keyword - must be disabled
2020-10-25 14:57:18 +01:00
Some((token, pos)) if token.is_keyword() && self.engine.custom_keywords.contains_key(token.syntax().as_ref()) => {
if self.engine.disabled_symbols.contains(token.syntax().as_ref()) {
// Disabled standard keyword
Some((Token::Custom(token.syntax().into()), pos))
} else {
// Active standard keyword - should never be a custom keyword!
unreachable!()
}
}
// Disabled operator
2020-10-25 14:57:18 +01:00
Some((token, pos)) if token.is_operator() && self.engine.disabled_symbols.contains(token.syntax().as_ref()) => {
2020-07-05 09:23:51 +02:00
Some((
2020-11-02 16:54:19 +01:00
Token::LexError(LexError::UnexpectedInput(token.syntax().into())),
2020-07-05 09:23:51 +02:00
pos,
))
}
// Disabled standard keyword
2020-10-25 14:57:18 +01:00
Some((token, pos)) if token.is_keyword() && self.engine.disabled_symbols.contains(token.syntax().as_ref()) => {
Some((Token::Reserved(token.syntax().into()), pos))
2020-07-05 09:23:51 +02:00
}
2020-10-25 14:57:18 +01:00
r => r,
2020-07-26 16:25:30 +02:00
};
match token {
None => None,
Some((token, pos)) => {
if let Some(ref map) = self.map {
Some((map(token), pos))
} else {
Some((token, pos))
}
}
2020-07-05 09:23:51 +02:00
}
}
}
impl Engine {
/// Tokenize an input text stream.
#[inline]
pub fn lex<'a, 'e>(
&'e self,
input: impl IntoIterator<Item = &'a &'a str>,
map: Option<Box<dyn Fn(Token) -> Token>>,
) -> TokenIterator<'a, 'e> {
TokenIterator {
engine: self,
state: TokenizeState {
#[cfg(not(feature = "unchecked"))]
2020-11-10 16:26:50 +01:00
max_string_size: self.limits.max_string_size,
#[cfg(feature = "unchecked")]
max_string_size: 0,
non_unary: false,
comment_level: 0,
end_with_none: false,
include_comments: false,
},
pos: Position::new(1, 0),
stream: MultiInputsStream {
buf: None,
streams: input.into_iter().map(|s| s.chars().peekable()).collect(),
index: 0,
},
map,
}
}
}