From 9bf3a9d78fe1b54b19a000e68a7a1c2b26ae7b27 Mon Sep 17 00:00:00 2001 From: Stephen Chung Date: Wed, 21 Dec 2022 13:54:54 +0800 Subject: [PATCH] Add Engine::compact_script. --- CHANGELOG.md | 8 ++- src/api/{type_names.rs => formatting.rs} | 34 +++++++++- src/api/mod.rs | 2 +- src/module/mod.rs | 2 +- src/serde/metadata.rs | 2 +- src/tokenizer.rs | 85 ++++++++++++++++++++++-- 6 files changed, 123 insertions(+), 10 deletions(-) rename src/api/{type_names.rs => formatting.rs} (86%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3fc67c28..2a9aa87d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,7 +37,7 @@ Net features * A function pointer created via a closure definition now links to the particular anonymous function itself. * This avoids a potentially expensive function lookup when the function pointer is called, speeding up closures. -* An additional benefit is that function pointers can now be `export`ed from modules! +* Closures now also encapsulate their defining environment, so function pointers can now be freely `export`ed from modules! ### `!in` @@ -49,6 +49,12 @@ Net features * The options are for future-proofing the API. * In this version, it gains the ability to set the value of the _custom state_ (accessible via `NativeCallContext::tag`) for a function evaluation, overriding `Engine::set_default_tag`. +### Compact a script for compression + +* `Engine::compact_script` is added which takes a valid script (it still returns parsing errors) and returns a _compacted_ version of the script with all insignificant whitespaces and all comments removed. +* A compact script compresses better than one with liberal whitespaces and comments. +* Unlike some uglifiers or minifiers, `Engine::compact_script` does not optimize the script in any way, nor does it rename variables. + Enhancements ------------ diff --git a/src/api/type_names.rs b/src/api/formatting.rs similarity index 86% rename from src/api/type_names.rs rename to src/api/formatting.rs index e6beb268..87f9bf0c 100644 --- a/src/api/type_names.rs +++ b/src/api/formatting.rs @@ -1,6 +1,9 @@ +//! Module that provide formatting services to the [`Engine`]. use crate::packages::iter_basic::{BitRange, CharsStream, StepRange}; +use crate::parser::{ParseResult, ParseState}; use crate::{ - Engine, ExclusiveRange, FnPtr, ImmutableString, InclusiveRange, Position, RhaiError, ERR, + Engine, ExclusiveRange, FnPtr, ImmutableString, InclusiveRange, OptimizationLevel, Position, + RhaiError, Scope, SmartString, StringsInterner, ERR, }; use std::any::type_name; #[cfg(feature = "no_std")] @@ -263,4 +266,33 @@ impl Engine { let t = self.map_type_name(type_name::()).into(); ERR::ErrorMismatchDataType(t, typ.into(), pos).into() } + + /// Compact a script to eliminate insignificant whitespaces and comments. + /// + /// This is useful to prepare a script for further compressing. + /// + /// The output script is semantically identical to the input script, except smaller in size. + /// + /// Unlike other uglifiers and minifiers, this method does not rename variables nor perform any + /// optimization on the input script. + #[inline] + pub fn compact_script(&self, script: impl AsRef) -> ParseResult { + let scripts = [script]; + let (mut stream, tc) = self.lex_raw(&scripts, self.token_mapper.as_deref()); + tc.borrow_mut().compressed = Some(String::new()); + stream.state.last_token = Some(SmartString::new_const()); + let scope = Scope::new(); + let mut interner = StringsInterner::new(); + let mut state = ParseState::new(&scope, &mut interner, tc); + let mut _ast = self.parse( + stream.peekable(), + &mut state, + #[cfg(not(feature = "no_optimize"))] + OptimizationLevel::None, + #[cfg(feature = "no_optimize")] + (), + )?; + let tc = state.tokenizer_control.borrow(); + Ok(tc.compressed.as_ref().unwrap().into()) + } } diff --git a/src/api/mod.rs b/src/api/mod.rs index 028cbe59..e59d7757 100644 --- a/src/api/mod.rs +++ b/src/api/mod.rs @@ -23,7 +23,7 @@ pub mod limits_unchecked; pub mod events; -pub mod type_names; +pub mod formatting; pub mod custom_syntax; diff --git a/src/module/mod.rs b/src/module/mod.rs index 26c3dcbb..082c4493 100644 --- a/src/module/mod.rs +++ b/src/module/mod.rs @@ -1,7 +1,7 @@ //! Module defining external-loaded modules for Rhai. #[cfg(feature = "metadata")] -use crate::api::type_names::format_type; +use crate::api::formatting::format_type; use crate::ast::FnAccess; use crate::func::{ shared_take_or_clone, CallableFunction, FnCallArgs, IteratorFn, RegisterNativeFunction, diff --git a/src/serde/metadata.rs b/src/serde/metadata.rs index 31610850..fa41fd6b 100644 --- a/src/serde/metadata.rs +++ b/src/serde/metadata.rs @@ -1,7 +1,7 @@ //! Serialization of functions metadata. #![cfg(feature = "metadata")] -use crate::api::type_names::format_type; +use crate::api::formatting::format_type; use crate::module::{calc_native_fn_hash, FuncInfo, ModuleFlags}; use crate::{calc_fn_hash, Engine, FnAccess, SmartString, StaticVec, AST}; use serde::Serialize; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 1abec947..dd9a5b6f 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -26,6 +26,8 @@ pub struct TokenizerControlBlock { /// Global comments. #[cfg(feature = "metadata")] pub global_comments: String, + /// Whitespace-compressed version of the script (if any). + pub compressed: Option, } impl TokenizerControlBlock { @@ -37,6 +39,7 @@ impl TokenizerControlBlock { is_within_text: false, #[cfg(feature = "metadata")] global_comments: String::new(), + compressed: None, } } } @@ -879,6 +882,8 @@ pub struct TokenizeState { pub include_comments: bool, /// Is the current tokenizer position within the text stream of an interpolated string? pub is_within_text_terminated_by: Option, + /// Last token + pub last_token: Option, } /// _(internals)_ Trait that encapsulates a peekable character input stream. @@ -956,6 +961,10 @@ pub fn parse_string_literal( let mut skip_whitespace_until = 0; state.is_within_text_terminated_by = Some(termination_char); + state.last_token.as_mut().map(|last| { + last.clear(); + last.push(termination_char); + }); loop { assert!( @@ -985,6 +994,8 @@ pub fn parse_string_literal( } }; + state.last_token.as_mut().map(|last| last.push(next_char)); + // String interpolation? if allow_interpolation && next_char == '$' @@ -1004,6 +1015,10 @@ pub fn parse_string_literal( // Double wrapper if stream.peek_next().map_or(false, |c| c == termination_char) { eat_next(stream, pos); + state + .last_token + .as_mut() + .map(|last| last.push(termination_char)); } else { state.is_within_text_terminated_by = None; break; @@ -1060,6 +1075,7 @@ pub fn parse_string_literal( .get_next() .ok_or_else(|| (LERR::MalformedEscapeSequence(seq.to_string()), *pos))?; + state.last_token.as_mut().map(|last| last.push(c)); seq.push(c); pos.advance(); @@ -1240,6 +1256,8 @@ fn get_next_token_inner( state: &mut TokenizeState, pos: &mut Position, ) -> Option<(Token, Position)> { + state.last_token.as_mut().map(|last| last.clear()); + // Still inside a comment? if state.comment_level > 0 { let start_pos = *pos; @@ -1398,6 +1416,8 @@ fn get_next_token_inner( negated_pos }); + state.last_token.as_mut().map(|last| *last = result.clone()); + // Parse number let token = radix_base.map_or_else( || { @@ -1452,14 +1472,14 @@ fn get_next_token_inner( #[cfg(not(feature = "unicode-xid-ident"))] ('a'..='z' | '_' | 'A'..='Z', ..) => { return Some( - parse_identifier_token(stream, pos, start_pos, c) + parse_identifier_token(stream, state, pos, start_pos, c) .unwrap_or_else(|err| (Token::LexError(err.into()), start_pos)), ); } #[cfg(feature = "unicode-xid-ident")] (ch, ..) if unicode_xid::UnicodeXID::is_xid_start(ch) || ch == '_' => { return Some( - parse_identifier_token(stream, pos, start_pos, c) + parse_identifier_token(stream, state, pos, start_pos, c) .unwrap_or_else(|err| (Token::LexError(err.into()), start_pos)), ); } @@ -1942,18 +1962,24 @@ fn get_next_token_inner( /// Get the next token, parsing it as an identifier. fn parse_identifier_token( stream: &mut impl InputStream, + state: &mut TokenizeState, pos: &mut Position, start_pos: Position, first_char: char, ) -> Result<(Token, Position), LexError> { let mut identifier = SmartString::new_const(); identifier.push(first_char); + state.last_token.as_mut().map(|last| { + last.clear(); + last.push(first_char); + }); while let Some(next_char) = stream.peek_next() { match next_char { x if is_id_continue(x) => { - identifier.push(x); eat_next(stream, pos); + identifier.push(x); + state.last_token.as_mut().map(|last| last.push(x)); } _ => break, } @@ -2129,7 +2155,7 @@ impl<'a> Iterator for TokenIterator<'a> { type Item = (Token, Position); fn next(&mut self) -> Option { - { + let (within_interpolated, compress_script) = { let control = &mut *self.state.tokenizer_control.borrow_mut(); if control.is_within_text { @@ -2138,7 +2164,12 @@ impl<'a> Iterator for TokenIterator<'a> { // Reset it control.is_within_text = false; } - } + + ( + self.state.is_within_text_terminated_by.is_some(), + control.compressed.is_some(), + ) + }; let (token, pos) = match get_next_token(&mut self.stream, &mut self.state, &mut self.pos) { // {EOF} @@ -2230,6 +2261,49 @@ impl<'a> Iterator for TokenIterator<'a> { None => token, }; + // Collect the compressed script, if needed + if compress_script { + let control = &mut *self.state.tokenizer_control.borrow_mut(); + + if let Some(ref mut compressed) = control.compressed { + if !matches!(token, Token::EOF) { + use std::fmt::Write; + + let last_token = self.state.last_token.as_ref().unwrap(); + let mut buf = SmartString::new_const(); + + if last_token.is_empty() { + write!(buf, "{token}").unwrap(); + } else if within_interpolated + && matches!( + token, + Token::StringConstant(..) | Token::InterpolatedString(..) + ) + { + compressed.push_str(&last_token[1..]); + } else { + buf = last_token.clone(); + } + + if !buf.is_empty() { + if !compressed.is_empty() { + let prev = compressed.chars().last().unwrap(); + let cur = buf.chars().next().unwrap(); + if (prev == '_' || is_id_first_alphabetic(prev) || is_id_continue(prev)) + && (cur == '_' + || is_id_first_alphabetic(cur) + || is_id_continue(cur)) + { + compressed.push(' '); + } + } + + compressed.push_str(&buf); + } + } + } + } + Some((token, pos)) } } @@ -2281,6 +2355,7 @@ impl Engine { comment_level: 0, include_comments: false, is_within_text_terminated_by: None, + last_token: None, }, pos: Position::new(1, 0), stream: MultiInputsStream {