Simplify strings interner.

This commit is contained in:
Stephen Chung 2022-09-27 08:52:51 +08:00
parent 25476d1cea
commit a518ab62bb
2 changed files with 57 additions and 44 deletions

View File

@ -47,9 +47,6 @@ const NEVER_ENDS: &str = "`Token`";
/// Unroll `switch` ranges no larger than this. /// Unroll `switch` ranges no larger than this.
const SMALL_SWITCH_RANGE: INT = 16; const SMALL_SWITCH_RANGE: INT = 16;
/// Number of string interners used: two additional for property getters/setters if not `no_object`
const NUM_INTERNERS: usize = if cfg!(feature = "no_object") { 1 } else { 3 };
/// _(internals)_ A type that encapsulates the current state of the parser. /// _(internals)_ A type that encapsulates the current state of the parser.
/// Exported under the `internals` feature only. /// Exported under the `internals` feature only.
pub struct ParseState<'e> { pub struct ParseState<'e> {
@ -58,7 +55,7 @@ pub struct ParseState<'e> {
/// Controls whether parsing of an expression should stop given the next token. /// Controls whether parsing of an expression should stop given the next token.
pub expr_filter: fn(&Token) -> bool, pub expr_filter: fn(&Token) -> bool,
/// String interners. /// String interners.
interned_strings: [StringsInterner<'e>; NUM_INTERNERS], interned_strings: StringsInterner<'e>,
/// External [scope][Scope] with constants. /// External [scope][Scope] with constants.
pub scope: &'e Scope<'e>, pub scope: &'e Scope<'e>,
/// Global runtime state. /// Global runtime state.
@ -88,6 +85,8 @@ pub struct ParseState<'e> {
} }
impl fmt::Debug for ParseState<'_> { impl fmt::Debug for ParseState<'_> {
#[cold]
#[inline(never)]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut f = f.debug_struct("ParseState"); let mut f = f.debug_struct("ParseState");
@ -116,7 +115,7 @@ impl<'e> ParseState<'e> {
pub fn new( pub fn new(
engine: &Engine, engine: &Engine,
scope: &'e Scope, scope: &'e Scope,
interned_strings: [StringsInterner<'e>; NUM_INTERNERS], interned_strings: StringsInterner<'e>,
tokenizer_control: TokenizerControl, tokenizer_control: TokenizerControl,
) -> Self { ) -> Self {
Self { Self {
@ -254,7 +253,7 @@ impl<'e> ParseState<'e> {
&mut self, &mut self,
text: impl AsRef<str> + Into<ImmutableString>, text: impl AsRef<str> + Into<ImmutableString>,
) -> ImmutableString { ) -> ImmutableString {
self.interned_strings[0].get(text) self.interned_strings.get(text)
} }
/// Get an interned property getter, creating one if it is not yet interned. /// Get an interned property getter, creating one if it is not yet interned.
@ -265,8 +264,11 @@ impl<'e> ParseState<'e> {
&mut self, &mut self,
text: impl AsRef<str> + Into<ImmutableString>, text: impl AsRef<str> + Into<ImmutableString>,
) -> ImmutableString { ) -> ImmutableString {
self.interned_strings[1] self.interned_strings.get_with_mapper(
.get_with_mapper(|s| crate::engine::make_getter(s.as_ref()).into(), text) crate::engine::FN_GET,
|s| crate::engine::make_getter(s.as_ref()).into(),
text,
)
} }
/// Get an interned property setter, creating one if it is not yet interned. /// Get an interned property setter, creating one if it is not yet interned.
@ -277,8 +279,11 @@ impl<'e> ParseState<'e> {
&mut self, &mut self,
text: impl AsRef<str> + Into<ImmutableString>, text: impl AsRef<str> + Into<ImmutableString>,
) -> ImmutableString { ) -> ImmutableString {
self.interned_strings[2] self.interned_strings.get_with_mapper(
.get_with_mapper(|s| crate::engine::make_setter(s.as_ref()).into(), text) crate::engine::FN_SET,
|s| crate::engine::make_setter(s.as_ref()).into(),
text,
)
} }
} }

View File

@ -1,3 +1,4 @@
use super::BloomFilterU64;
use crate::func::{hashing::get_hasher, StraightHashMap}; use crate::func::{hashing::get_hasher, StraightHashMap};
use crate::ImmutableString; use crate::ImmutableString;
#[cfg(feature = "no_std")] #[cfg(feature = "no_std")]
@ -14,7 +15,7 @@ use std::{
}; };
/// Maximum number of strings interned. /// Maximum number of strings interned.
pub const MAX_INTERNED_STRINGS: usize = 256; pub const MAX_INTERNED_STRINGS: usize = 1024;
/// Maximum length of strings interned. /// Maximum length of strings interned.
pub const MAX_STRING_LEN: usize = 24; pub const MAX_STRING_LEN: usize = 24;
@ -28,8 +29,10 @@ pub struct StringsInterner<'a> {
pub capacity: usize, pub capacity: usize,
/// Maximum string length. /// Maximum string length.
pub max_string_len: usize, pub max_string_len: usize,
/// Normal strings. /// Cached strings.
strings: StraightHashMap<ImmutableString>, cache: StraightHashMap<ImmutableString>,
/// Bloom filter to avoid caching "one-hit wonders".
filter: BloomFilterU64,
/// Take care of the lifetime parameter. /// Take care of the lifetime parameter.
dummy: PhantomData<&'a ()>, dummy: PhantomData<&'a ()>,
} }
@ -42,9 +45,10 @@ impl Default for StringsInterner<'_> {
} }
impl fmt::Debug for StringsInterner<'_> { impl fmt::Debug for StringsInterner<'_> {
#[inline] #[cold]
#[inline(never)]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_list().entries(self.strings.values()).finish() f.debug_list().entries(self.cache.values()).finish()
} }
} }
@ -56,7 +60,8 @@ impl StringsInterner<'_> {
Self { Self {
capacity: MAX_INTERNED_STRINGS, capacity: MAX_INTERNED_STRINGS,
max_string_len: MAX_STRING_LEN, max_string_len: MAX_STRING_LEN,
strings: StraightHashMap::default(), cache: StraightHashMap::default(),
filter: BloomFilterU64::new(),
dummy: PhantomData, dummy: PhantomData,
} }
} }
@ -65,7 +70,7 @@ impl StringsInterner<'_> {
#[inline(always)] #[inline(always)]
#[must_use] #[must_use]
pub fn get<S: AsRef<str> + Into<ImmutableString>>(&mut self, text: S) -> ImmutableString { pub fn get<S: AsRef<str> + Into<ImmutableString>>(&mut self, text: S) -> ImmutableString {
self.get_with_mapper(Into::into, text) self.get_with_mapper("", Into::into, text)
} }
/// Get an identifier from a text string, adding it to the interner if necessary. /// Get an identifier from a text string, adding it to the interner if necessary.
@ -73,20 +78,23 @@ impl StringsInterner<'_> {
#[must_use] #[must_use]
pub fn get_with_mapper<S: AsRef<str>>( pub fn get_with_mapper<S: AsRef<str>>(
&mut self, &mut self,
id: &str,
mapper: impl Fn(S) -> ImmutableString, mapper: impl Fn(S) -> ImmutableString,
text: S, text: S,
) -> ImmutableString { ) -> ImmutableString {
let key = text.as_ref(); let key = text.as_ref();
if key.len() > MAX_STRING_LEN { let hasher = &mut get_hasher();
id.hash(hasher);
key.hash(hasher);
let hash = hasher.finish();
// Cache long strings only on the second try to avoid caching "one-hit wonders".
if key.len() > MAX_STRING_LEN && self.filter.is_absent_and_set(hash) {
return mapper(text); return mapper(text);
} }
let hasher = &mut get_hasher(); let result = match self.cache.entry(hash) {
key.hash(hasher);
let key = hasher.finish();
let result = match self.strings.entry(key) {
Entry::Occupied(e) => return e.get().clone(), Entry::Occupied(e) => return e.get().clone(),
Entry::Vacant(e) => { Entry::Vacant(e) => {
let value = mapper(text); let value = mapper(text);
@ -100,7 +108,7 @@ impl StringsInterner<'_> {
}; };
// If the interner is over capacity, remove the longest entry that has the lowest count // If the interner is over capacity, remove the longest entry that has the lowest count
if self.strings.len() > self.capacity { if self.cache.len() > self.capacity {
// Leave some buffer to grow when shrinking the cache. // Leave some buffer to grow when shrinking the cache.
// We leave at least two entries, one for the empty string, and one for the string // We leave at least two entries, one for the empty string, and one for the string
// that has just been inserted. // that has just been inserted.
@ -110,21 +118,21 @@ impl StringsInterner<'_> {
self.capacity - 3 self.capacity - 3
}; };
while self.strings.len() > max { while self.cache.len() > max {
let (_, _, n) = let (_, _, n) = self
self.strings .cache
.iter() .iter()
.fold((0, usize::MAX, 0), |(x, c, n), (&k, v)| { .fold((0, usize::MAX, 0), |(x, c, n), (&k, v)| {
if k != key if k != hash
&& (v.strong_count() < c || (v.strong_count() == c && v.len() > x)) && (v.strong_count() < c || (v.strong_count() == c && v.len() > x))
{ {
(v.len(), v.strong_count(), k) (v.len(), v.strong_count(), k)
} else { } else {
(x, c, n) (x, c, n)
} }
}); });
self.strings.remove(&n); self.cache.remove(&n);
} }
} }
@ -136,7 +144,7 @@ impl StringsInterner<'_> {
#[must_use] #[must_use]
#[allow(dead_code)] #[allow(dead_code)]
pub fn len(&self) -> usize { pub fn len(&self) -> usize {
self.strings.len() self.cache.len()
} }
/// Returns `true` if there are no interned strings. /// Returns `true` if there are no interned strings.
@ -144,28 +152,28 @@ impl StringsInterner<'_> {
#[must_use] #[must_use]
#[allow(dead_code)] #[allow(dead_code)]
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
self.strings.is_empty() self.cache.is_empty()
} }
/// Clear all interned strings. /// Clear all interned strings.
#[inline(always)] #[inline(always)]
#[allow(dead_code)] #[allow(dead_code)]
pub fn clear(&mut self) { pub fn clear(&mut self) {
self.strings.clear(); self.cache.clear();
} }
} }
impl AddAssign<Self> for StringsInterner<'_> { impl AddAssign<Self> for StringsInterner<'_> {
#[inline(always)] #[inline(always)]
fn add_assign(&mut self, rhs: Self) { fn add_assign(&mut self, rhs: Self) {
self.strings.extend(rhs.strings.into_iter()); self.cache.extend(rhs.cache.into_iter());
} }
} }
impl AddAssign<&Self> for StringsInterner<'_> { impl AddAssign<&Self> for StringsInterner<'_> {
#[inline(always)] #[inline(always)]
fn add_assign(&mut self, rhs: &Self) { fn add_assign(&mut self, rhs: &Self) {
self.strings self.cache
.extend(rhs.strings.iter().map(|(&k, v)| (k, v.clone()))); .extend(rhs.cache.iter().map(|(&k, v)| (k, v.clone())));
} }
} }