diff --git a/crates/typst-eval/src/markup.rs b/crates/typst-eval/src/markup.rs index 25ea5751a..3a5ebe1fc 100644 --- a/crates/typst-eval/src/markup.rs +++ b/crates/typst-eval/src/markup.rs @@ -11,6 +11,7 @@ use typst_library::text::{ LinebreakElem, RawContent, RawElem, SmartQuoteElem, SpaceElem, TextElem, }; use typst_syntax::ast::{self, AstNode}; +use typst_utils::PicoStr; use crate::{Eval, Vm}; @@ -204,7 +205,7 @@ impl Eval for ast::Label<'_> { type Output = Value; fn eval(self, _: &mut Vm) -> SourceResult { - Ok(Value::Label(Label::new(self.get()))) + Ok(Value::Label(Label::new(PicoStr::intern(self.get())))) } } @@ -212,7 +213,7 @@ impl Eval for ast::Ref<'_> { type Output = Content; fn eval(self, vm: &mut Vm) -> SourceResult { - let target = Label::new(self.target()); + let target = Label::new(PicoStr::intern(self.target())); let mut elem = RefElem::new(target); if let Some(supplement) = self.supplement() { elem.push_supplement(Smart::Custom(Some(Supplement::Content( diff --git a/crates/typst-ide/src/analyze.rs b/crates/typst-ide/src/analyze.rs index 5e3dfd700..eaf7248b7 100644 --- a/crates/typst-ide/src/analyze.rs +++ b/crates/typst-ide/src/analyze.rs @@ -88,9 +88,7 @@ pub fn analyze_labels(document: &Document) -> (Vec<(Label, Option)>, let split = output.len(); // Bibliography keys. - for (key, detail) in BibliographyElem::keys(document.introspector.track()) { - output.push((Label::new(key.as_str()), detail)); - } + output.extend(BibliographyElem::keys(document.introspector.track())); (output, split) } diff --git a/crates/typst-ide/src/complete.rs b/crates/typst-ide/src/complete.rs index a2791e071..510db54ce 100644 --- a/crates/typst-ide/src/complete.rs +++ b/crates/typst-ide/src/complete.rs @@ -1254,11 +1254,11 @@ impl<'a> CompletionContext<'a> { eco_format!( "{}{}{}", if open { "<" } else { "" }, - label.as_str(), + label.resolve(), if close { ">" } else { "" } ) }), - label: label.as_str().into(), + label: label.resolve().as_str().into(), detail, }); } diff --git a/crates/typst-ide/src/definition.rs b/crates/typst-ide/src/definition.rs index 94def1c18..9303aee43 100644 --- a/crates/typst-ide/src/definition.rs +++ b/crates/typst-ide/src/definition.rs @@ -1,6 +1,7 @@ use typst::foundations::{Label, Selector, Value}; use typst::model::Document; use typst::syntax::{ast, LinkedNode, Side, Source, Span}; +use typst::utils::PicoStr; use crate::utils::globals; use crate::{ @@ -71,7 +72,7 @@ pub fn definition( // Try to jump to the referenced content. DerefTarget::Ref(node) => { - let label = Label::new(node.cast::()?.target()); + let label = Label::new(PicoStr::intern(node.cast::()?.target())); let selector = Selector::Label(label); let elem = document?.introspector.query_first(&selector)?; return Some(Definition::Span(elem.span())); diff --git a/crates/typst-ide/src/tooltip.rs b/crates/typst-ide/src/tooltip.rs index d62826522..30aca24fb 100644 --- a/crates/typst-ide/src/tooltip.rs +++ b/crates/typst-ide/src/tooltip.rs @@ -181,7 +181,7 @@ fn label_tooltip(document: &Document, leaf: &LinkedNode) -> Option { }; for (label, detail) in analyze_labels(document).0 { - if label.as_str() == target { + if label.resolve().as_str() == target { return Some(Tooltip::Text(detail?)); } } diff --git a/crates/typst-library/src/foundations/label.rs b/crates/typst-library/src/foundations/label.rs index 726958df7..2f5520b1c 100644 --- a/crates/typst-library/src/foundations/label.rs +++ b/crates/typst-library/src/foundations/label.rs @@ -1,7 +1,7 @@ use ecow::{eco_format, EcoString}; -use typst_utils::PicoStr; +use typst_utils::{PicoStr, ResolvedPicoStr}; -use crate::foundations::{func, scope, ty, Repr}; +use crate::foundations::{func, scope, ty, Repr, Str}; /// A label for an element. /// @@ -45,17 +45,17 @@ use crate::foundations::{func, scope, ty, Repr}; /// Currently, labels can only be attached to elements in markup mode, not in /// code mode. This might change in the future. #[ty(scope, cast)] -#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash)] +#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] pub struct Label(PicoStr); impl Label { - /// Creates a label from a string, interning it. - pub fn new(name: impl Into) -> Self { - Self(name.into()) + /// Creates a label from an interned string. + pub fn new(name: PicoStr) -> Self { + Self(name) } /// Resolves the label to a string. - pub fn as_str(&self) -> &'static str { + pub fn resolve(self) -> ResolvedPicoStr { self.0.resolve() } @@ -71,15 +71,15 @@ impl Label { #[func(constructor)] pub fn construct( /// The name of the label. - name: PicoStr, + name: Str, ) -> Label { - Self(name) + Self(PicoStr::intern(name.as_str())) } } impl Repr for Label { fn repr(&self) -> EcoString { - eco_format!("<{}>", self.as_str()) + eco_format!("<{}>", self.resolve()) } } diff --git a/crates/typst-library/src/foundations/str.rs b/crates/typst-library/src/foundations/str.rs index 1431e9f98..72fdcc53a 100644 --- a/crates/typst-library/src/foundations/str.rs +++ b/crates/typst-library/src/foundations/str.rs @@ -7,7 +7,6 @@ use comemo::Tracked; use ecow::EcoString; use serde::{Deserialize, Serialize}; use typst_syntax::{Span, Spanned}; -use typst_utils::PicoStr; use unicode_segmentation::UnicodeSegmentation; use crate::diag::{bail, At, SourceResult, StrResult}; @@ -753,12 +752,6 @@ cast! { v: Str => v.into(), } -cast! { - PicoStr, - self => Value::Str(self.resolve().into()), - v: Str => v.as_str().into(), -} - cast! { String, self => Value::Str(self.into()), @@ -784,7 +777,7 @@ cast! { .map_err(|_| "bytes are not valid utf-8")? .into() ), - v: Label => Self::Str(v.as_str().into()), + v: Label => Self::Str(v.resolve().as_str().into()), v: Type => Self::Str(v.long_name().into()), v: Str => Self::Str(v), } diff --git a/crates/typst-library/src/model/bibliography.rs b/crates/typst-library/src/model/bibliography.rs index 569167311..280ac4a42 100644 --- a/crates/typst-library/src/model/bibliography.rs +++ b/crates/typst-library/src/model/bibliography.rs @@ -179,16 +179,13 @@ impl BibliographyElem { } /// Find all bibliography keys. - pub fn keys( - introspector: Tracked, - ) -> Vec<(EcoString, Option)> { + pub fn keys(introspector: Tracked) -> Vec<(Label, Option)> { let mut vec = vec![]; for elem in introspector.query(&Self::elem().select()).iter() { let this = elem.to_packed::().unwrap(); - for entry in this.bibliography().entries() { - let key = entry.key().into(); + for (key, entry) in this.bibliography().iter() { let detail = entry.title().map(|title| title.value.to_str().into()); - vec.push((key, detail)) + vec.push((Label::new(key), detail)) } } vec @@ -341,7 +338,7 @@ impl Bibliography { }; for entry in library { - match map.entry(entry.key().into()) { + match map.entry(PicoStr::intern(entry.key())) { indexmap::map::Entry::Vacant(vacant) => { vacant.insert(entry); } @@ -366,8 +363,8 @@ impl Bibliography { self.map.contains_key(&key.into()) } - fn entries(&self) -> impl Iterator { - self.map.values() + fn iter(&self) -> impl Iterator { + self.map.iter().map(|(&k, v)| (k, v)) } } @@ -661,7 +658,7 @@ impl<'a> Generator<'a> { errors.push(error!( child.span(), "key `{}` does not exist in the bibliography", - key.as_str() + key.resolve() )); continue; }; @@ -775,7 +772,9 @@ impl<'a> Generator<'a> { let mut output = std::mem::take(&mut self.failures); for (info, citation) in self.infos.iter().zip(&rendered.citations) { let supplement = |i: usize| info.subinfos.get(i)?.supplement.clone(); - let link = |i: usize| links.get(info.subinfos.get(i)?.key.as_str()).copied(); + let link = |i: usize| { + links.get(info.subinfos.get(i)?.key.resolve().as_str()).copied() + }; let renderer = ElemRenderer { routines: self.routines, @@ -820,7 +819,7 @@ impl<'a> Generator<'a> { let mut first_occurrences = HashMap::new(); for info in &self.infos { for subinfo in &info.subinfos { - let key = subinfo.key.as_str(); + let key = subinfo.key.resolve(); first_occurrences.entry(key).or_insert(info.location); } } diff --git a/crates/typst-pdf/src/catalog.rs b/crates/typst-pdf/src/catalog.rs index 1412afe63..35c5ce681 100644 --- a/crates/typst-pdf/src/catalog.rs +++ b/crates/typst-pdf/src/catalog.rs @@ -174,7 +174,7 @@ pub fn write_catalog( let mut dests_name_tree = name_dict.destinations(); let mut names = dests_name_tree.names(); for &(name, dest_ref, ..) in &ctx.references.named_destinations.dests { - names.insert(Str(name.as_str().as_bytes()), dest_ref); + names.insert(Str(name.resolve().as_bytes()), dest_ref); } } diff --git a/crates/typst-pdf/src/named_destination.rs b/crates/typst-pdf/src/named_destination.rs index 90552335c..7ae2c5e6f 100644 --- a/crates/typst-pdf/src/named_destination.rs +++ b/crates/typst-pdf/src/named_destination.rs @@ -53,12 +53,12 @@ pub fn write_named_destinations( .collect(); // Named destinations must be sorted by key. - matches.sort_by_key(|&(_, label)| label); + matches.sort_by_key(|&(_, label)| label.resolve()); for (loc, label) in matches { // Don't encode named destinations that would exceed the limit. Those // will instead be encoded as normal links. - if label.as_str().len() > Str::PDFA_LIMIT { + if label.resolve().len() > Str::PDFA_LIMIT { continue; } diff --git a/crates/typst-pdf/src/page.rs b/crates/typst-pdf/src/page.rs index 27daf6c95..4e95f3c70 100644 --- a/crates/typst-pdf/src/page.rs +++ b/crates/typst-pdf/src/page.rs @@ -154,7 +154,7 @@ fn write_page( .action() .action_type(ActionType::GoTo) // `key` must be a `Str`, not a `Name`. - .pair(Name(b"D"), Str(key.as_str().as_bytes())); + .pair(Name(b"D"), Str(key.resolve().as_bytes())); continue; } else { ctx.document.introspector.position(*loc) diff --git a/crates/typst-svg/src/lib.rs b/crates/typst-svg/src/lib.rs index 0ae7b2dad..fb7d27c22 100644 --- a/crates/typst-svg/src/lib.rs +++ b/crates/typst-svg/src/lib.rs @@ -241,7 +241,7 @@ impl SVGRenderer { self.xml.write_attribute("class", "typst-group"); if let Some(label) = group.label { - self.xml.write_attribute("data-typst-label", label.as_str()); + self.xml.write_attribute("data-typst-label", &label.resolve()); } if let Some(clip_path) = &group.clip_path { diff --git a/crates/typst-utils/src/lib.rs b/crates/typst-utils/src/lib.rs index e199e1bbb..61703250a 100644 --- a/crates/typst-utils/src/lib.rs +++ b/crates/typst-utils/src/lib.rs @@ -16,7 +16,7 @@ pub use self::bitset::{BitSet, SmallBitSet}; pub use self::deferred::Deferred; pub use self::duration::format_duration; pub use self::hash::LazyHash; -pub use self::pico::PicoStr; +pub use self::pico::{PicoStr, ResolvedPicoStr}; pub use self::round::{round_int_with_precision, round_with_precision}; pub use self::scalar::Scalar; diff --git a/crates/typst-utils/src/pico.rs b/crates/typst-utils/src/pico.rs index 7fcd33435..dbab14a1c 100644 --- a/crates/typst-utils/src/pico.rs +++ b/crates/typst-utils/src/pico.rs @@ -1,87 +1,418 @@ +use std::borrow::Borrow; use std::cmp::Ordering; use std::collections::HashMap; -use std::fmt::{self, Debug, Formatter}; -use std::num::NonZeroU32; +use std::fmt::{self, Debug, Display, Formatter}; +use std::hash::{Hash, Hasher}; +use std::num::NonZeroU64; +use std::ops::Deref; use std::sync::{LazyLock, RwLock}; -/// The global string interner. -static INTERNER: LazyLock> = LazyLock::new(|| { - RwLock::new(Interner { to_id: HashMap::new(), from_id: Vec::new() }) -}); +/// Marks a number as a bitcode encoded `PicoStr``. +const MARKER: u64 = 1 << 63; + +/// The global runtime string interner. +static INTERNER: LazyLock> = + LazyLock::new(|| RwLock::new(Interner { seen: HashMap::new(), strings: Vec::new() })); /// A string interner. struct Interner { - to_id: HashMap<&'static str, PicoStr>, - from_id: Vec<&'static str>, + seen: HashMap<&'static str, PicoStr>, + strings: Vec<&'static str>, } -/// An interned string. +/// An interned string representation that is cheap to copy and hash, but more +/// expensive to access. /// -/// The API is purposefully kept small. This is because it might be relatively -/// slow to look up a string in the interner, so we want to avoid doing it -/// unnecessarily. For this reason, the user should use the [`PicoStr::resolve`] -/// method to get the underlying string, such that the lookup is done only once. +/// This type takes up 8 bytes and is copyable and null-optimized (i.e. +/// `Option` also takes 8 bytes). +/// +/// Supports compile-time string interning via [`PicoStr::constant`] in two +/// flavors: +/// - Strings of length at most 12 containing only chars from 'a'-'z', '1'-'4', +/// and '-' are stored inline in the number +/// - Other strings _can_ be compile-time interned the same way, but must first +/// be added to the list in `exceptions::LIST`. +/// +/// No such restrictions apply at runtime (via [`PicoStr::intern`]). #[derive(Copy, Clone, Eq, PartialEq, Hash)] -pub struct PicoStr(NonZeroU32); +pub struct PicoStr(NonZeroU64); impl PicoStr { - /// Creates a new interned string. - pub fn new(string: &str) -> Self { + /// Intern a string at runtime. + pub fn intern(string: &str) -> PicoStr { + // Try to use bitcode or exception representations. + if let Ok(value) = PicoStr::try_constant(string) { + return value; + } + // Try to find an existing entry that we can reuse. // // We could check with just a read lock, but if the string is not yet // present, we would then need to recheck after acquiring a write lock, // which is probably not worth it. let mut interner = INTERNER.write().unwrap(); - if let Some(&id) = interner.to_id.get(string) { + if let Some(&id) = interner.seen.get(string) { return id; } // Create a new entry forever by leaking the string. PicoStr is only // used for strings that aren't created en masse, so it is okay. - let num = u32::try_from(interner.from_id.len() + 1) - .and_then(NonZeroU32::try_from) - .expect("out of string ids"); - - let id = Self(num); + let num = exceptions::LIST.len() + interner.strings.len() + 1; + let id = Self(NonZeroU64::new(num as u64).unwrap()); let string = Box::leak(string.to_string().into_boxed_str()); - interner.to_id.insert(string, id); - interner.from_id.push(string); + interner.seen.insert(string, id); + interner.strings.push(string); id } - /// Resolves the interned string. - pub fn resolve(&self) -> &'static str { - INTERNER.read().unwrap().from_id[(self.0.get() - 1) as usize] + /// Creates a compile-time constant `PicoStr`. + /// + /// Should only be used in const contexts because it can panic. + #[track_caller] + pub const fn constant(string: &'static str) -> PicoStr { + match PicoStr::try_constant(string) { + Ok(value) => value, + Err(err) => panic!("{}", err.message()), + } + } + + /// Try to intern a string statically at compile-time. + pub const fn try_constant(string: &str) -> Result { + // Try to encode with bitcode. + let value = match bitcode::encode(string) { + // Store representation marker in high bit. Bitcode doesn't use + // 4 high bits. + Ok(v) => v | MARKER, + + // If that fails, try to use the exception list. + Err(e) => { + if let Some(i) = exceptions::get(string) { + // Offset by one to make it non-zero. + i as u64 + 1 + } else { + return Err(e); + } + } + }; + + match NonZeroU64::new(value) { + Some(value) => Ok(Self(value)), + None => unreachable!(), + } + } + + /// Resolve to a decoded string. + pub fn resolve(self) -> ResolvedPicoStr { + // If high bit is set, this is a bitcode-encoded string. + let value = self.0.get(); + if value & MARKER != 0 { + return bitcode::decode(value & !MARKER); + } + + let index = (value - 1) as usize; + let string = if let Some(runtime) = index.checked_sub(exceptions::LIST.len()) { + INTERNER.read().unwrap().strings[runtime] + } else { + exceptions::LIST[index] + }; + + ResolvedPicoStr(Repr::Static(string)) } } impl Debug for PicoStr { fn fmt(&self, f: &mut Formatter) -> fmt::Result { - self.resolve().fmt(f) + Debug::fmt(self.resolve().as_str(), f) } } -impl Ord for PicoStr { +/// A 5-bit encoding for strings with length up two 12 that are restricted to a +/// specific charset. +mod bitcode { + use super::{Repr, ResolvedPicoStr}; + + /// Maps from encodings to their bytes. + const DECODE: &[u8; 32] = b"\0abcdefghijklmnopqrstuvwxyz-1234"; + + /// Maps from bytes to their encodings. + const ENCODE: &[u8; 256] = &{ + let mut map = [0; 256]; + let mut i = 0; + while i < DECODE.len() { + map[DECODE[i] as usize] = i as u8; + i += 1; + } + map + }; + + /// Try to encode a string as a 64-bit integer. + pub const fn encode(string: &str) -> Result { + let bytes = string.as_bytes(); + + if bytes.len() > 12 { + return Err(EncodingError::TooLong); + } + + let mut num: u64 = 0; + let mut i = bytes.len(); + while i > 0 { + i -= 1; + let b = bytes[i]; + let v = ENCODE[b as usize]; + if v == 0 { + return Err(EncodingError::BadChar); + } + num <<= 5; + num |= v as u64; + } + + Ok(num) + } + + /// Decode the string for a 64-bit integer. + pub const fn decode(mut value: u64) -> ResolvedPicoStr { + let mut buf = [0; 12]; + let mut len = 0; + + while value != 0 { + let v = value & 0b11111; + buf[len as usize] = DECODE[v as usize]; + len += 1; + value >>= 5; + } + + ResolvedPicoStr(Repr::Inline(buf, len)) + } + + /// A failure during compile-time interning. + pub enum EncodingError { + TooLong, + BadChar, + } + + impl EncodingError { + pub const fn message(&self) -> &'static str { + match self { + Self::TooLong => { + "the maximum auto-internible string length is 12. \ + you can add an exception to typst-utils/src/pico.rs \ + to intern longer strings." + } + Self::BadChar => { + "can only auto-intern the chars 'a'-'z', '1'-'4', and '-'. \ + you can add an exception to typst-utils/src/pico.rs \ + to intern other strings." + } + } + } + } +} + +/// Compile-time interned strings that cannot be encoded with `bitcode`. +mod exceptions { + use std::cmp::Ordering; + + /// A global list of non-bitcode-encodable compile-time internible strings. + pub const LIST: &[&str] = &[ + "cjk-latin-spacing", + "discretionary-ligatures", + "historical-ligatures", + "number-clearance", + "number-margin", + "numbering-scope", + "page-numbering", + "par-line-marker", + "transparentize", + ]; + + /// Try to find the index of an exception if it exists. + pub const fn get(string: &str) -> Option { + let mut lo = 0; + let mut hi = LIST.len(); + while lo < hi { + let mid = (lo + hi) / 2; + match strcmp(string, LIST[mid]) { + Ordering::Less => hi = mid, + Ordering::Greater => lo = mid + 1, + Ordering::Equal => return Some(mid), + } + } + None + } + + /// Compare two strings. + const fn strcmp(a: &str, b: &str) -> Ordering { + let a = a.as_bytes(); + let b = b.as_bytes(); + let l = min(a.len(), b.len()); + + let mut i = 0; + while i < l { + if a[i] == b[i] { + i += 1; + } else if a[i] < b[i] { + return Ordering::Less; + } else { + return Ordering::Greater; + } + } + + if i < b.len() { + Ordering::Less + } else if i < a.len() { + Ordering::Greater + } else { + Ordering::Equal + } + } + + /// Determine the minimum of two integers. + const fn min(a: usize, b: usize) -> usize { + if a < b { + a + } else { + b + } + } +} + +/// This is returned by [`PicoStr::resolve`]. +/// +/// Dereferences to a `str`. +pub struct ResolvedPicoStr(Repr); + +/// Representation of a resolved string. +enum Repr { + Inline([u8; 12], u8), + Static(&'static str), +} + +impl ResolvedPicoStr { + /// Retrieve the underlying string. + pub fn as_str(&self) -> &str { + match &self.0 { + Repr::Inline(buf, len) => unsafe { + std::str::from_utf8_unchecked(&buf[..*len as usize]) + }, + Repr::Static(s) => s, + } + } +} + +impl Debug for ResolvedPicoStr { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + Debug::fmt(self.as_str(), f) + } +} + +impl Display for ResolvedPicoStr { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + Display::fmt(self.as_str(), f) + } +} + +impl Deref for ResolvedPicoStr { + type Target = str; + + fn deref(&self) -> &Self::Target { + self.as_str() + } +} + +impl AsRef for ResolvedPicoStr { + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl Borrow for ResolvedPicoStr { + fn borrow(&self) -> &str { + self.as_str() + } +} + +impl Eq for ResolvedPicoStr {} + +impl PartialEq for ResolvedPicoStr { + fn eq(&self, other: &Self) -> bool { + self.as_str().eq(other.as_str()) + } +} + +impl Ord for ResolvedPicoStr { fn cmp(&self, other: &Self) -> Ordering { - self.resolve().cmp(other.resolve()) + self.as_str().cmp(other.as_str()) } } -impl PartialOrd for PicoStr { +impl PartialOrd for ResolvedPicoStr { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl AsRef for PicoStr { - fn as_ref(&self) -> &str { - self.resolve() +impl Hash for ResolvedPicoStr { + fn hash(&self, state: &mut H) { + self.as_str().hash(state); } } -impl From<&str> for PicoStr { - fn from(value: &str) -> Self { - Self::new(value) +#[cfg(test)] +mod tests { + use super::*; + + #[track_caller] + fn roundtrip(s: &str) { + assert_eq!(PicoStr::intern(s).resolve().as_str(), s); + } + + #[test] + fn test_pico_str() { + // Test comparing compile-time and runtime-interned bitcode string. + const H1: PicoStr = PicoStr::constant("h1"); + assert_eq!(H1, PicoStr::intern("h1")); + assert_eq!(H1.resolve().as_str(), "h1"); + + // Test comparing compile-time and runtime-interned exception. + const DISC: PicoStr = PicoStr::constant("discretionary-ligatures"); + assert_eq!(DISC, PicoStr::intern("discretionary-ligatures")); + assert_eq!(DISC.resolve().as_str(), "discretionary-ligatures"); + + // Test just roundtripping some strings. + roundtrip(""); + roundtrip("hi"); + roundtrip("∆@