use std::borrow::{Borrow, Cow}; use std::fmt::{self, Debug, Display, Formatter}; use std::hash::{Hash, Hasher}; use std::ops::{Add, AddAssign, Deref, Range}; use comemo::Tracked; use ecow::EcoString; use serde::{Deserialize, Serialize}; use typst_syntax::{Span, Spanned}; use typst_utils::PicoStr; use unicode_segmentation::UnicodeSegmentation; use crate::diag::{bail, At, SourceResult, StrResult}; use crate::engine::Engine; use crate::foundations::{ cast, dict, func, repr, scope, ty, Array, Bytes, Context, Decimal, Dict, Func, IntoValue, Label, Repr, Type, Value, Version, }; use crate::layout::Alignment; /// Create a new [`Str`] from a format string. #[macro_export] #[doc(hidden)] macro_rules! __format_str { ($($tts:tt)*) => {{ $crate::foundations::Str::from($crate::foundations::eco_format!($($tts)*)) }}; } #[doc(hidden)] pub use ecow::eco_format; #[doc(inline)] pub use crate::__format_str as format_str; /// A sequence of Unicode codepoints. /// /// You can iterate over the grapheme clusters of the string using a [for /// loop]($scripting/#loops). Grapheme clusters are basically characters but /// keep together things that belong together, e.g. multiple codepoints that /// together form a flag emoji. Strings can be added with the `+` operator, /// [joined together]($scripting/#blocks) and multiplied with integers. /// /// Typst provides utility methods for string manipulation. Many of these /// methods (e.g., `split`, `trim` and `replace`) operate on _patterns:_ A /// pattern can be either a string or a [regular expression]($regex). This makes /// the methods quite versatile. /// /// All lengths and indices are expressed in terms of UTF-8 bytes. Indices are /// zero-based and negative indices wrap around to the end of the string. /// /// You can convert a value to a string with this type's constructor. /// /// # Example /// ```example /// #"hello world!" \ /// #"\"hello\n world\"!" \ /// #"1 2 3".split() \ /// #"1,2;3".split(regex("[,;]")) \ /// #(regex("\d+") in "ten euros") \ /// #(regex("\d+") in "10 euros") /// ``` /// /// # Escape sequences { #escapes } /// Just like in markup, you can escape a few symbols in strings: /// - `[\\]` for a backslash /// - `[\"]` for a quote /// - `[\n]` for a newline /// - `[\r]` for a carriage return /// - `[\t]` for a tab /// - `[\u{1f600}]` for a hexadecimal Unicode escape sequence #[ty(scope, cast, title = "String")] #[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] #[derive(Serialize, Deserialize)] #[serde(transparent)] pub struct Str(EcoString); impl Str { /// Create a new, empty string. pub fn new() -> Self { Self(EcoString::new()) } /// Return `true` if the length is 0. pub fn is_empty(&self) -> bool { self.0.is_empty() } /// Repeat the string a number of times. pub fn repeat(&self, n: usize) -> StrResult { if self.0.len().checked_mul(n).is_none() { return Err(eco_format!("cannot repeat this string {n} times")); } Ok(Self(self.0.repeat(n))) } /// A string slice containing the entire string. pub fn as_str(&self) -> &str { self } /// Resolve an index or throw an out of bounds error. fn locate(&self, index: i64) -> StrResult { self.locate_opt(index)? .ok_or_else(|| out_of_bounds(index, self.len())) } /// Resolve an index, if it is within bounds and on a valid char boundary. /// /// `index == len` is considered in bounds. fn locate_opt(&self, index: i64) -> StrResult> { let wrapped = if index >= 0 { Some(index) } else { (self.len() as i64).checked_add(index) }; let resolved = wrapped .and_then(|v| usize::try_from(v).ok()) .filter(|&v| v <= self.0.len()); if resolved.is_some_and(|i| !self.0.is_char_boundary(i)) { return Err(not_a_char_boundary(index)); } Ok(resolved) } } #[scope] impl Str { /// Converts a value to a string. /// /// - Integers are formatted in base 10. This can be overridden with the /// optional `base` parameter. /// - Floats are formatted in base 10 and never in exponential notation. /// - Negative integers and floats are formatted with the Unicode minus sign /// ("−" U+2212) instead of the ASCII minus sign ("-" U+002D). /// - From labels the name is extracted. /// - Bytes are decoded as UTF-8. /// /// If you wish to convert from and to Unicode code points, see the /// [`to-unicode`]($str.to-unicode) and [`from-unicode`]($str.from-unicode) /// functions. /// /// ```example /// #str(10) \ /// #str(4000, base: 16) \ /// #str(2.7) \ /// #str(1e8) \ /// #str() /// ``` #[func(constructor)] pub fn construct( /// The value that should be converted to a string. value: ToStr, /// The base (radix) to display integers in, between 2 and 36. #[named] #[default(Spanned::new(10, Span::detached()))] base: Spanned, ) -> SourceResult { Ok(match value { ToStr::Str(s) => { if base.v != 10 { bail!(base.span, "base is only supported for integers"); } s } ToStr::Int(n) => { if base.v < 2 || base.v > 36 { bail!(base.span, "base must be between 2 and 36"); } repr::format_int_with_base(n, base.v).into() } }) } /// The length of the string in UTF-8 encoded bytes. #[func(title = "Length")] pub fn len(&self) -> usize { self.0.len() } /// Extracts the first grapheme cluster of the string. /// Fails with an error if the string is empty. #[func] pub fn first(&self) -> StrResult { self.0 .graphemes(true) .next() .map(Into::into) .ok_or_else(string_is_empty) } /// Extracts the last grapheme cluster of the string. /// Fails with an error if the string is empty. #[func] pub fn last(&self) -> StrResult { self.0 .graphemes(true) .next_back() .map(Into::into) .ok_or_else(string_is_empty) } /// Extracts the first grapheme cluster after the specified index. Returns /// the default value if the index is out of bounds or fails with an error /// if no default value was specified. #[func] pub fn at( &self, /// The byte index. If negative, indexes from the back. index: i64, /// A default value to return if the index is out of bounds. #[named] default: Option, ) -> StrResult { let len = self.len(); self.locate_opt(index)? .and_then(|i| self.0[i..].graphemes(true).next().map(|s| s.into_value())) .or(default) .ok_or_else(|| no_default_and_out_of_bounds(index, len)) } /// Extracts a substring of the string. /// Fails with an error if the start or end index is out of bounds. #[func] pub fn slice( &self, /// The start byte index (inclusive). If negative, indexes from the /// back. start: i64, /// The end byte index (exclusive). If omitted, the whole slice until /// the end of the string is extracted. If negative, indexes from the /// back. #[default] end: Option, /// The number of bytes to extract. This is equivalent to passing /// `start + count` as the `end` position. Mutually exclusive with `end`. #[named] count: Option, ) -> StrResult { let end = end.or(count.map(|c| start + c)).unwrap_or(self.len() as i64); let start = self.locate(start)?; let end = self.locate(end)?.max(start); Ok(self.0[start..end].into()) } /// Returns the grapheme clusters of the string as an array of substrings. #[func] pub fn clusters(&self) -> Array { self.as_str().graphemes(true).map(|s| Value::Str(s.into())).collect() } /// Returns the Unicode codepoints of the string as an array of substrings. #[func] pub fn codepoints(&self) -> Array { self.chars().map(|c| Value::Str(c.into())).collect() } /// Converts a character into its corresponding code point. /// /// ```example /// #"a".to-unicode() \ /// #("a\u{0300}" /// .codepoints() /// .map(str.to-unicode)) /// ``` #[func] pub fn to_unicode( /// The character that should be converted. character: char, ) -> u32 { character as u32 } /// Converts a unicode code point into its corresponding string. /// /// ```example /// #str.from-unicode(97) /// ``` #[func] pub fn from_unicode( /// The code point that should be converted. value: u32, ) -> StrResult { let c: char = value .try_into() .map_err(|_| eco_format!("{value:#x} is not a valid codepoint"))?; Ok(c.into()) } /// Whether the string contains the specified pattern. /// /// This method also has dedicated syntax: You can write `{"bc" in "abcd"}` /// instead of `{"abcd".contains("bc")}`. #[func] pub fn contains( &self, /// The pattern to search for. pattern: StrPattern, ) -> bool { match pattern { StrPattern::Str(pat) => self.0.contains(pat.as_str()), StrPattern::Regex(re) => re.is_match(self), } } /// Whether the string starts with the specified pattern. #[func] pub fn starts_with( &self, /// The pattern the string might start with. pattern: StrPattern, ) -> bool { match pattern { StrPattern::Str(pat) => self.0.starts_with(pat.as_str()), StrPattern::Regex(re) => re.find(self).is_some_and(|m| m.start() == 0), } } /// Whether the string ends with the specified pattern. #[func] pub fn ends_with( &self, /// The pattern the string might end with. pattern: StrPattern, ) -> bool { match pattern { StrPattern::Str(pat) => self.0.ends_with(pat.as_str()), StrPattern::Regex(re) => { let mut start_byte = 0; while let Some(mat) = re.find_at(self, start_byte) { if mat.end() == self.0.len() { return true; } // There might still be a match overlapping this one, so // restart at the next code point. let Some(c) = self[mat.start()..].chars().next() else { break }; start_byte = mat.start() + c.len_utf8(); } false } } } /// Searches for the specified pattern in the string and returns the first /// match as a string or `{none}` if there is no match. #[func] pub fn find( &self, /// The pattern to search for. pattern: StrPattern, ) -> Option { match pattern { StrPattern::Str(pat) => self.0.contains(pat.as_str()).then_some(pat), StrPattern::Regex(re) => re.find(self).map(|m| m.as_str().into()), } } /// Searches for the specified pattern in the string and returns the index /// of the first match as an integer or `{none}` if there is no match. #[func] pub fn position( &self, /// The pattern to search for. pattern: StrPattern, ) -> Option { match pattern { StrPattern::Str(pat) => self.0.find(pat.as_str()), StrPattern::Regex(re) => re.find(self).map(|m| m.start()), } } /// Searches for the specified pattern in the string and returns a /// dictionary with details about the first match or `{none}` if there is no /// match. /// /// The returned dictionary has the following keys: /// - `start`: The start offset of the match /// - `end`: The end offset of the match /// - `text`: The text that matched. /// - `captures`: An array containing a string for each matched capturing /// group. The first item of the array contains the first matched /// capturing, not the whole match! This is empty unless the `pattern` was /// a regex with capturing groups. #[func] pub fn match_( &self, /// The pattern to search for. pattern: StrPattern, ) -> Option { match pattern { StrPattern::Str(pat) => { self.0.match_indices(pat.as_str()).next().map(match_to_dict) } StrPattern::Regex(re) => re.captures(self).map(captures_to_dict), } } /// Searches for the specified pattern in the string and returns an array of /// dictionaries with details about all matches. For details about the /// returned dictionaries, see above. #[func] pub fn matches( &self, /// The pattern to search for. pattern: StrPattern, ) -> Array { match pattern { StrPattern::Str(pat) => self .0 .match_indices(pat.as_str()) .map(match_to_dict) .map(Value::Dict) .collect(), StrPattern::Regex(re) => re .captures_iter(self) .map(captures_to_dict) .map(Value::Dict) .collect(), } } /// Replace at most `count` occurrences of the given pattern with a /// replacement string or function (beginning from the start). If no count /// is given, all occurrences are replaced. #[func] pub fn replace( &self, /// The engine. engine: &mut Engine, /// The callsite context. context: Tracked, /// The pattern to search for. pattern: StrPattern, /// The string to replace the matches with or a function that gets a /// dictionary for each match and can return individual replacement /// strings. replacement: Replacement, /// If given, only the first `count` matches of the pattern are placed. #[named] count: Option, ) -> SourceResult { // Heuristic: Assume the new string is about the same length as // the current string. let mut output = EcoString::with_capacity(self.as_str().len()); // Replace one match of a pattern with the replacement. let mut last_match = 0; let mut handle_match = |range: Range, dict: Dict| -> SourceResult<()> { // Push everything until the match. output.push_str(&self[last_match..range.start]); last_match = range.end; // Determine and push the replacement. match &replacement { Replacement::Str(s) => output.push_str(s), Replacement::Func(func) => { let piece = func .call(engine, context, [dict])? .cast::() .at(func.span())?; output.push_str(&piece); } } Ok(()) }; // Iterate over the matches of the `pattern`. let count = count.unwrap_or(usize::MAX); match &pattern { StrPattern::Str(pat) => { for m in self.match_indices(pat.as_str()).take(count) { let (start, text) = m; handle_match(start..start + text.len(), match_to_dict(m))?; } } StrPattern::Regex(re) => { for caps in re.captures_iter(self).take(count) { // Extract the entire match over all capture groups. let m = caps.get(0).unwrap(); handle_match(m.start()..m.end(), captures_to_dict(caps))?; } } } // Push the remainder. output.push_str(&self[last_match..]); Ok(output.into()) } /// Removes matches of a pattern from one or both sides of the string, once or /// repeatedly and returns the resulting string. #[func] pub fn trim( &self, /// The pattern to search for. If `{none}`, trims white spaces. #[default] pattern: Option, /// Can be `{start}` or `{end}` to only trim the start or end of the /// string. If omitted, both sides are trimmed. #[named] at: Option, /// Whether to repeatedly removes matches of the pattern or just once. /// Defaults to `{true}`. #[named] #[default(true)] repeat: bool, ) -> Str { let mut start = matches!(at, Some(StrSide::Start) | None); let end = matches!(at, Some(StrSide::End) | None); let trimmed = match pattern { None => match at { None => self.0.trim(), Some(StrSide::Start) => self.0.trim_start(), Some(StrSide::End) => self.0.trim_end(), }, Some(StrPattern::Str(pat)) => { let pat = pat.as_str(); let mut s = self.as_str(); if repeat { if start { s = s.trim_start_matches(pat); } if end { s = s.trim_end_matches(pat); } } else { if start { s = s.strip_prefix(pat).unwrap_or(s); } if end { s = s.strip_suffix(pat).unwrap_or(s); } } s } Some(StrPattern::Regex(re)) => { let s = self.as_str(); let mut last = None; let mut range = 0..s.len(); for m in re.find_iter(s) { // Does this match follow directly after the last one? let consecutive = last == Some(m.start()); // As long as we're at the beginning or in a consecutive run // of matches, and we're still trimming at the start, trim. start &= m.start() == 0 || consecutive; if start { range.start = m.end(); start &= repeat; } // Reset end trim if we aren't consecutive anymore or aren't // repeating. if end && (!consecutive || !repeat) { range.end = m.start(); } last = Some(m.end()); } // Is the last match directly at the end? if last.is_some_and(|last| last < s.len()) { range.end = s.len(); } &s[range.start..range.start.max(range.end)] } }; trimmed.into() } /// Splits a string at matches of a specified pattern and returns an array /// of the resulting parts. #[func] pub fn split( &self, /// The pattern to split at. Defaults to whitespace. #[default] pattern: Option, ) -> Array { let s = self.as_str(); match pattern { None => s.split_whitespace().map(|v| Value::Str(v.into())).collect(), Some(StrPattern::Str(pat)) => { s.split(pat.as_str()).map(|v| Value::Str(v.into())).collect() } Some(StrPattern::Regex(re)) => { re.split(s).map(|v| Value::Str(v.into())).collect() } } } /// Reverse the string. #[func(title = "Reverse")] pub fn rev(&self) -> Str { let mut s = EcoString::with_capacity(self.0.len()); for grapheme in self.as_str().graphemes(true).rev() { s.push_str(grapheme); } s.into() } } impl Deref for Str { type Target = str; fn deref(&self) -> &str { &self.0 } } impl Debug for Str { fn fmt(&self, f: &mut Formatter) -> fmt::Result { Debug::fmt(self.as_str(), f) } } impl Display for Str { fn fmt(&self, f: &mut Formatter) -> fmt::Result { Display::fmt(self.as_str(), f) } } impl Repr for Str { fn repr(&self) -> EcoString { self.as_ref().repr() } } impl Repr for EcoString { fn repr(&self) -> EcoString { self.as_ref().repr() } } impl Repr for str { fn repr(&self) -> EcoString { let mut r = EcoString::with_capacity(self.len() + 2); r.push('"'); for c in self.chars() { match c { '\0' => r.push_str(r"\u{0}"), '\'' => r.push('\''), '"' => r.push_str(r#"\""#), _ => c.escape_debug().for_each(|c| r.push(c)), } } r.push('"'); r } } impl Add for Str { type Output = Self; fn add(mut self, rhs: Self) -> Self::Output { self += rhs; self } } impl AddAssign for Str { fn add_assign(&mut self, rhs: Self) { self.0.push_str(rhs.as_str()); } } impl AsRef for Str { fn as_ref(&self) -> &str { self } } impl Borrow for Str { fn borrow(&self) -> &str { self } } impl From for Str { fn from(c: char) -> Self { Self(c.into()) } } impl From<&str> for Str { fn from(s: &str) -> Self { Self(s.into()) } } impl From for Str { fn from(s: EcoString) -> Self { Self(s) } } impl From for Str { fn from(s: String) -> Self { Self(s.into()) } } impl From> for Str { fn from(s: Cow) -> Self { Self(s.into()) } } impl FromIterator for Str { fn from_iter>(iter: T) -> Self { Self(iter.into_iter().collect()) } } impl From for EcoString { fn from(str: Str) -> Self { str.0 } } impl From for String { fn from(s: Str) -> Self { s.0.into() } } cast! { char, self => Value::Str(self.into()), string: Str => { let mut chars = string.chars(); match (chars.next(), chars.next()) { (Some(c), None) => c, _ => bail!("expected exactly one character"), } }, } cast! { &str, self => Value::Str(self.into()), } cast! { EcoString, self => Value::Str(self.into()), v: Str => v.into(), } cast! { PicoStr, self => Value::Str(self.resolve().into()), v: Str => v.as_str().into(), } cast! { String, self => Value::Str(self.into()), v: Str => v.into(), } /// A value that can be cast to a string. pub enum ToStr { /// A string value ready to be used as-is. Str(Str), /// An integer about to be formatted in a given base. Int(i64), } cast! { ToStr, v: i64 => Self::Int(v), v: f64 => Self::Str(repr::display_float(v).into()), v: Decimal => Self::Str(format_str!("{}", v)), v: Version => Self::Str(format_str!("{}", v)), v: Bytes => Self::Str( std::str::from_utf8(&v) .map_err(|_| "bytes are not valid utf-8")? .into() ), v: Label => Self::Str(v.as_str().into()), v: Type => Self::Str(v.long_name().into()), v: Str => Self::Str(v), } /// Convert an item of std's `match_indices` to a dictionary. fn match_to_dict((start, text): (usize, &str)) -> Dict { dict! { "start" => start, "end" => start + text.len(), "text" => text, "captures" => Array::new(), } } /// Convert regex captures to a dictionary. fn captures_to_dict(cap: regex::Captures) -> Dict { let m = cap.get(0).expect("missing first match"); dict! { "start" => m.start(), "end" => m.end(), "text" => m.as_str(), "captures" => cap.iter() .skip(1) .map(|opt| opt.map_or(Value::None, |m| m.as_str().into_value())) .collect::(), } } /// The out of bounds access error message. #[cold] fn out_of_bounds(index: i64, len: usize) -> EcoString { eco_format!("string index out of bounds (index: {}, len: {})", index, len) } /// The out of bounds access error message when no default value was given. #[cold] fn no_default_and_out_of_bounds(index: i64, len: usize) -> EcoString { eco_format!("no default value was specified and string index out of bounds (index: {}, len: {})", index, len) } /// The char boundary access error message. #[cold] fn not_a_char_boundary(index: i64) -> EcoString { eco_format!("string index {} is not a character boundary", index) } /// The error message when the string is empty. #[cold] fn string_is_empty() -> EcoString { "string is empty".into() } /// A regular expression. /// /// Can be used as a [show rule selector]($styling/#show-rules) and with /// [string methods]($str) like `find`, `split`, and `replace`. /// /// [See here](https://docs.rs/regex/latest/regex/#syntax) for a specification /// of the supported syntax. /// /// # Example /// ```example /// // Works with string methods. /// #"a,b;c".split(regex("[,;]")) /// /// // Works with show rules. /// #show regex("\d+"): set text(red) /// /// The numbers 1 to 10. /// ``` #[ty(scope)] #[derive(Debug, Clone)] pub struct Regex(regex::Regex); impl Regex { /// Create a new regular expression. pub fn new(re: &str) -> StrResult { regex::Regex::new(re).map(Self).map_err(|err| eco_format!("{err}")) } } #[scope] impl Regex { /// Create a regular expression from a string. #[func(constructor)] pub fn construct( /// The regular expression as a string. /// /// Most regex escape sequences just work because they are not valid Typst /// escape sequences. To produce regex escape sequences that are also valid in /// Typst (e.g. `[\\]`), you need to escape twice. Thus, to match a verbatim /// backslash, you would need to write `{regex("\\\\")}`. /// /// If you need many escape sequences, you can also create a raw element /// and extract its text to use it for your regular expressions: /// ```{regex(`\d+\.\d+\.\d+`.text)}```. regex: Spanned, ) -> SourceResult { Self::new(®ex.v).at(regex.span) } } impl Deref for Regex { type Target = regex::Regex; fn deref(&self) -> &Self::Target { &self.0 } } impl Repr for Regex { fn repr(&self) -> EcoString { eco_format!("regex({})", self.0.as_str().repr()) } } impl PartialEq for Regex { fn eq(&self, other: &Self) -> bool { self.0.as_str() == other.0.as_str() } } impl Hash for Regex { fn hash(&self, state: &mut H) { self.0.as_str().hash(state); } } /// A pattern which can be searched for in a string. #[derive(Debug, Clone)] pub enum StrPattern { /// Just a string. Str(Str), /// A regular expression. Regex(Regex), } cast! { StrPattern, self => match self { Self::Str(v) => v.into_value(), Self::Regex(v) => v.into_value(), }, v: Str => Self::Str(v), v: Regex => Self::Regex(v), } /// A side of a string. #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd)] pub enum StrSide { /// The logical start of the string, may be left or right depending on the /// language. Start, /// The logical end of the string. End, } cast! { StrSide, v: Alignment => match v { Alignment::START => Self::Start, Alignment::END => Self::End, _ => bail!("expected either `start` or `end`"), }, } /// A replacement for a matched [`Str`] pub enum Replacement { /// A string a match is replaced with. Str(Str), /// Function of type Dict -> Str (see `captures_to_dict` or `match_to_dict`) /// whose output is inserted for the match. Func(Func), } cast! { Replacement, self => match self { Self::Str(v) => v.into_value(), Self::Func(v) => v.into_value(), }, v: Str => Self::Str(v), v: Func => Self::Func(v) }