diff --git a/docs/src/reference/types.md b/docs/src/reference/types.md index 11580c8b3..47f8d9e1a 100644 --- a/docs/src/reference/types.md +++ b/docs/src/reference/types.md @@ -197,18 +197,18 @@ $arrow.t.quad$ # String A sequence of Unicode codepoints. -You can iterate over the characters (or rather, grapheme clusters) of the string -using a [for loop]($scripting/#loops). Strings can be added with -the `+` operator, [joined together]($scripting/#blocks) and -multiplied with integers. +You can iterate over the grapheme clusters of the string using a +[for loop]($scripting/#loops). Grapheme clusters are basically characters but +keep together things that belong together, e.g. multiple codepoints that +together form a flag emoji. Strings can be added with the `+` operator, +[joined together]($scripting/#blocks) and multiplied with integers. Typst provides utility methods for string manipulation. Many of these methods (e.g., `split`, `trim` and `replace`) operate on _patterns:_ A pattern can be either a string or a [regular expression]($func/regex). This makes the methods quite versatile. -_Note:_ Currently all lengths and indices are expressed in terms of UTF-8 bytes. -This _might_ change to grapheme clusters in the future. +All lengths and indices are expressed in terms of UTF-8 bytes. ### Example ```example @@ -236,20 +236,20 @@ The length of the string in UTF-8 encoded bytes. - returns: integer ### first() -Extract the first character (or rather, grapheme cluster) of the string. +Extract the first grapheme cluster of the string. Fails with an error if the string is empty. - returns: any ### last() -Extract the last character (or rather, grapheme cluster) of the string. +Extract the last grapheme cluster of the string. Fails with an error if the string is empty. - returns: any ### at() -Extract the first character (or rather, grapheme cluster) after the specified -index. Fails with an error if the index is out of bounds. +Extract the first grapheme cluster after the specified index. Fails with an +error if the index is out of bounds. - index: integer (positional, required) The byte index. @@ -269,6 +269,16 @@ Fails with an error if the start or end index is out of bounds. as the `end` position. Mutually exclusive with `end`. - returns: string +### clusters() +Returns the grapheme clusters of the string as array of substrings. + +- returns: array + +### codepoints() +Returns the Unicode codepoints of the string as array of substrings. + +- returns: array + ### contains() Whether the string contains the specified pattern. diff --git a/src/model/methods.rs b/src/model/methods.rs index c0b636694..38ebebda5 100644 --- a/src/model/methods.rs +++ b/src/model/methods.rs @@ -37,6 +37,8 @@ pub fn call( } Value::Str(string.slice(start, end).at(span)?) } + "clusters" => Value::Array(string.clusters()), + "codepoints" => Value::Array(string.codepoints()), "contains" => Value::Bool(string.contains(args.expect("pattern")?)), "starts-with" => Value::Bool(string.starts_with(args.expect("pattern")?)), "ends-with" => Value::Bool(string.ends_with(args.expect("pattern")?)), @@ -218,6 +220,8 @@ pub fn methods_on(type_name: &str) -> &[(&'static str, bool)] { "string" => &[ ("len", false), ("at", true), + ("clusters", false), + ("codepoints", false), ("contains", true), ("ends-with", true), ("find", true), diff --git a/src/model/str.rs b/src/model/str.rs index ae0ef8994..8da5b50c4 100644 --- a/src/model/str.rs +++ b/src/model/str.rs @@ -42,11 +42,6 @@ impl Str { self } - /// The grapheme clusters the string consists of. - pub fn graphemes(&self) -> Array { - self.as_str().graphemes(true).map(|s| Value::Str(s.into())).collect() - } - /// Extract the first grapheme cluster. pub fn first(&self) -> StrResult { self.0 @@ -82,6 +77,16 @@ impl Str { Ok(self.0[start..end].into()) } + /// The grapheme clusters the string consists of. + pub fn clusters(&self) -> Array { + self.as_str().graphemes(true).map(|s| Value::Str(s.into())).collect() + } + + /// The codepoints the string consists of. + pub fn codepoints(&self) -> Array { + self.chars().map(|c| Value::Str(c.into())).collect() + } + /// Whether the given pattern exists in this string. pub fn contains(&self, pattern: StrPattern) -> bool { match pattern { @@ -350,12 +355,10 @@ impl Debug for Str { f.write_char('"')?; for c in self.chars() { match c { - '\\' => f.write_str(r"\\")?, + '\0' => f.write_str("\\u{0}")?, + '\'' => f.write_str("'")?, '"' => f.write_str(r#"\""#)?, - '\n' => f.write_str(r"\n")?, - '\r' => f.write_str(r"\r")?, - '\t' => f.write_str(r"\t")?, - _ => f.write_char(c)?, + _ => Display::fmt(&c.escape_debug(), f)?, } } f.write_char('"') diff --git a/tests/typ/compiler/string.typ b/tests/typ/compiler/string.typ index 017e1cdd9..7692b41f5 100644 --- a/tests/typ/compiler/string.typ +++ b/tests/typ/compiler/string.typ @@ -45,6 +45,13 @@ // Error: 2-21 string index -1 is not a character boundary #"🏳️‍🌈".slice(0, -1) +--- +// Test the `clusters` and `codepoints` methods. +#test("abc".clusters(), ("a", "b", "c")) +#test("abc".clusters(), ("a", "b", "c")) +#test("🏳️‍🌈!".clusters(), ("🏳️‍🌈", "!")) +#test("🏳️‍🌈!".codepoints(), ("🏳", "\u{fe0f}", "\u{200d}", "🌈", "!")) + --- // Test the `contains` method. #test("abc".contains("b"), true)