Add clusters and codepoints methods

This commit is contained in:
Laurenz 2023-02-17 10:29:55 +01:00
parent 585f656487
commit dd5f07eb91
4 changed files with 44 additions and 20 deletions

View File

@ -197,18 +197,18 @@ $arrow.t.quad$
# String
A sequence of Unicode codepoints.
You can iterate over the characters (or rather, grapheme clusters) of the string
using a [for loop]($scripting/#loops). Strings can be added with
the `+` operator, [joined together]($scripting/#blocks) and
multiplied with integers.
You can iterate over the grapheme clusters of the string using a
[for loop]($scripting/#loops). Grapheme clusters are basically characters but
keep together things that belong together, e.g. multiple codepoints that
together form a flag emoji. Strings can be added with the `+` operator,
[joined together]($scripting/#blocks) and multiplied with integers.
Typst provides utility methods for string manipulation. Many of these methods
(e.g., `split`, `trim` and `replace`) operate on _patterns:_ A pattern can be
either a string or a [regular expression]($func/regex). This makes the methods
quite versatile.
_Note:_ Currently all lengths and indices are expressed in terms of UTF-8 bytes.
This _might_ change to grapheme clusters in the future.
All lengths and indices are expressed in terms of UTF-8 bytes.
### Example
```example
@ -236,20 +236,20 @@ The length of the string in UTF-8 encoded bytes.
- returns: integer
### first()
Extract the first character (or rather, grapheme cluster) of the string.
Extract the first grapheme cluster of the string.
Fails with an error if the string is empty.
- returns: any
### last()
Extract the last character (or rather, grapheme cluster) of the string.
Extract the last grapheme cluster of the string.
Fails with an error if the string is empty.
- returns: any
### at()
Extract the first character (or rather, grapheme cluster) after the specified
index. Fails with an error if the index is out of bounds.
Extract the first grapheme cluster after the specified index. Fails with an
error if the index is out of bounds.
- index: integer (positional, required)
The byte index.
@ -269,6 +269,16 @@ Fails with an error if the start or end index is out of bounds.
as the `end` position. Mutually exclusive with `end`.
- returns: string
### clusters()
Returns the grapheme clusters of the string as array of substrings.
- returns: array
### codepoints()
Returns the Unicode codepoints of the string as array of substrings.
- returns: array
### contains()
Whether the string contains the specified pattern.

View File

@ -37,6 +37,8 @@ pub fn call(
}
Value::Str(string.slice(start, end).at(span)?)
}
"clusters" => Value::Array(string.clusters()),
"codepoints" => Value::Array(string.codepoints()),
"contains" => Value::Bool(string.contains(args.expect("pattern")?)),
"starts-with" => Value::Bool(string.starts_with(args.expect("pattern")?)),
"ends-with" => Value::Bool(string.ends_with(args.expect("pattern")?)),
@ -218,6 +220,8 @@ pub fn methods_on(type_name: &str) -> &[(&'static str, bool)] {
"string" => &[
("len", false),
("at", true),
("clusters", false),
("codepoints", false),
("contains", true),
("ends-with", true),
("find", true),

View File

@ -42,11 +42,6 @@ impl Str {
self
}
/// The grapheme clusters the string consists of.
pub fn graphemes(&self) -> Array {
self.as_str().graphemes(true).map(|s| Value::Str(s.into())).collect()
}
/// Extract the first grapheme cluster.
pub fn first(&self) -> StrResult<Self> {
self.0
@ -82,6 +77,16 @@ impl Str {
Ok(self.0[start..end].into())
}
/// The grapheme clusters the string consists of.
pub fn clusters(&self) -> Array {
self.as_str().graphemes(true).map(|s| Value::Str(s.into())).collect()
}
/// The codepoints the string consists of.
pub fn codepoints(&self) -> Array {
self.chars().map(|c| Value::Str(c.into())).collect()
}
/// Whether the given pattern exists in this string.
pub fn contains(&self, pattern: StrPattern) -> bool {
match pattern {
@ -350,12 +355,10 @@ impl Debug for Str {
f.write_char('"')?;
for c in self.chars() {
match c {
'\\' => f.write_str(r"\\")?,
'\0' => f.write_str("\\u{0}")?,
'\'' => f.write_str("'")?,
'"' => f.write_str(r#"\""#)?,
'\n' => f.write_str(r"\n")?,
'\r' => f.write_str(r"\r")?,
'\t' => f.write_str(r"\t")?,
_ => f.write_char(c)?,
_ => Display::fmt(&c.escape_debug(), f)?,
}
}
f.write_char('"')

View File

@ -45,6 +45,13 @@
// Error: 2-21 string index -1 is not a character boundary
#"🏳️‍🌈".slice(0, -1)
---
// Test the `clusters` and `codepoints` methods.
#test("abc".clusters(), ("a", "b", "c"))
#test("abc".clusters(), ("a", "b", "c"))
#test("🏳️‍🌈!".clusters(), ("🏳️‍🌈", "!"))
#test("🏳️‍🌈!".codepoints(), ("🏳", "\u{fe0f}", "\u{200d}", "🌈", "!"))
---
// Test the `contains` method.
#test("abc".contains("b"), true)