diff --git a/Cargo.lock b/Cargo.lock index 1851134a5..86f04ee52 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2995,6 +2995,7 @@ dependencies = [ "typst-timing", "typst-utils", "unicode-math-class", + "unicode-normalization", "unicode-segmentation", "unscanny", "usvg", diff --git a/Cargo.toml b/Cargo.toml index 36195230e..f643856e1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -129,6 +129,7 @@ unicode-bidi = "0.3.18" unicode-ident = "1.0" unicode-math-class = "0.1" unicode-script = "0.5" +unicode-normalization = "0.1.24" unicode-segmentation = "1" unscanny = "0.1" ureq = { version = "2", default-features = false, features = ["native-tls", "gzip", "json"] } diff --git a/crates/typst-library/Cargo.toml b/crates/typst-library/Cargo.toml index fb45ec862..71729b63a 100644 --- a/crates/typst-library/Cargo.toml +++ b/crates/typst-library/Cargo.toml @@ -61,6 +61,7 @@ ttf-parser = { workspace = true } two-face = { workspace = true } typed-arena = { workspace = true } unicode-math-class = { workspace = true } +unicode-normalization = { workspace = true } unicode-segmentation = { workspace = true } unscanny = { workspace = true } usvg = { workspace = true } diff --git a/crates/typst-library/src/foundations/str.rs b/crates/typst-library/src/foundations/str.rs index 551ac04f5..23a1bd4cf 100644 --- a/crates/typst-library/src/foundations/str.rs +++ b/crates/typst-library/src/foundations/str.rs @@ -7,12 +7,13 @@ use comemo::Tracked; use ecow::EcoString; use serde::{Deserialize, Serialize}; use typst_syntax::{Span, Spanned}; +use unicode_normalization::UnicodeNormalization; use unicode_segmentation::UnicodeSegmentation; use crate::diag::{bail, At, SourceResult, StrResult}; use crate::engine::Engine; use crate::foundations::{ - cast, dict, func, repr, scope, ty, Array, Bytes, Context, Decimal, Dict, Func, + cast, dict, func, repr, scope, ty, Array, Bytes, Cast, Context, Decimal, Dict, Func, IntoValue, Label, Repr, Type, Value, Version, }; use crate::layout::Alignment; @@ -286,6 +287,30 @@ impl Str { Ok(c.into()) } + /// Normalizes the string to the given Unicode normal form. + /// + /// This is useful when manipulating strings containing Unicode combining + /// characters. + /// + /// ```typ + /// #assert.eq("é".normalize(form: "nfd"), "e\u{0301}") + /// #assert.eq("ſ́".normalize(form: "nfkc"), "ś") + /// ``` + #[func] + pub fn normalize( + &self, + #[named] + #[default(UnicodeNormalForm::Nfc)] + form: UnicodeNormalForm, + ) -> Str { + match form { + UnicodeNormalForm::Nfc => self.nfc().collect(), + UnicodeNormalForm::Nfd => self.nfd().collect(), + UnicodeNormalForm::Nfkc => self.nfkc().collect(), + UnicodeNormalForm::Nfkd => self.nfkd().collect(), + } + } + /// Whether the string contains the specified pattern. /// /// This method also has dedicated syntax: You can write `{"bc" in "abcd"}` @@ -788,6 +813,25 @@ cast! { v: Str => Self::Str(v), } +/// A Unicode normalization form. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Cast)] +pub enum UnicodeNormalForm { + /// Canonical composition where e.g. accented letters are turned into a + /// single Unicode codepoint. + #[string("nfc")] + Nfc, + /// Canonical decomposition where e.g. accented letters are split into a + /// separate base and diacritic. + #[string("nfd")] + Nfd, + /// Like NFC, but using the Unicode compatibility decompositions. + #[string("nfkc")] + Nfkc, + /// Like NFD, but using the Unicode compatibility decompositions. + #[string("nfkd")] + Nfkd, +} + /// Convert an item of std's `match_indices` to a dictionary. fn match_to_dict((start, text): (usize, &str)) -> Dict { dict! { diff --git a/tests/suite/foundations/str.typ b/tests/suite/foundations/str.typ index 56756416d..66fb912c0 100644 --- a/tests/suite/foundations/str.typ +++ b/tests/suite/foundations/str.typ @@ -86,6 +86,13 @@ // Error: 2-28 0x110000 is not a valid codepoint #str.from-unicode(0x110000) // 0x10ffff is the highest valid code point +--- str-normalize --- +// Test the `normalize` method. +#test("e\u{0301}".normalize(form: "nfc"), "é") +#test("é".normalize(form: "nfd"), "e\u{0301}") +#test("ſ\u{0301}".normalize(form: "nfkc"), "ś") +#test("ſ\u{0301}".normalize(form: "nfkd"), "s\u{0301}") + --- string-len --- // Test the `len` method. #test("Hello World!".len(), 12)