Add #str.normalize(form) (#5631)

Co-authored-by: +merlan #flirora <uruwi@protonmail.com> Co-authored-by: Laurenz <laurmaedje@gmail.com>
2025-07-16 00:52:54 +08:00 · 2025-02-25 06:01:01 -08:00 · 2025-02-25 06:01:01 -08:00 · d11ad80dee
commit d11ad80dee
parent bad343748b
5 changed files with 55 additions and 1 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2995,6 +2995,7 @@ dependencies = [
 "typst-timing",
 "typst-utils",
 "unicode-math-class",
 "unicode-normalization",
 "unicode-segmentation",
 "unscanny",
 "usvg",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -129,6 +129,7 @@ unicode-bidi = "0.3.18"
 unicode-ident = "1.0"
 unicode-math-class = "0.1"
 unicode-script = "0.5"
 unicode-normalization = "0.1.24"
 unicode-segmentation = "1"
 unscanny = "0.1"
 ureq = { version = "2", default-features = false, features = ["native-tls", "gzip", "json"] }
--- a/crates/typst-library/Cargo.toml
+++ b/crates/typst-library/Cargo.toml
@ -61,6 +61,7 @@ ttf-parser = { workspace = true }
 two-face = { workspace = true }
 typed-arena = { workspace = true }
 unicode-math-class = { workspace = true }
 unicode-normalization = { workspace = true }
 unicode-segmentation = { workspace = true }
 unscanny = { workspace = true }
 usvg = { workspace = true }
--- a/crates/typst-library/src/foundations/str.rs
+++ b/crates/typst-library/src/foundations/str.rs
@ -7,12 +7,13 @@ use comemo::Tracked;
 use ecow::EcoString;
 use serde::{Deserialize, Serialize};
 use typst_syntax::{Span, Spanned};
 use unicode_normalization::UnicodeNormalization;
 use unicode_segmentation::UnicodeSegmentation;
 use crate::diag::{bail, At, SourceResult, StrResult};
 use crate::engine::Engine;
 use crate::foundations::{
-    cast, dict, func, repr, scope, ty, Array, Bytes, Context, Decimal, Dict, Func,
+    cast, dict, func, repr, scope, ty, Array, Bytes, Cast, Context, Decimal, Dict, Func,
    IntoValue, Label, Repr, Type, Value, Version,
 };
 use crate::layout::Alignment;
@ -286,6 +287,30 @@ impl Str {
        Ok(c.into())
    }
    /// Normalizes the string to the given Unicode normal form.
    ///
    /// This is useful when manipulating strings containing Unicode combining
    /// characters.
    ///
    /// ```typ
    /// #assert.eq("é".normalize(form: "nfd"), "e\u{0301}")
    /// #assert.eq("ſ́".normalize(form: "nfkc"), "ś")
    /// ```
    #[func]
    pub fn normalize(
        &self,
        #[named]
        #[default(UnicodeNormalForm::Nfc)]
        form: UnicodeNormalForm,
    ) -> Str {
        match form {
            UnicodeNormalForm::Nfc => self.nfc().collect(),
            UnicodeNormalForm::Nfd => self.nfd().collect(),
            UnicodeNormalForm::Nfkc => self.nfkc().collect(),
            UnicodeNormalForm::Nfkd => self.nfkd().collect(),
        }
    }
    /// Whether the string contains the specified pattern.
    ///
    /// This method also has dedicated syntax: You can write `{"bc" in "abcd"}`
@ -788,6 +813,25 @@ cast! {
    v: Str => Self::Str(v),
 }
 /// A Unicode normalization form.
 #[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Cast)]
 pub enum UnicodeNormalForm {
    /// Canonical composition where e.g. accented letters are turned into a
    /// single Unicode codepoint.
    #[string("nfc")]
    Nfc,
    /// Canonical decomposition where e.g. accented letters are split into a
    /// separate base and diacritic.
    #[string("nfd")]
    Nfd,
    /// Like NFC, but using the Unicode compatibility decompositions.
    #[string("nfkc")]
    Nfkc,
    /// Like NFD, but using the Unicode compatibility decompositions.
    #[string("nfkd")]
    Nfkd,
 }
 /// Convert an item of std's `match_indices` to a dictionary.
 fn match_to_dict((start, text): (usize, &str)) -> Dict {
    dict! {
--- a/tests/suite/foundations/str.typ
+++ b/tests/suite/foundations/str.typ
@ -86,6 +86,13 @@
 // Error: 2-28 0x110000 is not a valid codepoint
 #str.from-unicode(0x110000) // 0x10ffff is the highest valid code point
 --- str-normalize ---
 // Test the `normalize` method.
 #test("e\u{0301}".normalize(form: "nfc"), "é")
 #test("é".normalize(form: "nfd"), "e\u{0301}")
 #test("ſ\u{0301}".normalize(form: "nfkc"), "ś")
 #test("ſ\u{0301}".normalize(form: "nfkd"), "s\u{0301}")
 --- string-len ---
 // Test the `len` method.
 #test("Hello World!".len(), 12)