Added Unicode Escaping for body text 👙

2025-07-13 15:42:53 +08:00 · 2020-08-30 13:21:07 +02:00 · 2020-08-30 13:21:07 +02:00 · 7041e0938d
commit 7041e0938d
parent 2a6cde7272
2 changed files with 71 additions and 10 deletions
--- a/src/syntax/parsing.rs
+++ b/src/syntax/parsing.rs
@ -110,6 +110,20 @@ impl Parser<'_> {
                    self.with_span(SyntaxNode::Text(text.to_string()))
                }

+                Token::UnicodeEscape(ues) => {
+                    if let Some(c) = std::char::from_u32(
+                        u32::from_str_radix(ues, 16)
+                        .expect("Unicode escape string not convertible to int")
+                    ) {
+                        let mut s = String::with_capacity(1);
+                        s.push(c);
+                        self.with_span(SyntaxNode::Text(s))
+                    } else {
+                        error!(@self.feedback, token.span, "invalid unicode codepoint");
+                        self.with_span(SyntaxNode::Text("".to_string()))
+                    }
+                }
+
                unexpected => {
                    self.eat();
                    error!(
@ -944,6 +958,7 @@ mod tests {
        t!("*hi"         => B, T("hi"));
        t!("hi_"         => T("hi"), I);
        t!("hi you"      => T("hi"), S, T("you"));
+        t!("\\u{1f303}"  => T("🌃"));
        t!("\n\n\nhello" => P, T("hello"));
        t!(r"a\ b"       => T("a"), L, S, T("b"));
        t!("`py`"        => R!["py"]);
@ -960,8 +975,9 @@ mod tests {
        t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => C![
            Some("typst"), " Typst uses ``` to indicate code blocks"
        ]);
-        e!("``` hi\nyou"      => s(1,3, 1,3, "expected backticks"));
-        e!("```🌍 hi\nyou```" => s(0,3, 0,4, "invalid identifier"));
+        e!("``` hi\nyou"      => s(1,3, 1,3,  "expected backticks"));
+        e!("```🌍 hi\nyou```" => s(0,3, 0,4,  "invalid identifier"));
+        e!("\\u{d421c809}"    => s(0,0, 0,12, "invalid unicode codepoint"));
        t!("💜\n\n 🌍"       => T("💜"), P, T("🌍"));

        ts!("hi"   => s(0,0, 0,2, T("hi")));
--- a/src/syntax/tokens.rs
+++ b/src/syntax/tokens.rs
@ -82,6 +82,9 @@ pub enum Token<'s> {
    /// A backslash followed by whitespace in text.
    Backslash,

+    /// A unicode escape sequence
+    UnicodeEscape(&'s str),
+
    /// Raw text.
    Raw {
        /// The raw text (not yet unescaped as for strings).
@ -136,6 +139,7 @@ impl<'s> Token<'s> {
            Star => "star",
            Underscore => "underscore",
            Backslash => "backslash",
+            UnicodeEscape(_) => "unicode escape sequence",
            Raw { .. } => "raw text",
            Code { .. } => "code block",
            Text(_) => "text",
@ -426,6 +430,41 @@ impl<'s> Tokens<'s> {
        }

        match self.peek() {
+            Some(c) if c == 'u' => {
+                // Index which points to start of escape sequence
+                let index = self.index() - 1;
+                self.eat();
+
+                if self.peek() == Some('{') {
+                    self.eat();
+                    // This loop will eat all hexadecimal chars and an
+                    // optional closing brace (brace not in end index range).
+                    let mut end = self.index();
+                    let mut valid = true;
+                    while let Some(c) = self.peek() {
+                        if c == '}' {
+                            self.eat();
+                            break;
+                        }
+
+                        if !c.is_ascii_hexdigit() {
+                            valid = false;
+                            break;
+                        }
+
+                        self.eat();
+                        end = self.index();
+                    }
+                    if valid == false {
+                        // There are only 8-bit ASCII chars in that range
+                        Text(&self.src[index..end])
+                    } else {
+                        UnicodeEscape(&self.src[index + 3..end])
+                    }
+                } else {
+                    Text("\\u")
+                }
+            }
            Some(c) if is_escapable(c) => {
                let index = self.index();
                self.eat();
@ -579,6 +618,7 @@ mod tests {
        Plus,
        Hyphen as Min,
        Slash,
+        UnicodeEscape as UE,
        Star,
        Text as T,
    };
@ -701,14 +741,16 @@ mod tests {

    #[test]
    fn tokenize_escaped_symbols() {
-        t!(Body, r"\\"   => T(r"\"));
-        t!(Body, r"\["   => T("["));
-        t!(Body, r"\]"   => T("]"));
-        t!(Body, r"\*"   => T("*"));
-        t!(Body, r"\_"   => T("_"));
-        t!(Body, r"\`"   => T("`"));
-        t!(Body, r"\/"   => T("/"));
-        t!(Body, r#"\""# => T("\""));
+        t!(Body, r"\\"       => T(r"\"));
+        t!(Body, r"\["       => T("["));
+        t!(Body, r"\]"       => T("]"));
+        t!(Body, r"\*"       => T("*"));
+        t!(Body, r"\_"       => T("_"));
+        t!(Body, r"\`"       => T("`"));
+        t!(Body, r"\/"       => T("/"));
+        t!(Body, r"\u{2603}" => UE("2603"));
+        t!(Body, r"\u{26A4"  => UE("26A4"));
+        t!(Body, r#"\""#     => T("\""));
    }

    #[test]
@ -716,6 +758,9 @@ mod tests {
        t!(Body, r"\a"     => T("\\"), T("a"));
        t!(Body, r"\:"     => T(r"\"), T(":"));
        t!(Body, r"\="     => T(r"\"), T("="));
+        t!(Body, r"\u{2GA4"=> T(r"\u{2"), Text("GA4"));
+        t!(Body, r"\u{ "   => T(r"\u{"), Space(0));
+        t!(Body, r"\u"     => T(r"\u"));
        t!(Header, r"\\\\" => Invalid(r"\\\\"));
        t!(Header, r"\a"   => Invalid(r"\a"));
        t!(Header, r"\:"   => Invalid(r"\"), Colon);