diff --git a/src/syntax/parsing.rs b/src/syntax/parsing.rs index 0d12f6e18..95c88c6ec 100644 --- a/src/syntax/parsing.rs +++ b/src/syntax/parsing.rs @@ -110,6 +110,20 @@ impl Parser<'_> { self.with_span(SyntaxNode::Text(text.to_string())) } + Token::UnicodeEscape(ues) => { + if let Some(c) = std::char::from_u32( + u32::from_str_radix(ues, 16) + .expect("Unicode escape string not convertible to int") + ) { + let mut s = String::with_capacity(1); + s.push(c); + self.with_span(SyntaxNode::Text(s)) + } else { + error!(@self.feedback, token.span, "invalid unicode codepoint"); + self.with_span(SyntaxNode::Text("".to_string())) + } + } + unexpected => { self.eat(); error!( @@ -944,6 +958,7 @@ mod tests { t!("*hi" => B, T("hi")); t!("hi_" => T("hi"), I); t!("hi you" => T("hi"), S, T("you")); + t!("\\u{1f303}" => T("🌃")); t!("\n\n\nhello" => P, T("hello")); t!(r"a\ b" => T("a"), L, S, T("b")); t!("`py`" => R!["py"]); @@ -960,8 +975,9 @@ mod tests { t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => C![ Some("typst"), " Typst uses ``` to indicate code blocks" ]); - e!("``` hi\nyou" => s(1,3, 1,3, "expected backticks")); - e!("```🌍 hi\nyou```" => s(0,3, 0,4, "invalid identifier")); + e!("``` hi\nyou" => s(1,3, 1,3, "expected backticks")); + e!("```🌍 hi\nyou```" => s(0,3, 0,4, "invalid identifier")); + e!("\\u{d421c809}" => s(0,0, 0,12, "invalid unicode codepoint")); t!("💜\n\n 🌍" => T("💜"), P, T("🌍")); ts!("hi" => s(0,0, 0,2, T("hi"))); diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs index 7ecb05fe4..f00c1b661 100644 --- a/src/syntax/tokens.rs +++ b/src/syntax/tokens.rs @@ -82,6 +82,9 @@ pub enum Token<'s> { /// A backslash followed by whitespace in text. Backslash, + /// A unicode escape sequence + UnicodeEscape(&'s str), + /// Raw text. Raw { /// The raw text (not yet unescaped as for strings). @@ -136,6 +139,7 @@ impl<'s> Token<'s> { Star => "star", Underscore => "underscore", Backslash => "backslash", + UnicodeEscape(_) => "unicode escape sequence", Raw { .. } => "raw text", Code { .. } => "code block", Text(_) => "text", @@ -426,6 +430,41 @@ impl<'s> Tokens<'s> { } match self.peek() { + Some(c) if c == 'u' => { + // Index which points to start of escape sequence + let index = self.index() - 1; + self.eat(); + + if self.peek() == Some('{') { + self.eat(); + // This loop will eat all hexadecimal chars and an + // optional closing brace (brace not in end index range). + let mut end = self.index(); + let mut valid = true; + while let Some(c) = self.peek() { + if c == '}' { + self.eat(); + break; + } + + if !c.is_ascii_hexdigit() { + valid = false; + break; + } + + self.eat(); + end = self.index(); + } + if valid == false { + // There are only 8-bit ASCII chars in that range + Text(&self.src[index..end]) + } else { + UnicodeEscape(&self.src[index + 3..end]) + } + } else { + Text("\\u") + } + } Some(c) if is_escapable(c) => { let index = self.index(); self.eat(); @@ -579,6 +618,7 @@ mod tests { Plus, Hyphen as Min, Slash, + UnicodeEscape as UE, Star, Text as T, }; @@ -701,14 +741,16 @@ mod tests { #[test] fn tokenize_escaped_symbols() { - t!(Body, r"\\" => T(r"\")); - t!(Body, r"\[" => T("[")); - t!(Body, r"\]" => T("]")); - t!(Body, r"\*" => T("*")); - t!(Body, r"\_" => T("_")); - t!(Body, r"\`" => T("`")); - t!(Body, r"\/" => T("/")); - t!(Body, r#"\""# => T("\"")); + t!(Body, r"\\" => T(r"\")); + t!(Body, r"\[" => T("[")); + t!(Body, r"\]" => T("]")); + t!(Body, r"\*" => T("*")); + t!(Body, r"\_" => T("_")); + t!(Body, r"\`" => T("`")); + t!(Body, r"\/" => T("/")); + t!(Body, r"\u{2603}" => UE("2603")); + t!(Body, r"\u{26A4" => UE("26A4")); + t!(Body, r#"\""# => T("\"")); } #[test] @@ -716,6 +758,9 @@ mod tests { t!(Body, r"\a" => T("\\"), T("a")); t!(Body, r"\:" => T(r"\"), T(":")); t!(Body, r"\=" => T(r"\"), T("=")); + t!(Body, r"\u{2GA4"=> T(r"\u{2"), Text("GA4")); + t!(Body, r"\u{ " => T(r"\u{"), Space(0)); + t!(Body, r"\u" => T(r"\u")); t!(Header, r"\\\\" => Invalid(r"\\\\")); t!(Header, r"\a" => Invalid(r"\a")); t!(Header, r"\:" => Invalid(r"\"), Colon);