diff --git a/src/syntax/parsing.rs b/src/syntax/parsing.rs index 06ea8167d..e35835c8c 100644 --- a/src/syntax/parsing.rs +++ b/src/syntax/parsing.rs @@ -110,17 +110,23 @@ impl Parser<'_> { self.with_span(SyntaxNode::Text(text.to_string())) } - Token::UnicodeEscape(ues) => { - if let Some(c) = std::char::from_u32( - u32::from_str_radix(ues, 16) - .expect("Unicode escape string not convertible to int") - ) { - let mut s = String::with_capacity(1); - s.push(c); - self.with_span(SyntaxNode::Text(s)) + Token::UnicodeEscape { sequence, terminated } => { + if !terminated { + error!( + @self.feedback, Span::at(token.span.end), + "expected closing brace", + ); + } + + if let Some(c) = unescape_char(sequence) { + self.with_span(SyntaxNode::Text(c.to_string())) } else { - error!(@self.feedback, token.span, "invalid unicode codepoint"); - self.with_span(SyntaxNode::Text("".to_string())) + self.eat(); + error!( + @self.feedback, token.span, + "invalid unicode escape sequence", + ); + continue; } } @@ -608,7 +614,7 @@ impl Group { } fn unescape_string(string: &str) -> String { - let mut iter = string.chars(); + let mut iter = string.chars().peekable(); let mut out = String::with_capacity(string.len()); while let Some(c) = iter.next() { @@ -616,45 +622,33 @@ fn unescape_string(string: &str) -> String { match iter.next() { Some('\\') => out.push('\\'), Some('"') => out.push('"'), - Some('u') => { - // Index which points to start of escape sequence - let mut seen = "\\u".to_string(); + Some('u') if iter.peek() == Some(&'{') => { + iter.next(); - let next = iter.next(); - if next == Some('{') { - seen.push('{'); - - let mut valid = true; - let mut closed = false; - while let Some(c) = iter.next() { - seen.push(c); - if c == '}' { - closed = true; - break; + let mut sequence = String::new(); + let terminated = loop { + match iter.peek() { + // TODO: Feedback that closing brace is missing. + Some('}') => { + iter.next(); + break true; } - - if !c.is_ascii_hexdigit() { - valid = false; - break; + Some(&c) if c.is_ascii_hexdigit() => { + iter.next(); + sequence.push(c); } + _ => break false, } - if valid != false && seen.len() >= 3 { - if let Some(c) = std::char::from_u32( - u32::from_str_radix(&seen[3..seen.len() - if closed { 1 } else { 0 }], 16) - .expect("Unicode escape string not convertible to int") - ) { - out.push(c); - } else { - // Somehow provide feedback here that conversion failed? - out.push_str(&seen); - } - } else { - out.push_str(&seen); - } + }; + + // TODO: Feedback that escape sequence is wrong. + if let Some(c) = unescape_char(&sequence) { + out.push(c); } else { - out.push_str("\\u"); - if let Some(c) = next { - out.push(c); + out.push_str("\\u{"); + out.push_str(&sequence); + if terminated { + out.push('}'); } } } @@ -673,7 +667,7 @@ fn unescape_string(string: &str) -> String { /// Unescape raw markup and split it into into lines. fn unescape_raw(raw: &str) -> Vec { - let mut iter = raw.chars().peekable(); + let mut iter = raw.chars(); let mut text = String::new(); while let Some(c) = iter.next() { @@ -761,6 +755,11 @@ fn unescape_code(raw: &str) -> Vec { split_lines(&text) } +/// Converts a hexademical sequence (without braces or "\u") into a character. +fn unescape_char(sequence: &str) -> Option { + u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32) +} + fn split_lines(text: &str) -> Vec { let mut iter = text.chars().peekable(); let mut line = String::new(); @@ -947,7 +946,7 @@ mod tests { test(r#"hello\nworld"#, "hello\nworld"); test(r#"a\"bc"#, "a\"bc"); test(r#"a\u{2603}bc"#, "a☃bc"); - test(r#"a\u{26c3bg"#, "a\\u{26c3bg"); + test(r#"a\u{26c3bg"#, "a𦰻g"); test(r#"av\u{6797"#, "av林"); test(r#"a\\"#, "a\\"); test(r#"a\\\nbc"#, "a\\\nbc"); @@ -1011,18 +1010,16 @@ mod tests { e!("`hi\nyou" => s(1,3, 1,3, "expected backtick")); t!("`hi\\`du`" => R!["hi`du"]); - t!("```java System.out.print```" => C![ - Some("java"), "System.out.print" - ]); - t!("``` console.log(\n\"alert\"\n)" => C![ - None, "console.log(", "\"alert\"", ")" - ]); + t!("```java System.out.print```" => C![Some("java"), "System.out.print"]); + t!("``` console.log(\n\"alert\"\n)" => C![None, "console.log(", "\"alert\"", ")"]); t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => C![ Some("typst"), " Typst uses ``` to indicate code blocks" ]); + e!("``` hi\nyou" => s(1,3, 1,3, "expected backticks")); e!("```🌍 hi\nyou```" => s(0,3, 0,4, "invalid identifier")); - e!("\\u{d421c809}" => s(0,0, 0,12, "invalid unicode codepoint")); + e!("\\u{d421c809}" => s(0,0, 0,12, "invalid unicode escape sequence")); + e!("\\u{abc" => s(0,6, 0,6, "expected closing brace")); t!("💜\n\n 🌍" => T("💜"), P, T("🌍")); ts!("hi" => s(0,0, 0,2, T("hi"))); diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs index 92576d896..fe20d11a3 100644 --- a/src/syntax/tokens.rs +++ b/src/syntax/tokens.rs @@ -82,8 +82,13 @@ pub enum Token<'s> { /// A backslash followed by whitespace in text. Backslash, - /// A unicode escape sequence - UnicodeEscape(&'s str), + /// A unicode escape sequence. + UnicodeEscape { + /// The escape sequence between two braces. + sequence: &'s str, + /// Whether the closing brace was present. + terminated: bool, + }, /// Raw text. Raw { @@ -139,7 +144,7 @@ impl<'s> Token<'s> { Star => "star", Underscore => "underscore", Backslash => "backslash", - UnicodeEscape(_) => "unicode escape sequence", + UnicodeEscape { .. } => "unicode escape sequence", Raw { .. } => "raw text", Code { .. } => "code block", Text(_) => "text", @@ -431,36 +436,20 @@ impl<'s> Tokens<'s> { match self.peek() { Some('u') => { - // Index which points to start of escape sequence - let index = self.index() - 1; self.eat(); - if self.peek() == Some('{') { self.eat(); - // This loop will eat all hexadecimal chars and an - // optional closing brace (brace not in end index range). - let mut end = self.index(); - let mut valid = true; - while let Some(c) = self.peek() { - if c == '}' { - self.eat(); - break; - } - - if !c.is_ascii_hexdigit() { - valid = false; - break; - } + let sequence = self.read_string_until( + |c| !c.is_ascii_hexdigit(), + false, 0, 0, + ).0; + let terminated = self.peek() == Some('}'); + if terminated { self.eat(); - end = self.index(); - } - if valid == false { - // There are only 8-bit ASCII chars in that range - Text(&self.src[index..end]) - } else { - UnicodeEscape(&self.src[index + 3..end]) } + + UnicodeEscape { sequence, terminated } } else { Text("\\u") } @@ -618,7 +607,6 @@ mod tests { Plus, Hyphen as Min, Slash, - UnicodeEscape as UE, Star, Text as T, }; @@ -628,6 +616,7 @@ mod tests { fn Code<'a>(lang: Option<&'a str>, raw: &'a str, terminated: bool) -> Token<'a> { Token::Code { lang: lang.map(Spanned::zero), raw, terminated } } + fn UE(sequence: &str, terminated: bool) -> Token { Token::UnicodeEscape { sequence, terminated } } macro_rules! t { ($($tts:tt)*) => {test!(@spans=false, $($tts)*)} } macro_rules! ts { ($($tts:tt)*) => {test!(@spans=true, $($tts)*)} } @@ -748,8 +737,8 @@ mod tests { t!(Body, r"\_" => T("_")); t!(Body, r"\`" => T("`")); t!(Body, r"\/" => T("/")); - t!(Body, r"\u{2603}" => UE("2603")); - t!(Body, r"\u{26A4" => UE("26A4")); + t!(Body, r"\u{2603}" => UE("2603", true)); + t!(Body, r"\u{26A4" => UE("26A4", false)); t!(Body, r#"\""# => T("\"")); } @@ -758,8 +747,8 @@ mod tests { t!(Body, r"\a" => T("\\"), T("a")); t!(Body, r"\:" => T(r"\"), T(":")); t!(Body, r"\=" => T(r"\"), T("=")); - t!(Body, r"\u{2GA4"=> T(r"\u{2"), Text("GA4")); - t!(Body, r"\u{ " => T(r"\u{"), Space(0)); + t!(Body, r"\u{2GA4"=> UE("2", false), T("GA4")); + t!(Body, r"\u{ " => UE("", false), Space(0)); t!(Body, r"\u" => T(r"\u")); t!(Header, r"\\\\" => Invalid(r"\\\\")); t!(Header, r"\a" => Invalid(r"\a"));