mirror of
https://github.com/typst/typst
synced 2025-05-16 01:55:28 +08:00
Merge pull request #12 from typst/unicode-escape
Unicode escape sequences in strings and body text
This commit is contained in:
commit
07f387d088
@ -110,6 +110,26 @@ impl Parser<'_> {
|
|||||||
self.with_span(SyntaxNode::Text(text.to_string()))
|
self.with_span(SyntaxNode::Text(text.to_string()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Token::UnicodeEscape { sequence, terminated } => {
|
||||||
|
if !terminated {
|
||||||
|
error!(
|
||||||
|
@self.feedback, Span::at(token.span.end),
|
||||||
|
"expected closing brace",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(c) = unescape_char(sequence) {
|
||||||
|
self.with_span(SyntaxNode::Text(c.to_string()))
|
||||||
|
} else {
|
||||||
|
self.eat();
|
||||||
|
error!(
|
||||||
|
@self.feedback, token.span,
|
||||||
|
"invalid unicode escape sequence",
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
unexpected => {
|
unexpected => {
|
||||||
self.eat();
|
self.eat();
|
||||||
error!(
|
error!(
|
||||||
@ -594,7 +614,7 @@ impl Group {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn unescape_string(string: &str) -> String {
|
fn unescape_string(string: &str) -> String {
|
||||||
let mut iter = string.chars();
|
let mut iter = string.chars().peekable();
|
||||||
let mut out = String::with_capacity(string.len());
|
let mut out = String::with_capacity(string.len());
|
||||||
|
|
||||||
while let Some(c) = iter.next() {
|
while let Some(c) = iter.next() {
|
||||||
@ -602,6 +622,36 @@ fn unescape_string(string: &str) -> String {
|
|||||||
match iter.next() {
|
match iter.next() {
|
||||||
Some('\\') => out.push('\\'),
|
Some('\\') => out.push('\\'),
|
||||||
Some('"') => out.push('"'),
|
Some('"') => out.push('"'),
|
||||||
|
Some('u') if iter.peek() == Some(&'{') => {
|
||||||
|
iter.next();
|
||||||
|
|
||||||
|
let mut sequence = String::new();
|
||||||
|
let terminated = loop {
|
||||||
|
match iter.peek() {
|
||||||
|
// TODO: Feedback that closing brace is missing.
|
||||||
|
Some('}') => {
|
||||||
|
iter.next();
|
||||||
|
break true;
|
||||||
|
}
|
||||||
|
Some(&c) if c.is_ascii_hexdigit() => {
|
||||||
|
iter.next();
|
||||||
|
sequence.push(c);
|
||||||
|
}
|
||||||
|
_ => break false,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: Feedback that escape sequence is wrong.
|
||||||
|
if let Some(c) = unescape_char(&sequence) {
|
||||||
|
out.push(c);
|
||||||
|
} else {
|
||||||
|
out.push_str("\\u{");
|
||||||
|
out.push_str(&sequence);
|
||||||
|
if terminated {
|
||||||
|
out.push('}');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Some('n') => out.push('\n'),
|
Some('n') => out.push('\n'),
|
||||||
Some('t') => out.push('\t'),
|
Some('t') => out.push('\t'),
|
||||||
Some(c) => { out.push('\\'); out.push(c); }
|
Some(c) => { out.push('\\'); out.push(c); }
|
||||||
@ -617,7 +667,7 @@ fn unescape_string(string: &str) -> String {
|
|||||||
|
|
||||||
/// Unescape raw markup and split it into into lines.
|
/// Unescape raw markup and split it into into lines.
|
||||||
fn unescape_raw(raw: &str) -> Vec<String> {
|
fn unescape_raw(raw: &str) -> Vec<String> {
|
||||||
let mut iter = raw.chars().peekable();
|
let mut iter = raw.chars();
|
||||||
let mut text = String::new();
|
let mut text = String::new();
|
||||||
|
|
||||||
while let Some(c) = iter.next() {
|
while let Some(c) = iter.next() {
|
||||||
@ -705,6 +755,11 @@ fn unescape_code(raw: &str) -> Vec<String> {
|
|||||||
split_lines(&text)
|
split_lines(&text)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Converts a hexademical sequence (without braces or "\u") into a character.
|
||||||
|
fn unescape_char(sequence: &str) -> Option<char> {
|
||||||
|
u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32)
|
||||||
|
}
|
||||||
|
|
||||||
fn split_lines(text: &str) -> Vec<String> {
|
fn split_lines(text: &str) -> Vec<String> {
|
||||||
let mut iter = text.chars().peekable();
|
let mut iter = text.chars().peekable();
|
||||||
let mut line = String::new();
|
let mut line = String::new();
|
||||||
@ -890,6 +945,9 @@ mod tests {
|
|||||||
test(r#"hello world"#, "hello world");
|
test(r#"hello world"#, "hello world");
|
||||||
test(r#"hello\nworld"#, "hello\nworld");
|
test(r#"hello\nworld"#, "hello\nworld");
|
||||||
test(r#"a\"bc"#, "a\"bc");
|
test(r#"a\"bc"#, "a\"bc");
|
||||||
|
test(r#"a\u{2603}bc"#, "a☃bc");
|
||||||
|
test(r#"a\u{26c3bg"#, "a𦰻g");
|
||||||
|
test(r#"av\u{6797"#, "av林");
|
||||||
test(r#"a\\"#, "a\\");
|
test(r#"a\\"#, "a\\");
|
||||||
test(r#"a\\\nbc"#, "a\\\nbc");
|
test(r#"a\\\nbc"#, "a\\\nbc");
|
||||||
test(r#"a\tbc"#, "a\tbc");
|
test(r#"a\tbc"#, "a\tbc");
|
||||||
@ -944,6 +1002,7 @@ mod tests {
|
|||||||
t!("*hi" => B, T("hi"));
|
t!("*hi" => B, T("hi"));
|
||||||
t!("hi_" => T("hi"), I);
|
t!("hi_" => T("hi"), I);
|
||||||
t!("hi you" => T("hi"), S, T("you"));
|
t!("hi you" => T("hi"), S, T("you"));
|
||||||
|
t!("\\u{1f303}" => T("🌃"));
|
||||||
t!("\n\n\nhello" => P, T("hello"));
|
t!("\n\n\nhello" => P, T("hello"));
|
||||||
t!(r"a\ b" => T("a"), L, S, T("b"));
|
t!(r"a\ b" => T("a"), L, S, T("b"));
|
||||||
t!("`py`" => R!["py"]);
|
t!("`py`" => R!["py"]);
|
||||||
@ -951,17 +1010,16 @@ mod tests {
|
|||||||
e!("`hi\nyou" => s(1,3, 1,3, "expected backtick"));
|
e!("`hi\nyou" => s(1,3, 1,3, "expected backtick"));
|
||||||
t!("`hi\\`du`" => R!["hi`du"]);
|
t!("`hi\\`du`" => R!["hi`du"]);
|
||||||
|
|
||||||
t!("```java System.out.print```" => C![
|
t!("```java System.out.print```" => C![Some("java"), "System.out.print"]);
|
||||||
Some("java"), "System.out.print"
|
t!("``` console.log(\n\"alert\"\n)" => C![None, "console.log(", "\"alert\"", ")"]);
|
||||||
]);
|
|
||||||
t!("``` console.log(\n\"alert\"\n)" => C![
|
|
||||||
None, "console.log(", "\"alert\"", ")"
|
|
||||||
]);
|
|
||||||
t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => C![
|
t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => C![
|
||||||
Some("typst"), " Typst uses ``` to indicate code blocks"
|
Some("typst"), " Typst uses ``` to indicate code blocks"
|
||||||
]);
|
]);
|
||||||
|
|
||||||
e!("``` hi\nyou" => s(1,3, 1,3, "expected backticks"));
|
e!("``` hi\nyou" => s(1,3, 1,3, "expected backticks"));
|
||||||
e!("```🌍 hi\nyou```" => s(0,3, 0,4, "invalid identifier"));
|
e!("```🌍 hi\nyou```" => s(0,3, 0,4, "invalid identifier"));
|
||||||
|
e!("\\u{d421c809}" => s(0,0, 0,12, "invalid unicode escape sequence"));
|
||||||
|
e!("\\u{abc" => s(0,6, 0,6, "expected closing brace"));
|
||||||
t!("💜\n\n 🌍" => T("💜"), P, T("🌍"));
|
t!("💜\n\n 🌍" => T("💜"), P, T("🌍"));
|
||||||
|
|
||||||
ts!("hi" => s(0,0, 0,2, T("hi")));
|
ts!("hi" => s(0,0, 0,2, T("hi")));
|
||||||
|
@ -82,6 +82,14 @@ pub enum Token<'s> {
|
|||||||
/// A backslash followed by whitespace in text.
|
/// A backslash followed by whitespace in text.
|
||||||
Backslash,
|
Backslash,
|
||||||
|
|
||||||
|
/// A unicode escape sequence.
|
||||||
|
UnicodeEscape {
|
||||||
|
/// The escape sequence between two braces.
|
||||||
|
sequence: &'s str,
|
||||||
|
/// Whether the closing brace was present.
|
||||||
|
terminated: bool,
|
||||||
|
},
|
||||||
|
|
||||||
/// Raw text.
|
/// Raw text.
|
||||||
Raw {
|
Raw {
|
||||||
/// The raw text (not yet unescaped as for strings).
|
/// The raw text (not yet unescaped as for strings).
|
||||||
@ -136,6 +144,7 @@ impl<'s> Token<'s> {
|
|||||||
Star => "star",
|
Star => "star",
|
||||||
Underscore => "underscore",
|
Underscore => "underscore",
|
||||||
Backslash => "backslash",
|
Backslash => "backslash",
|
||||||
|
UnicodeEscape { .. } => "unicode escape sequence",
|
||||||
Raw { .. } => "raw text",
|
Raw { .. } => "raw text",
|
||||||
Code { .. } => "code block",
|
Code { .. } => "code block",
|
||||||
Text(_) => "text",
|
Text(_) => "text",
|
||||||
@ -426,6 +435,25 @@ impl<'s> Tokens<'s> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
match self.peek() {
|
match self.peek() {
|
||||||
|
Some('u') => {
|
||||||
|
self.eat();
|
||||||
|
if self.peek() == Some('{') {
|
||||||
|
self.eat();
|
||||||
|
let sequence = self.read_string_until(
|
||||||
|
|c| !c.is_ascii_hexdigit(),
|
||||||
|
false, 0, 0,
|
||||||
|
).0;
|
||||||
|
|
||||||
|
let terminated = self.peek() == Some('}');
|
||||||
|
if terminated {
|
||||||
|
self.eat();
|
||||||
|
}
|
||||||
|
|
||||||
|
UnicodeEscape { sequence, terminated }
|
||||||
|
} else {
|
||||||
|
Text("\\u")
|
||||||
|
}
|
||||||
|
}
|
||||||
Some(c) if is_escapable(c) => {
|
Some(c) if is_escapable(c) => {
|
||||||
let index = self.index();
|
let index = self.index();
|
||||||
self.eat();
|
self.eat();
|
||||||
@ -588,6 +616,7 @@ mod tests {
|
|||||||
fn Code<'a>(lang: Option<&'a str>, raw: &'a str, terminated: bool) -> Token<'a> {
|
fn Code<'a>(lang: Option<&'a str>, raw: &'a str, terminated: bool) -> Token<'a> {
|
||||||
Token::Code { lang: lang.map(Spanned::zero), raw, terminated }
|
Token::Code { lang: lang.map(Spanned::zero), raw, terminated }
|
||||||
}
|
}
|
||||||
|
fn UE(sequence: &str, terminated: bool) -> Token { Token::UnicodeEscape { sequence, terminated } }
|
||||||
|
|
||||||
macro_rules! t { ($($tts:tt)*) => {test!(@spans=false, $($tts)*)} }
|
macro_rules! t { ($($tts:tt)*) => {test!(@spans=false, $($tts)*)} }
|
||||||
macro_rules! ts { ($($tts:tt)*) => {test!(@spans=true, $($tts)*)} }
|
macro_rules! ts { ($($tts:tt)*) => {test!(@spans=true, $($tts)*)} }
|
||||||
@ -708,6 +737,8 @@ mod tests {
|
|||||||
t!(Body, r"\_" => T("_"));
|
t!(Body, r"\_" => T("_"));
|
||||||
t!(Body, r"\`" => T("`"));
|
t!(Body, r"\`" => T("`"));
|
||||||
t!(Body, r"\/" => T("/"));
|
t!(Body, r"\/" => T("/"));
|
||||||
|
t!(Body, r"\u{2603}" => UE("2603", true));
|
||||||
|
t!(Body, r"\u{26A4" => UE("26A4", false));
|
||||||
t!(Body, r#"\""# => T("\""));
|
t!(Body, r#"\""# => T("\""));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -716,6 +747,9 @@ mod tests {
|
|||||||
t!(Body, r"\a" => T("\\"), T("a"));
|
t!(Body, r"\a" => T("\\"), T("a"));
|
||||||
t!(Body, r"\:" => T(r"\"), T(":"));
|
t!(Body, r"\:" => T(r"\"), T(":"));
|
||||||
t!(Body, r"\=" => T(r"\"), T("="));
|
t!(Body, r"\=" => T(r"\"), T("="));
|
||||||
|
t!(Body, r"\u{2GA4"=> UE("2", false), T("GA4"));
|
||||||
|
t!(Body, r"\u{ " => UE("", false), Space(0));
|
||||||
|
t!(Body, r"\u" => T(r"\u"));
|
||||||
t!(Header, r"\\\\" => Invalid(r"\\\\"));
|
t!(Header, r"\\\\" => Invalid(r"\\\\"));
|
||||||
t!(Header, r"\a" => Invalid(r"\a"));
|
t!(Header, r"\a" => Invalid(r"\a"));
|
||||||
t!(Header, r"\:" => Invalid(r"\"), Colon);
|
t!(Header, r"\:" => Invalid(r"\"), Colon);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user