Added Unicode Escaping for body text 👙

This commit is contained in:
Martin Haug 2020-08-30 13:21:07 +02:00
parent 2a6cde7272
commit 7041e0938d
2 changed files with 71 additions and 10 deletions

View File

@ -110,6 +110,20 @@ impl Parser<'_> {
self.with_span(SyntaxNode::Text(text.to_string()))
}
Token::UnicodeEscape(ues) => {
if let Some(c) = std::char::from_u32(
u32::from_str_radix(ues, 16)
.expect("Unicode escape string not convertible to int")
) {
let mut s = String::with_capacity(1);
s.push(c);
self.with_span(SyntaxNode::Text(s))
} else {
error!(@self.feedback, token.span, "invalid unicode codepoint");
self.with_span(SyntaxNode::Text("".to_string()))
}
}
unexpected => {
self.eat();
error!(
@ -944,6 +958,7 @@ mod tests {
t!("*hi" => B, T("hi"));
t!("hi_" => T("hi"), I);
t!("hi you" => T("hi"), S, T("you"));
t!("\\u{1f303}" => T("🌃"));
t!("\n\n\nhello" => P, T("hello"));
t!(r"a\ b" => T("a"), L, S, T("b"));
t!("`py`" => R!["py"]);
@ -960,8 +975,9 @@ mod tests {
t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => C![
Some("typst"), " Typst uses ``` to indicate code blocks"
]);
e!("``` hi\nyou" => s(1,3, 1,3, "expected backticks"));
e!("```🌍 hi\nyou```" => s(0,3, 0,4, "invalid identifier"));
e!("``` hi\nyou" => s(1,3, 1,3, "expected backticks"));
e!("```🌍 hi\nyou```" => s(0,3, 0,4, "invalid identifier"));
e!("\\u{d421c809}" => s(0,0, 0,12, "invalid unicode codepoint"));
t!("💜\n\n 🌍" => T("💜"), P, T("🌍"));
ts!("hi" => s(0,0, 0,2, T("hi")));

View File

@ -82,6 +82,9 @@ pub enum Token<'s> {
/// A backslash followed by whitespace in text.
Backslash,
/// A unicode escape sequence
UnicodeEscape(&'s str),
/// Raw text.
Raw {
/// The raw text (not yet unescaped as for strings).
@ -136,6 +139,7 @@ impl<'s> Token<'s> {
Star => "star",
Underscore => "underscore",
Backslash => "backslash",
UnicodeEscape(_) => "unicode escape sequence",
Raw { .. } => "raw text",
Code { .. } => "code block",
Text(_) => "text",
@ -426,6 +430,41 @@ impl<'s> Tokens<'s> {
}
match self.peek() {
Some(c) if c == 'u' => {
// Index which points to start of escape sequence
let index = self.index() - 1;
self.eat();
if self.peek() == Some('{') {
self.eat();
// This loop will eat all hexadecimal chars and an
// optional closing brace (brace not in end index range).
let mut end = self.index();
let mut valid = true;
while let Some(c) = self.peek() {
if c == '}' {
self.eat();
break;
}
if !c.is_ascii_hexdigit() {
valid = false;
break;
}
self.eat();
end = self.index();
}
if valid == false {
// There are only 8-bit ASCII chars in that range
Text(&self.src[index..end])
} else {
UnicodeEscape(&self.src[index + 3..end])
}
} else {
Text("\\u")
}
}
Some(c) if is_escapable(c) => {
let index = self.index();
self.eat();
@ -579,6 +618,7 @@ mod tests {
Plus,
Hyphen as Min,
Slash,
UnicodeEscape as UE,
Star,
Text as T,
};
@ -701,14 +741,16 @@ mod tests {
#[test]
fn tokenize_escaped_symbols() {
t!(Body, r"\\" => T(r"\"));
t!(Body, r"\[" => T("["));
t!(Body, r"\]" => T("]"));
t!(Body, r"\*" => T("*"));
t!(Body, r"\_" => T("_"));
t!(Body, r"\`" => T("`"));
t!(Body, r"\/" => T("/"));
t!(Body, r#"\""# => T("\""));
t!(Body, r"\\" => T(r"\"));
t!(Body, r"\[" => T("["));
t!(Body, r"\]" => T("]"));
t!(Body, r"\*" => T("*"));
t!(Body, r"\_" => T("_"));
t!(Body, r"\`" => T("`"));
t!(Body, r"\/" => T("/"));
t!(Body, r"\u{2603}" => UE("2603"));
t!(Body, r"\u{26A4" => UE("26A4"));
t!(Body, r#"\""# => T("\""));
}
#[test]
@ -716,6 +758,9 @@ mod tests {
t!(Body, r"\a" => T("\\"), T("a"));
t!(Body, r"\:" => T(r"\"), T(":"));
t!(Body, r"\=" => T(r"\"), T("="));
t!(Body, r"\u{2GA4"=> T(r"\u{2"), Text("GA4"));
t!(Body, r"\u{ " => T(r"\u{"), Space(0));
t!(Body, r"\u" => T(r"\u"));
t!(Header, r"\\\\" => Invalid(r"\\\\"));
t!(Header, r"\a" => Invalid(r"\a"));
t!(Header, r"\:" => Invalid(r"\"), Colon);