Slight simplifications 🍃

This commit is contained in:
Laurenz 2020-08-30 15:16:55 +02:00
parent c043692c96
commit 9861a9583e
2 changed files with 72 additions and 86 deletions

View File

@ -110,17 +110,23 @@ impl Parser<'_> {
self.with_span(SyntaxNode::Text(text.to_string())) self.with_span(SyntaxNode::Text(text.to_string()))
} }
Token::UnicodeEscape(ues) => { Token::UnicodeEscape { sequence, terminated } => {
if let Some(c) = std::char::from_u32( if !terminated {
u32::from_str_radix(ues, 16) error!(
.expect("Unicode escape string not convertible to int") @self.feedback, Span::at(token.span.end),
) { "expected closing brace",
let mut s = String::with_capacity(1); );
s.push(c); }
self.with_span(SyntaxNode::Text(s))
if let Some(c) = unescape_char(sequence) {
self.with_span(SyntaxNode::Text(c.to_string()))
} else { } else {
error!(@self.feedback, token.span, "invalid unicode codepoint"); self.eat();
self.with_span(SyntaxNode::Text("".to_string())) error!(
@self.feedback, token.span,
"invalid unicode escape sequence",
);
continue;
} }
} }
@ -608,7 +614,7 @@ impl Group {
} }
fn unescape_string(string: &str) -> String { fn unescape_string(string: &str) -> String {
let mut iter = string.chars(); let mut iter = string.chars().peekable();
let mut out = String::with_capacity(string.len()); let mut out = String::with_capacity(string.len());
while let Some(c) = iter.next() { while let Some(c) = iter.next() {
@ -616,45 +622,33 @@ fn unescape_string(string: &str) -> String {
match iter.next() { match iter.next() {
Some('\\') => out.push('\\'), Some('\\') => out.push('\\'),
Some('"') => out.push('"'), Some('"') => out.push('"'),
Some('u') => { Some('u') if iter.peek() == Some(&'{') => {
// Index which points to start of escape sequence iter.next();
let mut seen = "\\u".to_string();
let next = iter.next(); let mut sequence = String::new();
if next == Some('{') { let terminated = loop {
seen.push('{'); match iter.peek() {
// TODO: Feedback that closing brace is missing.
Some('}') => {
iter.next();
break true;
}
Some(&c) if c.is_ascii_hexdigit() => {
iter.next();
sequence.push(c);
}
_ => break false,
}
};
let mut valid = true; // TODO: Feedback that escape sequence is wrong.
let mut closed = false; if let Some(c) = unescape_char(&sequence) {
while let Some(c) = iter.next() {
seen.push(c);
if c == '}' {
closed = true;
break;
}
if !c.is_ascii_hexdigit() {
valid = false;
break;
}
}
if valid != false && seen.len() >= 3 {
if let Some(c) = std::char::from_u32(
u32::from_str_radix(&seen[3..seen.len() - if closed { 1 } else { 0 }], 16)
.expect("Unicode escape string not convertible to int")
) {
out.push(c); out.push(c);
} else { } else {
// Somehow provide feedback here that conversion failed? out.push_str("\\u{");
out.push_str(&seen); out.push_str(&sequence);
} if terminated {
} else { out.push('}');
out.push_str(&seen);
}
} else {
out.push_str("\\u");
if let Some(c) = next {
out.push(c);
} }
} }
} }
@ -673,7 +667,7 @@ fn unescape_string(string: &str) -> String {
/// Unescape raw markup and split it into into lines. /// Unescape raw markup and split it into into lines.
fn unescape_raw(raw: &str) -> Vec<String> { fn unescape_raw(raw: &str) -> Vec<String> {
let mut iter = raw.chars().peekable(); let mut iter = raw.chars();
let mut text = String::new(); let mut text = String::new();
while let Some(c) = iter.next() { while let Some(c) = iter.next() {
@ -761,6 +755,11 @@ fn unescape_code(raw: &str) -> Vec<String> {
split_lines(&text) split_lines(&text)
} }
/// Converts a hexademical sequence (without braces or "\u") into a character.
fn unescape_char(sequence: &str) -> Option<char> {
u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32)
}
fn split_lines(text: &str) -> Vec<String> { fn split_lines(text: &str) -> Vec<String> {
let mut iter = text.chars().peekable(); let mut iter = text.chars().peekable();
let mut line = String::new(); let mut line = String::new();
@ -947,7 +946,7 @@ mod tests {
test(r#"hello\nworld"#, "hello\nworld"); test(r#"hello\nworld"#, "hello\nworld");
test(r#"a\"bc"#, "a\"bc"); test(r#"a\"bc"#, "a\"bc");
test(r#"a\u{2603}bc"#, "a☃bc"); test(r#"a\u{2603}bc"#, "a☃bc");
test(r#"a\u{26c3bg"#, "a\\u{26c3bg"); test(r#"a\u{26c3bg"#, "a𦰻g");
test(r#"av\u{6797"#, "av林"); test(r#"av\u{6797"#, "av林");
test(r#"a\\"#, "a\\"); test(r#"a\\"#, "a\\");
test(r#"a\\\nbc"#, "a\\\nbc"); test(r#"a\\\nbc"#, "a\\\nbc");
@ -1011,18 +1010,16 @@ mod tests {
e!("`hi\nyou" => s(1,3, 1,3, "expected backtick")); e!("`hi\nyou" => s(1,3, 1,3, "expected backtick"));
t!("`hi\\`du`" => R!["hi`du"]); t!("`hi\\`du`" => R!["hi`du"]);
t!("```java System.out.print```" => C![ t!("```java System.out.print```" => C![Some("java"), "System.out.print"]);
Some("java"), "System.out.print" t!("``` console.log(\n\"alert\"\n)" => C![None, "console.log(", "\"alert\"", ")"]);
]);
t!("``` console.log(\n\"alert\"\n)" => C![
None, "console.log(", "\"alert\"", ")"
]);
t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => C![ t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => C![
Some("typst"), " Typst uses ``` to indicate code blocks" Some("typst"), " Typst uses ``` to indicate code blocks"
]); ]);
e!("``` hi\nyou" => s(1,3, 1,3, "expected backticks")); e!("``` hi\nyou" => s(1,3, 1,3, "expected backticks"));
e!("```🌍 hi\nyou```" => s(0,3, 0,4, "invalid identifier")); e!("```🌍 hi\nyou```" => s(0,3, 0,4, "invalid identifier"));
e!("\\u{d421c809}" => s(0,0, 0,12, "invalid unicode codepoint")); e!("\\u{d421c809}" => s(0,0, 0,12, "invalid unicode escape sequence"));
e!("\\u{abc" => s(0,6, 0,6, "expected closing brace"));
t!("💜\n\n 🌍" => T("💜"), P, T("🌍")); t!("💜\n\n 🌍" => T("💜"), P, T("🌍"));
ts!("hi" => s(0,0, 0,2, T("hi"))); ts!("hi" => s(0,0, 0,2, T("hi")));

View File

@ -82,8 +82,13 @@ pub enum Token<'s> {
/// A backslash followed by whitespace in text. /// A backslash followed by whitespace in text.
Backslash, Backslash,
/// A unicode escape sequence /// A unicode escape sequence.
UnicodeEscape(&'s str), UnicodeEscape {
/// The escape sequence between two braces.
sequence: &'s str,
/// Whether the closing brace was present.
terminated: bool,
},
/// Raw text. /// Raw text.
Raw { Raw {
@ -139,7 +144,7 @@ impl<'s> Token<'s> {
Star => "star", Star => "star",
Underscore => "underscore", Underscore => "underscore",
Backslash => "backslash", Backslash => "backslash",
UnicodeEscape(_) => "unicode escape sequence", UnicodeEscape { .. } => "unicode escape sequence",
Raw { .. } => "raw text", Raw { .. } => "raw text",
Code { .. } => "code block", Code { .. } => "code block",
Text(_) => "text", Text(_) => "text",
@ -431,36 +436,20 @@ impl<'s> Tokens<'s> {
match self.peek() { match self.peek() {
Some('u') => { Some('u') => {
// Index which points to start of escape sequence
let index = self.index() - 1;
self.eat(); self.eat();
if self.peek() == Some('{') { if self.peek() == Some('{') {
self.eat(); self.eat();
// This loop will eat all hexadecimal chars and an let sequence = self.read_string_until(
// optional closing brace (brace not in end index range). |c| !c.is_ascii_hexdigit(),
let mut end = self.index(); false, 0, 0,
let mut valid = true; ).0;
while let Some(c) = self.peek() {
if c == '}' { let terminated = self.peek() == Some('}');
if terminated {
self.eat(); self.eat();
break;
} }
if !c.is_ascii_hexdigit() { UnicodeEscape { sequence, terminated }
valid = false;
break;
}
self.eat();
end = self.index();
}
if valid == false {
// There are only 8-bit ASCII chars in that range
Text(&self.src[index..end])
} else {
UnicodeEscape(&self.src[index + 3..end])
}
} else { } else {
Text("\\u") Text("\\u")
} }
@ -618,7 +607,6 @@ mod tests {
Plus, Plus,
Hyphen as Min, Hyphen as Min,
Slash, Slash,
UnicodeEscape as UE,
Star, Star,
Text as T, Text as T,
}; };
@ -628,6 +616,7 @@ mod tests {
fn Code<'a>(lang: Option<&'a str>, raw: &'a str, terminated: bool) -> Token<'a> { fn Code<'a>(lang: Option<&'a str>, raw: &'a str, terminated: bool) -> Token<'a> {
Token::Code { lang: lang.map(Spanned::zero), raw, terminated } Token::Code { lang: lang.map(Spanned::zero), raw, terminated }
} }
fn UE(sequence: &str, terminated: bool) -> Token { Token::UnicodeEscape { sequence, terminated } }
macro_rules! t { ($($tts:tt)*) => {test!(@spans=false, $($tts)*)} } macro_rules! t { ($($tts:tt)*) => {test!(@spans=false, $($tts)*)} }
macro_rules! ts { ($($tts:tt)*) => {test!(@spans=true, $($tts)*)} } macro_rules! ts { ($($tts:tt)*) => {test!(@spans=true, $($tts)*)} }
@ -748,8 +737,8 @@ mod tests {
t!(Body, r"\_" => T("_")); t!(Body, r"\_" => T("_"));
t!(Body, r"\`" => T("`")); t!(Body, r"\`" => T("`"));
t!(Body, r"\/" => T("/")); t!(Body, r"\/" => T("/"));
t!(Body, r"\u{2603}" => UE("2603")); t!(Body, r"\u{2603}" => UE("2603", true));
t!(Body, r"\u{26A4" => UE("26A4")); t!(Body, r"\u{26A4" => UE("26A4", false));
t!(Body, r#"\""# => T("\"")); t!(Body, r#"\""# => T("\""));
} }
@ -758,8 +747,8 @@ mod tests {
t!(Body, r"\a" => T("\\"), T("a")); t!(Body, r"\a" => T("\\"), T("a"));
t!(Body, r"\:" => T(r"\"), T(":")); t!(Body, r"\:" => T(r"\"), T(":"));
t!(Body, r"\=" => T(r"\"), T("=")); t!(Body, r"\=" => T(r"\"), T("="));
t!(Body, r"\u{2GA4"=> T(r"\u{2"), Text("GA4")); t!(Body, r"\u{2GA4"=> UE("2", false), T("GA4"));
t!(Body, r"\u{ " => T(r"\u{"), Space(0)); t!(Body, r"\u{ " => UE("", false), Space(0));
t!(Body, r"\u" => T(r"\u")); t!(Body, r"\u" => T(r"\u"));
t!(Header, r"\\\\" => Invalid(r"\\\\")); t!(Header, r"\\\\" => Invalid(r"\\\\"));
t!(Header, r"\a" => Invalid(r"\a")); t!(Header, r"\a" => Invalid(r"\a"));