typst/src/syntax/parsing/escaping.rs
Laurenz 7f8f225cb3 Split up parser into multiple files 🧱
Splits up into:
- escaping: resolving of escape sequences
- parser: the parsing code
- tests: all integrated parsing tests

Also moves Ident from the root syntax module into the tree module.
2020-09-03 19:16:19 +02:00

244 lines
7.4 KiB
Rust

use crate::syntax::tokens::is_newline_char;
/// Resolves all escape sequences in a string.
pub fn unescape_string(string: &str) -> String {
let mut iter = string.chars().peekable();
let mut out = String::with_capacity(string.len());
while let Some(c) = iter.next() {
if c == '\\' {
match iter.next() {
Some('\\') => out.push('\\'),
Some('"') => out.push('"'),
Some('u') if iter.peek() == Some(&'{') => {
iter.next();
let mut sequence = String::new();
let terminated = loop {
match iter.peek() {
// TODO: Feedback that closing brace is missing.
Some('}') => {
iter.next();
break true;
}
Some(&c) if c.is_ascii_hexdigit() => {
iter.next();
sequence.push(c);
}
_ => break false,
}
};
// TODO: Feedback that escape sequence is wrong.
if let Some(c) = hex_to_char(&sequence) {
out.push(c);
} else {
out.push_str("\\u{");
out.push_str(&sequence);
if terminated {
out.push('}');
}
}
}
Some('n') => out.push('\n'),
Some('t') => out.push('\t'),
Some(c) => {
out.push('\\');
out.push(c);
}
None => out.push('\\'),
}
} else {
out.push(c);
}
}
out
}
/// Resolves all escape sequences in raw markup (between backticks) and splits it into
/// into lines.
pub fn unescape_raw(raw: &str) -> Vec<String> {
let mut iter = raw.chars();
let mut text = String::new();
while let Some(c) = iter.next() {
if c == '\\' {
if let Some(c) = iter.next() {
if c != '\\' && c != '`' {
text.push('\\');
}
text.push(c);
} else {
text.push('\\');
}
} else {
text.push(c);
}
}
split_lines(&text)
}
/// Resolves all escape sequences in code markup (between triple backticks) and splits it
/// into into lines.
pub fn unescape_code(raw: &str) -> Vec<String> {
let mut iter = raw.chars().peekable();
let mut text = String::new();
let mut backticks = 0u32;
let mut update_backtick_count;
while let Some(c) = iter.next() {
update_backtick_count = true;
if c == '\\' && backticks > 0 {
let mut tail = String::new();
let mut escape_success = false;
let mut backticks_after_slash = 0u32;
while let Some(&s) = iter.peek() {
match s {
'\\' => {
if backticks_after_slash == 0 {
tail.push('\\');
} else {
// Pattern like `\`\` should fail
// escape and just be printed verbantim.
break;
}
}
'`' => {
tail.push(s);
backticks_after_slash += 1;
if backticks_after_slash == 2 {
escape_success = true;
iter.next();
break;
}
}
_ => break,
}
iter.next();
}
if !escape_success {
text.push(c);
backticks = backticks_after_slash;
update_backtick_count = false;
} else {
backticks = 0;
}
text.push_str(&tail);
} else {
text.push(c);
}
if update_backtick_count {
if c == '`' {
backticks += 1;
} else {
backticks = 0;
}
}
}
split_lines(&text)
}
/// Converts a hexademical sequence (without braces or "\u") into a character.
pub fn hex_to_char(sequence: &str) -> Option<char> {
u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32)
}
/// Splits a string into a vector of lines (respecting Unicode & Windows line breaks).
pub fn split_lines(text: &str) -> Vec<String> {
let mut iter = text.chars().peekable();
let mut line = String::new();
let mut lines = Vec::new();
while let Some(c) = iter.next() {
if is_newline_char(c) {
if c == '\r' && iter.peek() == Some(&'\n') {
iter.next();
}
lines.push(std::mem::take(&mut line));
} else {
line.push(c);
}
}
lines.push(line);
lines
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[rustfmt::skip]
fn test_unescape_strings() {
fn test(string: &str, expected: &str) {
assert_eq!(unescape_string(string), expected.to_string());
}
test(r#"hello world"#, "hello world");
test(r#"hello\nworld"#, "hello\nworld");
test(r#"a\"bc"#, "a\"bc");
test(r#"a\u{2603}bc"#, "a☃bc");
test(r#"a\u{26c3bg"#, "a𦰻g");
test(r#"av\u{6797"#, "av林");
test(r#"a\\"#, "a\\");
test(r#"a\\\nbc"#, "a\\\nbc");
test(r#"a\tbc"#, "a\tbc");
test(r"🌎", "🌎");
test(r"🌎\", r"🌎\");
test(r"\🌎", r"\🌎");
}
#[test]
#[rustfmt::skip]
fn test_unescape_raws() {
fn test(raw: &str, expected: Vec<&str>) {
assert_eq!(unescape_raw(raw), expected);
}
test("raw\\`", vec!["raw`"]);
test("raw\\\\`", vec!["raw\\`"]);
test("raw\ntext", vec!["raw", "text"]);
test("a\r\nb", vec!["a", "b"]);
test("a\n\nb", vec!["a", "", "b"]);
test("a\r\x0Bb", vec!["a", "", "b"]);
test("a\r\n\r\nb", vec!["a", "", "b"]);
test("raw\\a", vec!["raw\\a"]);
test("raw\\", vec!["raw\\"]);
}
#[test]
#[rustfmt::skip]
fn test_unescape_code() {
fn test(raw: &str, expected: Vec<&str>) {
assert_eq!(unescape_code(raw), expected);
}
test("code\\`", vec!["code\\`"]);
test("code`\\``", vec!["code```"]);
test("code`\\`a", vec!["code`\\`a"]);
test("code``hi`\\``", vec!["code``hi```"]);
test("code`\\\\``", vec!["code`\\``"]);
test("code`\\`\\`go", vec!["code`\\`\\`go"]);
test("code`\\`\\``", vec!["code`\\```"]);
test("code\ntext", vec!["code", "text"]);
test("a\r\nb", vec!["a", "b"]);
test("a\n\nb", vec!["a", "", "b"]);
test("a\r\x0Bb", vec!["a", "", "b"]);
test("a\r\n\r\nb", vec!["a", "", "b"]);
test("code\\a", vec!["code\\a"]);
test("code\\", vec!["code\\"]);
}
}