Better tokenization testing 🌋

- Better tokenization test coverage.
- Suffix testing: Each test case is tested with many different suffixes to ensure correct token ends.
- Improves expression parsing (fixes #3).
This commit is contained in:
Laurenz 2020-12-16 15:42:02 +01:00
parent 0cfce1de7e
commit 6bbedeaa2c
5 changed files with 532 additions and 243 deletions

1
.gitignore vendored
View File

@ -5,3 +5,4 @@ Cargo.lock
bench/target bench/target
tests/out tests/out
_things _things
tarpaulin-report.html

View File

@ -66,7 +66,6 @@ fn node(p: &mut Parser, at_start: bool) -> Option<Spanned<SynNode>> {
// Markup. // Markup.
Token::Star => SynNode::Strong, Token::Star => SynNode::Strong,
Token::Underscore => SynNode::Emph, Token::Underscore => SynNode::Emph,
Token::Backslash => SynNode::Linebreak,
Token::Hashtag => { Token::Hashtag => {
if at_start { if at_start {
SynNode::Heading(heading(p, start)) SynNode::Heading(heading(p, start))
@ -74,9 +73,10 @@ fn node(p: &mut Parser, at_start: bool) -> Option<Spanned<SynNode>> {
SynNode::Text(p.eaten_from(start).into()) SynNode::Text(p.eaten_from(start).into())
} }
} }
Token::NonBreakingSpace => SynNode::Text("\u{00A0}".into()), Token::Tilde => SynNode::Text("\u{00A0}".into()),
Token::Raw(token) => SynNode::Raw(raw(p, token)), Token::Backslash => SynNode::Linebreak,
Token::UnicodeEscape(token) => SynNode::Text(unicode_escape(p, token, start)), Token::UnicodeEscape(token) => SynNode::Text(unicode_escape(p, token, start)),
Token::Raw(token) => SynNode::Raw(raw(p, token)),
// Functions. // Functions.
Token::LeftBracket => { Token::LeftBracket => {

View File

@ -63,57 +63,79 @@ impl<'s> Iterator for Tokens<'s> {
/// Parse the next token in the source code. /// Parse the next token in the source code.
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
let start = self.s.index(); let start = self.s.index();
Some(match self.s.eat()? { let c = self.s.eat()?;
// Whitespace with fast path for just a single space.
' ' if !self.s.check(|c| c.is_whitespace()) => Token::Space(0), // This never loops. It just exists to allow breaking out of it.
c if c.is_whitespace() => { loop {
self.s.jump(start); // Common elements.
self.read_whitespace() return Some(match c {
} // Whitespace.
c if c.is_whitespace() => self.whitespace(c, start),
// Comments. // Comments.
'/' if self.s.eat_if('/') => self.read_line_comment(), '/' if self.s.eat_if('/') => self.line_comment(),
'/' if self.s.eat_if('*') => self.read_block_comment(), '/' if self.s.eat_if('*') => self.block_comment(),
'*' if self.s.eat_if('/') => Token::Invalid("*/"), '*' if self.s.eat_if('/') => Token::Invalid("*/"),
// Functions. // Functions and blocks.
'[' => Token::LeftBracket, '[' => Token::LeftBracket,
']' => Token::RightBracket, ']' => Token::RightBracket,
'{' => Token::LeftBrace, '{' => Token::LeftBrace,
'}' => Token::RightBrace, '}' => Token::RightBrace,
// Syntactic elements in body text. _ => break,
'*' if self.mode == Body => Token::Star, });
'_' if self.mode == Body => Token::Underscore, }
'#' if self.mode == Body => Token::Hashtag,
'~' if self.mode == Body => Token::NonBreakingSpace,
'`' if self.mode == Body => self.read_raw(),
'\\' if self.mode == Body => self.read_escaped(),
Some(match self.mode {
Body => match c {
// Markup.
'*' => Token::Star,
'_' => Token::Underscore,
'~' => Token::Tilde,
'#' => Token::Hashtag,
'`' => self.raw(),
// Escape sequences.
'\\' => self.escaped(),
// Plain text.
_ => self.text(start),
},
Header => match c {
// Syntactic elements in headers. // Syntactic elements in headers.
'(' if self.mode == Header => Token::LeftParen, '(' => Token::LeftParen,
')' if self.mode == Header => Token::RightParen, ')' => Token::RightParen,
':' if self.mode == Header => Token::Colon, ':' => Token::Colon,
',' if self.mode == Header => Token::Comma, ',' => Token::Comma,
'=' if self.mode == Header => Token::Equals, '=' => Token::Equals,
'>' if self.mode == Header && self.s.eat_if('>') => Token::Chain, '>' if self.s.eat_if('>') => Token::Chain,
'+' if self.mode == Header => Token::Plus, '+' => Token::Plus,
'-' if self.mode == Header => Token::Hyphen, '-' => Token::Hyphen,
'*' if self.mode == Header => Token::Star, '*' => Token::Star,
'/' if self.mode == Header => Token::Slash, '/' => Token::Slash,
// Expressions in headers. // Expressions in headers.
'#' if self.mode == Header => self.read_hex(), '#' => self.hex(),
'"' if self.mode == Header => self.read_string(), '"' => self.string(),
// Expressions or just plain text. // Expressions.
_ => self.read_text_or_expr(start), c => self.expr(c, start),
},
}) })
} }
} }
impl<'s> Tokens<'s> { impl<'s> Tokens<'s> {
fn read_whitespace(&mut self) -> Token<'s> { fn whitespace(&mut self, first: char, start: usize) -> Token<'s> {
// Fast path for just a single space
if first == ' ' && !self.s.check(|c| c.is_whitespace()) {
return Token::Space(0);
}
self.s.jump(start);
// Count the number of newlines. // Count the number of newlines.
let mut newlines = 0; let mut newlines = 0;
while let Some(c) = self.s.eat_merging_crlf() { while let Some(c) = self.s.eat_merging_crlf() {
@ -130,11 +152,11 @@ impl<'s> Tokens<'s> {
Token::Space(newlines) Token::Space(newlines)
} }
fn read_line_comment(&mut self) -> Token<'s> { fn line_comment(&mut self) -> Token<'s> {
Token::LineComment(self.s.eat_until(is_newline)) Token::LineComment(self.s.eat_until(is_newline))
} }
fn read_block_comment(&mut self) -> Token<'s> { fn block_comment(&mut self) -> Token<'s> {
let start = self.s.index(); let start = self.s.index();
let mut state = '_'; let mut state = '_';
@ -164,7 +186,7 @@ impl<'s> Tokens<'s> {
Token::BlockComment(self.s.get(start .. end)) Token::BlockComment(self.s.get(start .. end))
} }
fn read_raw(&mut self) -> Token<'s> { fn raw(&mut self) -> Token<'s> {
let mut backticks = 1; let mut backticks = 1;
while self.s.eat_if('`') { while self.s.eat_if('`') {
backticks += 1; backticks += 1;
@ -191,10 +213,15 @@ impl<'s> Tokens<'s> {
}) })
} }
fn read_escaped(&mut self) -> Token<'s> { fn escaped(&mut self) -> Token<'s> {
if let Some(c) = self.s.peek() { if let Some(c) = self.s.peek() {
match c { match c {
'[' | ']' | '\\' | '/' | '*' | '_' | '`' | '"' | '#' | '~' => { // Backslash and comments.
'\\' | '/' |
// Parenthesis.
'[' | ']' | '{' | '}' |
// Markup.
'*' | '_' | '~' | '#' | '`' => {
let start = self.s.index(); let start = self.s.index();
self.s.eat_assert(c); self.s.eat_assert(c);
Token::Text(&self.s.eaten_from(start)) Token::Text(&self.s.eaten_from(start))
@ -203,7 +230,8 @@ impl<'s> Tokens<'s> {
self.s.eat_assert('u'); self.s.eat_assert('u');
self.s.eat_assert('{'); self.s.eat_assert('{');
Token::UnicodeEscape(TokenUnicodeEscape { Token::UnicodeEscape(TokenUnicodeEscape {
sequence: self.s.eat_while(|c| c.is_ascii_hexdigit()), // Allow more than `ascii_hexdigit` for better error recovery.
sequence: self.s.eat_while(|c| c.is_ascii_alphanumeric()),
terminated: self.s.eat_if('}'), terminated: self.s.eat_if('}'),
}) })
} }
@ -215,13 +243,35 @@ impl<'s> Tokens<'s> {
} }
} }
fn read_hex(&mut self) -> Token<'s> { fn text(&mut self, start: usize) -> Token<'s> {
// This parses more than the permissable 0-9, a-f, A-F character ranges while let Some(c) = self.s.eat() {
// to provide nicer error messages later. if match c {
// Whitespace.
c if c.is_whitespace() => true,
// Comments.
'/' if self.s.check(|c| c == '/' || c == '*') => true,
// Parenthesis.
'[' | ']' | '{' | '}' => true,
// Markup.
'*' | '_' | '#' | '~' | '`' => true,
// Escaping.
'\\' => true,
_ => false,
} {
self.s.uneat();
break;
}
}
Token::Text(self.s.eaten_from(start))
}
fn hex(&mut self) -> Token<'s> {
// Allow more than `ascii_hexdigit` for better error recovery.
Token::Hex(self.s.eat_while(|c| c.is_ascii_alphanumeric())) Token::Hex(self.s.eat_while(|c| c.is_ascii_alphanumeric()))
} }
fn read_string(&mut self) -> Token<'s> { fn string(&mut self) -> Token<'s> {
let mut escaped = false; let mut escaped = false;
Token::Str(TokenStr { Token::Str(TokenStr {
string: self.s.eat_until(|c| { string: self.s.eat_until(|c| {
@ -236,29 +286,60 @@ impl<'s> Tokens<'s> {
}) })
} }
fn read_text_or_expr(&mut self, start: usize) -> Token<'s> { fn expr(&mut self, first: char, start: usize) -> Token<'s> {
let body = self.mode == Body; if is_id_start(first) {
let header = self.mode == Header; self.ident(start)
} else if first.is_ascii_digit()
let mut last_was_e = false; || (first == '.' && self.s.check(|c| c.is_ascii_digit()))
self.s.eat_until(|c| { {
let end = match c { self.number(start)
c if c.is_whitespace() => true,
'[' | ']' | '{' | '}' | '*' | '/' | '#' => true,
'_' | '`' | '~' | '\\' if body => true,
'(' | ')' | ':' | ',' | '=' | '"' if header => true,
'+' | '-' if header && !last_was_e => true,
_ => false,
};
last_was_e = c == 'e' || c == 'E';
end
});
let read = self.s.eaten_from(start);
if self.mode == Header {
parse_expr(read)
} else { } else {
Token::Text(read) Token::Invalid(self.s.eaten_from(start))
}
}
fn ident(&mut self, start: usize) -> Token<'s> {
self.s.eat_while(is_id_continue);
let string = self.s.eaten_from(start);
match string {
"true" => Token::Bool(true),
"false" => Token::Bool(false),
_ => Token::Ident(string),
}
}
fn number(&mut self, start: usize) -> Token<'s> {
self.s.jump(start);
// Read the integer part.
self.s.eat_while(|c| c.is_ascii_digit());
// Read the fractional part if present.
if self.s.eat_if('.') {
self.s.eat_while(|c| c.is_ascii_digit());
}
// Read the exponent.
if self.s.eat_if('e') || self.s.eat_if('E') {
let _ = self.s.eat_if('+') || self.s.eat_if('-');
self.s.eat_while(|c| c.is_ascii_digit());
}
// Read the suffix.
self.s.eat_while(|c| c == '%' || c.is_ascii_alphanumeric());
// Parse into one of the suitable types.
let string = self.s.eaten_from(start);
if let Some(percent) = parse_percent(string) {
Token::Percent(percent)
} else if let Some((val, unit)) = parse_length(string) {
Token::Length(val, unit)
} else if let Ok(int) = string.parse::<i64>() {
Token::Int(int)
} else if let Ok(float) = string.parse::<f64>() {
Token::Float(float)
} else {
Token::Invalid(string)
} }
} }
} }
@ -269,30 +350,12 @@ impl Debug for Tokens<'_> {
} }
} }
fn parse_expr(text: &str) -> Token<'_> { fn parse_percent(string: &str) -> Option<f64> {
if let Ok(b) = text.parse::<bool>() { string.strip_suffix('%').and_then(|prefix| prefix.parse::<f64>().ok())
Token::Bool(b)
} else if let Ok(int) = text.parse::<i64>() {
Token::Int(int)
} else if let Ok(num) = text.parse::<f64>() {
Token::Float(num)
} else if let Some(percent) = parse_percent(text) {
Token::Percent(percent)
} else if let Some((val, unit)) = parse_length(text) {
Token::Length(val, unit)
} else if is_ident(text) {
Token::Ident(text)
} else {
Token::Invalid(text)
}
} }
fn parse_percent(text: &str) -> Option<f64> { fn parse_length(string: &str) -> Option<(f64, Unit)> {
text.strip_suffix('%').and_then(|num| num.parse::<f64>().ok()) let len = string.len();
}
fn parse_length(text: &str) -> Option<(f64, Unit)> {
let len = text.len();
// We need at least some number and the unit. // We need at least some number and the unit.
if len <= 2 { if len <= 2 {
@ -302,7 +365,7 @@ fn parse_length(text: &str) -> Option<(f64, Unit)> {
// We can view the string as bytes since a multibyte UTF-8 char cannot // We can view the string as bytes since a multibyte UTF-8 char cannot
// have valid ASCII chars as subbytes. // have valid ASCII chars as subbytes.
let split = len - 2; let split = len - 2;
let bytes = text.as_bytes(); let bytes = string.as_bytes();
let unit = match &bytes[split ..] { let unit = match &bytes[split ..] {
b"pt" => Unit::Pt, b"pt" => Unit::Pt,
b"mm" => Unit::Mm, b"mm" => Unit::Mm,
@ -311,7 +374,7 @@ fn parse_length(text: &str) -> Option<(f64, Unit)> {
_ => return None, _ => return None,
}; };
text[.. split].parse::<f64>().ok().map(|val| (val, unit)) string[.. split].parse::<f64>().ok().map(|val| (val, unit))
} }
#[cfg(test)] #[cfg(test)]
@ -321,33 +384,106 @@ mod tests {
use crate::parse::tests::check; use crate::parse::tests::check;
use Token::{ use Token::{
BlockComment as BC, Hyphen as Min, Ident as Id, LeftBrace as LB, BlockComment as BC, Ident as Id, LeftBrace as LB, LeftBracket as L,
LeftBracket as L, LeftParen as LP, LineComment as LC, NonBreakingSpace as Nbsp, LeftParen as LP, LineComment as LC, RightBrace as RB, RightBracket as R,
RightBrace as RB, RightBracket as R, RightParen as RP, Space as S, Text as T, *, RightParen as RP, Space as S, Text as T, *,
}; };
use Unit::*; use Unit::*;
fn Str(string: &str, terminated: bool) -> Token { fn Str(string: &str, terminated: bool) -> Token {
Token::Str(TokenStr { string, terminated }) Token::Str(TokenStr { string, terminated })
} }
fn Raw(text: &str, backticks: usize, terminated: bool) -> Token { fn Raw(text: &str, backticks: usize, terminated: bool) -> Token {
Token::Raw(TokenRaw { text, backticks, terminated }) Token::Raw(TokenRaw { text, backticks, terminated })
} }
fn UE(sequence: &str, terminated: bool) -> Token { fn UE(sequence: &str, terminated: bool) -> Token {
Token::UnicodeEscape(TokenUnicodeEscape { sequence, terminated }) Token::UnicodeEscape(TokenUnicodeEscape { sequence, terminated })
} }
/// Building blocks for suffix testing.
///
/// We extend each test case with a collection of different suffixes to make
/// sure tokens end at the correct position. These suffixes are split into
/// blocks, which can be disabled/enabled per test case. For example, when
/// testing identifiers we disable letter suffixes because these would
/// mingle with the identifiers.
///
/// Suffix blocks:
/// - ' ': spacing
/// - 'a': letters
/// - '1': numbers
/// - '/': symbols
const BLOCKS: &str = " a1/";
/// Suffixes described by four-tuples of:
///
/// - block the suffix is part of
/// - mode in which the suffix is applicable
/// - the suffix string
/// - the resulting suffix token
const SUFFIXES: &[(char, Option<TokenMode>, &str, Token)] = &[
// Whitespace suffixes.
(' ', None, " ", S(0)),
(' ', None, "\n", S(1)),
(' ', None, "\r", S(1)),
(' ', None, "\r\n", S(1)),
// Letter suffixes.
('a', Some(Body), "hello", T("hello")),
('a', Some(Body), "💚", T("💚")),
('a', Some(Header), "val", Id("val")),
('a', Some(Header), "α", Id("α")),
('a', Some(Header), "_", Id("_")),
// Number suffixes.
('1', Some(Header), "2", Int(2)),
('1', Some(Header), ".2", Float(0.2)),
// Symbol suffixes.
('/', None, "[", L),
('/', None, "//", LC("")),
('/', None, "/**/", BC("")),
('/', Some(Body), "*", Star),
('/', Some(Body), "_", Underscore),
('/', Some(Body), r"\\", T(r"\")),
('/', Some(Header), "(", LP),
('/', Some(Header), ":", Colon),
('/', Some(Header), "+", Plus),
('/', Some(Header), "#123", Hex("123")),
];
macro_rules! t { macro_rules! t {
($mode:expr, $src:expr => $($token:expr),*) => { (Both $($tts:tt)*) => {
let exp = vec![$($token),*]; t!(Body $($tts)*);
let found = Tokens::new($src, $mode).collect::<Vec<_>>(); t!(Header $($tts)*);
check($src, exp, found, false); };
($mode:ident $([$blocks:literal])?: $src:expr => $($token:expr),*) => {{
// Test without suffix.
t!(@$mode: $src => $($token),*);
// Test with each applicable suffix.
for &(block, mode, suffix, token) in SUFFIXES {
let src = $src;
#[allow(unused)]
let mut blocks = BLOCKS;
$(blocks = $blocks;)?
assert!(!blocks.contains(|c| !BLOCKS.contains(c)));
if (mode.is_none() || mode == Some($mode)) && blocks.contains(block) {
t!(@$mode: format!("{}{}", src, suffix) => $($token,)* token);
} }
} }
}};
(@$mode:ident: $src:expr => $($token:expr),*) => {{
let src = $src;
let exp = vec![$($token),*];
let found = Tokens::new(&src, $mode).collect::<Vec<_>>();
check(&src, exp, found, false);
}};
}
#[test] #[test]
fn test_length_from_str_parses_correct_value_and_unit() { fn test_length_from_str_parses_correct_value_and_unit() {
assert_eq!(parse_length("2.5cm"), Some((2.5, Cm))); assert_eq!(parse_length("2.5cm"), Some((2.5, Cm)));
assert_eq!(parse_length("1.e+2cm"), Some((100.0, Cm)));
} }
#[test] #[test]
@ -356,157 +492,305 @@ mod tests {
} }
#[test] #[test]
fn tokenize_whitespace() { fn test_tokenize_whitespace() {
t!(Body, "" => ); // Test basic whitespace.
t!(Body, " " => S(0)); t!(Both["a1/"]: "" => );
t!(Body, " " => S(0)); t!(Both["a1/"]: " " => S(0));
t!(Body, "\t" => S(0)); t!(Both["a1/"]: " " => S(0));
t!(Body, " \t" => S(0)); t!(Both["a1/"]: "\t" => S(0));
t!(Body, "\n" => S(1)); t!(Both["a1/"]: " \t" => S(0));
t!(Body, "\n " => S(1)); t!(Both["a1/"]: "\u{202F}" => S(0));
t!(Body, " \n" => S(1));
t!(Body, " \n " => S(1)); // Test newline counting.
t!(Body, "\r\n" => S(1)); t!(Both["a1/"]: "\n" => S(1));
t!(Body, " \n\t \n " => S(2)); t!(Both["a1/"]: "\n " => S(1));
t!(Body, "\n\r" => S(2)); t!(Both["a1/"]: " \n" => S(1));
t!(Body, " \r\r\n \x0D" => S(3)); t!(Both["a1/"]: " \n " => S(1));
t!(Body, "a~b" => T("a"), Nbsp, T("b")); t!(Both["a1/"]: "\r\n" => S(1));
t!(Both["a1/"]: " \n\t \n " => S(2));
t!(Both["a1/"]: "\n\r" => S(2));
t!(Both["a1/"]: " \r\r\n \x0D" => S(3));
} }
#[test] #[test]
fn tokenize_comments() { fn test_tokenize_line_comments() {
t!(Body, "a // bc\n " => T("a"), S(0), LC(" bc"), S(1)); // Test line comment with no trailing newline.
t!(Body, "a //a//b\n " => T("a"), S(0), LC("a//b"), S(1)); t!(Both[""]: "//" => LC(""));
t!(Body, "a //a//b\r\n" => T("a"), S(0), LC("a//b"), S(1));
t!(Body, "a //a//b\n\nhello" => T("a"), S(0), LC("a//b"), S(2), T("hello")); // Test line comment ends at newline.
t!(Body, "/**/" => BC("")); t!(Both["a1/"]: "//bc\n" => LC("bc"), S(1));
t!(Body, "_/*_/*a*/*/" => Underscore, BC("_/*a*/")); t!(Both["a1/"]: "// bc \n" => LC(" bc "), S(1));
t!(Body, "/*/*/" => BC("/*/")); t!(Both["a1/"]: "//bc\r\n" => LC("bc"), S(1));
t!(Body, "abc*/" => T("abc"), Invalid("*/"));
t!(Body, "/***/" => BC("*")); // Test nested line comments.
t!(Body, "/**\\****/*/*/" => BC("*\\***"), Invalid("*/"), Invalid("*/")); t!(Both["a1/"]: "//a//b\n" => LC("a//b"), S(1));
t!(Body, "/*abc" => BC("abc"));
t!(Body, "/*/*abc*/" => BC("/*abc*/"));
} }
#[test] #[test]
fn tokenize_body_tokens() { fn test_tokenize_block_comments() {
t!(Body, "a_*" => T("a"), Underscore, Star); // Test basic block comments.
t!(Body, "a***" => T("a"), Star, Star, Star); t!(Both[""]: "/*" => BC(""));
t!(Body, "[func]*bold*" => L, T("func"), R, Star, T("bold"), Star); t!(Both: "/**/" => BC(""));
t!(Body, "hi_you_ there" => T("hi"), Underscore, T("you"), Underscore, S(0), T("there")); t!(Both: "/*🏞*/" => BC("🏞"));
t!(Body, "# hi" => Hashtag, S(0), T("hi")); t!(Both: "/*\n*/" => BC("\n"));
t!(Body, "ab# hi" => T("ab"), Hashtag, S(0), T("hi"));
t!(Body, "#{}" => Hashtag, LB, RB); // Test depth 1 and 2 nested block comments.
t!(Body, "{text}" => LB, Text("text"), RB); t!(Both: "/* /* */ */" => BC(" /* */ "));
t!(Header, "_`" => Invalid("_`")); t!(Both: "/*/*/**/*/*/" => BC("/*/**/*/"));
// Test two nested, one unclosed block comments.
t!(Both[""]: "/*/*/**/*/" => BC("/*/**/*/"));
// Test all combinations of up to two following slashes and stars.
t!(Both[""]: "/*" => BC(""));
t!(Both[""]: "/*/" => BC("/"));
t!(Both[""]: "/**" => BC("*"));
t!(Both[""]: "/*//" => BC("//"));
t!(Both[""]: "/*/*" => BC("/*"));
t!(Both[""]: "/**/" => BC(""));
t!(Both[""]: "/***" => BC("**"));
} }
#[test] #[test]
fn test_tokenize_raw() { fn test_tokenize_body_tokens() {
// Basics. // Test parentheses.
t!(Body, "a`raw`" => T("a"), Raw("raw", 1, true)); t!(Body: "[" => L);
t!(Body, "`[func]`" => Raw("[func]", 1, true)); t!(Body: "]" => R);
t!(Body, "`]" => Raw("]", 1, false)); t!(Body: "{" => LB);
t!(Body, r"`\`` " => Raw(r"\", 1, true), Raw(" ", 1, false)); t!(Body: "}" => RB);
// Language tag. // Test markup tokens.
t!(Body, "``` hi```" => Raw(" hi", 3, true)); t!(Body[" a1"]: "*" => Star);
t!(Body, "```rust hi```" => Raw("rust hi", 3, true)); t!(Body: "_" => Underscore);
t!(Body, r"``` hi\````" => Raw(r" hi\", 3, true), Raw("", 1, false)); t!(Body: "~" => Tilde);
t!(Body, "``` not `y`e`t finished```" => Raw(" not `y`e`t finished", 3, true)); t!(Body: "#" => Hashtag);
t!(Body, "```js \r\n document.write(\"go\")`" t!(Body[" "]: r"\" => Backslash);
=> Raw("js \r\n document.write(\"go\")`", 3, false));
// More backticks. // Test header symbols.
t!(Body, "`````` ``````hi" => Raw(" ", 6, true), T("hi")); t!(Body[" /"]: ":,=>>/+-" => T(":,=>>/+-"));
t!(Body, "````\n```js\nalert()\n```\n````" => Raw("\n```js\nalert()\n```\n", 4, true));
} }
#[test] #[test]
fn tokenize_escaped_symbols() { fn test_tokenize_raw_blocks() {
t!(Body, r"\\" => T(r"\")); // Test basic raw block.
t!(Body, r"\[" => T("[")); t!(Body: "`raw`" => Raw("raw", 1, true));
t!(Body, r"\]" => T("]")); t!(Body[""]: "`]" => Raw("]", 1, false));
t!(Body, r"\*" => T("*"));
t!(Body, r"\_" => T("_")); // Test special symbols in raw block.
t!(Body, r"\`" => T("`")); t!(Body: "`[func]`" => Raw("[func]", 1, true));
t!(Body, r"\/" => T("/")); t!(Body[""]: r"`\`` " => Raw(r"\", 1, true), Raw(" ", 1, false));
t!(Body, r"\u{2603}" => UE("2603", true));
t!(Body, r"\u{26A4" => UE("26A4", false)); // Test more backticks.
t!(Body, r#"\""# => T("\"")); t!(Body: "````🚀````" => Raw("🚀", 4, true));
t!(Body[""]: "````👩‍🚀``noend" => Raw("👩‍🚀``noend", 4, false));
t!(Body[""]: "````raw``````new" => Raw("raw", 4, true), Raw("new", 2, false));
// Test separated closing backticks.
t!(Body: "```not `y`e`t```" => Raw("not `y`e`t", 3, true));
} }
#[test] #[test]
fn tokenize_unescapable_symbols() { fn test_tokenize_escape_sequences() {
t!(Body, r"\a" => T("\\"), T("a")); // Test escapable symbols.
t!(Body, r"\:" => T(r"\"), T(":")); t!(Body: r"\\" => T(r"\"));
t!(Body, r"\=" => T(r"\"), T("=")); t!(Body: r"\/" => T("/"));
t!(Body, r"\u{2GA4" => UE("2", false), T("GA4")); t!(Body: r"\[" => T("["));
t!(Body, r"\u{ " => UE("", false), Space(0)); t!(Body: r"\]" => T("]"));
t!(Body, r"\u" => T("\\"), T("u")); t!(Body: r"\{" => T("{"));
t!(Header, r"\\\\" => Invalid(r"\\\\")); t!(Body: r"\}" => T("}"));
t!(Header, r"\a" => Invalid(r"\a")); t!(Body: r"\*" => T("*"));
t!(Header, r"\:" => Invalid(r"\"), Colon); t!(Body: r"\_" => T("_"));
t!(Header, r"\=" => Invalid(r"\"), Equals); t!(Body: r"\#" => T("#"));
t!(Header, r"\," => Invalid(r"\"), Comma); t!(Body: r"\~" => T("~"));
t!(Body: r"\`" => T("`"));
// Test unescapable symbols.
t!(Body[" /"]: r"\a" => T(r"\"), T("a"));
t!(Body[" /"]: r"\u" => T(r"\"), T("u"));
t!(Body[" /"]: r"\1" => T(r"\"), T("1"));
t!(Body[" /"]: r"\:" => T(r"\"), T(":"));
t!(Body[" /"]: r"\=" => T(r"\"), T("="));
t!(Body[" /"]: r#"\""# => T(r"\"), T("\""));
// Test basic unicode escapes.
t!(Body: r"\u{}" => UE("", true));
t!(Body: r"\u{2603}" => UE("2603", true));
t!(Body: r"\u{P}" => UE("P", true));
// Test unclosed unicode escapes.
t!(Body[" /"]: r"\u{" => UE("", false));
t!(Body[" /"]: r"\u{1" => UE("1", false));
t!(Body[" /"]: r"\u{26A4" => UE("26A4", false));
t!(Body[" /"]: r"\u{1Q3P" => UE("1Q3P", false));
t!(Body: r"\u{1🏕}" => UE("1", false), T("🏕"), RB);
} }
#[test] #[test]
fn tokenize_header_tokens() { fn test_tokenize_text() {
t!(Header, "__main__" => Id("__main__")); // Test basic text.
t!(Header, "_func_box" => Id("_func_box")); t!(Body[" /"]: "hello" => T("hello"));
t!(Header, ">main" => Invalid(">main")); t!(Body[" /"]: "hello-world" => T("hello-world"));
t!(Header, "🌓, 🌍," => Invalid("🌓"), Comma, S(0), Invalid("🌍"), Comma);
t!(Header, "{abc}" => LB, Id("abc"), RB); // Test header symbols in text.
t!(Header, "(1,2)" => LP, Int(1), Comma, Int(2), RP); t!(Body[" /"]: "a():\"b" => T("a():\"b"));
t!(Header, "12_pt, 12pt" => Invalid("12_pt"), Comma, S(0), Length(12.0, Pt));
t!(Header, "f: arg >> g" => Id("f"), Colon, S(0), Id("arg"), S(0), Chain, S(0), Id("g")); // Test text ends.
t!(Header, "=3.15" => Equals, Float(3.15)); t!(Body[""]: "hello " => T("hello"), S(0));
t!(Header, "arg, _b, _1" => Id("arg"), Comma, S(0), Id("_b"), Comma, S(0), Id("_1")); t!(Body[""]: "hello~" => T("hello"), Tilde);
t!(Header, "a:b" => Id("a"), Colon, Id("b"));
t!(Header, "(){}:=," => LP, RP, LB, RB, Colon, Equals, Comma);
t!(Body, "c=d, " => T("c=d,"), S(0));
t!(Body, "a: b" => T("a:"), S(0), T("b"));
t!(Header, "a: true, x=1" => Id("a"), Colon, S(0), Bool(true), Comma, S(0),
Id("x"), Equals, Int(1));
} }
#[test] #[test]
fn tokenize_numeric_values() { fn test_tokenize_header_tokens() {
t!(Header, "12.3e5" => Float(12.3e5)); // Test parentheses.
t!(Header, "120%" => Percent(120.0)); t!(Header: "[" => L);
t!(Header, "12e4%" => Percent(120000.0)); t!(Header: "]" => R);
t!(Header, "1e5in" => Length(100000.0, In)); t!(Header: "{" => LB);
t!(Header, "2.3cm" => Length(2.3, Cm)); t!(Header: "}" => RB);
t!(Header, "02.4mm" => Length(2.4, Mm)); t!(Header: "(" => LP);
t!(Header, "2.4.cm" => Invalid("2.4.cm")); t!(Header: ")" => RP);
t!(Header, "#6ae6dd" => Hex("6ae6dd"));
t!(Header, "#8A083c" => Hex("8A083c")); // Test structural tokens.
t!(Header: ":" => Colon);
t!(Header: "," => Comma);
t!(Header: "=" => Equals);
t!(Header: ">>" => Chain);
t!(Header: "+" => Plus);
t!(Header: "-" => Hyphen);
t!(Header[" a1"]: "*" => Star);
t!(Header[" a1"]: "/" => Slash);
// Test hyphen parsed as symbol.
t!(Header[" /"]: "-1" => Hyphen, Int(1));
t!(Header[" /"]: "-a" => Hyphen, Id("a"));
t!(Header[" /"]: "--1" => Hyphen, Hyphen, Int(1));
t!(Header[" /"]: "--_a" => Hyphen, Hyphen, Id("_a"));
t!(Header[" /"]: "a-b" => Id("a-b"));
// Test some operations.
t!(Header[" /"]: "1+3" => Int(1), Plus, Int(3));
t!(Header[" /"]: "1*3" => Int(1), Star, Int(3));
t!(Header[" /"]: "1/3" => Int(1), Slash, Int(3));
} }
#[test] #[test]
fn tokenize_strings() { fn test_tokenize_idents() {
t!(Body, "a \"hi\" string" => T("a"), S(0), T("\"hi\""), S(0), T("string")); // Test valid identifiers.
t!(Header, "\"hello" => Str("hello", false)); t!(Header[" /"]: "x" => Id("x"));
t!(Header, "\"hello world\"" => Str("hello world", true)); t!(Header[" /"]: "value" => Id("value"));
t!(Header, "\"hello\nworld\"" => Str("hello\nworld", true)); t!(Header[" /"]: "__main__" => Id("__main__"));
t!(Header, r#"1"hello\nworld"false"# => Int(1), Str("hello\\nworld", true), Bool(false)); t!(Header[" /"]: "_snake_case" => Id("_snake_case"));
t!(Header, r#""a\"bc""# => Str(r#"a\"bc"#, true));
t!(Header, r#""a\\"bc""# => Str(r#"a\\"#, true), Id("bc"), Str("", false)); // Test non-ascii.
t!(Header, r#""a\tbc"# => Str("a\\tbc", false)); t!(Header[" /"]: "α" => Id("α"));
t!(Header, "\"🌎\"" => Str("🌎", true)); t!(Header[" /"]: "ម្តាយ" => Id("ម្តាយ"));
// Test hyphen parsed as identifier.
t!(Header[" /"]: "kebab-case" => Id("kebab-case"));
t!(Header[" /"]: "one-10" => Id("one-10"));
} }
#[test] #[test]
fn tokenize_math() { fn test_tokenize_bools() {
t!(Header, "12e-3in" => Length(12e-3, In)); // Test valid bools.
t!(Header, "-1" => Min, Int(1)); t!(Header[" /"]: "false" => Bool(false));
t!(Header, "--1" => Min, Min, Int(1)); t!(Header[" /"]: "true" => Bool(true));
t!(Header, "- 1" => Min, S(0), Int(1));
t!(Header, "6.1cm + 4pt,a=1*2" => Length(6.1, Cm), S(0), Plus, S(0), Length(4.0, Pt), // Test invalid bools.
Comma, Id("a"), Equals, Int(1), Star, Int(2)); t!(Header[" /"]: "True" => Id("True"));
t!(Header, "(5 - 1) / 2.1" => LP, Int(5), S(0), Min, S(0), Int(1), RP, t!(Header[" /"]: "falser" => Id("falser"));
S(0), Slash, S(0), Float(2.1)); }
#[test]
fn test_tokenize_numeric_values() {
let ints = [("7", 7), ("012", 12)];
let floats = [
(".3", 0.3),
("0.3", 0.3),
("3.", 3.0),
("3.0", 3.0),
("14.3", 14.3),
("10e2", 1000.0),
("10e+0", 10.0),
("10e+1", 100.0),
("10e-2", 0.1),
("10.e1", 100.0),
("10.e-1", 1.0),
(".1e1", 1.0),
("10E2", 1000.0),
];
// Test integers.
for &(s, v) in &ints {
t!(Header[" /"]: s => Int(v));
}
// Test floats.
for &(s, v) in &floats {
t!(Header[" /"]: s => Float(v));
}
// Test attached numbers.
t!(Header[" /"]: "1.2.3" => Float(1.2), Float(0.3));
t!(Header[" /"]: "1e-2+3" => Float(0.01), Plus, Int(3));
// Test float from too large integer.
let large = i64::MAX as f64 + 1.0;
t!(Header[" /"]: large.to_string() => Float(large));
// Combined integers and floats.
let nums = ints.iter().map(|&(k, v)| (k, v as f64)).chain(floats.iter().copied());
// Test percentages.
for (s, v) in nums.clone() {
t!(Header[" /"]: format!("{}%", s) => Percent(v));
}
// Test lengths.
for &unit in &[Unit::Mm, Unit::Pt, Unit::Cm, Unit::In] {
for (s, v) in nums.clone() {
t!(Header[" /"]: format!("{}{}", s, unit) => Length(v, unit));
}
}
}
#[test]
fn test_tokenize_hex() {
// Test basic hex expressions.
t!(Header[" /"]: "#6ae6dd" => Hex("6ae6dd"));
t!(Header[" /"]: "#8A083c" => Hex("8A083c"));
// Test with non-hex letters.
t!(Header[" /"]: "#PQ" => Hex("PQ"));
}
#[test]
fn test_tokenize_strings() {
// Test basic strings.
t!(Header: "\"hi\"" => Str("hi", true));
t!(Header: "\"hi\nthere\"" => Str("hi\nthere", true));
t!(Header: "\"🌎\"" => Str("🌎", true));
t!(Header[""]: "\"hi" => Str("hi", false));
// Test escaped quote.
t!(Header: r#""a\"bc""# => Str(r#"a\"bc"#, true));
t!(Header[""]: r#""\""# => Str(r#"\""#, false));
}
#[test]
fn test_tokenize_invalid() {
// Test invalidly closed block comments.
t!(Both: "*/" => Invalid("*/"));
t!(Both: "/**/*/" => BC(""), Invalid("*/"));
// Test invalid expressions.
t!(Header: r"\" => Invalid(r"\"));
t!(Header: "🌓" => Invalid("🌓"));
t!(Header: r"\:" => Invalid(r"\"), Colon);
t!(Header: "meal⌚" => Id("meal"), Invalid(""));
t!(Header[" /"]: r"\a" => Invalid(r"\"), Id("a"));
t!(Header[" /"]: ">main" => Invalid(">"), Id("main"));
// Test invalid number suffixes.
t!(Header[" /"]: "1foo" => Invalid("1foo"));
} }
} }

View File

@ -46,13 +46,17 @@ impl Deref for Ident {
/// Whether the string is a valid identifier. /// Whether the string is a valid identifier.
pub fn is_ident(string: &str) -> bool { pub fn is_ident(string: &str) -> bool {
let mut chars = string.chars(); let mut chars = string.chars();
if matches!(chars.next(), Some(c) if c.is_xid_start() || is_also_ok(c)) { chars
chars.all(|c| c.is_xid_continue() || is_also_ok(c)) .next()
} else { .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue))
false
}
} }
fn is_also_ok(c: char) -> bool { /// Whether the character can start an identifier.
c == '-' || c == '_' pub fn is_id_start(c: char) -> bool {
c.is_xid_start() || c == '_'
}
/// Whether the character can continue an identifier.
pub fn is_id_continue(c: char) -> bool {
c.is_xid_continue() || c == '_' || c == '-'
} }

View File

@ -24,16 +24,16 @@ pub enum Token<'s> {
Star, Star,
/// An underscore: `_`. /// An underscore: `_`.
Underscore, Underscore,
/// A backslash followed by whitespace: `\`.
Backslash,
/// A hashtag indicating a section heading: `#`. /// A hashtag indicating a section heading: `#`.
Hashtag, Hashtag,
/// A non-breaking space: `~`. /// A tilde: `~`.
NonBreakingSpace, Tilde,
/// A raw block: `` `...` ``. /// A backslash followed by whitespace: `\`.
Raw(TokenRaw<'s>), Backslash,
/// A unicode escape sequence: `\u{1F5FA}`. /// A unicode escape sequence: `\u{1F5FA}`.
UnicodeEscape(TokenUnicodeEscape<'s>), UnicodeEscape(TokenUnicodeEscape<'s>),
/// A raw block: `` `...` ``.
Raw(TokenRaw<'s>),
/// A left bracket: `[`. /// A left bracket: `[`.
LeftBracket, LeftBracket,
@ -134,7 +134,7 @@ impl<'s> Token<'s> {
Self::Underscore => "underscore", Self::Underscore => "underscore",
Self::Backslash => "backslash", Self::Backslash => "backslash",
Self::Hashtag => "hashtag", Self::Hashtag => "hashtag",
Self::NonBreakingSpace => "non-breaking space", Self::Tilde => "tidle",
Self::Raw { .. } => "raw block", Self::Raw { .. } => "raw block",
Self::UnicodeEscape { .. } => "unicode escape sequence", Self::UnicodeEscape { .. } => "unicode escape sequence",