Better tokenization testing 🌋

- Better tokenization test coverage. - Suffix testing: Each test case is tested with many different suffixes to ensure correct token ends. - Improves expression parsing (fixes #3).
2025-08-14 07:07:54 +08:00 · 2020-12-16 15:42:02 +01:00 · 2020-12-16 15:42:02 +01:00 · 6bbedeaa2c
commit 6bbedeaa2c
parent 0cfce1de7e
5 changed files with 532 additions and 243 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,3 +5,4 @@ Cargo.lock
 bench/target
 tests/out
 _things
 tarpaulin-report.html
--- a/src/parse/mod.rs
+++ b/src/parse/mod.rs
@ -66,7 +66,6 @@ fn node(p: &mut Parser, at_start: bool) -> Option<Spanned<SynNode>> {
        // Markup.
        Token::Star => SynNode::Strong,
        Token::Underscore => SynNode::Emph,
        Token::Backslash => SynNode::Linebreak,
        Token::Hashtag => {
            if at_start {
                SynNode::Heading(heading(p, start))
@ -74,9 +73,10 @@ fn node(p: &mut Parser, at_start: bool) -> Option<Spanned<SynNode>> {
                SynNode::Text(p.eaten_from(start).into())
            }
        }
-        Token::NonBreakingSpace => SynNode::Text("\u{00A0}".into()),
+        Token::Tilde => SynNode::Text("\u{00A0}".into()),
-        Token::Raw(token) => SynNode::Raw(raw(p, token)),
+        Token::Backslash => SynNode::Linebreak,
        Token::UnicodeEscape(token) => SynNode::Text(unicode_escape(p, token, start)),
        Token::Raw(token) => SynNode::Raw(raw(p, token)),
        // Functions.
        Token::LeftBracket => {
--- a/src/parse/tokens.rs
+++ b/src/parse/tokens.rs
@ -63,57 +63,79 @@ impl<'s> Iterator for Tokens<'s> {
    /// Parse the next token in the source code.
    fn next(&mut self) -> Option<Self::Item> {
        let start = self.s.index();
-        Some(match self.s.eat()? {
+        let c = self.s.eat()?;
-            // Whitespace with fast path for just a single space.
+
-            ' ' if !self.s.check(|c| c.is_whitespace()) => Token::Space(0),
+        // This never loops. It just exists to allow breaking out of it.
-            c if c.is_whitespace() => {
+        loop {
-                self.s.jump(start);
+            // Common elements.
-                self.read_whitespace()
+            return Some(match c {
-            }
+                // Whitespace.
                c if c.is_whitespace() => self.whitespace(c, start),
                // Comments.
-            '/' if self.s.eat_if('/') => self.read_line_comment(),
+                '/' if self.s.eat_if('/') => self.line_comment(),
-            '/' if self.s.eat_if('*') => self.read_block_comment(),
+                '/' if self.s.eat_if('*') => self.block_comment(),
                '*' if self.s.eat_if('/') => Token::Invalid("*/"),
-            // Functions.
+                // Functions and blocks.
                '[' => Token::LeftBracket,
                ']' => Token::RightBracket,
                '{' => Token::LeftBrace,
                '}' => Token::RightBrace,
-            // Syntactic elements in body text.
+                _ => break,
-            '*' if self.mode == Body => Token::Star,
+            });
-            '_' if self.mode == Body => Token::Underscore,
+        }
            '#' if self.mode == Body => Token::Hashtag,
            '~' if self.mode == Body => Token::NonBreakingSpace,
            '`' if self.mode == Body => self.read_raw(),
            '\\' if self.mode == Body => self.read_escaped(),
        Some(match self.mode {
            Body => match c {
                // Markup.
                '*' => Token::Star,
                '_' => Token::Underscore,
                '~' => Token::Tilde,
                '#' => Token::Hashtag,
                '`' => self.raw(),
                // Escape sequences.
                '\\' => self.escaped(),
                // Plain text.
                _ => self.text(start),
            },
            Header => match c {
                // Syntactic elements in headers.
-            '(' if self.mode == Header => Token::LeftParen,
+                '(' => Token::LeftParen,
-            ')' if self.mode == Header => Token::RightParen,
+                ')' => Token::RightParen,
-            ':' if self.mode == Header => Token::Colon,
+                ':' => Token::Colon,
-            ',' if self.mode == Header => Token::Comma,
+                ',' => Token::Comma,
-            '=' if self.mode == Header => Token::Equals,
+                '=' => Token::Equals,
-            '>' if self.mode == Header && self.s.eat_if('>') => Token::Chain,
+                '>' if self.s.eat_if('>') => Token::Chain,
-            '+' if self.mode == Header => Token::Plus,
+                '+' => Token::Plus,
-            '-' if self.mode == Header => Token::Hyphen,
+                '-' => Token::Hyphen,
-            '*' if self.mode == Header => Token::Star,
+                '*' => Token::Star,
-            '/' if self.mode == Header => Token::Slash,
+                '/' => Token::Slash,
                // Expressions in headers.
-            '#' if self.mode == Header => self.read_hex(),
+                '#' => self.hex(),
-            '"' if self.mode == Header => self.read_string(),
+                '"' => self.string(),
-            // Expressions or just plain text.
+                // Expressions.
-            _ => self.read_text_or_expr(start),
+                c => self.expr(c, start),
            },
        })
    }
 }
 impl<'s> Tokens<'s> {
-    fn read_whitespace(&mut self) -> Token<'s> {
+    fn whitespace(&mut self, first: char, start: usize) -> Token<'s> {
        // Fast path for just a single space
        if first == ' ' && !self.s.check(|c| c.is_whitespace()) {
            return Token::Space(0);
        }
        self.s.jump(start);
        // Count the number of newlines.
        let mut newlines = 0;
        while let Some(c) = self.s.eat_merging_crlf() {
@ -130,11 +152,11 @@ impl<'s> Tokens<'s> {
        Token::Space(newlines)
    }
-    fn read_line_comment(&mut self) -> Token<'s> {
+    fn line_comment(&mut self) -> Token<'s> {
        Token::LineComment(self.s.eat_until(is_newline))
    }
-    fn read_block_comment(&mut self) -> Token<'s> {
+    fn block_comment(&mut self) -> Token<'s> {
        let start = self.s.index();
        let mut state = '_';
@ -164,7 +186,7 @@ impl<'s> Tokens<'s> {
        Token::BlockComment(self.s.get(start .. end))
    }
-    fn read_raw(&mut self) -> Token<'s> {
+    fn raw(&mut self) -> Token<'s> {
        let mut backticks = 1;
        while self.s.eat_if('`') {
            backticks += 1;
@ -191,10 +213,15 @@ impl<'s> Tokens<'s> {
        })
    }
-    fn read_escaped(&mut self) -> Token<'s> {
+    fn escaped(&mut self) -> Token<'s> {
        if let Some(c) = self.s.peek() {
            match c {
-                '[' | ']' | '\\' | '/' | '*' | '_' | '`' | '"' | '#' | '~' => {
+                // Backslash and comments.
                '\\' | '/' |
                // Parenthesis.
                '[' | ']' | '{' | '}' |
                // Markup.
                '*' | '_' |  '~' | '#' | '`' => {
                    let start = self.s.index();
                    self.s.eat_assert(c);
                    Token::Text(&self.s.eaten_from(start))
@ -203,7 +230,8 @@ impl<'s> Tokens<'s> {
                    self.s.eat_assert('u');
                    self.s.eat_assert('{');
                    Token::UnicodeEscape(TokenUnicodeEscape {
-                        sequence: self.s.eat_while(|c| c.is_ascii_hexdigit()),
+                        // Allow more than `ascii_hexdigit` for better error recovery.
                        sequence: self.s.eat_while(|c| c.is_ascii_alphanumeric()),
                        terminated: self.s.eat_if('}'),
                    })
                }
@ -215,13 +243,35 @@ impl<'s> Tokens<'s> {
        }
    }
-    fn read_hex(&mut self) -> Token<'s> {
+    fn text(&mut self, start: usize) -> Token<'s> {
-        // This parses more than the permissable 0-9, a-f, A-F character ranges
+        while let Some(c) = self.s.eat() {
-        // to provide nicer error messages later.
+            if match c {
                // Whitespace.
                c if c.is_whitespace() => true,
                // Comments.
                '/' if self.s.check(|c| c == '/' || c == '*') => true,
                // Parenthesis.
                '[' | ']' | '{' | '}' => true,
                // Markup.
                '*' | '_' | '#' | '~' | '`' => true,
                // Escaping.
                '\\' => true,
                _ => false,
            } {
                self.s.uneat();
                break;
            }
        }
        Token::Text(self.s.eaten_from(start))
    }
    fn hex(&mut self) -> Token<'s> {
        // Allow more than `ascii_hexdigit` for better error recovery.
        Token::Hex(self.s.eat_while(|c| c.is_ascii_alphanumeric()))
    }
-    fn read_string(&mut self) -> Token<'s> {
+    fn string(&mut self) -> Token<'s> {
        let mut escaped = false;
        Token::Str(TokenStr {
            string: self.s.eat_until(|c| {
@ -236,29 +286,60 @@ impl<'s> Tokens<'s> {
        })
    }
-    fn read_text_or_expr(&mut self, start: usize) -> Token<'s> {
+    fn expr(&mut self, first: char, start: usize) -> Token<'s> {
-        let body = self.mode == Body;
+        if is_id_start(first) {
-        let header = self.mode == Header;
+            self.ident(start)
-
+        } else if first.is_ascii_digit()
-        let mut last_was_e = false;
+            || (first == '.' && self.s.check(|c| c.is_ascii_digit()))
-        self.s.eat_until(|c| {
+        {
-            let end = match c {
+            self.number(start)
                c if c.is_whitespace() => true,
                '[' | ']' | '{' | '}' | '*' | '/' | '#' => true,
                '_' | '`' | '~' | '\\' if body => true,
                '(' | ')' | ':' | ',' | '=' | '"' if header => true,
                '+' | '-' if header && !last_was_e => true,
                _ => false,
            };
            last_was_e = c == 'e' || c == 'E';
            end
        });
        let read = self.s.eaten_from(start);
        if self.mode == Header {
            parse_expr(read)
        } else {
-            Token::Text(read)
+            Token::Invalid(self.s.eaten_from(start))
        }
    }
    fn ident(&mut self, start: usize) -> Token<'s> {
        self.s.eat_while(is_id_continue);
        let string = self.s.eaten_from(start);
        match string {
            "true" => Token::Bool(true),
            "false" => Token::Bool(false),
            _ => Token::Ident(string),
        }
    }
    fn number(&mut self, start: usize) -> Token<'s> {
        self.s.jump(start);
        // Read the integer part.
        self.s.eat_while(|c| c.is_ascii_digit());
        // Read the fractional part if present.
        if self.s.eat_if('.') {
            self.s.eat_while(|c| c.is_ascii_digit());
        }
        // Read the exponent.
        if self.s.eat_if('e') || self.s.eat_if('E') {
            let _ = self.s.eat_if('+') || self.s.eat_if('-');
            self.s.eat_while(|c| c.is_ascii_digit());
        }
        // Read the suffix.
        self.s.eat_while(|c| c == '%' || c.is_ascii_alphanumeric());
        // Parse into one of the suitable types.
        let string = self.s.eaten_from(start);
        if let Some(percent) = parse_percent(string) {
            Token::Percent(percent)
        } else if let Some((val, unit)) = parse_length(string) {
            Token::Length(val, unit)
        } else if let Ok(int) = string.parse::<i64>() {
            Token::Int(int)
        } else if let Ok(float) = string.parse::<f64>() {
            Token::Float(float)
        } else {
            Token::Invalid(string)
        }
    }
 }
@ -269,30 +350,12 @@ impl Debug for Tokens<'_> {
    }
 }
-fn parse_expr(text: &str) -> Token<'_> {
+fn parse_percent(string: &str) -> Option<f64> {
-    if let Ok(b) = text.parse::<bool>() {
+    string.strip_suffix('%').and_then(|prefix| prefix.parse::<f64>().ok())
        Token::Bool(b)
    } else if let Ok(int) = text.parse::<i64>() {
        Token::Int(int)
    } else if let Ok(num) = text.parse::<f64>() {
        Token::Float(num)
    } else if let Some(percent) = parse_percent(text) {
        Token::Percent(percent)
    } else if let Some((val, unit)) = parse_length(text) {
        Token::Length(val, unit)
    } else if is_ident(text) {
        Token::Ident(text)
    } else {
        Token::Invalid(text)
    }
 }
-fn parse_percent(text: &str) -> Option<f64> {
+fn parse_length(string: &str) -> Option<(f64, Unit)> {
-    text.strip_suffix('%').and_then(|num| num.parse::<f64>().ok())
+    let len = string.len();
 }
 fn parse_length(text: &str) -> Option<(f64, Unit)> {
    let len = text.len();
    // We need at least some number and the unit.
    if len <= 2 {
@ -302,7 +365,7 @@ fn parse_length(text: &str) -> Option<(f64, Unit)> {
    // We can view the string as bytes since a multibyte UTF-8 char cannot
    // have valid ASCII chars as subbytes.
    let split = len - 2;
-    let bytes = text.as_bytes();
+    let bytes = string.as_bytes();
    let unit = match &bytes[split ..] {
        b"pt" => Unit::Pt,
        b"mm" => Unit::Mm,
@ -311,7 +374,7 @@ fn parse_length(text: &str) -> Option<(f64, Unit)> {
        _ => return None,
    };
-    text[.. split].parse::<f64>().ok().map(|val| (val, unit))
+    string[.. split].parse::<f64>().ok().map(|val| (val, unit))
 }
 #[cfg(test)]
@ -321,33 +384,106 @@ mod tests {
    use crate::parse::tests::check;
    use Token::{
-        BlockComment as BC, Hyphen as Min, Ident as Id, LeftBrace as LB,
+        BlockComment as BC, Ident as Id, LeftBrace as LB, LeftBracket as L,
-        LeftBracket as L, LeftParen as LP, LineComment as LC, NonBreakingSpace as Nbsp,
+        LeftParen as LP, LineComment as LC, RightBrace as RB, RightBracket as R,
-        RightBrace as RB, RightBracket as R, RightParen as RP, Space as S, Text as T, *,
+        RightParen as RP, Space as S, Text as T, *,
    };
    use Unit::*;
    fn Str(string: &str, terminated: bool) -> Token {
        Token::Str(TokenStr { string, terminated })
    }
    fn Raw(text: &str, backticks: usize, terminated: bool) -> Token {
        Token::Raw(TokenRaw { text, backticks, terminated })
    }
    fn UE(sequence: &str, terminated: bool) -> Token {
        Token::UnicodeEscape(TokenUnicodeEscape { sequence, terminated })
    }
    /// Building blocks for suffix testing.
    ///
    /// We extend each test case with a collection of different suffixes to make
    /// sure tokens end at the correct position. These suffixes are split into
    /// blocks, which can be disabled/enabled per test case. For example, when
    /// testing identifiers we disable letter suffixes because these would
    /// mingle with the identifiers.
    ///
    /// Suffix blocks:
    /// - ' ': spacing
    /// - 'a': letters
    /// - '1': numbers
    /// - '/': symbols
    const BLOCKS: &str = " a1/";
    /// Suffixes described by four-tuples of:
    ///
    /// - block the suffix is part of
    /// - mode in which the suffix is applicable
    /// - the suffix string
    /// - the resulting suffix token
    const SUFFIXES: &[(char, Option<TokenMode>, &str, Token)] = &[
        // Whitespace suffixes.
        (' ', None, " ", S(0)),
        (' ', None, "\n", S(1)),
        (' ', None, "\r", S(1)),
        (' ', None, "\r\n", S(1)),
        // Letter suffixes.
        ('a', Some(Body), "hello", T("hello")),
        ('a', Some(Body), "💚", T("💚")),
        ('a', Some(Header), "val", Id("val")),
        ('a', Some(Header), "α", Id("α")),
        ('a', Some(Header), "_", Id("_")),
        // Number suffixes.
        ('1', Some(Header), "2", Int(2)),
        ('1', Some(Header), ".2", Float(0.2)),
        // Symbol suffixes.
        ('/', None, "[", L),
        ('/', None, "//", LC("")),
        ('/', None, "/**/", BC("")),
        ('/', Some(Body), "*", Star),
        ('/', Some(Body), "_", Underscore),
        ('/', Some(Body), r"\\", T(r"\")),
        ('/', Some(Header), "(", LP),
        ('/', Some(Header), ":", Colon),
        ('/', Some(Header), "+", Plus),
        ('/', Some(Header), "#123", Hex("123")),
    ];
    macro_rules! t {
-        ($mode:expr, $src:expr => $($token:expr),*) => {
+        (Both $($tts:tt)*) => {
-            let exp = vec![$($token),*];
+            t!(Body $($tts)*);
-            let found = Tokens::new($src, $mode).collect::<Vec<_>>();
+            t!(Header $($tts)*);
-            check($src, exp, found, false);
+        };
        ($mode:ident $([$blocks:literal])?: $src:expr => $($token:expr),*) => {{
            // Test without suffix.
            t!(@$mode: $src => $($token),*);
            // Test with each applicable suffix.
            for &(block, mode, suffix, token) in SUFFIXES {
                let src = $src;
                #[allow(unused)]
                let mut blocks = BLOCKS;
                $(blocks = $blocks;)?
                assert!(!blocks.contains(|c| !BLOCKS.contains(c)));
                if (mode.is_none() || mode == Some($mode)) && blocks.contains(block) {
                    t!(@$mode: format!("{}{}", src, suffix) => $($token,)* token);
                }
            }
        }};
        (@$mode:ident: $src:expr => $($token:expr),*) => {{
            let src = $src;
            let exp = vec![$($token),*];
            let found = Tokens::new(&src, $mode).collect::<Vec<_>>();
            check(&src, exp, found, false);
        }};
    }
    #[test]
    fn test_length_from_str_parses_correct_value_and_unit() {
        assert_eq!(parse_length("2.5cm"), Some((2.5, Cm)));
        assert_eq!(parse_length("1.e+2cm"), Some((100.0, Cm)));
    }
    #[test]
@ -356,157 +492,305 @@ mod tests {
    }
    #[test]
-    fn tokenize_whitespace() {
+    fn test_tokenize_whitespace() {
-        t!(Body, ""             => );
+        // Test basic whitespace.
-        t!(Body, " "            => S(0));
+        t!(Both["a1/"]: ""         => );
-        t!(Body, "    "         => S(0));
+        t!(Both["a1/"]: " "        => S(0));
-        t!(Body, "\t"           => S(0));
+        t!(Both["a1/"]: "    "     => S(0));
-        t!(Body, "  \t"         => S(0));
+        t!(Both["a1/"]: "\t"       => S(0));
-        t!(Body, "\n"           => S(1));
+        t!(Both["a1/"]: "  \t"     => S(0));
-        t!(Body, "\n "          => S(1));
+        t!(Both["a1/"]: "\u{202F}" => S(0));
-        t!(Body, "  \n"         => S(1));
+
-        t!(Body, "  \n   "      => S(1));
+        // Test newline counting.
-        t!(Body, "\r\n"         => S(1));
+        t!(Both["a1/"]: "\n"           => S(1));
-        t!(Body, "  \n\t \n  "  => S(2));
+        t!(Both["a1/"]: "\n "          => S(1));
-        t!(Body, "\n\r"         => S(2));
+        t!(Both["a1/"]: "  \n"         => S(1));
-        t!(Body, " \r\r\n \x0D" => S(3));
+        t!(Both["a1/"]: "  \n   "      => S(1));
-        t!(Body, "a~b"          => T("a"), Nbsp, T("b"));
+        t!(Both["a1/"]: "\r\n"         => S(1));
        t!(Both["a1/"]: "  \n\t \n  "  => S(2));
        t!(Both["a1/"]: "\n\r"         => S(2));
        t!(Both["a1/"]: " \r\r\n \x0D" => S(3));
    }
    #[test]
-    fn tokenize_comments() {
+    fn test_tokenize_line_comments() {
-        t!(Body, "a // bc\n "        => T("a"), S(0), LC(" bc"),  S(1));
+        // Test line comment with no trailing newline.
-        t!(Body, "a //a//b\n "       => T("a"), S(0), LC("a//b"), S(1));
+        t!(Both[""]: "//" => LC(""));
-        t!(Body, "a //a//b\r\n"      => T("a"), S(0), LC("a//b"), S(1));
+
-        t!(Body, "a //a//b\n\nhello" => T("a"), S(0), LC("a//b"), S(2), T("hello"));
+        // Test line comment ends at newline.
-        t!(Body, "/**/"              => BC(""));
+        t!(Both["a1/"]: "//bc\n"   => LC("bc"), S(1));
-        t!(Body, "_/*_/*a*/*/"       => Underscore, BC("_/*a*/"));
+        t!(Both["a1/"]: "// bc \n" => LC(" bc "), S(1));
-        t!(Body, "/*/*/"             => BC("/*/"));
+        t!(Both["a1/"]: "//bc\r\n" => LC("bc"), S(1));
-        t!(Body, "abc*/"             => T("abc"), Invalid("*/"));
+
-        t!(Body, "/***/"             => BC("*"));
+        // Test nested line comments.
-        t!(Body, "/**\\****/*/*/"    => BC("*\\***"), Invalid("*/"), Invalid("*/"));
+        t!(Both["a1/"]: "//a//b\n" => LC("a//b"), S(1));
        t!(Body, "/*abc"             => BC("abc"));
        t!(Body, "/*/*abc*/"         => BC("/*abc*/"));
    }
    #[test]
-    fn tokenize_body_tokens() {
+    fn test_tokenize_block_comments() {
-        t!(Body, "a_*"           => T("a"), Underscore, Star);
+        // Test basic block comments.
-        t!(Body, "a***"          => T("a"), Star, Star, Star);
+        t!(Both[""]: "/*" => BC(""));
-        t!(Body, "[func]*bold*"  => L, T("func"), R, Star, T("bold"), Star);
+        t!(Both: "/**/"   => BC(""));
-        t!(Body, "hi_you_ there" => T("hi"), Underscore, T("you"), Underscore, S(0), T("there"));
+        t!(Both: "/*🏞*/" => BC("🏞"));
-        t!(Body, "# hi"          => Hashtag, S(0), T("hi"));
+        t!(Both: "/*\n*/" => BC("\n"));
-        t!(Body, "ab# hi"        => T("ab"), Hashtag, S(0), T("hi"));
+
-        t!(Body, "#{}"           => Hashtag, LB, RB);
+        // Test depth 1 and 2 nested block comments.
-        t!(Body, "{text}"        => LB, Text("text"), RB);
+        t!(Both: "/* /* */ */"  => BC(" /* */ "));
-        t!(Header, "_`"          => Invalid("_`"));
+        t!(Both: "/*/*/**/*/*/" => BC("/*/**/*/"));
        // Test two nested, one unclosed block comments.
        t!(Both[""]: "/*/*/**/*/" => BC("/*/**/*/"));
        // Test all combinations of up to two following slashes and stars.
        t!(Both[""]: "/*"   => BC(""));
        t!(Both[""]: "/*/"  => BC("/"));
        t!(Both[""]: "/**"  => BC("*"));
        t!(Both[""]: "/*//" => BC("//"));
        t!(Both[""]: "/*/*" => BC("/*"));
        t!(Both[""]: "/**/" => BC(""));
        t!(Both[""]: "/***" => BC("**"));
    }
    #[test]
-    fn test_tokenize_raw() {
+    fn test_tokenize_body_tokens() {
-        // Basics.
+        // Test parentheses.
-        t!(Body, "a`raw`"   => T("a"), Raw("raw", 1, true));
+        t!(Body: "[" => L);
-        t!(Body, "`[func]`" => Raw("[func]", 1, true));
+        t!(Body: "]" => R);
-        t!(Body, "`]"       => Raw("]", 1, false));
+        t!(Body: "{" => LB);
-        t!(Body, r"`\`` "   => Raw(r"\", 1, true), Raw(" ", 1, false));
+        t!(Body: "}" => RB);
-        // Language tag.
+        // Test markup tokens.
-        t!(Body, "``` hi```"     => Raw(" hi", 3, true));
+        t!(Body[" a1"]: "*" => Star);
-        t!(Body, "```rust hi```" => Raw("rust hi", 3, true));
+        t!(Body: "_"        => Underscore);
-        t!(Body, r"``` hi\````"  => Raw(r" hi\", 3, true), Raw("", 1, false));
+        t!(Body: "~"        => Tilde);
-        t!(Body, "``` not `y`e`t finished```" => Raw(" not `y`e`t finished", 3, true));
+        t!(Body: "#"        => Hashtag);
-        t!(Body, "```js   \r\n  document.write(\"go\")`"
+        t!(Body[" "]: r"\"  => Backslash);
            => Raw("js   \r\n  document.write(\"go\")`", 3, false));
-        // More backticks.
+        // Test header symbols.
-        t!(Body, "`````` ``````hi"  => Raw(" ", 6, true), T("hi"));
+        t!(Body[" /"]: ":,=>>/+-" => T(":,=>>/+-"));
        t!(Body, "````\n```js\nalert()\n```\n````" => Raw("\n```js\nalert()\n```\n", 4, true));
    }
    #[test]
-    fn tokenize_escaped_symbols() {
+    fn test_tokenize_raw_blocks() {
-        t!(Body, r"\\"       => T(r"\"));
+        // Test basic raw block.
-        t!(Body, r"\["       => T("["));
+        t!(Body: "`raw`"  => Raw("raw", 1, true));
-        t!(Body, r"\]"       => T("]"));
+        t!(Body[""]: "`]" => Raw("]", 1, false));
-        t!(Body, r"\*"       => T("*"));
+
-        t!(Body, r"\_"       => T("_"));
+        // Test special symbols in raw block.
-        t!(Body, r"\`"       => T("`"));
+        t!(Body: "`[func]`"   => Raw("[func]", 1, true));
-        t!(Body, r"\/"       => T("/"));
+        t!(Body[""]: r"`\`` " => Raw(r"\", 1, true), Raw(" ", 1, false));
-        t!(Body, r"\u{2603}" => UE("2603", true));
+
-        t!(Body, r"\u{26A4"  => UE("26A4", false));
+        // Test more backticks.
-        t!(Body, r#"\""#     => T("\""));
+        t!(Body: "````🚀````"           => Raw("🚀", 4, true));
        t!(Body[""]: "````👩‍🚀``noend"    => Raw("👩‍🚀``noend", 4, false));
        t!(Body[""]: "````raw``````new" => Raw("raw", 4, true), Raw("new", 2, false));
        // Test separated closing backticks.
        t!(Body: "```not `y`e`t```" => Raw("not `y`e`t", 3, true));
    }
    #[test]
-    fn tokenize_unescapable_symbols() {
+    fn test_tokenize_escape_sequences() {
-        t!(Body, r"\a"      => T("\\"), T("a"));
+        // Test escapable symbols.
-        t!(Body, r"\:"      => T(r"\"), T(":"));
+        t!(Body: r"\\" => T(r"\"));
-        t!(Body, r"\="      => T(r"\"), T("="));
+        t!(Body: r"\/" => T("/"));
-        t!(Body, r"\u{2GA4" => UE("2", false), T("GA4"));
+        t!(Body: r"\[" => T("["));
-        t!(Body, r"\u{ "    => UE("", false), Space(0));
+        t!(Body: r"\]" => T("]"));
-        t!(Body, r"\u"      => T("\\"), T("u"));
+        t!(Body: r"\{" => T("{"));
-        t!(Header, r"\\\\"  => Invalid(r"\\\\"));
+        t!(Body: r"\}" => T("}"));
-        t!(Header, r"\a"    => Invalid(r"\a"));
+        t!(Body: r"\*" => T("*"));
-        t!(Header, r"\:"    => Invalid(r"\"), Colon);
+        t!(Body: r"\_" => T("_"));
-        t!(Header, r"\="    => Invalid(r"\"), Equals);
+        t!(Body: r"\#" => T("#"));
-        t!(Header, r"\,"    => Invalid(r"\"), Comma);
+        t!(Body: r"\~" => T("~"));
        t!(Body: r"\`" => T("`"));
        // Test unescapable symbols.
        t!(Body[" /"]: r"\a"   => T(r"\"), T("a"));
        t!(Body[" /"]: r"\u"   => T(r"\"), T("u"));
        t!(Body[" /"]: r"\1"   => T(r"\"), T("1"));
        t!(Body[" /"]: r"\:"   => T(r"\"), T(":"));
        t!(Body[" /"]: r"\="   => T(r"\"), T("="));
        t!(Body[" /"]: r#"\""# => T(r"\"), T("\""));
        // Test basic unicode escapes.
        t!(Body: r"\u{}"     => UE("", true));
        t!(Body: r"\u{2603}" => UE("2603", true));
        t!(Body: r"\u{P}"    => UE("P", true));
        // Test unclosed unicode escapes.
        t!(Body[" /"]: r"\u{"     => UE("", false));
        t!(Body[" /"]: r"\u{1"    => UE("1", false));
        t!(Body[" /"]: r"\u{26A4" => UE("26A4", false));
        t!(Body[" /"]: r"\u{1Q3P" => UE("1Q3P", false));
        t!(Body: r"\u{1🏕}"       => UE("1", false), T("🏕"), RB);
    }
    #[test]
-    fn tokenize_header_tokens() {
+    fn test_tokenize_text() {
-        t!(Header, "__main__"     => Id("__main__"));
+        // Test basic text.
-        t!(Header, "_func_box"    => Id("_func_box"));
+        t!(Body[" /"]: "hello"       => T("hello"));
-        t!(Header, ">main"        => Invalid(">main"));
+        t!(Body[" /"]: "hello-world" => T("hello-world"));
-        t!(Header, "🌓, 🌍,"     => Invalid("🌓"), Comma, S(0), Invalid("🌍"), Comma);
+
-        t!(Header, "{abc}"        => LB, Id("abc"), RB);
+        // Test header symbols in text.
-        t!(Header, "(1,2)"        => LP, Int(1), Comma, Int(2), RP);
+        t!(Body[" /"]: "a():\"b" => T("a():\"b"));
-        t!(Header, "12_pt, 12pt"  => Invalid("12_pt"), Comma, S(0), Length(12.0, Pt));
+
-        t!(Header, "f: arg >> g"  => Id("f"), Colon, S(0), Id("arg"), S(0), Chain, S(0), Id("g"));
+        // Test text ends.
-        t!(Header, "=3.15"        => Equals, Float(3.15));
+        t!(Body[""]: "hello " => T("hello"), S(0));
-        t!(Header, "arg, _b, _1"  => Id("arg"), Comma, S(0), Id("_b"), Comma, S(0), Id("_1"));
+        t!(Body[""]: "hello~" => T("hello"), Tilde);
        t!(Header, "a:b"          => Id("a"), Colon, Id("b"));
        t!(Header, "(){}:=,"      => LP, RP, LB, RB, Colon, Equals, Comma);
        t!(Body,   "c=d, "        => T("c=d,"), S(0));
        t!(Body,   "a: b"         => T("a:"), S(0), T("b"));
        t!(Header, "a: true, x=1" => Id("a"), Colon, S(0), Bool(true), Comma, S(0),
                                     Id("x"), Equals, Int(1));
    }
    #[test]
-    fn tokenize_numeric_values() {
+    fn test_tokenize_header_tokens() {
-        t!(Header, "12.3e5"  => Float(12.3e5));
+        // Test parentheses.
-        t!(Header, "120%"    => Percent(120.0));
+        t!(Header: "[" => L);
-        t!(Header, "12e4%"   => Percent(120000.0));
+        t!(Header: "]" => R);
-        t!(Header, "1e5in"   => Length(100000.0, In));
+        t!(Header: "{" => LB);
-        t!(Header, "2.3cm"   => Length(2.3, Cm));
+        t!(Header: "}" => RB);
-        t!(Header, "02.4mm"  => Length(2.4, Mm));
+        t!(Header: "(" => LP);
-        t!(Header, "2.4.cm"  => Invalid("2.4.cm"));
+        t!(Header: ")" => RP);
-        t!(Header, "#6ae6dd" => Hex("6ae6dd"));
+
-        t!(Header, "#8A083c" => Hex("8A083c"));
+        // Test structural tokens.
        t!(Header: ":"        => Colon);
        t!(Header: ","        => Comma);
        t!(Header: "="        => Equals);
        t!(Header: ">>"       => Chain);
        t!(Header: "+"        => Plus);
        t!(Header: "-"        => Hyphen);
        t!(Header[" a1"]: "*" => Star);
        t!(Header[" a1"]: "/" => Slash);
        // Test hyphen parsed as symbol.
        t!(Header[" /"]: "-1"   => Hyphen, Int(1));
        t!(Header[" /"]: "-a"   => Hyphen, Id("a"));
        t!(Header[" /"]: "--1"  => Hyphen, Hyphen, Int(1));
        t!(Header[" /"]: "--_a" => Hyphen, Hyphen, Id("_a"));
        t!(Header[" /"]: "a-b"  => Id("a-b"));
        // Test some operations.
        t!(Header[" /"]: "1+3" => Int(1), Plus, Int(3));
        t!(Header[" /"]: "1*3" => Int(1), Star, Int(3));
        t!(Header[" /"]: "1/3" => Int(1), Slash, Int(3));
    }
    #[test]
-    fn tokenize_strings() {
+    fn test_tokenize_idents() {
-        t!(Body, "a \"hi\" string"           => T("a"), S(0), T("\"hi\""), S(0), T("string"));
+        // Test valid identifiers.
-        t!(Header, "\"hello"                 => Str("hello", false));
+        t!(Header[" /"]: "x"           => Id("x"));
-        t!(Header, "\"hello world\""         => Str("hello world", true));
+        t!(Header[" /"]: "value"       => Id("value"));
-        t!(Header, "\"hello\nworld\""        => Str("hello\nworld", true));
+        t!(Header[" /"]: "__main__"    => Id("__main__"));
-        t!(Header, r#"1"hello\nworld"false"# => Int(1), Str("hello\\nworld", true), Bool(false));
+        t!(Header[" /"]: "_snake_case" => Id("_snake_case"));
-        t!(Header, r#""a\"bc""#              => Str(r#"a\"bc"#, true));
+
-        t!(Header, r#""a\\"bc""#             => Str(r#"a\\"#, true), Id("bc"), Str("", false));
+        // Test non-ascii.
-        t!(Header, r#""a\tbc"#               => Str("a\\tbc", false));
+        t!(Header[" /"]: "α"    => Id("α"));
-        t!(Header, "\"🌎\""                  => Str("🌎", true));
+        t!(Header[" /"]: "ម្តាយ" => Id("ម្តាយ"));
        // Test hyphen parsed as identifier.
        t!(Header[" /"]: "kebab-case" => Id("kebab-case"));
        t!(Header[" /"]: "one-10"     => Id("one-10"));
    }
    #[test]
-    fn tokenize_math() {
+    fn test_tokenize_bools() {
-        t!(Header, "12e-3in"           => Length(12e-3, In));
+        // Test valid bools.
-        t!(Header, "-1"                => Min, Int(1));
+        t!(Header[" /"]: "false" => Bool(false));
-        t!(Header, "--1"               => Min, Min, Int(1));
+        t!(Header[" /"]: "true"  => Bool(true));
-        t!(Header, "- 1"               => Min, S(0), Int(1));
+
-        t!(Header, "6.1cm + 4pt,a=1*2" => Length(6.1, Cm), S(0), Plus, S(0), Length(4.0, Pt),
+        // Test invalid bools.
-                                          Comma, Id("a"), Equals, Int(1), Star, Int(2));
+        t!(Header[" /"]: "True"   => Id("True"));
-        t!(Header, "(5 - 1) / 2.1"     => LP, Int(5), S(0), Min, S(0), Int(1), RP,
+        t!(Header[" /"]: "falser" => Id("falser"));
-                                          S(0), Slash, S(0), Float(2.1));
+    }
    #[test]
    fn test_tokenize_numeric_values() {
        let ints = [("7", 7), ("012", 12)];
        let floats = [
            (".3", 0.3),
            ("0.3", 0.3),
            ("3.", 3.0),
            ("3.0", 3.0),
            ("14.3", 14.3),
            ("10e2", 1000.0),
            ("10e+0", 10.0),
            ("10e+1", 100.0),
            ("10e-2", 0.1),
            ("10.e1", 100.0),
            ("10.e-1", 1.0),
            (".1e1", 1.0),
            ("10E2", 1000.0),
        ];
        // Test integers.
        for &(s, v) in &ints {
            t!(Header[" /"]: s => Int(v));
        }
        // Test floats.
        for &(s, v) in &floats {
            t!(Header[" /"]: s => Float(v));
        }
        // Test attached numbers.
        t!(Header[" /"]: "1.2.3"  => Float(1.2), Float(0.3));
        t!(Header[" /"]: "1e-2+3" => Float(0.01), Plus, Int(3));
        // Test float from too large integer.
        let large = i64::MAX as f64 + 1.0;
        t!(Header[" /"]: large.to_string() => Float(large));
        // Combined integers and floats.
        let nums = ints.iter().map(|&(k, v)| (k, v as f64)).chain(floats.iter().copied());
        // Test percentages.
        for (s, v) in nums.clone() {
            t!(Header[" /"]: format!("{}%", s) => Percent(v));
        }
        // Test lengths.
        for &unit in &[Unit::Mm, Unit::Pt, Unit::Cm, Unit::In] {
            for (s, v) in nums.clone() {
                t!(Header[" /"]: format!("{}{}", s, unit) => Length(v, unit));
            }
        }
    }
    #[test]
    fn test_tokenize_hex() {
        // Test basic hex expressions.
        t!(Header[" /"]: "#6ae6dd" => Hex("6ae6dd"));
        t!(Header[" /"]: "#8A083c" => Hex("8A083c"));
        // Test with non-hex letters.
        t!(Header[" /"]: "#PQ" => Hex("PQ"));
    }
    #[test]
    fn test_tokenize_strings() {
        // Test basic strings.
        t!(Header: "\"hi\""        => Str("hi", true));
        t!(Header: "\"hi\nthere\"" => Str("hi\nthere", true));
        t!(Header: "\"🌎\""        => Str("🌎", true));
        t!(Header[""]: "\"hi"      => Str("hi", false));
        // Test escaped quote.
        t!(Header: r#""a\"bc""# => Str(r#"a\"bc"#, true));
        t!(Header[""]: r#""\""# => Str(r#"\""#, false));
    }
    #[test]
    fn test_tokenize_invalid() {
        // Test invalidly closed block comments.
        t!(Both: "*/"     => Invalid("*/"));
        t!(Both: "/**/*/" => BC(""), Invalid("*/"));
        // Test invalid expressions.
        t!(Header: r"\"          => Invalid(r"\"));
        t!(Header: "🌓"          => Invalid("🌓"));
        t!(Header: r"\:"         => Invalid(r"\"), Colon);
        t!(Header: "meal⌚"      => Id("meal"), Invalid("⌚"));
        t!(Header[" /"]: r"\a"   => Invalid(r"\"), Id("a"));
        t!(Header[" /"]: ">main" => Invalid(">"), Id("main"));
        // Test invalid number suffixes.
        t!(Header[" /"]: "1foo" => Invalid("1foo"));
    }
 }
--- a/src/syntax/ident.rs
+++ b/src/syntax/ident.rs
@ -46,13 +46,17 @@ impl Deref for Ident {
 /// Whether the string is a valid identifier.
 pub fn is_ident(string: &str) -> bool {
    let mut chars = string.chars();
-    if matches!(chars.next(), Some(c) if c.is_xid_start() || is_also_ok(c)) {
+    chars
-        chars.all(|c| c.is_xid_continue() || is_also_ok(c))
+        .next()
-    } else {
+        .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue))
        false
    }
 }
-fn is_also_ok(c: char) -> bool {
+/// Whether the character can start an identifier.
-    c == '-' || c == '_'
+pub fn is_id_start(c: char) -> bool {
    c.is_xid_start() || c == '_'
 }
 /// Whether the character can continue an identifier.
 pub fn is_id_continue(c: char) -> bool {
    c.is_xid_continue() || c == '_' || c == '-'
 }
--- a/src/syntax/token.rs
+++ b/src/syntax/token.rs
@ -24,16 +24,16 @@ pub enum Token<'s> {
    Star,
    /// An underscore: `_`.
    Underscore,
    /// A backslash followed by whitespace: `\`.
    Backslash,
    /// A hashtag indicating a section heading: `#`.
    Hashtag,
-    /// A non-breaking space: `~`.
+    /// A tilde: `~`.
-    NonBreakingSpace,
+    Tilde,
-    /// A raw block: `` `...` ``.
+    /// A backslash followed by whitespace: `\`.
-    Raw(TokenRaw<'s>),
+    Backslash,
    /// A unicode escape sequence: `\u{1F5FA}`.
    UnicodeEscape(TokenUnicodeEscape<'s>),
    /// A raw block: `` `...` ``.
    Raw(TokenRaw<'s>),
    /// A left bracket: `[`.
    LeftBracket,
@ -134,7 +134,7 @@ impl<'s> Token<'s> {
            Self::Underscore => "underscore",
            Self::Backslash => "backslash",
            Self::Hashtag => "hashtag",
-            Self::NonBreakingSpace => "non-breaking space",
+            Self::Tilde => "tidle",
            Self::Raw { .. } => "raw block",
            Self::UnicodeEscape { .. } => "unicode escape sequence",