typst/src/parse/tokens.rs

use std::fmt::{self, Debug, Formatter};
use std::str::FromStr;

use super::{is_newline, Scanner};
use crate::color::RgbaColor;
use crate::geom::{AngularUnit, LengthUnit};
use crate::syntax::*;

/// An iterator over the tokens of a string of source code.
#[derive(Clone)]
pub struct Tokens<'s> {
    s: Scanner<'s>,
    mode: TokenMode,
}

/// What kind of tokens to emit.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum TokenMode {
    /// Text and markup.
    Markup,
    /// Blocks and expressions.
    Code,
}

impl<'s> Tokens<'s> {
    /// Create a new token iterator with the given mode.
    pub fn new(src: &'s str, mode: TokenMode) -> Self {
        Self { s: Scanner::new(src), mode }
    }

    /// Get the current token mode.
    pub fn mode(&self) -> TokenMode {
        self.mode
    }

    /// Change the token mode.
    pub fn set_mode(&mut self, mode: TokenMode) {
        self.mode = mode;
    }

    /// The index in the string at which the last token ends and next token
    /// will start.
    pub fn index(&self) -> usize {
        self.s.index()
    }

    /// Jump to the given index in the string.
    ///
    /// You need to know the correct column.
    pub fn jump(&mut self, index: usize) {
        self.s.jump(index);
    }

    /// The underlying scanner.
    pub fn scanner(&self) -> Scanner<'s> {
        self.s
    }
}

impl<'s> Iterator for Tokens<'s> {
    type Item = Token<'s>;

    /// Parse the next token in the source code.
    fn next(&mut self) -> Option<Self::Item> {
        let start = self.s.index();
        let c = self.s.eat()?;
        Some(match c {
            // Blocks and templates.
            '[' => Token::LeftBracket,
            ']' => Token::RightBracket,
            '{' => Token::LeftBrace,
            '}' => Token::RightBrace,

            // Headings, keywords, identifiers, colors.
            '#' => self.hash(start),

            // Whitespace.
            c if c.is_whitespace() => self.whitespace(c),

            // Comments.
            '/' if self.s.eat_if('/') => self.line_comment(),
            '/' if self.s.eat_if('*') => self.block_comment(),
            '*' if self.s.eat_if('/') => Token::Invalid(self.s.eaten_from(start)),

            // Other things.
            _ => match self.mode {
                TokenMode::Markup => self.markup(start, c),
                TokenMode::Code => self.code(start, c),
            },
        })
    }
}

impl<'s> Tokens<'s> {
    fn markup(&mut self, start: usize, c: char) -> Token<'s> {
        match c {
            // Markup.
            '~' => Token::Tilde,
            '*' => Token::Star,
            '_' => Token::Underscore,
            '\\' => self.backslash(),
            '`' => self.raw(),
            '$' => self.math(),
            '-' => self.hyph(start),
            c if c == '.' || c.is_ascii_digit() => self.numbering(start, c),

            // Plain text.
            _ => self.text(start),
        }
    }

    fn code(&mut self, start: usize, c: char) -> Token<'s> {
        match c {
            // Parens.
            '(' => Token::LeftParen,
            ')' => Token::RightParen,

            // Length two.
            '=' if self.s.eat_if('=') => Token::EqEq,
            '!' if self.s.eat_if('=') => Token::ExclEq,
            '<' if self.s.eat_if('=') => Token::LtEq,
            '>' if self.s.eat_if('=') => Token::GtEq,
            '+' if self.s.eat_if('=') => Token::PlusEq,
            '-' if self.s.eat_if('=') => Token::HyphEq,
            '*' if self.s.eat_if('=') => Token::StarEq,
            '/' if self.s.eat_if('=') => Token::SlashEq,
            '.' if self.s.eat_if('.') => Token::Dots,
            '=' if self.s.eat_if('>') => Token::Arrow,

            // Length one.
            ',' => Token::Comma,
            ';' => Token::Semicolon,
            ':' => Token::Colon,
            '+' => Token::Plus,
            '-' => Token::Hyph,
            '*' => Token::Star,
            '/' => Token::Slash,
            '!' => Token::Excl,
            '=' => Token::Eq,
            '<' => Token::Lt,
            '>' => Token::Gt,

            // Identifiers.
            c if is_id_start(c) => self.ident(start),

            // Numbers.
            c if c.is_ascii_digit()
                || (c == '.' && self.s.check(|n| n.is_ascii_digit())) =>
            {
                self.number(start, c)
            }

            // Strings.
            '"' => self.string(),

            _ => Token::Invalid(self.s.eaten_from(start)),
        }
    }

    fn whitespace(&mut self, first: char) -> Token<'s> {
        // Fast path for just a single space
        if first == ' ' && !self.s.check(char::is_whitespace) {
            Token::Space(0)
        } else {
            self.s.uneat();

            // Count the number of newlines.
            let mut newlines = 0;
            while let Some(c) = self.s.eat_merging_crlf() {
                if !c.is_whitespace() {
                    self.s.uneat();
                    break;
                }

                if is_newline(c) {
                    newlines += 1;
                }
            }

            Token::Space(newlines)
        }
    }

    fn text(&mut self, start: usize) -> Token<'s> {
        while let Some(c) = self.s.eat() {
            if match c {
                // Whitespace.
                c if c.is_whitespace() => true,
                // Comments.
                '/' => true,
                // Parentheses.
                '[' | ']' | '{' | '}' => true,
                // Markup.
                '#' | '~' | '*' | '_' | '`' | '$' | '-' => true,
                // Escaping.
                '\\' => true,
                // Just text.
                _ => false,
            } {
                self.s.uneat();
                break;
            }
        }

        Token::Text(self.s.eaten_from(start))
    }

    fn backslash(&mut self) -> Token<'s> {
        if let Some(c) = self.s.peek() {
            match c {
                // Backslash and comments.
                '\\' | '/' |
                // Parenthesis and hashtag.
                '[' | ']' | '{' | '}' | '#' |
                // Markup.
                '*' | '_' | '=' | '~' | '`' | '$' => {
                    let start = self.s.index();
                    self.s.eat_assert(c);
                    Token::Text(&self.s.eaten_from(start))
                }
                'u' if self.s.peek_nth(1) == Some('{') => {
                    self.s.eat_assert('u');
                    self.s.eat_assert('{');
                    Token::UnicodeEscape(UnicodeEscapeToken {
                        // Allow more than `ascii_hexdigit` for better error recovery.
                        sequence: self.s.eat_while(|c| c.is_ascii_alphanumeric()),
                        terminated: self.s.eat_if('}'),
                    })
                }
                c if c.is_whitespace() => Token::Backslash,
                _ => Token::Text("\\"),
            }
        } else {
            Token::Backslash
        }
    }

    fn hash(&mut self, start: usize) -> Token<'s> {
        match self.mode {
            TokenMode::Markup => {
                if self.s.check(is_id_start) {
                    let read = self.s.eat_while(is_id_continue);
                    if let Some(keyword) = keyword(read) {
                        keyword
                    } else {
                        Token::Ident(read)
                    }
                } else if self.s.check(|c| c != '#' && !c.is_whitespace()) {
                    Token::Text(self.s.eaten_from(start))
                } else {
                    Token::Hashtag
                }
            }
            TokenMode::Code => {
                let read = self.s.eat_while(is_id_continue);
                if let Ok(color) = RgbaColor::from_str(read) {
                    Token::Color(color)
                } else {
                    Token::Invalid(self.s.eaten_from(start))
                }
            }
        }
    }

    fn hyph(&mut self, start: usize) -> Token<'s> {
        if self.s.eat_if('-') {
            if self.s.eat_if('-') {
                Token::HyphHyphHyph
            } else {
                Token::HyphHyph
            }
        } else if self.s.check(|c| !c.is_whitespace()) {
            Token::Text(self.s.eaten_from(start))
        } else {
            Token::Hyph
        }
    }

    fn numbering(&mut self, start: usize, c: char) -> Token<'s> {
        let number = if c != '.' {
            self.s.eat_while(|c| c.is_ascii_digit());
            let read = self.s.eaten_from(start);
            if !self.s.eat_if('.') {
                return Token::Text(read);
            }
            read.parse().ok()
        } else {
            None
        };

        if self.s.check(|c| !c.is_whitespace()) {
            return Token::Text(self.s.eaten_from(start));
        }

        Token::Numbering(number)
    }

    fn raw(&mut self) -> Token<'s> {
        let mut backticks = 1;
        while self.s.eat_if('`') {
            backticks += 1;
        }

        // Special case for empty inline block.
        if backticks == 2 {
            return Token::Raw(RawToken { text: "", backticks: 1, terminated: true });
        }

        let start = self.s.index();

        let mut found = 0;
        while found < backticks {
            match self.s.eat() {
                Some('`') => found += 1,
                Some(_) => found = 0,
                None => break,
            }
        }

        let terminated = found == backticks;
        let end = self.s.index() - if terminated { found } else { 0 };

        Token::Raw(RawToken {
            text: self.s.get(start .. end),
            backticks,
            terminated,
        })
    }

    fn math(&mut self) -> Token<'s> {
        let mut display = false;
        if self.s.eat_if('[') {
            display = true;
        }

        let start = self.s.index();

        let mut escaped = false;
        let mut dollar = !display;

        let terminated = loop {
            match self.s.eat() {
                Some('$') if !escaped && dollar => break true,
                Some(']') if !escaped => dollar = true,
                Some(c) => {
                    dollar = !display;
                    escaped = c == '\\' && !escaped;
                }
                None => break false,
            }
        };

        let end = self.s.index()
            - match (terminated, display) {
                (false, _) => 0,
                (true, false) => 1,
                (true, true) => 2,
            };

        Token::Math(MathToken {
            formula: self.s.get(start .. end),
            display,
            terminated,
        })
    }

    fn ident(&mut self, start: usize) -> Token<'s> {
        self.s.eat_while(is_id_continue);
        match self.s.eaten_from(start) {
            "none" => Token::None,
            "auto" => Token::Auto,
            "true" => Token::Bool(true),
            "false" => Token::Bool(false),
            id => keyword(id).unwrap_or(Token::Ident(id)),
        }
    }

    fn number(&mut self, start: usize, c: char) -> Token<'s> {
        // Read the first part (integer or fractional depending on `first`).
        self.s.eat_while(|c| c.is_ascii_digit());

        // Read the fractional part if not already done.
        if c != '.' && self.s.eat_if('.') {
            self.s.eat_while(|c| c.is_ascii_digit());
        }

        // Read the exponent.
        if self.s.eat_if('e') || self.s.eat_if('E') {
            let _ = self.s.eat_if('+') || self.s.eat_if('-');
            self.s.eat_while(|c| c.is_ascii_digit());
        }

        // Read the suffix.
        let suffix_start = self.s.index();
        if !self.s.eat_if('%') {
            self.s.eat_while(|c| c.is_ascii_alphanumeric());
        }

        let number = self.s.get(start .. suffix_start);
        let suffix = self.s.eaten_from(suffix_start);
        let all = self.s.eaten_from(start);

        // Find out whether it is a simple number.
        if suffix.is_empty() {
            if let Ok(int) = number.parse::<i64>() {
                return Token::Int(int);
            } else if let Ok(float) = number.parse::<f64>() {
                return Token::Float(float);
            }
        }

        // Otherwise parse into the fitting numeric type.
        let build = match suffix {
            "%" => Token::Percent,
            "fr" => Token::Fraction,
            "pt" => |x| Token::Length(x, LengthUnit::Pt),
            "mm" => |x| Token::Length(x, LengthUnit::Mm),
            "cm" => |x| Token::Length(x, LengthUnit::Cm),
            "in" => |x| Token::Length(x, LengthUnit::In),
            "rad" => |x| Token::Angle(x, AngularUnit::Rad),
            "deg" => |x| Token::Angle(x, AngularUnit::Deg),
            _ => return Token::Invalid(all),
        };

        if let Ok(float) = number.parse::<f64>() {
            build(float)
        } else {
            Token::Invalid(all)
        }
    }

    fn string(&mut self) -> Token<'s> {
        let mut escaped = false;
        Token::Str(StrToken {
            string: self.s.eat_until(|c| {
                if c == '"' && !escaped {
                    true
                } else {
                    escaped = c == '\\' && !escaped;
                    false
                }
            }),
            terminated: self.s.eat_if('"'),
        })
    }

    fn line_comment(&mut self) -> Token<'s> {
        Token::LineComment(self.s.eat_until(is_newline))
    }

    fn block_comment(&mut self) -> Token<'s> {
        let start = self.s.index();

        let mut state = '_';
        let mut depth = 1;

        // Find the first `*/` that does not correspond to a nested `/*`.
        while let Some(c) = self.s.eat() {
            state = match (state, c) {
                ('*', '/') => {
                    depth -= 1;
                    if depth == 0 {
                        break;
                    }
                    '_'
                }
                ('/', '*') => {
                    depth += 1;
                    '_'
                }
                _ => c,
            }
        }

        let terminated = depth == 0;
        let end = self.s.index() - if terminated { 2 } else { 0 };

        Token::BlockComment(self.s.get(start .. end))
    }
}

impl Debug for Tokens<'_> {
    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
        write!(f, "Tokens({}|{})", self.s.eaten(), self.s.rest())
    }
}

fn keyword(id: &str) -> Option<Token<'static>> {
    Some(match id {
        "not" => Token::Not,
        "and" => Token::And,
        "or" => Token::Or,
        "with" => Token::With,
        "let" => Token::Let,
        "if" => Token::If,
        "else" => Token::Else,
        "for" => Token::For,
        "in" => Token::In,
        "while" => Token::While,
        "break" => Token::Break,
        "continue" => Token::Continue,
        "return" => Token::Return,
        "import" => Token::Import,
        "include" => Token::Include,
        "using" => Token::Using,
        _ => return None,
    })
}

#[cfg(test)]
#[allow(non_snake_case)]
mod tests {
    use super::*;

    use Option::None;
    use Token::{Ident, *};
    use TokenMode::{Code, Markup};

    const fn UnicodeEscape(sequence: &str, terminated: bool) -> Token {
        Token::UnicodeEscape(UnicodeEscapeToken { sequence, terminated })
    }

    const fn Raw(text: &str, backticks: usize, terminated: bool) -> Token {
        Token::Raw(RawToken { text, backticks, terminated })
    }

    const fn Math(formula: &str, display: bool, terminated: bool) -> Token {
        Token::Math(MathToken { formula, display, terminated })
    }

    const fn Color(r: u8, g: u8, b: u8, a: u8) -> Token<'static> {
        Token::Color(RgbaColor { r, g, b, a })
    }

    const fn Str(string: &str, terminated: bool) -> Token {
        Token::Str(StrToken { string, terminated })
    }

    /// Building blocks for suffix testing.
    ///
    /// We extend each test case with a collection of different suffixes to make
    /// sure tokens end at the correct position. These suffixes are split into
    /// blocks, which can be disabled/enabled per test case. For example, when
    /// testing identifiers we disable letter suffixes because these would
    /// mingle with the identifiers.
    ///
    /// Suffix blocks:
    /// - ' ': spacing
    /// - 'a': letters
    /// - '1': numbers
    /// - '/': symbols
    const BLOCKS: &str = " a1/";

    /// Suffixes described by four-tuples of:
    ///
    /// - block the suffix is part of
    /// - mode in which the suffix is applicable
    /// - the suffix string
    /// - the resulting suffix token
    const SUFFIXES: &[(char, Option<TokenMode>, &str, Token)] = &[
        // Whitespace suffixes.
        (' ', None, " ", Space(0)),
        (' ', None, "\n", Space(1)),
        (' ', None, "\r", Space(1)),
        (' ', None, "\r\n", Space(1)),
        // Letter suffixes.
        ('a', Some(Markup), "hello", Text("hello")),
        ('a', Some(Markup), "💚", Text("💚")),
        ('a', Some(Code), "val", Ident("val")),
        ('a', Some(Code), "α", Ident("α")),
        ('a', Some(Code), "_", Ident("_")),
        // Number suffixes.
        ('1', Some(Code), "2", Int(2)),
        ('1', Some(Code), ".2", Float(0.2)),
        // Symbol suffixes.
        ('/', None, "[", LeftBracket),
        ('/', None, "//", LineComment("")),
        ('/', None, "/**/", BlockComment("")),
        ('/', Some(Markup), "*", Star),
        ('/', Some(Markup), "$ $", Math(" ", false, true)),
        ('/', Some(Markup), r"\\", Text(r"\")),
        ('/', Some(Markup), "#let", Let),
        ('/', Some(Code), "(", LeftParen),
        ('/', Some(Code), ":", Colon),
        ('/', Some(Code), "+=", PlusEq),
        ('/', Some(Code), "#123", Color(0x11, 0x22, 0x33, 0xff)),
    ];

    macro_rules! t {
        (Both $($tts:tt)*) => {
            t!(Markup $($tts)*);
            t!(Code $($tts)*);
        };
        ($mode:ident $([$blocks:literal])?: $src:expr => $($token:expr),*) => {{
            // Test without suffix.
            t!(@$mode: $src => $($token),*);

            // Test with each applicable suffix.
            for &(block, mode, suffix, token) in SUFFIXES {
                let src = $src;
                #[allow(unused)]
                let mut blocks = BLOCKS;
                $(blocks = $blocks;)?
                assert!(!blocks.contains(|c| !BLOCKS.contains(c)));
                if (mode.is_none() || mode == Some($mode)) && blocks.contains(block) {
                    t!(@$mode: format!("{}{}", src, suffix) => $($token,)* token);
                }
            }
        }};
        (@$mode:ident: $src:expr => $($token:expr),*) => {{
            let src = $src;
            let exp = vec![$($token),*];
            let found = Tokens::new(&src, $mode).collect::<Vec<_>>();
            check(&src, exp, found);
        }};
    }

    #[track_caller]
    fn check<T>(src: &str, exp: T, found: T)
    where
        T: Debug + PartialEq,
    {
        if exp != found {
            println!("source:   {:?}", src);
            println!("expected: {:#?}", exp);
            println!("found:    {:#?}", found);
            panic!("test failed");
        }
    }

    #[test]
    fn test_tokenize_brackets() {
        // Test in markup.
        t!(Markup: "["       => LeftBracket);
        t!(Markup: "]"       => RightBracket);
        t!(Markup: "{"       => LeftBrace);
        t!(Markup: "}"       => RightBrace);
        t!(Markup[" /"]: "(" => Text("("));
        t!(Markup[" /"]: ")" => Text(")"));

        // Test in code.
        t!(Code: "[" => LeftBracket);
        t!(Code: "]" => RightBracket);
        t!(Code: "{" => LeftBrace);
        t!(Code: "}" => RightBrace);
        t!(Code: "(" => LeftParen);
        t!(Code: ")" => RightParen);
    }

    #[test]
    fn test_tokenize_whitespace() {
        // Test basic whitespace.
        t!(Both["a1/"]: ""         => );
        t!(Both["a1/"]: " "        => Space(0));
        t!(Both["a1/"]: "    "     => Space(0));
        t!(Both["a1/"]: "\t"       => Space(0));
        t!(Both["a1/"]: "  \t"     => Space(0));
        t!(Both["a1/"]: "\u{202F}" => Space(0));

        // Test newline counting.
        t!(Both["a1/"]: "\n"           => Space(1));
        t!(Both["a1/"]: "\n "          => Space(1));
        t!(Both["a1/"]: "  \n"         => Space(1));
        t!(Both["a1/"]: "  \n   "      => Space(1));
        t!(Both["a1/"]: "\r\n"         => Space(1));
        t!(Both["a1/"]: "  \n\t \n  "  => Space(2));
        t!(Both["a1/"]: "\n\r"         => Space(2));
        t!(Both["a1/"]: " \r\r\n \x0D" => Space(3));
    }

    #[test]
    fn test_tokenize_text() {
        // Test basic text.
        t!(Markup[" /"]: "hello"       => Text("hello"));
        t!(Markup[" /"]: "hello-world" => Text("hello"), Text("-"), Text("world"));

        // Test code symbols in text.
        t!(Markup[" /"]: "a():\"b" => Text("a():\"b"));
        t!(Markup[" /"]: ";:,|/+"  => Text(";:,|"), Text("/+"));
        t!(Markup[" /"]: "#-a"     => Text("#"), Text("-"), Text("a"));
        t!(Markup[" "]: "#123"     => Text("#"), Text("123"));

        // Test text ends.
        t!(Markup[""]: "hello " => Text("hello"), Space(0));
        t!(Markup[""]: "hello~" => Text("hello"), Tilde);
    }

    #[test]
    fn test_tokenize_escape_sequences() {
        // Test escapable symbols.
        t!(Markup: r"\\" => Text(r"\"));
        t!(Markup: r"\/" => Text("/"));
        t!(Markup: r"\[" => Text("["));
        t!(Markup: r"\]" => Text("]"));
        t!(Markup: r"\{" => Text("{"));
        t!(Markup: r"\}" => Text("}"));
        t!(Markup: r"\*" => Text("*"));
        t!(Markup: r"\_" => Text("_"));
        t!(Markup: r"\=" => Text("="));
        t!(Markup: r"\~" => Text("~"));
        t!(Markup: r"\`" => Text("`"));
        t!(Markup: r"\$" => Text("$"));
        t!(Markup: r"\#" => Text("#"));

        // Test unescapable symbols.
        t!(Markup[" /"]: r"\a"   => Text(r"\"), Text("a"));
        t!(Markup[" /"]: r"\u"   => Text(r"\"), Text("u"));
        t!(Markup[" /"]: r"\1"   => Text(r"\"), Text("1"));
        t!(Markup[" /"]: r"\:"   => Text(r"\"), Text(":"));
        t!(Markup[" /"]: r#"\""# => Text(r"\"), Text("\""));

        // Test basic unicode escapes.
        t!(Markup: r"\u{}"     => UnicodeEscape("", true));
        t!(Markup: r"\u{2603}" => UnicodeEscape("2603", true));
        t!(Markup: r"\u{P}"    => UnicodeEscape("P", true));

        // Test unclosed unicode escapes.
        t!(Markup[" /"]: r"\u{"     => UnicodeEscape("", false));
        t!(Markup[" /"]: r"\u{1"    => UnicodeEscape("1", false));
        t!(Markup[" /"]: r"\u{26A4" => UnicodeEscape("26A4", false));
        t!(Markup[" /"]: r"\u{1Q3P" => UnicodeEscape("1Q3P", false));
        t!(Markup: r"\u{1🏕}"       => UnicodeEscape("1", false), Text("🏕"), RightBrace);
    }

    #[test]
    fn test_tokenize_markup_symbols() {
        // Test markup tokens.
        t!(Markup[" a1"]: "*"   => Star);
        t!(Markup: "_"          => Underscore);
        t!(Markup[""]: "###"    => Hashtag, Hashtag, Hashtag);
        t!(Markup["a1/"]: "# "  => Hashtag, Space(0));
        t!(Markup: "~"          => Tilde);
        t!(Markup[" "]: r"\"    => Backslash);
        t!(Markup["a "]: r"a--" => Text("a"), HyphHyph);
        t!(Markup["a1/"]: "- "  => Hyph, Space(0));
        t!(Markup[" "]: "."     => Numbering(None));
        t!(Markup[" "]: "1."    => Numbering(Some(1)));
        t!(Markup[" "]: "1.a"   => Text("1."), Text("a"));
        t!(Markup[" /"]: "a1."  => Text("a1."));
    }

    #[test]
    fn test_tokenize_code_symbols() {
        // Test all symbols.
        t!(Code: ","        => Comma);
        t!(Code: ";"        => Semicolon);
        t!(Code: ":"        => Colon);
        t!(Code: "+"        => Plus);
        t!(Code: "-"        => Hyph);
        t!(Code[" a1"]: "*" => Star);
        t!(Code[" a1"]: "/" => Slash);
        t!(Code: "="        => Eq);
        t!(Code: "=="       => EqEq);
        t!(Code: "!="       => ExclEq);
        t!(Code: "<"        => Lt);
        t!(Code: "<="       => LtEq);
        t!(Code: ">"        => Gt);
        t!(Code: ">="       => GtEq);
        t!(Code: "+="       => PlusEq);
        t!(Code: "-="       => HyphEq);
        t!(Code: "*="       => StarEq);
        t!(Code: "/="       => SlashEq);
        t!(Code: ".."       => Dots);
        t!(Code: "=>"       => Arrow);

        // Test combinations.
        t!(Code: "<=>"        => LtEq, Gt);
        t!(Code[" a/"]: "..." => Dots, Invalid("."));

        // Test hyphen as symbol vs part of identifier.
        t!(Code[" /"]: "-1"   => Hyph, Int(1));
        t!(Code[" /"]: "-a"   => Hyph, Ident("a"));
        t!(Code[" /"]: "--1"  => Hyph, Hyph, Int(1));
        t!(Code[" /"]: "--_a" => Hyph, Hyph, Ident("_a"));
        t!(Code[" /"]: "a-b"  => Ident("a-b"));
    }

    #[test]
    fn test_tokenize_keywords() {
        // A list of a few (not all) keywords.
        let list = [
            ("not", Not),
            ("let", Let),
            ("if", If),
            ("else", Else),
            ("for", For),
            ("in", In),
            ("import", Import),
        ];

        for &(s, t) in &list {
            t!(Markup[" "]: format!("#{}", s) => t);
            t!(Markup[" "]: format!("#{0}#{0}", s) => t, t);
            t!(Markup[" /"]: format!("# {}", s) => Token::Hashtag, Space(0), Text(s));
        }

        for &(s, t) in &list {
            t!(Code[" "]: s => t);
            t!(Markup[" /"]: s => Text(s));
        }

        // Test simple identifier.
        t!(Markup[" "]: "#letter" => Ident("letter"));
        t!(Code[" /"]: "falser"   => Ident("falser"));
        t!(Code[" /"]: "None"     => Ident("None"));
        t!(Code[" /"]: "True"     => Ident("True"));
    }

    #[test]
    fn test_tokenize_raw_blocks() {
        let empty = Raw("", 1, true);

        // Test basic raw block.
        t!(Markup: "``"     => empty);
        t!(Markup: "`raw`"  => Raw("raw", 1, true));
        t!(Markup[""]: "`]" => Raw("]", 1, false));

        // Test special symbols in raw block.
        t!(Markup: "`[brackets]`" => Raw("[brackets]", 1, true));
        t!(Markup[""]: r"`\`` "   => Raw(r"\", 1, true), Raw(" ", 1, false));

        // Test separated closing backticks.
        t!(Markup: "```not `y`e`t```" => Raw("not `y`e`t", 3, true));

        // Test more backticks.
        t!(Markup: "``nope``"             => empty, Text("nope"), empty);
        t!(Markup: "````🚀````"           => Raw("🚀", 4, true));
        t!(Markup[""]: "`````👩‍🚀````noend" => Raw("👩‍🚀````noend", 5, false));
        t!(Markup[""]: "````raw``````"    => Raw("raw", 4, true), empty);
    }

    #[test]
    fn test_tokenize_math_formulas() {
        // Test basic formula.
        t!(Markup: "$$"        => Math("", false, true));
        t!(Markup: "$x$"       => Math("x", false, true));
        t!(Markup: r"$\\$"     => Math(r"\\", false, true));
        t!(Markup: "$[x + y]$" => Math("x + y", true, true));
        t!(Markup: r"$[\\]$"   => Math(r"\\", true, true));

        // Test unterminated.
        t!(Markup[""]: "$x"      => Math("x", false, false));
        t!(Markup[""]: "$[x"     => Math("x", true, false));
        t!(Markup[""]: "$[x]\n$" => Math("x]\n$", true, false));

        // Test escape sequences.
        t!(Markup: r"$\$x$"       => Math(r"\$x", false, true));
        t!(Markup: r"$[\\\]$]$"   => Math(r"\\\]$", true, true));
        t!(Markup[""]: r"$[ ]\\$" => Math(r" ]\\$", true, false));
    }

    #[test]
    fn test_tokenize_idents() {
        // Test valid identifiers.
        t!(Code[" /"]: "x"           => Ident("x"));
        t!(Code[" /"]: "value"       => Ident("value"));
        t!(Code[" /"]: "__main__"    => Ident("__main__"));
        t!(Code[" /"]: "_snake_case" => Ident("_snake_case"));

        // Test non-ascii.
        t!(Code[" /"]: "α"    => Ident("α"));
        t!(Code[" /"]: "ម្តាយ" => Ident("ម្តាយ"));

        // Test hyphen parsed as identifier.
        t!(Code[" /"]: "kebab-case" => Ident("kebab-case"));
        t!(Code[" /"]: "one-10"     => Ident("one-10"));
    }

    #[test]
    fn test_tokenize_numeric() {
        let ints = [("7", 7), ("012", 12)];
        let floats = [
            (".3", 0.3),
            ("0.3", 0.3),
            ("3.", 3.0),
            ("3.0", 3.0),
            ("14.3", 14.3),
            ("10e2", 1000.0),
            ("10e+0", 10.0),
            ("10e+1", 100.0),
            ("10e-2", 0.1),
            ("10.e1", 100.0),
            ("10.e-1", 1.0),
            (".1e1", 1.0),
            ("10E2", 1000.0),
        ];

        // Test integers.
        for &(s, v) in &ints {
            t!(Code[" /"]: s => Int(v));
        }

        // Test floats.
        for &(s, v) in &floats {
            t!(Code[" /"]: s => Float(v));
        }

        // Test attached numbers.
        t!(Code[" /"]: ".2.3"  => Float(0.2), Float(0.3));
        t!(Code[" /"]: "1.2.3"  => Float(1.2), Float(0.3));
        t!(Code[" /"]: "1e-2+3" => Float(0.01), Plus, Int(3));

        // Test float from too large integer.
        let large = i64::MAX as f64 + 1.0;
        t!(Code[" /"]: large.to_string() => Float(large));

        // Combined integers and floats.
        let nums = ints.iter().map(|&(k, v)| (k, v as f64)).chain(floats.iter().copied());

        let suffixes = [
            ("%", Percent as fn(f64) -> Token<'static>),
            ("fr", Fraction as fn(f64) -> Token<'static>),
            ("mm", |x| Length(x, LengthUnit::Mm)),
            ("pt", |x| Length(x, LengthUnit::Pt)),
            ("cm", |x| Length(x, LengthUnit::Cm)),
            ("in", |x| Length(x, LengthUnit::In)),
            ("rad", |x| Angle(x, AngularUnit::Rad)),
            ("deg", |x| Angle(x, AngularUnit::Deg)),
        ];

        // Numeric types.
        for &(suffix, build) in &suffixes {
            for (s, v) in nums.clone() {
                t!(Code[" /"]: format!("{}{}", s, suffix) => build(v));
            }
        }
    }

    #[test]
    fn test_tokenize_color() {
        t!(Code[" /"]: "#ABC" => Color(0xAA, 0xBB, 0xCC, 0xff));
        t!(Code[" /"]: "#6ae6dd" => Color(0x6a, 0xe6, 0xdd, 0xff));
        t!(Code[" /"]: "#8A083caf" => Color(0x8A, 0x08, 0x3c, 0xaf));
    }

    #[test]
    fn test_tokenize_strings() {
        // Test basic strings.
        t!(Code: "\"hi\""        => Str("hi", true));
        t!(Code: "\"hi\nthere\"" => Str("hi\nthere", true));
        t!(Code: "\"🌎\""        => Str("🌎", true));

        // Test unterminated.
        t!(Code[""]: "\"hi"      => Str("hi", false));

        // Test escaped quote.
        t!(Code: r#""a\"bc""# => Str(r#"a\"bc"#, true));
        t!(Code[""]: r#""\""# => Str(r#"\""#, false));
    }

    #[test]
    fn test_tokenize_line_comments() {
        // Test line comment with no trailing newline.
        t!(Both[""]: "//" => LineComment(""));

        // Test line comment ends at newline.
        t!(Both["a1/"]: "//bc\n"   => LineComment("bc"), Space(1));
        t!(Both["a1/"]: "// bc \n" => LineComment(" bc "), Space(1));
        t!(Both["a1/"]: "//bc\r\n" => LineComment("bc"), Space(1));

        // Test nested line comments.
        t!(Both["a1/"]: "//a//b\n" => LineComment("a//b"), Space(1));
    }

    #[test]
    fn test_tokenize_block_comments() {
        // Test basic block comments.
        t!(Both[""]: "/*" => BlockComment(""));
        t!(Both: "/**/"   => BlockComment(""));
        t!(Both: "/*🏞*/" => BlockComment("🏞"));
        t!(Both: "/*\n*/" => BlockComment("\n"));

        // Test depth 1 and 2 nested block comments.
        t!(Both: "/* /* */ */"  => BlockComment(" /* */ "));
        t!(Both: "/*/*/**/*/*/" => BlockComment("/*/**/*/"));

        // Test two nested, one unclosed block comments.
        t!(Both[""]: "/*/*/**/*/" => BlockComment("/*/**/*/"));

        // Test all combinations of up to two following slashes and stars.
        t!(Both[""]: "/*"   => BlockComment(""));
        t!(Both[""]: "/*/"  => BlockComment("/"));
        t!(Both[""]: "/**"  => BlockComment("*"));
        t!(Both[""]: "/*//" => BlockComment("//"));
        t!(Both[""]: "/*/*" => BlockComment("/*"));
        t!(Both[""]: "/**/" => BlockComment(""));
        t!(Both[""]: "/***" => BlockComment("**"));
    }

    #[test]
    fn test_tokenize_invalid() {
        // Test invalidly closed block comments.
        t!(Both: "*/"     => Token::Invalid("*/"));
        t!(Both: "/**/*/" => BlockComment(""), Token::Invalid("*/"));

        // Test invalid expressions.
        t!(Code: r"\"        => Invalid(r"\"));
        t!(Code: "🌓"        => Invalid("🌓"));
        t!(Code: r"\:"       => Invalid(r"\"), Colon);
        t!(Code: "meal⌚"    => Ident("meal"), Invalid("⌚"));
        t!(Code[" /"]: r"\a" => Invalid(r"\"), Ident("a"));

        // Test invalid number suffixes.
        t!(Code[" /"]: "1foo" => Invalid("1foo"));
        t!(Code: "1p%"        => Invalid("1p"), Invalid("%"));
        t!(Code: "1%%"        => Percent(1.0), Invalid("%"));

        // Test invalid color.
        t!(Code[" /"]: r"#letter" => Invalid(r"#letter"));
    }
}