Optimize scanner and tokenizer

2025-07-05 19:52:53 +08:00 · 2021-07-13 15:11:42 +02:00 · 2021-07-13 15:11:42 +02:00 · 81f2f8f4c3
commit 81f2f8f4c3
parent 0481192a77
3 changed files with 79 additions and 52 deletions
--- a/src/parse/lines.rs
+++ b/src/parse/lines.rs
@ -94,6 +94,7 @@ pub fn search_column(src: &str) -> usize {
 }
 /// Whether this character denotes a newline.
 #[inline]
 pub fn is_newline(character: char) -> bool {
    matches!(
        character,
--- a/src/parse/scanner.rs
+++ b/src/parse/scanner.rs
@ -13,11 +13,13 @@ pub struct Scanner<'s> {
 impl<'s> Scanner<'s> {
    /// Create a new char scanner.
    #[inline]
    pub fn new(src: &'s str) -> Self {
        Self { src, index: 0 }
    }
    /// Consume the next char.
    #[inline]
    pub fn eat(&mut self) -> Option<char> {
        let next = self.peek();
        if let Some(c) = next {
@ -29,6 +31,7 @@ impl<'s> Scanner<'s> {
    /// Consume the next char if it is the given one.
    ///
    /// Returns whether the char was consumed.
    #[inline]
    pub fn eat_if(&mut self, c: char) -> bool {
        let matches = self.peek() == Some(c);
        if matches {
@ -38,12 +41,14 @@ impl<'s> Scanner<'s> {
    }
    /// Consume the next char, debug-asserting that it is the given one.
    #[inline]
    pub fn eat_assert(&mut self, c: char) {
        let next = self.eat();
        debug_assert_eq!(next, Some(c));
    }
    /// Consume the next char, coalescing `\r\n` to just `\n`.
    #[inline]
    pub fn eat_merging_crlf(&mut self) -> Option<char> {
        if self.rest().starts_with("\r\n") {
            self.index += 2;
@ -54,6 +59,7 @@ impl<'s> Scanner<'s> {
    }
    /// Eat chars while the condition is true.
    #[inline]
    pub fn eat_while<F>(&mut self, mut f: F) -> &'s str
    where
        F: FnMut(char) -> bool,
@ -62,6 +68,7 @@ impl<'s> Scanner<'s> {
    }
    /// Eat chars until the condition is true.
    #[inline]
    pub fn eat_until<F>(&mut self, mut f: F) -> &'s str
    where
        F: FnMut(char) -> bool,
@ -77,11 +84,13 @@ impl<'s> Scanner<'s> {
    }
    /// Uneat the last eaten char.
    #[inline]
    pub fn uneat(&mut self) {
        self.index = self.last_index();
    }
    /// Peek at the next char without consuming it.
    #[inline]
    pub fn peek(&self) -> Option<char> {
        self.rest().chars().next()
    }
@ -89,6 +98,7 @@ impl<'s> Scanner<'s> {
    /// Checks whether the next char fulfills a condition.
    ///
    /// Returns `default` if there is no next char.
    #[inline]
    pub fn check_or<F>(&self, default: bool, f: F) -> bool
    where
        F: FnOnce(char) -> bool,
@ -97,6 +107,7 @@ impl<'s> Scanner<'s> {
    }
    /// The previous index in the source string.
    #[inline]
    pub fn last_index(&self) -> usize {
        self.eaten()
            .chars()
@ -105,43 +116,53 @@ impl<'s> Scanner<'s> {
    }
    /// The current index in the source string.
    #[inline]
    pub fn index(&self) -> usize {
        self.index
    }
    /// Jump to an index in the source string.
    #[inline]
    pub fn jump(&mut self, index: usize) {
        // Make sure that the index is in bounds and on a codepoint boundary.
        self.src.get(index ..).expect("jumped to invalid index");
        self.index = index;
    }
-    /// Slice a part out of the source string.
+    /// Slice out part of the source string.
    #[inline]
    pub fn get<I>(&self, index: I) -> &'s str
    where
        I: SliceIndex<str, Output = str>,
    {
-        &self.src[index]
+        // See `eaten_from` for details about `unwrap_or_default`.
        self.src.get(index).unwrap_or_default()
    }
-    /// The full source string up to the current index.
+    /// The remaining source string after the current index.
-    pub fn eaten(&self) -> &'s str {
+    #[inline]
    pub fn rest(&self) -> &'s str {
        // SAFETY: The index is always in bounds and on a codepoint boundary
        // since it is:
        // - either increased by the length of a scanned character,
        // - or checked upon jumping.
        unsafe { self.src.get_unchecked(self.index ..) }
    }
    /// The full source string up to the current index.
    #[inline]
    pub fn eaten(&self) -> &'s str {
        // SAFETY: The index is always okay, for details see `rest()`.
        unsafe { self.src.get_unchecked(.. self.index) }
    }
    /// The source string from `start` to the current index.
    #[inline]
    pub fn eaten_from(&self, start: usize) -> &'s str {
-        &self.src[start .. self.index]
+        // Using `unwrap_or_default` is much faster than unwrap, probably
-    }
+        // because then the whole call to `eaten_from` is pure and can be
-
+        // optimized away in some cases.
-    /// The remaining source string after the current index.
+        self.src.get(start .. self.index).unwrap_or_default()
    pub fn rest(&self) -> &'s str {
        // SAFETY: The index is always okay, for details see `eaten()`.
        unsafe { self.src.get_unchecked(self.index ..) }
    }
 }
--- a/src/parse/tokens.rs
+++ b/src/parse/tokens.rs
@ -22,22 +22,26 @@ pub enum TokenMode {
 impl<'s> Tokens<'s> {
    /// Create a new token iterator with the given mode.
    #[inline]
    pub fn new(src: &'s str, mode: TokenMode) -> Self {
        Self { s: Scanner::new(src), mode }
    }
    /// Get the current token mode.
    #[inline]
    pub fn mode(&self) -> TokenMode {
        self.mode
    }
    /// Change the token mode.
    #[inline]
    pub fn set_mode(&mut self, mode: TokenMode) {
        self.mode = mode;
    }
    /// The index in the string at which the last token ends and next token
    /// will start.
    #[inline]
    pub fn index(&self) -> usize {
        self.s.index()
    }
@ -45,11 +49,13 @@ impl<'s> Tokens<'s> {
    /// Jump to the given index in the string.
    ///
    /// You need to know the correct column.
    #[inline]
    pub fn jump(&mut self, index: usize) {
        self.s.jump(index);
    }
    /// The underlying scanner.
    #[inline]
    pub fn scanner(&self) -> Scanner<'s> {
        self.s
    }
@ -59,6 +65,7 @@ impl<'s> Iterator for Tokens<'s> {
    type Item = Token<'s>;
    /// Parse the next token in the source code.
    #[inline]
    fn next(&mut self) -> Option<Self::Item> {
        let start = self.s.index();
        let c = self.s.eat()?;
@ -70,7 +77,8 @@ impl<'s> Iterator for Tokens<'s> {
            '}' => Token::RightBrace,
            // Whitespace.
-            c if c.is_whitespace() => self.whitespace(c),
+            ' ' if self.s.check_or(true, |c| !c.is_whitespace()) => Token::Space(0),
            c if c.is_whitespace() => self.whitespace(),
            // Comments with special case for URLs.
            '/' if self.s.eat_if('*') => self.block_comment(),
@ -87,6 +95,7 @@ impl<'s> Iterator for Tokens<'s> {
 }
 impl<'s> Tokens<'s> {
    #[inline]
    fn markup(&mut self, start: usize, c: char) -> Token<'s> {
        match c {
            // Escape sequences.
@ -158,54 +167,49 @@ impl<'s> Tokens<'s> {
        }
    }
-    fn whitespace(&mut self, first: char) -> Token<'s> {
+    #[inline]
-        // Fast path for just a single space
+    fn text(&mut self, start: usize) -> Token<'s> {
-        if first == ' ' && self.s.check_or(true, |c| !c.is_whitespace()) {
+        macro_rules! table {
-            Token::Space(0)
+            ($($c:literal)|*) => {{
-        } else {
+                let mut t = [false; 128];
-            self.s.uneat();
+                $(t[$c as usize] = true;)*
-
+                t
-            // Count the number of newlines.
+            }}
            let mut newlines = 0;
            while let Some(c) = self.s.eat_merging_crlf() {
                if !c.is_whitespace() {
                    self.s.uneat();
                    break;
                }
                if is_newline(c) {
                    newlines += 1;
                }
            }
            Token::Space(newlines)
        }
        const TABLE: [bool; 128] = table! {
            // Ascii whitespace.
            ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' |
            // Comments, parentheses, code.
            '/' | '[' | ']' | '{' | '}' | '#' |
            // Markup
            '~' | '*' | '_' | '`' | '$' | '-' | '\\'
        };
        self.s.eat_until(|c| {
            TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
        });
        Token::Text(self.s.eaten_from(start))
    }
-    fn text(&mut self, start: usize) -> Token<'s> {
+    fn whitespace(&mut self) -> Token<'s> {
-        while let Some(c) = self.s.eat() {
+        self.s.uneat();
-            if match c {
+
-                // Whitespace.
+        // Count the number of newlines.
-                c if c.is_whitespace() => true,
+        let mut newlines = 0;
-                // Comments.
+        while let Some(c) = self.s.eat_merging_crlf() {
-                '/' => true,
+            if !c.is_whitespace() {
                // Parentheses.
                '[' | ']' | '{' | '}' => true,
                // Code.
                '#' => true,
                // Markup.
                '~' | '*' | '_' | '`' | '$' | '-' => true,
                // Escaping.
                '\\' => true,
                // Just text.
                _ => false,
            } {
                self.s.uneat();
                break;
            }
            if is_newline(c) {
                newlines += 1;
            }
        }
-        Token::Text(self.s.eaten_from(start))
+        Token::Space(newlines)
    }
    fn backslash(&mut self) -> Token<'s> {
@ -238,6 +242,7 @@ impl<'s> Tokens<'s> {
        }
    }
    #[inline]
    fn hash(&mut self) -> Token<'s> {
        if self.s.check_or(false, is_id_start) {
            let read = self.s.eat_while(is_id_continue);