7. Return SyntaxNodes from the Lexer

2025-05-13 20:46:23 +08:00 · 2024-10-10 11:57:27 -04:00 · 2024-10-10 11:57:27 -04:00 · 1cecae0333
commit 1cecae0333
parent 01186779cd
2 changed files with 44 additions and 37 deletions
--- a/crates/typst-syntax/src/lexer.rs
+++ b/crates/typst-syntax/src/lexer.rs
@ -4,12 +4,12 @@ use unicode_script::{Script, UnicodeScript};
 use unicode_segmentation::UnicodeSegmentation;
 use unscanny::Scanner;

-use crate::{SyntaxError, SyntaxKind};
+use crate::{SyntaxError, SyntaxKind, SyntaxNode};

-/// Splits up a string of source code into tokens.
+/// An iterator over a source code string which returns tokens.
 #[derive(Clone)]
 pub(super) struct Lexer<'s> {
-    /// The underlying scanner.
+    /// The scanner: contains the underlying string and location as a "cursor".
    s: Scanner<'s>,
    /// The mode the lexer is in. This determines which kinds of tokens it
    /// produces.
@ -73,11 +73,6 @@ impl<'s> Lexer<'s> {
    pub fn newline(&self) -> bool {
        self.newline
    }
-
-    /// Take out the last error, if any.
-    pub fn take_error(&mut self) -> Option<SyntaxError> {
-        self.error.take()
-    }
 }

 impl Lexer<'_> {
@ -97,21 +92,24 @@ impl Lexer<'_> {

 /// Shared methods with all [`LexMode`].
 impl Lexer<'_> {
-    /// Proceed to the next token and return its [`SyntaxKind`]. Note the
-    /// token could be a [trivia](SyntaxKind::is_trivia).
-    pub fn next(&mut self) -> SyntaxKind {
+    /// Return the next token in our text. Returns both the [`SyntaxNode`]
+    /// and the raw [`SyntaxKind`] to make it more ergonomic to check the kind
+    pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) {
+        debug_assert!(self.error.is_none());
+        let start = self.s.cursor();
        if self.mode == LexMode::Raw {
-            let Some((kind, end)) = self.raw.pop() else {
-                return SyntaxKind::End;
+            let kind = if let Some((kind, end)) = self.raw.pop() {
+                self.s.jump(end);
+                kind
+            } else {
+                SyntaxKind::End
            };
-            self.s.jump(end);
-            return kind;
+            let node = SyntaxNode::leaf(kind, self.s.from(start));
+            return (kind, node);
        }

        self.newline = false;
-        self.error = None;
-        let start = self.s.cursor();
-        match self.s.eat() {
+        let kind = match self.s.eat() {
            Some(c) if is_space(c, self.mode) => self.whitespace(start, c),
            Some('/') if self.s.eat_if('/') => self.line_comment(),
            Some('/') if self.s.eat_if('*') => self.block_comment(),
@ -132,13 +130,21 @@ impl Lexer<'_> {
            },

            None => SyntaxKind::End,
-        }
+        };
+
+        let text = self.s.from(start);
+        let node = match self.error.take() {
+            Some(error) => SyntaxNode::error(error, text),
+            None => SyntaxNode::leaf(kind, text),
+        };
+        (kind, node)
    }

    /// Eat whitespace characters greedily.
    fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind {
        let more = self.s.eat_while(|c| is_space(c, self.mode));
        let newlines = match c {
+            // Optimize eating a single space.
            ' ' if more.is_empty() => 0,
            _ => count_newlines(self.s.from(start)),
        };
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@ -185,7 +185,7 @@ fn heading(p: &mut Parser) {
    whitespace_line(p);
    markup(p, false, usize::MAX, |p| {
        p.at_set(syntax_set!(Label, Space, RightBracket))
-            && (!p.at(SyntaxKind::Space) || p.lexer.clone().next() == SyntaxKind::Label)
+            && (!p.at(SyntaxKind::Space) || p.lexer.clone().next().0 == SyntaxKind::Label)
    });
    p.wrap(m, SyntaxKind::Heading);
 }
@ -282,7 +282,7 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) {
            while p.directly_at(SyntaxKind::Text) && p.current_text() == "." && {
                let mut copy = p.lexer.clone();
                let start = copy.cursor();
-                let next = copy.next();
+                let next = copy.next().0;
                let end = copy.cursor();
                matches!(next, SyntaxKind::MathIdent | SyntaxKind::Text)
                    && is_ident(&p.text[start..end])
@ -686,8 +686,8 @@ fn code_expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) {
            continue;
        }

-        let at_field_or_method =
-            p.directly_at(SyntaxKind::Dot) && p.lexer.clone().next() == SyntaxKind::Ident;
+        let at_field_or_method = p.directly_at(SyntaxKind::Dot)
+            && p.lexer.clone().next().0 == SyntaxKind::Ident;

        if atomic && !at_field_or_method {
            break;
@ -947,9 +947,8 @@ fn for_loop(p: &mut Parser) {
    let mut seen = HashSet::new();
    pattern(p, false, &mut seen, None);

-    let m2 = p.marker();
-    if p.eat_if(SyntaxKind::Comma) {
-        let node = &mut p[m2];
+    if p.at(SyntaxKind::Comma) {
+        let node = p.eat_and_get();
        node.unexpected();
        node.hint("destructuring patterns must be wrapped in parentheses");
        if p.at_set(set::PATTERN) {
@ -1563,6 +1562,9 @@ struct Parser<'s> {
    current_start: usize,
    /// The [`SyntaxKind`] of the current token.
    current: SyntaxKind,
+    /// The [`SyntaxNode`] of the current token, ready to be eaten and pushed
+    /// onto the end of `nodes`.
+    current_node: SyntaxNode,
    /// Whether the parser has the expected set of open/close delimiters. This
    /// only ever transitions from `true` to `false`.
    balanced: bool,
@ -1603,13 +1605,14 @@ impl<'s> Parser<'s> {
    fn new(text: &'s str, offset: usize, mode: LexMode) -> Self {
        let mut lexer = Lexer::new(text, mode);
        lexer.jump(offset);
-        let current = lexer.next();
+        let (current, current_node) = lexer.next();
        Self {
            lexer,
            text,
            prev_end: offset,
            current_start: offset,
            current,
+            current_node,
            balanced: true,
            nodes: vec![],
            modes: vec![],
@ -1722,7 +1725,8 @@ impl<'s> Parser<'s> {

    /// Convert the current token's [`SyntaxKind`] and eat it.
    fn convert_and_eat(&mut self, kind: SyntaxKind) {
-        self.current = kind;
+        // Only need to replace the node here.
+        self.current_node.convert_to_kind(kind);
        self.eat();
    }

@ -1848,13 +1852,7 @@ impl<'s> Parser<'s> {

    /// Save the current token to the `nodes` vector as an Inner or Error node.
    fn save(&mut self) {
-        let text = self.current_text();
-        if self.at(SyntaxKind::Error) {
-            let error = self.lexer.take_error().unwrap();
-            self.nodes.push(SyntaxNode::error(error, text));
-        } else {
-            self.nodes.push(SyntaxNode::leaf(self.current, text));
-        }
+        self.nodes.push(self.current_node.clone());

        if self.lexer.mode() == LexMode::Markup || !self.current.is_trivia() {
            self.prev_end = self.current_end();
@ -1864,7 +1862,7 @@ impl<'s> Parser<'s> {
    /// Find the kind of the next non-trivia token in the lexer.
    fn next_non_trivia(lexer: &mut Lexer<'s>) -> SyntaxKind {
        loop {
-            let next = lexer.next();
+            let next = lexer.next().0;
            // Loop is terminable, because `SyntaxKind::End` is not a trivia.
            if !next.is_trivia() {
                break next;
@ -1876,7 +1874,7 @@ impl<'s> Parser<'s> {
    /// might insert a temporary [`SyntaxKind::End`] based on our newline mode.
    fn lex(&mut self) {
        self.current_start = self.lexer.cursor();
-        self.current = self.lexer.next();
+        (self.current, self.current_node) = self.lexer.next();

        // Special cases to handle newlines in Code.
        if self.lexer.mode() == LexMode::Code
@ -1931,6 +1929,7 @@ struct PartialState {
    prev_end: usize,
    current_start: usize,
    current: SyntaxKind,
+    current_node: SyntaxNode,
 }

 impl<'s> Parser<'s> {
@ -1975,6 +1974,7 @@ impl<'s> Parser<'s> {
        self.prev_end = state.prev_end;
        self.current_start = state.current_start;
        self.current = state.current;
+        self.current_node = state.current_node;
    }

    /// Save a checkpoint of the parser state.
@ -1986,6 +1986,7 @@ impl<'s> Parser<'s> {
            prev_end: self.prev_end,
            current_start: self.current_start,
            current: self.current,
+            current_node: self.current_node.clone(),
        };
        Checkpoint { node_len, state }
    }