diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs index 721225c6e..cdd4121c9 100644 --- a/crates/typst-syntax/src/lexer.rs +++ b/crates/typst-syntax/src/lexer.rs @@ -4,12 +4,12 @@ use unicode_script::{Script, UnicodeScript}; use unicode_segmentation::UnicodeSegmentation; use unscanny::Scanner; -use crate::{SyntaxError, SyntaxKind}; +use crate::{SyntaxError, SyntaxKind, SyntaxNode}; -/// Splits up a string of source code into tokens. +/// An iterator over a source code string which returns tokens. #[derive(Clone)] pub(super) struct Lexer<'s> { - /// The underlying scanner. + /// The scanner: contains the underlying string and location as a "cursor". s: Scanner<'s>, /// The mode the lexer is in. This determines which kinds of tokens it /// produces. @@ -73,11 +73,6 @@ impl<'s> Lexer<'s> { pub fn newline(&self) -> bool { self.newline } - - /// Take out the last error, if any. - pub fn take_error(&mut self) -> Option { - self.error.take() - } } impl Lexer<'_> { @@ -97,21 +92,24 @@ impl Lexer<'_> { /// Shared methods with all [`LexMode`]. impl Lexer<'_> { - /// Proceed to the next token and return its [`SyntaxKind`]. Note the - /// token could be a [trivia](SyntaxKind::is_trivia). - pub fn next(&mut self) -> SyntaxKind { + /// Return the next token in our text. Returns both the [`SyntaxNode`] + /// and the raw [`SyntaxKind`] to make it more ergonomic to check the kind + pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) { + debug_assert!(self.error.is_none()); + let start = self.s.cursor(); if self.mode == LexMode::Raw { - let Some((kind, end)) = self.raw.pop() else { - return SyntaxKind::End; + let kind = if let Some((kind, end)) = self.raw.pop() { + self.s.jump(end); + kind + } else { + SyntaxKind::End }; - self.s.jump(end); - return kind; + let node = SyntaxNode::leaf(kind, self.s.from(start)); + return (kind, node); } self.newline = false; - self.error = None; - let start = self.s.cursor(); - match self.s.eat() { + let kind = match self.s.eat() { Some(c) if is_space(c, self.mode) => self.whitespace(start, c), Some('/') if self.s.eat_if('/') => self.line_comment(), Some('/') if self.s.eat_if('*') => self.block_comment(), @@ -132,13 +130,21 @@ impl Lexer<'_> { }, None => SyntaxKind::End, - } + }; + + let text = self.s.from(start); + let node = match self.error.take() { + Some(error) => SyntaxNode::error(error, text), + None => SyntaxNode::leaf(kind, text), + }; + (kind, node) } /// Eat whitespace characters greedily. fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind { let more = self.s.eat_while(|c| is_space(c, self.mode)); let newlines = match c { + // Optimize eating a single space. ' ' if more.is_empty() => 0, _ => count_newlines(self.s.from(start)), }; diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index 19e8adbbb..b69486411 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -185,7 +185,7 @@ fn heading(p: &mut Parser) { whitespace_line(p); markup(p, false, usize::MAX, |p| { p.at_set(syntax_set!(Label, Space, RightBracket)) - && (!p.at(SyntaxKind::Space) || p.lexer.clone().next() == SyntaxKind::Label) + && (!p.at(SyntaxKind::Space) || p.lexer.clone().next().0 == SyntaxKind::Label) }); p.wrap(m, SyntaxKind::Heading); } @@ -282,7 +282,7 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) { while p.directly_at(SyntaxKind::Text) && p.current_text() == "." && { let mut copy = p.lexer.clone(); let start = copy.cursor(); - let next = copy.next(); + let next = copy.next().0; let end = copy.cursor(); matches!(next, SyntaxKind::MathIdent | SyntaxKind::Text) && is_ident(&p.text[start..end]) @@ -686,8 +686,8 @@ fn code_expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) { continue; } - let at_field_or_method = - p.directly_at(SyntaxKind::Dot) && p.lexer.clone().next() == SyntaxKind::Ident; + let at_field_or_method = p.directly_at(SyntaxKind::Dot) + && p.lexer.clone().next().0 == SyntaxKind::Ident; if atomic && !at_field_or_method { break; @@ -947,9 +947,8 @@ fn for_loop(p: &mut Parser) { let mut seen = HashSet::new(); pattern(p, false, &mut seen, None); - let m2 = p.marker(); - if p.eat_if(SyntaxKind::Comma) { - let node = &mut p[m2]; + if p.at(SyntaxKind::Comma) { + let node = p.eat_and_get(); node.unexpected(); node.hint("destructuring patterns must be wrapped in parentheses"); if p.at_set(set::PATTERN) { @@ -1563,6 +1562,9 @@ struct Parser<'s> { current_start: usize, /// The [`SyntaxKind`] of the current token. current: SyntaxKind, + /// The [`SyntaxNode`] of the current token, ready to be eaten and pushed + /// onto the end of `nodes`. + current_node: SyntaxNode, /// Whether the parser has the expected set of open/close delimiters. This /// only ever transitions from `true` to `false`. balanced: bool, @@ -1603,13 +1605,14 @@ impl<'s> Parser<'s> { fn new(text: &'s str, offset: usize, mode: LexMode) -> Self { let mut lexer = Lexer::new(text, mode); lexer.jump(offset); - let current = lexer.next(); + let (current, current_node) = lexer.next(); Self { lexer, text, prev_end: offset, current_start: offset, current, + current_node, balanced: true, nodes: vec![], modes: vec![], @@ -1722,7 +1725,8 @@ impl<'s> Parser<'s> { /// Convert the current token's [`SyntaxKind`] and eat it. fn convert_and_eat(&mut self, kind: SyntaxKind) { - self.current = kind; + // Only need to replace the node here. + self.current_node.convert_to_kind(kind); self.eat(); } @@ -1848,13 +1852,7 @@ impl<'s> Parser<'s> { /// Save the current token to the `nodes` vector as an Inner or Error node. fn save(&mut self) { - let text = self.current_text(); - if self.at(SyntaxKind::Error) { - let error = self.lexer.take_error().unwrap(); - self.nodes.push(SyntaxNode::error(error, text)); - } else { - self.nodes.push(SyntaxNode::leaf(self.current, text)); - } + self.nodes.push(self.current_node.clone()); if self.lexer.mode() == LexMode::Markup || !self.current.is_trivia() { self.prev_end = self.current_end(); @@ -1864,7 +1862,7 @@ impl<'s> Parser<'s> { /// Find the kind of the next non-trivia token in the lexer. fn next_non_trivia(lexer: &mut Lexer<'s>) -> SyntaxKind { loop { - let next = lexer.next(); + let next = lexer.next().0; // Loop is terminable, because `SyntaxKind::End` is not a trivia. if !next.is_trivia() { break next; @@ -1876,7 +1874,7 @@ impl<'s> Parser<'s> { /// might insert a temporary [`SyntaxKind::End`] based on our newline mode. fn lex(&mut self) { self.current_start = self.lexer.cursor(); - self.current = self.lexer.next(); + (self.current, self.current_node) = self.lexer.next(); // Special cases to handle newlines in Code. if self.lexer.mode() == LexMode::Code @@ -1931,6 +1929,7 @@ struct PartialState { prev_end: usize, current_start: usize, current: SyntaxKind, + current_node: SyntaxNode, } impl<'s> Parser<'s> { @@ -1975,6 +1974,7 @@ impl<'s> Parser<'s> { self.prev_end = state.prev_end; self.current_start = state.current_start; self.current = state.current; + self.current_node = state.current_node; } /// Save a checkpoint of the parser state. @@ -1986,6 +1986,7 @@ impl<'s> Parser<'s> { prev_end: self.prev_end, current_start: self.current_start, current: self.current, + current_node: self.current_node.clone(), }; Checkpoint { node_len, state } }