From a320e92cfad7b97cdf0e36b45db87863c128798e Mon Sep 17 00:00:00 2001 From: PgBiel <9021226+PgBiel@users.noreply.github.com> Date: Thu, 20 Jun 2024 23:37:22 -0300 Subject: [PATCH] lexer now returns nodes --- crates/typst-syntax/src/lexer.rs | 61 +++++++------- crates/typst-syntax/src/node.rs | 5 ++ crates/typst-syntax/src/parser.rs | 131 +++++++++++++----------------- 3 files changed, 94 insertions(+), 103 deletions(-) diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs index e1fb00268..cd681d9c2 100644 --- a/crates/typst-syntax/src/lexer.rs +++ b/crates/typst-syntax/src/lexer.rs @@ -4,7 +4,7 @@ use unicode_script::{Script, UnicodeScript}; use unicode_segmentation::UnicodeSegmentation; use unscanny::Scanner; -use crate::{SyntaxError, SyntaxKind}; +use crate::{SyntaxError, SyntaxKind, SyntaxNode}; /// Splits up a string of source code into tokens. #[derive(Clone)] @@ -18,10 +18,6 @@ pub(super) struct Lexer<'s> { newline: bool, /// The state held by raw line lexing. raw: Vec<(SyntaxKind, usize)>, - /// The subtree of tokens associated with this token. - /// The parser is responsible for converting this subtree into syntax nodes - /// matching this structure. - subtree: Vec<(SyntaxKind, usize)>, /// An error for the last token. error: Option, } @@ -49,7 +45,6 @@ impl<'s> Lexer<'s> { newline: false, error: None, raw: Vec::new(), - subtree: Vec::new(), } } @@ -104,24 +99,24 @@ impl Lexer<'_> { impl Lexer<'_> { /// Proceed to the next token and return its [`SyntaxKind`]. Note the /// token could be a [trivia](SyntaxKind::is_trivia). - pub fn next(&mut self) -> SyntaxKind { + pub fn next(&mut self) -> SyntaxNode { if self.mode == LexMode::Raw { let Some((kind, end)) = self.raw.pop() else { - return SyntaxKind::End; + return SyntaxNode::end(); }; + let start = self.s.cursor(); self.s.jump(end); - return kind; + return self.emit_token(kind, start); } self.newline = false; self.error = None; - self.subtree.clear(); let start = self.s.cursor(); - match self.s.eat() { + let token = match self.s.eat() { Some(c) if is_space(c, self.mode) => self.whitespace(start, c), Some('/') if self.s.eat_if('/') => self.line_comment(), Some('/') if self.s.eat_if('*') => self.block_comment(), - Some('/') if self.s.eat_if('!') => self.decorator(), + Some('/') if self.s.eat_if('!') => return self.decorator(start), Some('*') if self.s.eat_if('/') => { let kind = self.error("unexpected end of block comment"); self.hint( @@ -138,12 +133,22 @@ impl Lexer<'_> { }, None => SyntaxKind::End, - } + }; + + self.emit_token(token, start) } - /// Takes the subtree associated with the latest token. - pub fn take_subtree(&mut self) -> Vec<(SyntaxKind, usize)> { - std::mem::take(&mut self.subtree) + /// Converts a token into a syntax node based on its kind. + /// Produces an error node if there are errors. + fn emit_token(&mut self, kind: SyntaxKind, start: usize) -> SyntaxNode { + let text = self.s.from(start); + if kind == SyntaxKind::End { + SyntaxNode::end() + } else if let Some(error) = self.take_error() { + SyntaxNode::error(error, text) + } else { + SyntaxNode::leaf(kind, text) + } } /// Eat whitespace characters greedily. @@ -192,34 +197,32 @@ impl Lexer<'_> { SyntaxKind::BlockComment } - fn decorator(&mut self) -> SyntaxKind { - let mut start = self.s.cursor(); + fn decorator(&mut self, start: usize) -> SyntaxNode { + // TODO: DecoratorMarker node + let mut current_start = start; + let mut subtree = vec![]; while !self.s.peek().is_some_and(is_newline) { let token = match self.s.eat() { - Some(c) if is_space(c, self.mode) => self.whitespace(start, c), + Some(c) if is_space(c, self.mode) => self.whitespace(current_start, c), Some('/') if self.s.eat_if('/') => break, Some('/') if self.s.eat_if('*') => self.block_comment(), Some('(') => SyntaxKind::LeftParen, Some(')') => SyntaxKind::RightParen, Some('"') => self.string(), - Some(c @ '0'..='9') => self.number(start, c), + Some(c @ '0'..='9') => self.number(current_start, c), Some(',') => SyntaxKind::Comma, - Some(c) if is_id_start(c) => self.ident(start), + Some(c) if is_id_start(c) => self.ident(current_start), Some(c) => self .error(eco_format!("the character {c} is not valid in a decorator")), None => break, }; - if token.is_error() { - return token; - } - - let end = self.s.cursor(); - self.subtree.push((token, end)); - start = end; + let node = self.emit_token(token, current_start); + subtree.push(node); + current_start = self.s.cursor(); } - SyntaxKind::Decorator + SyntaxNode::inner(SyntaxKind::Decorator, subtree) } } diff --git a/crates/typst-syntax/src/node.rs b/crates/typst-syntax/src/node.rs index 6a7f470bc..1f0244cc9 100644 --- a/crates/typst-syntax/src/node.rs +++ b/crates/typst-syntax/src/node.rs @@ -39,6 +39,11 @@ impl SyntaxNode { Self(Repr::Error(Arc::new(ErrorNode::new(error, text)))) } + /// Create a new end node. It is only used to terminate the token stream. + pub const fn end() -> Self { + Self::placeholder(SyntaxKind::End) + } + /// Create a dummy node of the given kind. /// /// Panics if `kind` is `SyntaxKind::Error`. diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index e1d4bb951..ed81d0e20 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -44,7 +44,7 @@ fn markup( let m = p.marker(); let mut nesting: usize = 0; while !p.end() { - match p.current() { + match p.current_kind() { SyntaxKind::LeftBracket => nesting += 1, SyntaxKind::RightBracket if nesting > 0 => nesting -= 1, _ if stop(p) => break, @@ -79,10 +79,10 @@ pub(super) fn reparse_markup( ) -> Option> { let mut p = Parser::new(text, range.start, LexMode::Markup); while !p.end() && p.current_start() < range.end { - match p.current() { + match p.current_kind() { SyntaxKind::LeftBracket => *nesting += 1, SyntaxKind::RightBracket if *nesting > 0 => *nesting -= 1, - _ if stop(p.current()) => break, + _ if stop(p.current_kind()) => break, _ => {} } @@ -104,7 +104,7 @@ pub(super) fn reparse_markup( /// Parses a single markup expression: This includes markup elements like /// spaces, text, and headings, and embedded code expressions. fn markup_expr(p: &mut Parser, at_start: &mut bool) { - match p.current() { + match p.current_kind() { SyntaxKind::Space | SyntaxKind::Parbreak | SyntaxKind::LineComment @@ -203,7 +203,8 @@ fn heading(p: &mut Parser) { whitespace_line(p); markup(p, false, usize::MAX, |p| { p.at_set(END) - && (!p.at(SyntaxKind::Space) || p.lexer.clone().next() == SyntaxKind::Label) + && (!p.at(SyntaxKind::Space) + || p.lexer.clone().next().kind() == SyntaxKind::Label) }); p.wrap(m, SyntaxKind::Heading); } @@ -256,7 +257,7 @@ fn reference(p: &mut Parser) { /// Consumes whitespace that does not contain a newline. fn whitespace_line(p: &mut Parser) { - while !p.newline() && p.current().is_trivia() { + while !p.newline() && p.current_kind().is_trivia() { p.eat(); } } @@ -295,7 +296,7 @@ fn math_expr(p: &mut Parser) { fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) { let m = p.marker(); let mut continuable = false; - match p.current() { + match p.current_kind() { SyntaxKind::Hash => embedded_code_expr(p), SyntaxKind::MathIdent => { continuable = true; @@ -305,7 +306,7 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) { let start = copy.cursor(); let next = copy.next(); let end = copy.cursor(); - matches!(next, SyntaxKind::MathIdent | SyntaxKind::Text) + matches!(next.kind(), SyntaxKind::MathIdent | SyntaxKind::Text) && is_ident(&p.text[start..end]) } { p.convert(SyntaxKind::Dot); @@ -395,11 +396,11 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) { } // Separate primes and superscripts to different attachments. - if primed && p.current() == SyntaxKind::Hat { + if primed && p.current_kind() == SyntaxKind::Hat { p.wrap(m, SyntaxKind::MathAttach); } - let Some((kind, stop, assoc, mut prec)) = math_op(p.current()) else { + let Some((kind, stop, assoc, mut prec)) = math_op(p.current_kind()) else { // No attachments, so we need to wrap primes as attachment. if primed { p.wrap(m, SyntaxKind::MathAttach); @@ -667,7 +668,7 @@ fn embedded_code_expr(p: &mut Parser) { code_expr_prec(p, true, 0); // Consume error for things like `#12p` or `#"abc\"`.# - if !at && !p.current().is_trivia() && !p.end() { + if !at && !p.current_kind().is_trivia() && !p.end() { p.unexpected(); } @@ -686,7 +687,7 @@ fn embedded_code_expr(p: &mut Parser) { fn code_expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) { let m = p.marker(); if !atomic && p.at_set(set::UNARY_OP) { - let op = ast::UnOp::from_kind(p.current()).unwrap(); + let op = ast::UnOp::from_kind(p.current_kind()).unwrap(); p.eat(); code_expr_prec(p, atomic, op.precedence()); p.wrap(m, SyntaxKind::Unary); @@ -702,8 +703,8 @@ fn code_expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) { continue; } - let at_field_or_method = - p.directly_at(SyntaxKind::Dot) && p.lexer.clone().next() == SyntaxKind::Ident; + let at_field_or_method = p.directly_at(SyntaxKind::Dot) + && p.lexer.clone().next().kind() == SyntaxKind::Ident; if atomic && !at_field_or_method { break; @@ -716,7 +717,7 @@ fn code_expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) { } let binop = if p.at_set(set::BINARY_OP) { - ast::BinOp::from_kind(p.current()) + ast::BinOp::from_kind(p.current_kind()) } else if min_prec <= ast::BinOp::NotIn.precedence() && p.eat_if(SyntaxKind::Not) { if p.at(SyntaxKind::In) { @@ -755,7 +756,7 @@ fn code_expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) { /// composed of. fn code_primary(p: &mut Parser, atomic: bool) { let m = p.marker(); - match p.current() { + match p.current_kind() { SyntaxKind::Ident => { p.eat(); if !atomic && p.at(SyntaxKind::Arrow) { @@ -813,7 +814,7 @@ fn code_primary(p: &mut Parser, atomic: bool) { /// Parses a content or code block. fn block(p: &mut Parser) { - match p.current() { + match p.current_kind() { SyntaxKind::LeftBracket => content_block(p), SyntaxKind::LeftBrace => code_block(p), _ => p.expected("block"), @@ -1004,7 +1005,7 @@ fn module_import(p: &mut Parser) { /// Parses items to import from a module: `a, b, c`. fn import_items(p: &mut Parser) { let m = p.marker(); - while !p.current().is_terminator() { + while !p.current_kind().is_terminator() { let item_marker = p.marker(); if !p.eat_if(SyntaxKind::Ident) { p.unexpected(); @@ -1023,7 +1024,7 @@ fn import_items(p: &mut Parser) { p.wrap(item_marker, SyntaxKind::RenamedImportItem); } - if !p.current().is_terminator() { + if !p.current_kind().is_terminator() { p.expect(SyntaxKind::Comma); } } @@ -1148,7 +1149,7 @@ fn parenthesized_or_array_or_dict(p: &mut Parser) -> SyntaxKind { state.maybe_just_parens = false; } - while !p.current().is_terminator() { + while !p.current_kind().is_terminator() { if !p.at_set(set::ARRAY_OR_DICT_ITEM) { p.unexpected(); continue; @@ -1157,7 +1158,7 @@ fn parenthesized_or_array_or_dict(p: &mut Parser) -> SyntaxKind { array_or_dict_item(p, &mut state); state.count += 1; - if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) { + if !p.current_kind().is_terminator() && p.expect(SyntaxKind::Comma) { state.maybe_just_parens = false; } } @@ -1248,7 +1249,7 @@ fn args(p: &mut Parser) { p.assert(SyntaxKind::LeftParen); let mut seen = HashSet::new(); - while !p.current().is_terminator() { + while !p.current_kind().is_terminator() { if !p.at_set(set::ARG) { p.unexpected(); continue; @@ -1256,7 +1257,7 @@ fn args(p: &mut Parser) { arg(p, &mut seen); - if !p.current().is_terminator() { + if !p.current_kind().is_terminator() { p.expect(SyntaxKind::Comma); } } @@ -1313,7 +1314,7 @@ fn params(p: &mut Parser) { let mut seen = HashSet::new(); let mut sink = false; - while !p.current().is_terminator() { + while !p.current_kind().is_terminator() { if !p.at_set(set::PARAM) { p.unexpected(); continue; @@ -1321,7 +1322,7 @@ fn params(p: &mut Parser) { param(p, &mut seen, &mut sink); - if !p.current().is_terminator() { + if !p.current_kind().is_terminator() { p.expect(SyntaxKind::Comma); } } @@ -1370,7 +1371,7 @@ fn pattern<'s>( seen: &mut HashSet<&'s str>, dupe: Option<&'s str>, ) { - match p.current() { + match p.current_kind() { SyntaxKind::Underscore => p.eat(), SyntaxKind::LeftParen => destructuring_or_parenthesized(p, reassignment, seen), _ => pattern_leaf(p, reassignment, seen, dupe), @@ -1391,7 +1392,7 @@ fn destructuring_or_parenthesized<'s>( p.enter_newline_mode(NewlineMode::Continue); p.assert(SyntaxKind::LeftParen); - while !p.current().is_terminator() { + while !p.current_kind().is_terminator() { if !p.at_set(set::DESTRUCTURING_ITEM) { p.unexpected(); continue; @@ -1400,7 +1401,7 @@ fn destructuring_or_parenthesized<'s>( destructuring_item(p, reassignment, seen, &mut maybe_just_parens, &mut sink); count += 1; - if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) { + if !p.current_kind().is_terminator() && p.expect(SyntaxKind::Comma) { maybe_just_parens = false; } } @@ -1466,7 +1467,7 @@ fn pattern_leaf<'s>( seen: &mut HashSet<&'s str>, dupe: Option<&'s str>, ) { - if p.current().is_keyword() { + if p.current_kind().is_keyword() { p.eat_and_get().expected("pattern"); return; } else if !p.at_set(set::PATTERN_LEAF) { @@ -1503,7 +1504,7 @@ struct Parser<'s> { lexer: Lexer<'s>, prev_end: usize, current_start: usize, - current: SyntaxKind, + current: SyntaxNode, balanced: bool, nodes: Vec, modes: Vec, @@ -1531,7 +1532,7 @@ struct Checkpoint<'s> { lexer: Lexer<'s>, prev_end: usize, current_start: usize, - current: SyntaxKind, + current: SyntaxNode, nodes: usize, } @@ -1563,8 +1564,8 @@ impl<'s> Parser<'s> { self.prev_end } - fn current(&self) -> SyntaxKind { - self.current + fn current_kind(&self) -> SyntaxKind { + self.current.kind() } fn current_start(&self) -> usize { @@ -1580,11 +1581,11 @@ impl<'s> Parser<'s> { } fn at(&self, kind: SyntaxKind) -> bool { - self.current == kind + self.current.kind() == kind } fn at_set(&self, set: SyntaxSet) -> bool { - set.contains(self.current) + set.contains(self.current.kind()) } fn end(&self) -> bool { @@ -1592,20 +1593,18 @@ impl<'s> Parser<'s> { } fn directly_at(&self, kind: SyntaxKind) -> bool { - self.current == kind && self.prev_end == self.current_start + self.current.kind() == kind && self.prev_end == self.current_start } fn eat(&mut self) { - self.save(); - self.lex(); + self.save_and_lex(); self.skip(); } #[track_caller] fn eat_and_get(&mut self) -> &mut SyntaxNode { let offset = self.nodes.len(); - self.save(); - self.lex(); + self.save_and_lex(); self.skip(); &mut self.nodes[offset] } @@ -1633,12 +1632,12 @@ impl<'s> Parser<'s> { #[track_caller] fn assert(&mut self, kind: SyntaxKind) { - assert_eq!(self.current, kind); + assert_eq!(self.current_kind(), kind); self.eat(); } fn convert(&mut self, kind: SyntaxKind) { - self.current = kind; + self.current.convert_to_kind(kind); self.eat(); } @@ -1727,7 +1726,7 @@ impl<'s> Parser<'s> { lexer: self.lexer.clone(), prev_end: self.prev_end, current_start: self.current_start, - current: self.current, + current: self.current.clone(), nodes: self.nodes.len(), } } @@ -1742,9 +1741,8 @@ impl<'s> Parser<'s> { fn skip(&mut self) { if self.lexer.mode() != LexMode::Markup { - while self.current.is_trivia() { - self.save(); - self.lex(); + while self.current_kind().is_trivia() { + self.save_and_lex(); } } } @@ -1760,40 +1758,25 @@ impl<'s> Parser<'s> { } } - fn save(&mut self) { - let text = self.current_text(); - let subtree = self.lexer.take_subtree(); - if self.at(SyntaxKind::Error) { - let error = self.lexer.take_error().unwrap(); - self.nodes.push(SyntaxNode::error(error, text)); - } else if !subtree.is_empty() { - let mut text_cursor = self.current_start; - let mut children = Vec::with_capacity(subtree.len()); + fn save_and_lex(&mut self) { + // Replace 'current' with a placeholder node until we lex. + let current = std::mem::replace(&mut self.current, SyntaxNode::end()); - for (kind, end) in subtree { - // Ensure no errors in the subtree - assert!(!kind.is_error()); - - children.push(SyntaxNode::leaf(kind, &self.text[text_cursor..end])); - text_cursor = end; - } - - self.nodes.push(SyntaxNode::inner(self.current, children)); - } else { - self.nodes.push(SyntaxNode::leaf(self.current, text)); - } - - if self.lexer.mode() == LexMode::Markup || !self.current.is_trivia() { + if self.lexer.mode() == LexMode::Markup || !current.kind().is_trivia() { self.prev_end = self.current_end(); } + + self.nodes.push(current); + + self.lex(); } fn next_non_trivia(lexer: &mut Lexer<'s>) -> SyntaxKind { loop { let next = lexer.next(); // Loop is terminatable, because SyntaxKind::End is not a trivia. - if !next.is_trivia() { - break next; + if !next.kind().is_trivia() { + break next.kind(); } } } @@ -1815,7 +1798,7 @@ impl<'s> Parser<'s> { None => false, } { - self.current = SyntaxKind::End; + self.current = SyntaxNode::end(); } } } @@ -1826,7 +1809,7 @@ impl<'s> Parser<'s> { let at = self.at(kind); if at { self.eat(); - } else if kind == SyntaxKind::Ident && self.current.is_keyword() { + } else if kind == SyntaxKind::Ident && self.current_kind().is_keyword() { self.trim_errors(); self.eat_and_get().expected(kind.name()); } else { @@ -1872,7 +1855,7 @@ impl<'s> Parser<'s> { /// unexpected. fn unexpected(&mut self) { self.trim_errors(); - self.balanced &= !self.current.is_grouping(); + self.balanced &= !self.current_kind().is_grouping(); self.eat_and_get().unexpected(); }