diff --git a/Cargo.lock b/Cargo.lock index 4709fb5b4..5c148c117 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3018,6 +3018,7 @@ dependencies = [ "typst-pdf", "typst-render", "typst-svg", + "typst-syntax", "unscanny", "walkdir", ] diff --git a/crates/typst-syntax/README.md b/crates/typst-syntax/README.md new file mode 100644 index 000000000..ced4096ef --- /dev/null +++ b/crates/typst-syntax/README.md @@ -0,0 +1,40 @@ +# typst-syntax + +Welcome to the Typst Syntax crate! This crate manages the syntactical structure +of Typst by holding some core abstractions like assigning source file ids, +parsing Typst syntax, creating an Abstract Syntax Tree (AST), initializing +source "spans" (for linking AST elements to their outputs in a document), and +syntax highlighting. + +Below are quick descriptions of the files you might be editing if you find +yourself here :) + +- `lexer.rs`: The lexical foundation of the parser, which converts a string of + characters into tokens. +- `parser.rs`: The main parser definition, preparing a Concrete Syntax Tree made + of nested vectors of `SyntaxNode`s. +- `reparser.rs`: The algorithm for reparsing the minimal required amount of + source text for efficient incremental compilation. +- `ast.rs`: The conversion layer between the Concrete Syntax Tree of the parser + and the Abstract Syntax Tree used for code evaluation. +- `node.rs` & `span.rs`: The underlying data structure for the Concrete Syntax + Tree and the definitions of source spans used for efficiently pointing to a + syntax node in things like diagnostics. +- `kind.rs` & `set.rs`: An enum with all syntactical tokens and nodes and + bit-set data structure for sets of `SyntaxKind`s. +- `highlight.rs`: Extracting of syntax highlighting information out of the + Concrete Syntax Tree (and outputting as HTML). +- `path.rs`, `file.rs`, `package.rs`: The system for interning project and + package paths as unique file IDs and resolving them in a virtual filesystem + (not actually for _opening_ files). + +The structure of the parser is largely adapted from Rust Analyzer. Their +[documentation][ra] is a good reference for a number of the design decisions +around the parser and AST. + +The reparsing algorithm is explained in Section 4 of [Martin's thesis][thesis] +(though it changed a bit since). + +[ra]: https://github.com/rust-lang/rust-analyzer/blob/master/docs/dev/syntax.md +[thesis]: + https://www.researchgate.net/publication/364622490_Fast_Typesetting_with_Incremental_Compilation diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs index 721225c6e..1314016fa 100644 --- a/crates/typst-syntax/src/lexer.rs +++ b/crates/typst-syntax/src/lexer.rs @@ -4,20 +4,18 @@ use unicode_script::{Script, UnicodeScript}; use unicode_segmentation::UnicodeSegmentation; use unscanny::Scanner; -use crate::{SyntaxError, SyntaxKind}; +use crate::{SyntaxError, SyntaxKind, SyntaxNode}; -/// Splits up a string of source code into tokens. +/// An iterator over a source code string which returns tokens. #[derive(Clone)] pub(super) struct Lexer<'s> { - /// The underlying scanner. + /// The scanner: contains the underlying string and location as a "cursor". s: Scanner<'s>, /// The mode the lexer is in. This determines which kinds of tokens it /// produces. mode: LexMode, /// Whether the last token contained a newline. newline: bool, - /// The state held by raw line lexing. - raw: Vec<(SyntaxKind, usize)>, /// An error for the last token. error: Option, } @@ -31,8 +29,6 @@ pub(super) enum LexMode { Math, /// Keywords, literals and operators. Code, - /// The contents of a raw block. - Raw, } impl<'s> Lexer<'s> { @@ -44,7 +40,6 @@ impl<'s> Lexer<'s> { mode, newline: false, error: None, - raw: Vec::new(), } } @@ -74,9 +69,11 @@ impl<'s> Lexer<'s> { self.newline } - /// Take out the last error, if any. - pub fn take_error(&mut self) -> Option { - self.error.take() + /// The number of characters until the most recent newline from an index. + pub fn column(&self, index: usize) -> usize { + let mut s = self.s; // Make a new temporary scanner (cheap). + s.jump(index); + s.before().chars().rev().take_while(|&c| !is_newline(c)).count() } } @@ -97,21 +94,14 @@ impl Lexer<'_> { /// Shared methods with all [`LexMode`]. impl Lexer<'_> { - /// Proceed to the next token and return its [`SyntaxKind`]. Note the - /// token could be a [trivia](SyntaxKind::is_trivia). - pub fn next(&mut self) -> SyntaxKind { - if self.mode == LexMode::Raw { - let Some((kind, end)) = self.raw.pop() else { - return SyntaxKind::End; - }; - self.s.jump(end); - return kind; - } + /// Return the next token in our text. Returns both the [`SyntaxNode`] + /// and the raw [`SyntaxKind`] to make it more ergonomic to check the kind + pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) { + debug_assert!(self.error.is_none()); + let start = self.s.cursor(); self.newline = false; - self.error = None; - let start = self.s.cursor(); - match self.s.eat() { + let kind = match self.s.eat() { Some(c) if is_space(c, self.mode) => self.whitespace(start, c), Some('/') if self.s.eat_if('/') => self.line_comment(), Some('/') if self.s.eat_if('*') => self.block_comment(), @@ -123,22 +113,32 @@ impl Lexer<'_> { ); kind } - + Some('`') if self.mode != LexMode::Math => return self.raw(), Some(c) => match self.mode { LexMode::Markup => self.markup(start, c), - LexMode::Math => self.math(start, c), + LexMode::Math => match self.math(start, c) { + (kind, None) => kind, + (kind, Some(node)) => return (kind, node), + }, LexMode::Code => self.code(start, c), - LexMode::Raw => unreachable!(), }, None => SyntaxKind::End, - } + }; + + let text = self.s.from(start); + let node = match self.error.take() { + Some(error) => SyntaxNode::error(error, text), + None => SyntaxNode::leaf(kind, text), + }; + (kind, node) } /// Eat whitespace characters greedily. fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind { let more = self.s.eat_while(|c| is_space(c, self.mode)); let newlines = match c { + // Optimize eating a single space. ' ' if more.is_empty() => 0, _ => count_newlines(self.s.from(start)), }; @@ -187,7 +187,6 @@ impl Lexer<'_> { fn markup(&mut self, start: usize, c: char) -> SyntaxKind { match c { '\\' => self.backslash(), - '`' => self.raw(), 'h' if self.s.eat_if("ttp://") => self.link(), 'h' if self.s.eat_if("ttps://") => self.link(), '<' if self.s.at(is_id_continue) => self.label(), @@ -252,9 +251,10 @@ impl Lexer<'_> { } } - fn raw(&mut self) -> SyntaxKind { + /// Lex an entire raw segment at once. This is a convenience to avoid going + /// to and from the parser for each raw section. + fn raw(&mut self) -> (SyntaxKind, SyntaxNode) { let start = self.s.cursor() - 1; - self.raw.clear(); // Determine number of opening backticks. let mut backticks = 1; @@ -264,9 +264,11 @@ impl Lexer<'_> { // Special case for ``. if backticks == 2 { - self.push_raw(SyntaxKind::RawDelim); - self.s.jump(start + 1); - return SyntaxKind::RawDelim; + let nodes = vec![ + SyntaxNode::leaf(SyntaxKind::RawDelim, "`"), + SyntaxNode::leaf(SyntaxKind::RawDelim, "`"), + ]; + return (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes)); } // Find end of raw text. @@ -275,43 +277,55 @@ impl Lexer<'_> { match self.s.eat() { Some('`') => found += 1, Some(_) => found = 0, - None => break, + None => { + let msg = SyntaxError::new("unclosed raw text"); + let error = SyntaxNode::error(msg, self.s.from(start)); + return (SyntaxKind::Error, error); + } } } - - if found != backticks { - return self.error("unclosed raw text"); - } - let end = self.s.cursor(); - if backticks >= 3 { - self.blocky_raw(start, end, backticks); - } else { - self.inline_raw(start, end, backticks); - } - // Closing delimiter. - self.push_raw(SyntaxKind::RawDelim); + let mut nodes = Vec::with_capacity(3); // Will have at least 3. - // The saved tokens will be removed in reverse. - self.raw.reverse(); + // A closure for pushing a node onto our raw vector. Assumes the caller + // will move the scanner to the next location at each step. + let mut prev_start = start; + let mut push_raw = |kind, s: &Scanner| { + nodes.push(SyntaxNode::leaf(kind, s.from(prev_start))); + prev_start = s.cursor(); + }; // Opening delimiter. self.s.jump(start + backticks); - SyntaxKind::RawDelim + push_raw(SyntaxKind::RawDelim, &self.s); + + if backticks >= 3 { + self.blocky_raw(end - backticks, &mut push_raw); + } else { + self.inline_raw(end - backticks, &mut push_raw); + } + + // Closing delimiter. + self.s.jump(end); + push_raw(SyntaxKind::RawDelim, &self.s); + + (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes)) } - fn blocky_raw(&mut self, start: usize, end: usize, backticks: usize) { + fn blocky_raw(&mut self, inner_end: usize, mut push_raw: F) + where + F: FnMut(SyntaxKind, &Scanner), + { // Language tag. - self.s.jump(start + backticks); if self.s.eat_if(is_id_start) { self.s.eat_while(is_id_continue); - self.push_raw(SyntaxKind::RawLang); + push_raw(SyntaxKind::RawLang, &self.s); } // Determine inner content between backticks. self.s.eat_if(' '); - let inner = self.s.to(end - backticks); + let inner = self.s.to(inner_end); // Determine dedent level. let mut lines = split_newlines(inner); @@ -357,41 +371,32 @@ impl Lexer<'_> { let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum(); self.s.eat_newline(); self.s.advance(offset); - self.push_raw(SyntaxKind::RawTrimmed); + push_raw(SyntaxKind::RawTrimmed, &self.s); self.s.advance(line.len() - offset); - self.push_raw(SyntaxKind::Text); + push_raw(SyntaxKind::Text, &self.s); } // Add final trimmed. - if self.s.cursor() < end - backticks { - self.s.jump(end - backticks); - self.push_raw(SyntaxKind::RawTrimmed); + if self.s.cursor() < inner_end { + self.s.jump(inner_end); + push_raw(SyntaxKind::RawTrimmed, &self.s); } - self.s.jump(end); } - fn inline_raw(&mut self, start: usize, end: usize, backticks: usize) { - self.s.jump(start + backticks); - - while self.s.cursor() < end - backticks { + fn inline_raw(&mut self, inner_end: usize, mut push_raw: F) + where + F: FnMut(SyntaxKind, &Scanner), + { + while self.s.cursor() < inner_end { if self.s.at(is_newline) { - self.push_raw(SyntaxKind::Text); + push_raw(SyntaxKind::Text, &self.s); self.s.eat_newline(); - self.push_raw(SyntaxKind::RawTrimmed); + push_raw(SyntaxKind::RawTrimmed, &self.s); continue; } self.s.eat(); } - self.push_raw(SyntaxKind::Text); - - self.s.jump(end); - } - - /// Push the current cursor that marks the end of a raw segment of - /// the given `kind`. - fn push_raw(&mut self, kind: SyntaxKind) { - let end = self.s.cursor(); - self.raw.push((kind, end)); + push_raw(SyntaxKind::Text, &self.s); } fn link(&mut self) -> SyntaxKind { @@ -512,8 +517,8 @@ impl Lexer<'_> { /// Math. impl Lexer<'_> { - fn math(&mut self, start: usize, c: char) -> SyntaxKind { - match c { + fn math(&mut self, start: usize, c: char) -> (SyntaxKind, Option) { + let kind = match c { '\\' => self.backslash(), '"' => self.string(), @@ -566,11 +571,41 @@ impl Lexer<'_> { // Identifiers. c if is_math_id_start(c) && self.s.at(is_math_id_continue) => { self.s.eat_while(is_math_id_continue); - SyntaxKind::MathIdent + let (kind, node) = self.math_ident_or_field(start); + return (kind, Some(node)); } // Other math atoms. _ => self.math_text(start, c), + }; + (kind, None) + } + + /// Parse a single `MathIdent` or an entire `FieldAccess`. + fn math_ident_or_field(&mut self, start: usize) -> (SyntaxKind, SyntaxNode) { + let mut kind = SyntaxKind::MathIdent; + let mut node = SyntaxNode::leaf(kind, self.s.from(start)); + while let Some(ident) = self.maybe_dot_ident() { + kind = SyntaxKind::FieldAccess; + let field_children = vec![ + node, + SyntaxNode::leaf(SyntaxKind::Dot, '.'), + SyntaxNode::leaf(SyntaxKind::Ident, ident), + ]; + node = SyntaxNode::inner(kind, field_children); + } + (kind, node) + } + + /// If at a dot and a math identifier, eat and return the identifier. + fn maybe_dot_ident(&mut self) -> Option<&str> { + if self.s.scout(1).is_some_and(is_math_id_start) && self.s.eat_if('.') { + let ident_start = self.s.cursor(); + self.s.eat(); + self.s.eat_while(is_math_id_continue); + Some(self.s.from(ident_start)) + } else { + None } } @@ -599,7 +634,6 @@ impl Lexer<'_> { impl Lexer<'_> { fn code(&mut self, start: usize, c: char) -> SyntaxKind { match c { - '`' => self.raw(), '<' if self.s.at(is_id_continue) => self.label(), '0'..='9' => self.number(start, c), '.' if self.s.at(char::is_ascii_digit) => self.number(start, c), diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index a8bec6268..ea5b9155d 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -6,66 +6,57 @@ use ecow::{eco_format, EcoString}; use unicode_math_class::MathClass; use crate::set::{syntax_set, SyntaxSet}; -use crate::{ - ast, is_ident, is_newline, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode, -}; +use crate::{ast, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode}; -/// Parses a source file. +/// Parses a source file as top-level markup. pub fn parse(text: &str) -> SyntaxNode { let _scope = typst_timing::TimingScope::new("parse"); let mut p = Parser::new(text, 0, LexMode::Markup); - markup(&mut p, true, 0, |_| false); - p.finish().into_iter().next().unwrap() + markup_exprs(&mut p, true, syntax_set!(End)); + p.finish_into(SyntaxKind::Markup) } /// Parses top-level code. pub fn parse_code(text: &str) -> SyntaxNode { let _scope = typst_timing::TimingScope::new("parse code"); let mut p = Parser::new(text, 0, LexMode::Code); - let m = p.marker(); - p.skip(); - code_exprs(&mut p, |_| false); - p.wrap_all(m, SyntaxKind::Code); - p.finish().into_iter().next().unwrap() + code_exprs(&mut p, syntax_set!(End)); + p.finish_into(SyntaxKind::Code) } /// Parses top-level math. pub fn parse_math(text: &str) -> SyntaxNode { let _scope = typst_timing::TimingScope::new("parse math"); let mut p = Parser::new(text, 0, LexMode::Math); - math(&mut p, |_| false); - p.finish().into_iter().next().unwrap() + math_exprs(&mut p, syntax_set!(End)); + p.finish_into(SyntaxKind::Math) } -/// Parses the contents of a file or content block. -fn markup( - p: &mut Parser, - mut at_start: bool, - min_indent: usize, - mut stop: impl FnMut(&Parser) -> bool, -) { - let m = p.marker(); +/// Parses markup expressions until a stop condition is met. +fn markup(p: &mut Parser, at_start: bool, wrap_trivia: bool, stop_set: SyntaxSet) { + let m = if wrap_trivia { p.before_trivia() } else { p.marker() }; + markup_exprs(p, at_start, stop_set); + if wrap_trivia { + p.flush_trivia(); + } + p.wrap(m, SyntaxKind::Markup); +} + +/// Parses a sequence of markup expressions. +fn markup_exprs(p: &mut Parser, mut at_start: bool, stop_set: SyntaxSet) { + debug_assert!(stop_set.contains(SyntaxKind::End)); + at_start |= p.had_newline(); let mut nesting: usize = 0; - while !p.end() { + loop { match p.current() { SyntaxKind::LeftBracket => nesting += 1, SyntaxKind::RightBracket if nesting > 0 => nesting -= 1, - _ if stop(p) => break, + _ if p.at_set(stop_set) => break, _ => {} } - - if p.newline() { - at_start = true; - if min_indent > 0 && p.column(p.current_end()) < min_indent { - break; - } - p.eat(); - continue; - } - - markup_expr(p, &mut at_start); + markup_expr(p, at_start); + at_start = p.had_newline(); } - p.wrap(m, SyntaxKind::Markup); } /// Reparses a subsection of markup incrementally. @@ -74,40 +65,29 @@ pub(super) fn reparse_markup( range: Range, at_start: &mut bool, nesting: &mut usize, - mut stop: impl FnMut(SyntaxKind) -> bool, + top_level: bool, ) -> Option> { let mut p = Parser::new(text, range.start, LexMode::Markup); - while !p.end() && p.current_start() < range.end { + *at_start |= p.had_newline(); + while p.current_start() < range.end { match p.current() { SyntaxKind::LeftBracket => *nesting += 1, SyntaxKind::RightBracket if *nesting > 0 => *nesting -= 1, - _ if stop(p.current()) => break, + SyntaxKind::RightBracket if !top_level => break, + SyntaxKind::End => break, _ => {} } - - if p.newline() { - *at_start = true; - p.eat(); - continue; - } - - markup_expr(&mut p, at_start); + markup_expr(&mut p, *at_start); + *at_start = p.had_newline(); } (p.balanced && p.current_start() == range.end).then(|| p.finish()) } -/// Parses a single markup expression: This includes markup elements like -/// spaces, text, and headings, and embedded code expressions. -fn markup_expr(p: &mut Parser, at_start: &mut bool) { +/// Parses a single markup expression. This includes markup elements like text, +/// headings, strong/emph, lists/enums, etc. This is also the entry point for +/// parsing math equations and embedded code expressions. +fn markup_expr(p: &mut Parser, at_start: bool) { match p.current() { - SyntaxKind::Space - | SyntaxKind::Parbreak - | SyntaxKind::LineComment - | SyntaxKind::BlockComment => { - p.eat(); - return; - } - SyntaxKind::Text | SyntaxKind::Linebreak | SyntaxKind::Escape @@ -116,14 +96,15 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) { | SyntaxKind::Link | SyntaxKind::Label => p.eat(), + SyntaxKind::Raw => p.eat(), // Raw is handled entirely in the Lexer. + SyntaxKind::Hash => embedded_code_expr(p), SyntaxKind::Star => strong(p), SyntaxKind::Underscore => emph(p), - SyntaxKind::RawDelim => raw(p), - SyntaxKind::HeadingMarker if *at_start => heading(p), - SyntaxKind::ListMarker if *at_start => list_item(p), - SyntaxKind::EnumMarker if *at_start => enum_item(p), - SyntaxKind::TermMarker if *at_start => term_item(p), + SyntaxKind::HeadingMarker if at_start => heading(p), + SyntaxKind::ListMarker if at_start => list_item(p), + SyntaxKind::EnumMarker if at_start => enum_item(p), + SyntaxKind::TermMarker if at_start => term_item(p), SyntaxKind::RefMarker => reference(p), SyntaxKind::Dollar => equation(p), @@ -133,94 +114,76 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) { | SyntaxKind::ListMarker | SyntaxKind::EnumMarker | SyntaxKind::TermMarker - | SyntaxKind::Colon => p.convert(SyntaxKind::Text), + | SyntaxKind::Colon => p.convert_and_eat(SyntaxKind::Text), - _ => { - p.unexpected(); - return; // Don't set `at_start` - } + _ => p.unexpected(), } - - *at_start = false; } /// Parses strong content: `*Strong*`. fn strong(p: &mut Parser) { - let m = p.marker(); - p.assert(SyntaxKind::Star); - markup(p, false, 0, |p| p.at_set(syntax_set!(Star, Parbreak, RightBracket))); - p.expect_closing_delimiter(m, SyntaxKind::Star); - p.wrap(m, SyntaxKind::Strong); + p.with_nl_mode(AtNewline::StopParBreak, |p| { + let m = p.marker(); + p.assert(SyntaxKind::Star); + markup(p, false, true, syntax_set!(Star, RightBracket, End)); + p.expect_closing_delimiter(m, SyntaxKind::Star); + p.wrap(m, SyntaxKind::Strong); + }); } /// Parses emphasized content: `_Emphasized_`. fn emph(p: &mut Parser) { - let m = p.marker(); - p.assert(SyntaxKind::Underscore); - markup(p, false, 0, |p| p.at_set(syntax_set!(Underscore, Parbreak, RightBracket))); - p.expect_closing_delimiter(m, SyntaxKind::Underscore); - p.wrap(m, SyntaxKind::Emph); -} - -/// Parses raw text with optional syntax highlighting: `` `...` ``. -fn raw(p: &mut Parser) { - let m = p.marker(); - p.enter(LexMode::Raw); - p.assert(SyntaxKind::RawDelim); - - // Eats until the closing delimiter. - while !p.end() && !p.at(SyntaxKind::RawDelim) { - p.eat(); - } - - p.expect(SyntaxKind::RawDelim); - p.exit(); - p.wrap(m, SyntaxKind::Raw); + p.with_nl_mode(AtNewline::StopParBreak, |p| { + let m = p.marker(); + p.assert(SyntaxKind::Underscore); + markup(p, false, true, syntax_set!(Underscore, RightBracket, End)); + p.expect_closing_delimiter(m, SyntaxKind::Underscore); + p.wrap(m, SyntaxKind::Emph); + }); } /// Parses a section heading: `= Introduction`. fn heading(p: &mut Parser) { - let m = p.marker(); - p.assert(SyntaxKind::HeadingMarker); - whitespace_line(p); - markup(p, false, usize::MAX, |p| { - p.at_set(syntax_set!(Label, Space, RightBracket)) - && (!p.at(SyntaxKind::Space) || p.lexer.clone().next() == SyntaxKind::Label) + p.with_nl_mode(AtNewline::Stop, |p| { + let m = p.marker(); + p.assert(SyntaxKind::HeadingMarker); + markup(p, false, false, syntax_set!(Label, RightBracket, End)); + p.wrap(m, SyntaxKind::Heading); }); - p.wrap(m, SyntaxKind::Heading); } /// Parses an item in a bullet list: `- ...`. fn list_item(p: &mut Parser) { - let m = p.marker(); - let min_indent = p.column(p.current_start()) + 1; - p.assert(SyntaxKind::ListMarker); - whitespace_line(p); - markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket)); - p.wrap(m, SyntaxKind::ListItem); + p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| { + let m = p.marker(); + p.assert(SyntaxKind::ListMarker); + markup(p, false, false, syntax_set!(RightBracket, End)); + p.wrap(m, SyntaxKind::ListItem); + }); } /// Parses an item in an enumeration (numbered list): `+ ...` or `1. ...`. fn enum_item(p: &mut Parser) { - let m = p.marker(); - let min_indent = p.column(p.current_start()) + 1; - p.assert(SyntaxKind::EnumMarker); - whitespace_line(p); - markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket)); - p.wrap(m, SyntaxKind::EnumItem); + p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| { + let m = p.marker(); + p.assert(SyntaxKind::EnumMarker); + markup(p, false, false, syntax_set!(RightBracket, End)); + p.wrap(m, SyntaxKind::EnumItem); + }); } /// Parses an item in a term list: `/ Term: Details`. fn term_item(p: &mut Parser) { - let m = p.marker(); - p.assert(SyntaxKind::TermMarker); - let min_indent = p.column(p.prev_end()); - whitespace_line(p); - markup(p, false, usize::MAX, |p| p.at_set(syntax_set!(Colon, RightBracket))); - p.expect(SyntaxKind::Colon); - whitespace_line(p); - markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket)); - p.wrap(m, SyntaxKind::TermItem); + p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| { + let m = p.marker(); + p.with_nl_mode(AtNewline::Stop, |p| { + p.assert(SyntaxKind::TermMarker); + markup(p, false, false, syntax_set!(Colon, RightBracket, End)); + }); + p.expect(SyntaxKind::Colon); + markup(p, false, false, syntax_set!(RightBracket, End)); + p.wrap(m, SyntaxKind::TermItem); + }); } /// Parses a reference: `@target`, `@target[..]`. @@ -233,35 +196,34 @@ fn reference(p: &mut Parser) { p.wrap(m, SyntaxKind::Ref); } -/// Consumes whitespace that does not contain a newline. -fn whitespace_line(p: &mut Parser) { - while !p.newline() && p.current().is_trivia() { - p.eat(); - } -} - /// Parses a mathematical equation: `$x$`, `$ x^2 $`. fn equation(p: &mut Parser) { let m = p.marker(); - p.enter(LexMode::Math); - p.assert(SyntaxKind::Dollar); - math(p, |p| p.at(SyntaxKind::Dollar)); - p.expect_closing_delimiter(m, SyntaxKind::Dollar); - p.exit(); + p.enter_modes(LexMode::Math, AtNewline::Continue, |p| { + p.assert(SyntaxKind::Dollar); + math(p, syntax_set!(Dollar, RightBracket, End)); + p.expect_closing_delimiter(m, SyntaxKind::Dollar); + }); p.wrap(m, SyntaxKind::Equation); } /// Parses the contents of a mathematical equation: `x^2 + 1`. -fn math(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) { +fn math(p: &mut Parser, stop_set: SyntaxSet) { let m = p.marker(); - while !p.end() && !stop(p) { + math_exprs(p, stop_set); + p.wrap(m, SyntaxKind::Math); +} + +/// Parses a sequence of math expressions. +fn math_exprs(p: &mut Parser, stop_set: SyntaxSet) { + debug_assert!(stop_set.contains(SyntaxKind::End)); + while !p.at_set(stop_set) { if p.at_set(set::MATH_EXPR) { math_expr(p); } else { p.unexpected(); } } - p.wrap(m, SyntaxKind::Math); } /// Parses a single math expression: This includes math elements like @@ -276,21 +238,11 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) { let mut continuable = false; match p.current() { SyntaxKind::Hash => embedded_code_expr(p), - SyntaxKind::MathIdent => { + // The lexer manages creating full FieldAccess nodes if needed. + SyntaxKind::MathIdent | SyntaxKind::FieldAccess => { continuable = true; p.eat(); - while p.directly_at(SyntaxKind::Text) && p.current_text() == "." && { - let mut copy = p.lexer.clone(); - let start = copy.cursor(); - let next = copy.next(); - let end = copy.cursor(); - matches!(next, SyntaxKind::MathIdent | SyntaxKind::Text) - && is_ident(&p.text[start..end]) - } { - p.convert(SyntaxKind::Dot); - p.convert(SyntaxKind::Ident); - p.wrap(m, SyntaxKind::FieldAccess); - } + // Parse a function call for an identifier or field access. if min_prec < 3 && p.directly_at(SyntaxKind::Text) && p.current_text() == "(" { math_args(p); @@ -340,11 +292,7 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) { _ => p.expected("expression"), } - if continuable - && min_prec < 3 - && p.prev_end() == p.current_start() - && maybe_delimited(p) - { + if continuable && min_prec < 3 && !p.had_trivia() && maybe_delimited(p) { p.wrap(m, SyntaxKind::Math); } @@ -414,6 +362,23 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) { } } +/// Precedence and wrapper kinds for the binary math operators. +fn math_op(kind: SyntaxKind) -> Option<(SyntaxKind, SyntaxKind, ast::Assoc, usize)> { + match kind { + SyntaxKind::Underscore => { + Some((SyntaxKind::MathAttach, SyntaxKind::Hat, ast::Assoc::Right, 2)) + } + SyntaxKind::Hat => { + Some((SyntaxKind::MathAttach, SyntaxKind::Underscore, ast::Assoc::Right, 2)) + } + SyntaxKind::Slash => { + Some((SyntaxKind::MathFrac, SyntaxKind::End, ast::Assoc::Left, 1)) + } + _ => None, + } +} + +/// Try to parse delimiters based on the current token's unicode math class. fn maybe_delimited(p: &mut Parser) -> bool { let open = math_class(p.current_text()) == Some(MathClass::Opening); if open { @@ -422,11 +387,12 @@ fn maybe_delimited(p: &mut Parser) -> bool { open } +/// Parse matched delimiters in math: `[x + y]`. fn math_delimited(p: &mut Parser) { let m = p.marker(); p.eat(); let m2 = p.marker(); - while !p.end() && !p.at(SyntaxKind::Dollar) { + while !p.at_set(syntax_set!(Dollar, End)) { if math_class(p.current_text()) == Some(MathClass::Closing) { p.wrap(m2, SyntaxKind::Math); p.eat(); @@ -444,6 +410,8 @@ fn math_delimited(p: &mut Parser) { p.wrap(m, SyntaxKind::Math); } +/// Remove one set of parentheses (if any) from a previously parsed expression +/// by converting to non-expression SyntaxKinds. fn math_unparen(p: &mut Parser, m: Marker) { let Some(node) = p.nodes.get_mut(m.0) else { return }; if node.kind() != SyntaxKind::MathDelimited { @@ -460,6 +428,10 @@ fn math_unparen(p: &mut Parser, m: Marker) { node.convert_to_kind(SyntaxKind::Math); } +/// The unicode math class of a string. Only returns `Some` if `text` has +/// exactly one unicode character or is a math shorthand string (currently just +/// `[|`, `||`, `|]`) and then only returns `Some` if there is a math class +/// defined for that character. fn math_class(text: &str) -> Option { match text { "[|" => return Some(MathClass::Opening), @@ -475,38 +447,26 @@ fn math_class(text: &str) -> Option { .and_then(unicode_math_class::class) } -fn math_op(kind: SyntaxKind) -> Option<(SyntaxKind, SyntaxKind, ast::Assoc, usize)> { - match kind { - SyntaxKind::Underscore => { - Some((SyntaxKind::MathAttach, SyntaxKind::Hat, ast::Assoc::Right, 2)) - } - SyntaxKind::Hat => { - Some((SyntaxKind::MathAttach, SyntaxKind::Underscore, ast::Assoc::Right, 2)) - } - SyntaxKind::Slash => { - Some((SyntaxKind::MathFrac, SyntaxKind::End, ast::Assoc::Left, 1)) - } - _ => None, - } -} - +/// Parse an argument list in math: `(a, b; c, d; size: #50%)`. fn math_args(p: &mut Parser) { let m = p.marker(); - p.convert(SyntaxKind::LeftParen); + p.convert_and_eat(SyntaxKind::LeftParen); let mut namable = true; let mut named = None; let mut has_arrays = false; let mut array = p.marker(); let mut arg = p.marker(); + // The number of math expressions per argument. + let mut count = 0; - while !p.end() && !p.at(SyntaxKind::Dollar) { + while !p.at_set(syntax_set!(Dollar, End)) { if namable && (p.at(SyntaxKind::MathIdent) || p.at(SyntaxKind::Text)) && p.text[p.current_end()..].starts_with(':') { - p.convert(SyntaxKind::Ident); - p.convert(SyntaxKind::Colon); + p.convert_and_eat(SyntaxKind::Ident); + p.convert_and_eat(SyntaxKind::Colon); named = Some(arg); arg = p.marker(); array = p.marker(); @@ -515,20 +475,22 @@ fn math_args(p: &mut Parser) { match p.current_text() { ")" => break, ";" => { - maybe_wrap_in_math(p, arg, named); + maybe_wrap_in_math(p, arg, count, named); p.wrap(array, SyntaxKind::Array); - p.convert(SyntaxKind::Semicolon); + p.convert_and_eat(SyntaxKind::Semicolon); array = p.marker(); arg = p.marker(); + count = 0; namable = true; named = None; has_arrays = true; continue; } "," => { - maybe_wrap_in_math(p, arg, named); - p.convert(SyntaxKind::Comma); + maybe_wrap_in_math(p, arg, count, named); + p.convert_and_eat(SyntaxKind::Comma); arg = p.marker(); + count = 0; namable = true; if named.is_some() { array = p.marker(); @@ -541,6 +503,7 @@ fn math_args(p: &mut Parser) { if p.at_set(set::MATH_EXPR) { math_expr(p); + count += 1; } else { p.unexpected(); } @@ -549,7 +512,7 @@ fn math_args(p: &mut Parser) { } if arg != p.marker() { - maybe_wrap_in_math(p, arg, named); + maybe_wrap_in_math(p, arg, count, named); if named.is_some() { array = p.marker(); } @@ -560,7 +523,7 @@ fn math_args(p: &mut Parser) { } if p.at(SyntaxKind::Text) && p.current_text() == ")" { - p.convert(SyntaxKind::RightParen); + p.convert_and_eat(SyntaxKind::RightParen); } else { p.expected("closing paren"); p.balanced = false; @@ -569,23 +532,26 @@ fn math_args(p: &mut Parser) { p.wrap(m, SyntaxKind::Args); } -/// Wrap math function arguments in a "Math" SyntaxKind to combine adjacent expressions -/// or create blank content. +/// Wrap math function arguments to join adjacent math content or create an +/// empty 'Math' node for when we have 0 args. /// -/// We don't wrap when `exprs == 1`, as there is only one expression, so the grouping -/// isn't needed, and this would change the type of the expression from potentially -/// non-content to content. -/// -/// Note that `exprs` might be 0 if we have whitespace or trivia before a comma i.e. -/// `mat(; ,)` or `sin(x, , , ,)`. This would create an empty Math element before that -/// trivia if we called `p.wrap()` -- breaking the expected AST for 2-d arguments -- so -/// we instead manually wrap to our current marker using `p.wrap_within()`. -fn maybe_wrap_in_math(p: &mut Parser, arg: Marker, named: Option) { - let exprs = p.post_process(arg).filter(|node| node.is::()).count(); - if exprs != 1 { - // Convert 0 exprs into a blank math element (so empty arguments are allowed). - // Convert 2+ exprs into a math element (so they become a joined sequence). - p.wrap_within(arg, p.marker(), SyntaxKind::Math); +/// We don't wrap when `count == 1`, since wrapping would change the type of the +/// expression from potentially non-content to content. Ex: `$ func(#12pt) $` +/// would change the type from size to content if wrapped. +fn maybe_wrap_in_math(p: &mut Parser, arg: Marker, count: usize, named: Option) { + if count == 0 { + // Flush trivia so that the new empty Math node will be wrapped _inside_ + // any `SyntaxKind::Array` elements created in `math_args`. + // (And if we don't follow by wrapping in an array, it has no effect.) + // The difference in node layout without this would look like: + // Expression: `$ mat( ;) $` + // - Correct: [ .., Space(" "), Array[Math[], ], Semicolon(";"), .. ] + // - Incorrect: [ .., Math[], Array[], Space(" "), Semicolon(";"), .. ] + p.flush_trivia(); + } + + if count != 1 { + p.wrap(arg, SyntaxKind::Math); } if let Some(m) = named { @@ -594,34 +560,58 @@ fn maybe_wrap_in_math(p: &mut Parser, arg: Marker, named: Option) { } /// Parses the contents of a code block. -fn code(p: &mut Parser, stop: impl FnMut(&Parser) -> bool) { +fn code(p: &mut Parser, stop_set: SyntaxSet) { let m = p.marker(); - code_exprs(p, stop); + code_exprs(p, stop_set); p.wrap(m, SyntaxKind::Code); } /// Parses a sequence of code expressions. -fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) { - while !p.end() && !stop(p) { - p.enter_newline_mode(NewlineMode::Contextual); - - let at_expr = p.at_set(set::CODE_EXPR); - if at_expr { +fn code_exprs(p: &mut Parser, stop_set: SyntaxSet) { + debug_assert!(stop_set.contains(SyntaxKind::End)); + while !p.at_set(stop_set) { + p.with_nl_mode(AtNewline::ContextualContinue, |p| { + if !p.at_set(set::CODE_EXPR) { + p.unexpected(); + return; + } code_expr(p); - if !p.end() && !stop(p) && !p.eat_if(SyntaxKind::Semicolon) { + if !p.at_set(stop_set) && !p.eat_if(SyntaxKind::Semicolon) { p.expected("semicolon or line break"); if p.at(SyntaxKind::Label) { p.hint("labels can only be applied in markup mode"); p.hint("try wrapping your code in a markup block (`[ ]`)"); } } + }); + } +} + +/// Parses an atomic code expression embedded in markup or math. +fn embedded_code_expr(p: &mut Parser) { + p.enter_modes(LexMode::Code, AtNewline::Stop, |p| { + p.assert(SyntaxKind::Hash); + if p.had_trivia() || p.end() { + p.expected("expression"); + return; } - p.exit_newline_mode(); - if !at_expr && !p.end() { + let stmt = p.at_set(set::STMT); + let at = p.at_set(set::ATOMIC_CODE_EXPR); + code_expr_prec(p, true, 0); + + // Consume error for things like `#12p` or `#"abc\"`.# + if !at { p.unexpected(); } - } + + let semi = (stmt || p.directly_at(SyntaxKind::Semicolon)) + && p.eat_if(SyntaxKind::Semicolon); + + if stmt && !semi && !p.end() && !p.at(SyntaxKind::RightBracket) { + p.expected("semicolon or line break"); + } + }); } /// Parses a single code expression. @@ -629,33 +619,6 @@ fn code_expr(p: &mut Parser) { code_expr_prec(p, false, 0) } -/// Parses a code expression embedded in markup or math. -fn embedded_code_expr(p: &mut Parser) { - p.enter_newline_mode(NewlineMode::Stop); - p.enter(LexMode::Code); - p.assert(SyntaxKind::Hash); - p.unskip(); - - let stmt = p.at_set(set::STMT); - let at = p.at_set(set::ATOMIC_CODE_EXPR); - code_expr_prec(p, true, 0); - - // Consume error for things like `#12p` or `#"abc\"`.# - if !at && !p.current().is_trivia() && !p.end() { - p.unexpected(); - } - - let semi = - (stmt || p.directly_at(SyntaxKind::Semicolon)) && p.eat_if(SyntaxKind::Semicolon); - - if stmt && !semi && !p.end() && !p.at(SyntaxKind::RightBracket) { - p.expected("semicolon or line break"); - } - - p.exit(); - p.exit_newline_mode(); -} - /// Parses a code expression with at least the given precedence. fn code_expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) { let m = p.marker(); @@ -676,8 +639,8 @@ fn code_expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) { continue; } - let at_field_or_method = - p.directly_at(SyntaxKind::Dot) && p.lexer.clone().next() == SyntaxKind::Ident; + let at_field_or_method = p.directly_at(SyntaxKind::Dot) + && p.lexer.clone().next().0 == SyntaxKind::Ident; if atomic && !at_field_or_method { break; @@ -757,7 +720,6 @@ fn code_primary(p: &mut Parser, atomic: bool) { SyntaxKind::LeftBrace => code_block(p), SyntaxKind::LeftBracket => content_block(p), SyntaxKind::LeftParen => expr_with_paren(p, atomic), - SyntaxKind::RawDelim => raw(p), SyntaxKind::Dollar => equation(p), SyntaxKind::Let => let_binding(p), SyntaxKind::Set => set_rule(p), @@ -772,6 +734,8 @@ fn code_primary(p: &mut Parser, atomic: bool) { SyntaxKind::Continue => continue_stmt(p), SyntaxKind::Return => return_stmt(p), + SyntaxKind::Raw => p.eat(), // Raw is handled entirely in the Lexer. + SyntaxKind::None | SyntaxKind::Auto | SyntaxKind::Int @@ -785,15 +749,6 @@ fn code_primary(p: &mut Parser, atomic: bool) { } } -/// Parses a content or code block. -fn block(p: &mut Parser) { - match p.current() { - SyntaxKind::LeftBracket => content_block(p), - SyntaxKind::LeftBrace => code_block(p), - _ => p.expected("block"), - } -} - /// Reparses a full content or code block. pub(super) fn reparse_block(text: &str, range: Range) -> Option { let mut p = Parser::new(text, range.start, LexMode::Code); @@ -803,27 +758,34 @@ pub(super) fn reparse_block(text: &str, range: Range) -> Option content_block(p), + SyntaxKind::LeftBrace => code_block(p), + _ => p.expected("block"), + } +} + /// Parses a code block: `{ let x = 1; x + 2 }`. fn code_block(p: &mut Parser) { let m = p.marker(); - p.enter(LexMode::Code); - p.enter_newline_mode(NewlineMode::Continue); - p.assert(SyntaxKind::LeftBrace); - code(p, |p| p.at_set(syntax_set!(RightBrace, RightBracket, RightParen))); - p.expect_closing_delimiter(m, SyntaxKind::RightBrace); - p.exit(); - p.exit_newline_mode(); + p.enter_modes(LexMode::Code, AtNewline::Continue, |p| { + p.assert(SyntaxKind::LeftBrace); + code(p, syntax_set!(RightBrace, RightBracket, RightParen, End)); + p.expect_closing_delimiter(m, SyntaxKind::RightBrace); + }); p.wrap(m, SyntaxKind::CodeBlock); } /// Parses a content block: `[*Hi* there!]`. fn content_block(p: &mut Parser) { let m = p.marker(); - p.enter(LexMode::Markup); - p.assert(SyntaxKind::LeftBracket); - markup(p, true, 0, |p| p.at(SyntaxKind::RightBracket)); - p.expect_closing_delimiter(m, SyntaxKind::RightBracket); - p.exit(); + p.enter_modes(LexMode::Markup, AtNewline::Continue, |p| { + p.assert(SyntaxKind::LeftBracket); + markup(p, true, true, syntax_set!(RightBracket, End)); + p.expect_closing_delimiter(m, SyntaxKind::RightBracket); + }); p.wrap(m, SyntaxKind::ContentBlock); } @@ -937,9 +899,8 @@ fn for_loop(p: &mut Parser) { let mut seen = HashSet::new(); pattern(p, false, &mut seen, None); - let m2 = p.marker(); - if p.eat_if(SyntaxKind::Comma) { - let node = &mut p[m2]; + if p.at(SyntaxKind::Comma) { + let node = p.eat_and_get(); node.unexpected(); node.hint("destructuring patterns must be wrapped in parentheses"); if p.at_set(set::PATTERN) { @@ -967,14 +928,14 @@ fn module_import(p: &mut Parser) { if p.eat_if(SyntaxKind::Colon) { if p.at(SyntaxKind::LeftParen) { - let m1 = p.marker(); - p.enter_newline_mode(NewlineMode::Continue); - p.assert(SyntaxKind::LeftParen); + p.with_nl_mode(AtNewline::Continue, |p| { + let m2 = p.marker(); + p.assert(SyntaxKind::LeftParen); - import_items(p); + import_items(p); - p.expect_closing_delimiter(m1, SyntaxKind::RightParen); - p.exit_newline_mode(); + p.expect_closing_delimiter(m2, SyntaxKind::RightParen); + }); } else if !p.eat_if(SyntaxKind::Star) { import_items(p); } @@ -1047,29 +1008,25 @@ fn return_stmt(p: &mut Parser) { /// An expression that starts with a parenthesis. fn expr_with_paren(p: &mut Parser, atomic: bool) { - // If we've seen this position before and have a memoized result, just use - // it. See below for more explanation about this memoization. - let start = p.current_start(); - if let Some((range, end_point)) = p.memo.get(&start).cloned() { - // Restore the end point first, so that it doesn't truncate our freshly - // pushed nodes. If the current length of `p.nodes` doesn't match what - // we had in the memoized run, this might otherwise happen. - p.restore(end_point); - p.nodes.extend(p.memo_arena[range].iter().cloned()); + if atomic { + // Atomic expressions aren't modified by operators that follow them, so + // our first guess of array/dict will be correct. + parenthesized_or_array_or_dict(p); return; } - let m = p.marker(); - let checkpoint = p.checkpoint(); + // If we've seen this position before and have a memoized result, restore it + // and return. Otherwise, get a key to this position and a checkpoint to + // restart from in case we make a wrong prediction. + let Some((memo_key, checkpoint)) = p.restore_memo_or_checkpoint() else { return }; + // The node length from when we restored. + let prev_len = checkpoint.node_len; // When we reach a '(', we can't be sure what it is. First, we attempt to // parse as a simple parenthesized expression, array, or dictionary as // these are the most likely things. We can handle all of those in a single // pass. let kind = parenthesized_or_array_or_dict(p); - if atomic { - return; - } // If, however, '=>' or '=' follows, we must backtrack and reparse as either // a parameter list or a destructuring. To be able to do that, we created a @@ -1090,6 +1047,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) { // case running time of O(2n). if p.at(SyntaxKind::Arrow) { p.restore(checkpoint); + let m = p.marker(); params(p); if !p.expect(SyntaxKind::Arrow) { return; @@ -1098,6 +1056,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) { p.wrap(m, SyntaxKind::Closure); } else if p.at(SyntaxKind::Eq) && kind != SyntaxKind::Parenthesized { p.restore(checkpoint); + let m = p.marker(); destructuring_or_parenthesized(p, true, &mut HashSet::new()); if !p.expect(SyntaxKind::Eq) { return; @@ -1109,9 +1068,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) { } // Memoize result if we backtracked. - let offset = p.memo_arena.len(); - p.memo_arena.extend(p.nodes[m.0..].iter().cloned()); - p.memo.insert(start, (offset..p.memo_arena.len(), p.checkpoint())); + p.memoize_parsed_nodes(memo_key, prev_len); } /// Parses either @@ -1119,10 +1076,6 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) { /// - an array: `(1, "hi", 12cm)`, or /// - a dictionary: `(thickness: 3pt, pattern: dashed)`. fn parenthesized_or_array_or_dict(p: &mut Parser) -> SyntaxKind { - let m = p.marker(); - p.enter_newline_mode(NewlineMode::Continue); - p.assert(SyntaxKind::LeftParen); - let mut state = GroupState { count: 0, maybe_just_parens: true, @@ -1130,27 +1083,44 @@ fn parenthesized_or_array_or_dict(p: &mut Parser) -> SyntaxKind { seen: HashSet::new(), }; - if p.eat_if(SyntaxKind::Colon) { - state.kind = Some(SyntaxKind::Dict); - state.maybe_just_parens = false; - } - - while !p.current().is_terminator() { - if !p.at_set(set::ARRAY_OR_DICT_ITEM) { - p.unexpected(); - continue; + // An edge case with parens is whether we can interpret a leading spread + // expression as a dictionary, e.g. if we want `(..dict1, ..dict2)` to join + // the two dicts. + // + // The issue is that we decide on the type of the parenthesized expression + // here in the parser by the `SyntaxKind` we wrap with, instead of in eval + // based on the type of the spread item. + // + // The current fix is that we allow a leading colon to force the + // parenthesized value into a dict: + // - `(..arr1, ..arr2)` is wrapped as an `Array`. + // - `(: ..dict1, ..dict2)` is wrapped as a `Dict`. + // + // This does allow some unexpected expressions, such as `(: key: val)`, but + // it's currently intentional. + let m = p.marker(); + p.with_nl_mode(AtNewline::Continue, |p| { + p.assert(SyntaxKind::LeftParen); + if p.eat_if(SyntaxKind::Colon) { + state.kind = Some(SyntaxKind::Dict); } - array_or_dict_item(p, &mut state); - state.count += 1; + while !p.current().is_terminator() { + if !p.at_set(set::ARRAY_OR_DICT_ITEM) { + p.unexpected(); + continue; + } - if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) { - state.maybe_just_parens = false; + array_or_dict_item(p, &mut state); + state.count += 1; + + if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) { + state.maybe_just_parens = false; + } } - } - p.expect_closing_delimiter(m, SyntaxKind::RightParen); - p.exit_newline_mode(); + p.expect_closing_delimiter(m, SyntaxKind::RightParen); + }); let kind = if state.maybe_just_parens && state.count == 1 { SyntaxKind::Parenthesized @@ -1165,8 +1135,13 @@ fn parenthesized_or_array_or_dict(p: &mut Parser) -> SyntaxKind { /// State for array/dictionary parsing. struct GroupState { count: usize, + /// Whether this is just a single expression in parens: `(a)`. Single + /// element arrays require an explicit comma: `(a,)`, unless we're + /// spreading: `(..a)`. maybe_just_parens: bool, + /// The `SyntaxKind` to wrap as (if we've figured it out yet). kind: Option, + /// Store named arguments so we can give an error if they're repeated. seen: HashSet, } @@ -1234,25 +1209,25 @@ fn args(p: &mut Parser) { let m = p.marker(); if p.at(SyntaxKind::LeftParen) { let m2 = p.marker(); - p.enter_newline_mode(NewlineMode::Continue); - p.assert(SyntaxKind::LeftParen); + p.with_nl_mode(AtNewline::Continue, |p| { + p.assert(SyntaxKind::LeftParen); - let mut seen = HashSet::new(); - while !p.current().is_terminator() { - if !p.at_set(set::ARG) { - p.unexpected(); - continue; + let mut seen = HashSet::new(); + while !p.current().is_terminator() { + if !p.at_set(set::ARG) { + p.unexpected(); + continue; + } + + arg(p, &mut seen); + + if !p.current().is_terminator() { + p.expect(SyntaxKind::Comma); + } } - arg(p, &mut seen); - - if !p.current().is_terminator() { - p.expect(SyntaxKind::Comma); - } - } - - p.expect_closing_delimiter(m2, SyntaxKind::RightParen); - p.exit_newline_mode(); + p.expect_closing_delimiter(m2, SyntaxKind::RightParen); + }); } while p.directly_at(SyntaxKind::LeftBracket) { @@ -1297,27 +1272,27 @@ fn arg<'s>(p: &mut Parser<'s>, seen: &mut HashSet<&'s str>) { /// Parses a closure's parameters: `(x, y)`. fn params(p: &mut Parser) { let m = p.marker(); - p.enter_newline_mode(NewlineMode::Continue); - p.assert(SyntaxKind::LeftParen); + p.with_nl_mode(AtNewline::Continue, |p| { + p.assert(SyntaxKind::LeftParen); - let mut seen = HashSet::new(); - let mut sink = false; + let mut seen = HashSet::new(); + let mut sink = false; - while !p.current().is_terminator() { - if !p.at_set(set::PARAM) { - p.unexpected(); - continue; + while !p.current().is_terminator() { + if !p.at_set(set::PARAM) { + p.unexpected(); + continue; + } + + param(p, &mut seen, &mut sink); + + if !p.current().is_terminator() { + p.expect(SyntaxKind::Comma); + } } - param(p, &mut seen, &mut sink); - - if !p.current().is_terminator() { - p.expect(SyntaxKind::Comma); - } - } - - p.expect_closing_delimiter(m, SyntaxKind::RightParen); - p.exit_newline_mode(); + p.expect_closing_delimiter(m, SyntaxKind::RightParen); + }); p.wrap(m, SyntaxKind::Params); } @@ -1378,25 +1353,25 @@ fn destructuring_or_parenthesized<'s>( let mut maybe_just_parens = true; let m = p.marker(); - p.enter_newline_mode(NewlineMode::Continue); - p.assert(SyntaxKind::LeftParen); + p.with_nl_mode(AtNewline::Continue, |p| { + p.assert(SyntaxKind::LeftParen); - while !p.current().is_terminator() { - if !p.at_set(set::DESTRUCTURING_ITEM) { - p.unexpected(); - continue; + while !p.current().is_terminator() { + if !p.at_set(set::DESTRUCTURING_ITEM) { + p.unexpected(); + continue; + } + + destructuring_item(p, reassignment, seen, &mut maybe_just_parens, &mut sink); + count += 1; + + if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) { + maybe_just_parens = false; + } } - destructuring_item(p, reassignment, seen, &mut maybe_just_parens, &mut sink); - count += 1; - - if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) { - maybe_just_parens = false; - } - } - - p.expect_closing_delimiter(m, SyntaxKind::RightParen); - p.exit_newline_mode(); + p.expect_closing_delimiter(m, SyntaxKind::RightParen); + }); if maybe_just_parens && count == 1 && !sink { p.wrap(m, SyntaxKind::Parenthesized); @@ -1429,6 +1404,9 @@ fn destructuring_item<'s>( // Parse a normal positional pattern or a destructuring key. let was_at_pat = p.at_set(set::PATTERN); + + // We must use a full checkpoint here (can't just clone the lexer) because + // there may be trivia between the identifier and the colon we need to skip. let checkpoint = p.checkpoint(); if !(p.eat_if(SyntaxKind::Ident) && p.at(SyntaxKind::Colon)) { p.restore(checkpoint); @@ -1487,122 +1465,274 @@ fn pattern_leaf<'s>( } } -/// Manages parsing of a stream of tokens. +/// Manages parsing a stream of tokens into a tree of [`SyntaxNode`]s. +/// +/// The implementation presents an interface that investigates a current `token` +/// with a [`SyntaxKind`] and can take one of the following actions: +/// +/// 1. Eat a token: push `token` onto the `nodes` vector as a [leaf +/// node](`SyntaxNode::leaf`) and prepare a new `token` by calling into the +/// lexer. +/// 2. Wrap nodes from a marker to the end of `nodes` (excluding `token` and any +/// attached trivia) into an [inner node](`SyntaxNode::inner`) of a specific +/// `SyntaxKind`. +/// 3. Produce or convert nodes into an [error node](`SyntaxNode::error`) when +/// something expected is missing or something unexpected is found. +/// +/// Overall the parser produces a nested tree of SyntaxNodes as a "_Concrete_ +/// Syntax Tree." The raw Concrete Syntax Tree should contain the entire source +/// text, and is used as-is for e.g. syntax highlighting and IDE features. In +/// `ast.rs` the CST is interpreted as a lazy view over an "_Abstract_ Syntax +/// Tree." The AST module skips over irrelevant tokens -- whitespace, comments, +/// code parens, commas in function args, etc. -- as it iterates through the +/// tree. +/// +/// ### Modes +/// +/// The parser manages the transitions between the three modes of Typst through +/// [lexer modes](`LexMode`) and [newline modes](`AtNewline`). +/// +/// The lexer modes map to the three Typst modes and are stored in the lexer, +/// changing which`SyntaxKind`s it will generate. +/// +/// The newline mode is used to determine whether a newline should end the +/// current expression. If so, the parser temporarily changes `token`'s kind to +/// a fake [`SyntaxKind::End`]. When the parser exits the mode the original +/// `SyntaxKind` is restored. struct Parser<'s> { + /// The source text shared with the lexer. text: &'s str, + /// A lexer over the source text with multiple modes. Defines the boundaries + /// of tokens and determines their [`SyntaxKind`]. Contains the [`LexMode`] + /// defining our current Typst mode. lexer: Lexer<'s>, - prev_end: usize, - current_start: usize, - current: SyntaxKind, + /// The newline mode: whether to insert a temporary end at newlines. + nl_mode: AtNewline, + /// The current token under inspection, not yet present in `nodes`. This + /// acts like a single item of lookahead for the parser. + /// + /// When wrapping, this is _not_ included in the wrapped nodes. + token: Token, + /// Whether the parser has the expected set of open/close delimiters. This + /// only ever transitions from `true` to `false`. balanced: bool, + /// Nodes representing the concrete syntax tree of previously parsed text. + /// In Code and Math, includes previously parsed trivia, but not `token`. nodes: Vec, - modes: Vec, - newline_modes: Vec, - memo: HashMap, Checkpoint<'s>)>, - memo_arena: Vec, + /// Parser checkpoints for a given text index. Used for efficient parser + /// backtracking similar to packrat parsing. See comments above in + /// [`expr_with_paren`]. + memo: MemoArena, } -/// How to proceed with parsing when seeing a newline. -#[derive(Clone)] -enum NewlineMode { - /// Stop always. - Stop, - /// Proceed if there is no continuation with `else` or `.` - Contextual, - /// Just proceed like with normal whitespace. +/// A single token returned from the lexer with a cached [`SyntaxKind`] and a +/// record of preceding trivia. +#[derive(Debug, Clone)] +struct Token { + /// The [`SyntaxKind`] of the current token. + kind: SyntaxKind, + /// The [`SyntaxNode`] of the current token, ready to be eaten and pushed + /// onto the end of `nodes`. + node: SyntaxNode, + /// The number of preceding trivia before this token. + n_trivia: usize, + /// Whether this token's preceding trivia contained a newline. + newline: Option, + /// The index into `text` of the start of our current token (the end is + /// stored as the lexer's cursor). + start: usize, + /// The index into `text` of the end of the previous token. + prev_end: usize, +} + +/// Information about a newline if present (currently only relevant in Markup). +#[derive(Debug, Clone, Copy)] +struct Newline { + /// The column of the start of our token in its line. + column: Option, + /// Whether any of our newlines were paragraph breaks. + parbreak: bool, +} + +/// How to proceed with parsing when at a newline. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum AtNewline { + /// Continue at newlines. Continue, + /// Stop at any newline. + Stop, + /// Continue only if there is no continuation with `else` or `.` (Code only). + ContextualContinue, + /// Stop only at a parbreak, not normal newlines (Markup only). + StopParBreak, + /// Require that the token's column be greater or equal to a column (Markup + /// only). If this is `0`, acts like `Continue`; if this is `usize::MAX`, + /// acts like `Stop`. + RequireColumn(usize), } +impl AtNewline { + /// Whether to stop at a newline or continue based on the current context. + fn stop_at(self, Newline { column, parbreak }: Newline, kind: SyntaxKind) -> bool { + #[allow(clippy::match_like_matches_macro)] + match self { + AtNewline::Continue => false, + AtNewline::Stop => true, + AtNewline::ContextualContinue => match kind { + SyntaxKind::Else | SyntaxKind::Dot => false, + _ => true, + }, + AtNewline::StopParBreak => parbreak, + AtNewline::RequireColumn(min_col) => match column { + Some(column) => column <= min_col, + None => false, // Don't stop if we had no column. + }, + } + } +} + +/// A marker representing a node's position in the parser. Mainly used for +/// wrapping, but can also index into the parser to access the node, like +/// `p[m]`. #[derive(Debug, Copy, Clone, Eq, PartialEq)] struct Marker(usize); -#[derive(Clone)] -struct Checkpoint<'s> { - lexer: Lexer<'s>, - prev_end: usize, - current_start: usize, - current: SyntaxKind, - nodes: usize, +// Index into the parser with markers. +impl Index for Parser<'_> { + type Output = SyntaxNode; + + fn index(&self, m: Marker) -> &Self::Output { + &self.nodes[m.0] + } } +impl IndexMut for Parser<'_> { + fn index_mut(&mut self, m: Marker) -> &mut Self::Output { + &mut self.nodes[m.0] + } +} + +/// Creating/Consuming the parser and getting info about the current token. impl<'s> Parser<'s> { + /// Create a new parser starting from the given text offset and lexer mode. fn new(text: &'s str, offset: usize, mode: LexMode) -> Self { let mut lexer = Lexer::new(text, mode); lexer.jump(offset); - let current = lexer.next(); + let nl_mode = AtNewline::Continue; + let mut nodes = vec![]; + let token = Self::lex(&mut nodes, &mut lexer, nl_mode); Self { - lexer, text, - prev_end: offset, - current_start: offset, - current, + lexer, + nl_mode, + token, balanced: true, - nodes: vec![], - modes: vec![], - newline_modes: vec![], - memo: HashMap::new(), - memo_arena: vec![], + nodes, + memo: Default::default(), } } + /// Consume the parser, yielding the full vector of parsed SyntaxNodes. fn finish(self) -> Vec { self.nodes } - fn prev_end(&self) -> usize { - self.prev_end + /// Consume the parser, generating a single top-level node. + fn finish_into(self, kind: SyntaxKind) -> SyntaxNode { + assert!(self.at(SyntaxKind::End)); + SyntaxNode::inner(kind, self.finish()) } + /// Similar to a `peek()` function: returns the `kind` of the next token to + /// be eaten. fn current(&self) -> SyntaxKind { - self.current - } - - fn current_start(&self) -> usize { - self.current_start - } - - fn current_end(&self) -> usize { - self.lexer.cursor() - } - - fn current_text(&self) -> &'s str { - &self.text[self.current_start..self.current_end()] + self.token.kind } + /// Whether the current token is a given [`SyntaxKind`]. fn at(&self, kind: SyntaxKind) -> bool { - self.current == kind + self.token.kind == kind } + /// Whether the current token is contained in a [`SyntaxSet`]. fn at_set(&self, set: SyntaxSet) -> bool { - set.contains(self.current) + set.contains(self.token.kind) } + /// Whether we're at the end of the token stream. + /// + /// Note: This might be a fake end due to the newline mode. fn end(&self) -> bool { self.at(SyntaxKind::End) } + /// If we're at the given `kind` with no preceding trivia tokens. fn directly_at(&self, kind: SyntaxKind) -> bool { - self.current == kind && self.prev_end == self.current_start + self.token.kind == kind && !self.had_trivia() } - fn eat(&mut self) { - self.save(); - self.lex(); - self.skip(); + /// Whether `token` had any preceding trivia. + fn had_trivia(&self) -> bool { + self.token.n_trivia > 0 } + /// Whether `token` had a newline among any of its preceding trivia. + fn had_newline(&self) -> bool { + self.token.newline.is_some() + } + + /// The number of characters until the most recent newline from the current + /// token, or 0 if it did not follow a newline. + fn current_column(&self) -> usize { + self.token.newline.and_then(|newline| newline.column).unwrap_or(0) + } + + /// The current token's text. + fn current_text(&self) -> &'s str { + &self.text[self.token.start..self.current_end()] + } + + /// The offset into `text` of the current token's start. + fn current_start(&self) -> usize { + self.token.start + } + + /// The offset into `text` of the current token's end. + fn current_end(&self) -> usize { + self.lexer.cursor() + } + + /// The offset into `text` of the previous token's end. + fn prev_end(&self) -> usize { + self.token.prev_end + } +} + +// The main parsing interface for generating tokens and eating/modifying nodes. +impl<'s> Parser<'s> { + /// A marker that will point to the current token in the parser once it's + /// been eaten. + fn marker(&self) -> Marker { + Marker(self.nodes.len()) + } + + /// A marker that will point to first trivia before this token in the + /// parser (or the token itself if no trivia precede it). + fn before_trivia(&self) -> Marker { + Marker(self.nodes.len() - self.token.n_trivia) + } + + /// Eat the current node and return a reference for in-place mutation. #[track_caller] fn eat_and_get(&mut self) -> &mut SyntaxNode { let offset = self.nodes.len(); - self.save(); - self.lex(); - self.skip(); + self.eat(); &mut self.nodes[offset] } - /// Eats if at `kind`. + /// Eat the token if at `kind`. Returns `true` if eaten. /// - /// Note: In math and code mode, this will ignore trivia in front of the + /// Note: In Math and Code, this will ignore trivia in front of the /// `kind`, To forbid skipping trivia, consider using `eat_if_direct`. fn eat_if(&mut self, kind: SyntaxKind) -> bool { let at = self.at(kind); @@ -1612,7 +1742,8 @@ impl<'s> Parser<'s> { at } - /// Eats only if currently at the start of `kind`. + /// Eat the token only if at `kind` with no preceding trivia. Returns `true` + /// if eaten. fn eat_if_direct(&mut self, kind: SyntaxKind) -> bool { let at = self.directly_at(kind); if at { @@ -1621,188 +1752,228 @@ impl<'s> Parser<'s> { at } + /// Assert that we are at the given [`SyntaxKind`] and eat it. This should + /// be used when moving between functions that expect to start with a + /// specific token. #[track_caller] fn assert(&mut self, kind: SyntaxKind) { - assert_eq!(self.current, kind); + assert_eq!(self.token.kind, kind); self.eat(); } - fn convert(&mut self, kind: SyntaxKind) { - self.current = kind; + /// Convert the current token's [`SyntaxKind`] and eat it. + fn convert_and_eat(&mut self, kind: SyntaxKind) { + // Only need to replace the node here. + self.token.node.convert_to_kind(kind); self.eat(); } - fn newline(&mut self) -> bool { - self.lexer.newline() + /// Eat the current token by saving it to the `nodes` vector, then move + /// the lexer forward to prepare a new token. + fn eat(&mut self) { + self.nodes.push(std::mem::take(&mut self.token.node)); + self.token = Self::lex(&mut self.nodes, &mut self.lexer, self.nl_mode); } - fn column(&self, at: usize) -> usize { - self.text[..at].chars().rev().take_while(|&c| !is_newline(c)).count() - } - - fn marker(&self) -> Marker { - Marker(self.nodes.len()) - } - - /// Get a marker after the last non-trivia node. - fn before_trivia(&self) -> Marker { - let mut i = self.nodes.len(); - if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start { - while i > 0 && self.nodes[i - 1].kind().is_trivia() { - i -= 1; - } - } - Marker(i) - } - - /// Whether the last non-trivia node is an error. - fn after_error(&mut self) -> bool { - let m = self.before_trivia(); - m.0 > 0 && self.nodes[m.0 - 1].kind().is_error() - } - - #[track_caller] - fn post_process(&mut self, m: Marker) -> impl Iterator { - self.nodes[m.0..] - .iter_mut() - .filter(|child| !child.kind().is_error() && !child.kind().is_trivia()) + /// Detach the parsed trivia nodes from this token (but not newline info) so + /// that subsequent wrapping will include the trivia. + fn flush_trivia(&mut self) { + self.token.n_trivia = 0; + self.token.prev_end = self.token.start; } + /// Wrap the nodes from a marker up to (but excluding) the current token in + /// a new [inner node](`SyntaxNode::inner`) of the given kind. This is an + /// easy interface for creating nested syntax nodes _after_ having parsed + /// their children. fn wrap(&mut self, from: Marker, kind: SyntaxKind) { - self.wrap_within(from, self.before_trivia(), kind); - } - - fn wrap_all(&mut self, from: Marker, kind: SyntaxKind) { - self.wrap_within(from, Marker(self.nodes.len()), kind) - } - - fn wrap_within(&mut self, from: Marker, to: Marker, kind: SyntaxKind) { - let len = self.nodes.len(); - let to = to.0.min(len); + let to = self.before_trivia().0; let from = from.0.min(to); let children = self.nodes.drain(from..to).collect(); self.nodes.insert(from, SyntaxNode::inner(kind, children)); } - fn enter(&mut self, mode: LexMode) { - self.modes.push(self.lexer.mode()); + /// Parse within the [`LexMode`] for subsequent tokens (does not change the + /// current token). This may re-lex the final token on exit. + /// + /// This function effectively repurposes the call stack as a stack of modes. + fn enter_modes( + &mut self, + mode: LexMode, + stop: AtNewline, + func: impl FnOnce(&mut Parser<'s>), + ) { + let previous = self.lexer.mode(); self.lexer.set_mode(mode); - } - - fn exit(&mut self) { - let mode = self.modes.pop().unwrap(); - if mode != self.lexer.mode() { - self.unskip(); - self.lexer.set_mode(mode); - self.lexer.jump(self.current_start); - self.lex(); - self.skip(); + self.with_nl_mode(stop, func); + if mode != previous { + self.lexer.set_mode(previous); + self.lexer.jump(self.token.prev_end); + self.nodes.truncate(self.nodes.len() - self.token.n_trivia); + self.token = Self::lex(&mut self.nodes, &mut self.lexer, self.nl_mode); } } - fn enter_newline_mode(&mut self, stop: NewlineMode) { - self.newline_modes.push(stop); - } - - fn exit_newline_mode(&mut self) { - self.unskip(); - self.newline_modes.pop(); - self.lexer.jump(self.prev_end); - self.lex(); - self.skip(); - } - - fn checkpoint(&self) -> Checkpoint<'s> { - Checkpoint { - lexer: self.lexer.clone(), - prev_end: self.prev_end, - current_start: self.current_start, - current: self.current, - nodes: self.nodes.len(), - } - } - - fn restore(&mut self, checkpoint: Checkpoint<'s>) { - self.lexer = checkpoint.lexer; - self.prev_end = checkpoint.prev_end; - self.current_start = checkpoint.current_start; - self.current = checkpoint.current; - self.nodes.truncate(checkpoint.nodes); - } - - fn skip(&mut self) { - if self.lexer.mode() != LexMode::Markup { - while self.current.is_trivia() { - self.save(); - self.lex(); + /// Parse within the [`AtNewline`] mode for subsequent tokens (does not + /// change the current token). This may re-lex the final token on exit. + /// + /// This function effectively repurposes the call stack as a stack of modes. + fn with_nl_mode(&mut self, mode: AtNewline, func: impl FnOnce(&mut Parser<'s>)) { + let previous = self.nl_mode; + self.nl_mode = mode; + func(self); + self.nl_mode = previous; + if let Some(newline) = self.token.newline { + if mode != previous { + // Restore our actual token's kind or insert a fake end. + let actual_kind = self.token.node.kind(); + if self.nl_mode.stop_at(newline, actual_kind) { + self.token.kind = SyntaxKind::End; + } else { + self.token.kind = actual_kind; + } } } } - fn unskip(&mut self) { - if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start { - while self.nodes.last().is_some_and(|last| last.kind().is_trivia()) { - self.nodes.pop(); - } + /// Move the lexer forward and prepare the current token. In Code, this + /// might insert a temporary [`SyntaxKind::End`] based on our newline mode. + /// + /// This is not a method on `self` because we need a valid token before we + /// can initialize the parser. + fn lex(nodes: &mut Vec, lexer: &mut Lexer, nl_mode: AtNewline) -> Token { + let prev_end = lexer.cursor(); + let mut start = prev_end; + let (mut kind, mut node) = lexer.next(); + let mut n_trivia = 0; + let mut had_newline = false; + let mut parbreak = false; - self.lexer.jump(self.prev_end); - self.lex(); + while kind.is_trivia() { + had_newline |= lexer.newline(); // Newlines are always trivia. + parbreak |= kind == SyntaxKind::Parbreak; + n_trivia += 1; + nodes.push(node); + start = lexer.cursor(); + (kind, node) = lexer.next(); } - } - fn save(&mut self) { - let text = self.current_text(); - if self.at(SyntaxKind::Error) { - let error = self.lexer.take_error().unwrap(); - self.nodes.push(SyntaxNode::error(error, text)); + let newline = if had_newline { + let column = (lexer.mode() == LexMode::Markup).then(|| lexer.column(start)); + let newline = Newline { column, parbreak }; + if nl_mode.stop_at(newline, kind) { + // Insert a temporary `SyntaxKind::End` to halt the parser. + // The actual kind will be restored from `node` later. + kind = SyntaxKind::End; + } + Some(newline) } else { - self.nodes.push(SyntaxNode::leaf(self.current, text)); - } + None + }; - if self.lexer.mode() == LexMode::Markup || !self.current.is_trivia() { - self.prev_end = self.current_end(); - } - } - - fn next_non_trivia(lexer: &mut Lexer<'s>) -> SyntaxKind { - loop { - let next = lexer.next(); - // Loop is terminable, because SyntaxKind::End is not a trivia. - if !next.is_trivia() { - break next; - } - } - } - - fn lex(&mut self) { - self.current_start = self.lexer.cursor(); - self.current = self.lexer.next(); - - // Special cases to handle newlines in code mode. - if self.lexer.mode() == LexMode::Code - && self.lexer.newline() - && match self.newline_modes.last() { - Some(NewlineMode::Continue) => false, - Some(NewlineMode::Contextual) => !matches!( - Self::next_non_trivia(&mut self.lexer.clone()), - SyntaxKind::Else | SyntaxKind::Dot - ), - Some(NewlineMode::Stop) => true, - None => false, - } - { - self.current = SyntaxKind::End; - } + Token { kind, node, n_trivia, newline, start, prev_end } } } +/// Extra parser state for efficiently recovering from mispredicted parses. +/// +/// This is the same idea as packrat parsing, but we use it only in the limited +/// case of parenthesized structures. See [`expr_with_paren`] for more. +#[derive(Default)] +struct MemoArena { + /// A single arena of previously parsed nodes (to reduce allocations). + /// Memoized ranges refer to unique sections of the arena. + arena: Vec, + /// A map from the parser's current position to a range of previously parsed + /// nodes in the arena and a checkpoint of the parser's state. These allow + /// us to reset the parser to avoid parsing the same location again. + memo_map: HashMap, PartialState)>, +} + +/// A type alias for the memo key so it doesn't get confused with other usizes. +/// +/// The memo is keyed by the index into `text` of the current token's start. +type MemoKey = usize; + +/// A checkpoint of the parser which can fully restore it to a previous state. +struct Checkpoint { + node_len: usize, + state: PartialState, +} + +/// State needed to restore the parser's current token and the lexer (but not +/// the nodes vector). +#[derive(Clone)] +struct PartialState { + cursor: usize, + lex_mode: LexMode, + token: Token, +} + +/// The Memoization interface. impl<'s> Parser<'s> { - /// Consume the given syntax `kind` or produce an error. + /// Store the already parsed nodes and the parser state into the memo map by + /// extending the arena and storing the extended range and a checkpoint. + fn memoize_parsed_nodes(&mut self, key: MemoKey, prev_len: usize) { + let Checkpoint { state, node_len } = self.checkpoint(); + let memo_start = self.memo.arena.len(); + self.memo.arena.extend_from_slice(&self.nodes[prev_len..node_len]); + let arena_range = memo_start..self.memo.arena.len(); + self.memo.memo_map.insert(key, (arena_range, state)); + } + + /// Try to load a memoized result, return `None` if we did or `Some` (with a + /// checkpoint and a key for the memo map) if we didn't. + fn restore_memo_or_checkpoint(&mut self) -> Option<(MemoKey, Checkpoint)> { + // We use the starting index of the current token as our key. + let key: MemoKey = self.current_start(); + match self.memo.memo_map.get(&key).cloned() { + Some((range, state)) => { + self.nodes.extend_from_slice(&self.memo.arena[range]); + // It's important that we don't truncate the nodes vector since + // it may have grown or shrunk (due to other memoization or + // error reporting) since we made this checkpoint. + self.restore_partial(state); + None + } + None => Some((key, self.checkpoint())), + } + } + + /// Restore the parser to the state at a checkpoint. + fn restore(&mut self, checkpoint: Checkpoint) { + self.nodes.truncate(checkpoint.node_len); + self.restore_partial(checkpoint.state); + } + + /// Restore parts of the checkpoint excluding the nodes vector. + fn restore_partial(&mut self, state: PartialState) { + self.lexer.jump(state.cursor); + self.lexer.set_mode(state.lex_mode); + self.token = state.token; + } + + /// Save a checkpoint of the parser state. + fn checkpoint(&self) -> Checkpoint { + let node_len = self.nodes.len(); + let state = PartialState { + cursor: self.lexer.cursor(), + lex_mode: self.lexer.mode(), + token: self.token.clone(), + }; + Checkpoint { node_len, state } + } +} + +/// Functions for eating expected or unexpected tokens and generating errors if +/// we don't get what we expect. +impl<'s> Parser<'s> { + /// Consume the given `kind` or produce an error. fn expect(&mut self, kind: SyntaxKind) -> bool { let at = self.at(kind); if at { self.eat(); - } else if kind == SyntaxKind::Ident && self.current.is_keyword() { + } else if kind == SyntaxKind::Ident && self.token.kind.is_keyword() { self.trim_errors(); self.eat_and_get().expected(kind.name()); } else { @@ -1828,6 +1999,12 @@ impl<'s> Parser<'s> { } } + /// Whether the last non-trivia node is an error. + fn after_error(&mut self) -> bool { + let m = self.before_trivia(); + m.0 > 0 && self.nodes[m.0 - 1].kind().is_error() + } + /// Produce an error that the given `thing` was expected at the position /// of the marker `m`. fn expected_at(&mut self, m: Marker, thing: &str) { @@ -1836,7 +2013,7 @@ impl<'s> Parser<'s> { self.nodes.insert(m.0, error); } - /// Produce a hint. + /// Add a hint to a trailing error. fn hint(&mut self, hint: &str) { let m = self.before_trivia(); if let Some(error) = self.nodes.get_mut(m.0 - 1) { @@ -1848,7 +2025,7 @@ impl<'s> Parser<'s> { /// unexpected. fn unexpected(&mut self) { self.trim_errors(); - self.balanced &= !self.current.is_grouping(); + self.balanced &= !self.token.kind.is_grouping(); self.eat_and_get().unexpected(); } @@ -1865,17 +2042,3 @@ impl<'s> Parser<'s> { self.nodes.drain(start..end); } } - -impl Index for Parser<'_> { - type Output = SyntaxNode; - - fn index(&self, m: Marker) -> &Self::Output { - &self.nodes[m.0] - } -} - -impl IndexMut for Parser<'_> { - fn index_mut(&mut self, m: Marker) -> &mut Self::Output { - &mut self.nodes[m.0] - } -} diff --git a/crates/typst-syntax/src/reparser.rs b/crates/typst-syntax/src/reparser.rs index 7a9704906..c20d8314f 100644 --- a/crates/typst-syntax/src/reparser.rs +++ b/crates/typst-syntax/src/reparser.rs @@ -157,19 +157,13 @@ fn try_reparse( let new_range = shifted..shifted + new_len; let at_end = end == children.len(); - // Stop parsing early if this kind is encountered. - let stop_kind = match parent_kind { - Some(_) => SyntaxKind::RightBracket, - None => SyntaxKind::End, - }; - // Reparse! let reparsed = reparse_markup( text, new_range.clone(), &mut at_start, &mut nesting, - |kind| kind == stop_kind, + parent_kind.is_none(), ); if let Some(newborns) = reparsed { diff --git a/crates/typst-syntax/src/set.rs b/crates/typst-syntax/src/set.rs index eaee7ef28..014aaf2f7 100644 --- a/crates/typst-syntax/src/set.rs +++ b/crates/typst-syntax/src/set.rs @@ -58,6 +58,7 @@ pub const STMT: SyntaxSet = syntax_set!(Let, Set, Show, Import, Include, Return) pub const MATH_EXPR: SyntaxSet = syntax_set!( Hash, MathIdent, + FieldAccess, Text, MathShorthand, Linebreak, @@ -104,7 +105,7 @@ pub const ATOMIC_CODE_PRIMARY: SyntaxSet = syntax_set!( Numeric, Str, Label, - RawDelim, + Raw, ); /// Syntax kinds that are unary operators. diff --git a/tests/Cargo.toml b/tests/Cargo.toml index b1855b496..eed093eb6 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -11,14 +11,32 @@ name = "tests" path = "src/tests.rs" harness = false +[features] +# Allow just compiling the parser when only testing typst-syntax. To do so, +# pass '--no-default-features' to 'cargo test'. +default = [ + # "typst-syntax" intentionally not present + "typst", + "typst-assets", + "typst-dev-assets", + "typst-library", + "typst-pdf", + "typst-render", + "typst-svg", + "typst-svg", +] + [dependencies] -typst = { workspace = true } -typst-assets = { workspace = true, features = ["fonts"] } -typst-dev-assets = { workspace = true } -typst-library = { workspace = true } -typst-pdf = { workspace = true } -typst-render = { workspace = true } -typst-svg = { workspace = true } +typst-syntax = { workspace = true } +# Mark other Typst crates as optional so we can use '--no-default-features' +# to decrease compile times for parser testing. +typst = { workspace = true, optional = true } +typst-assets = { workspace = true, features = ["fonts"], optional = true } +typst-dev-assets = { workspace = true, optional = true } +typst-library = { workspace = true, optional = true } +typst-pdf = { workspace = true, optional = true } +typst-render = { workspace = true, optional = true } +typst-svg = { workspace = true, optional = true } clap = { workspace = true } comemo = { workspace = true } ecow = { workspace = true } diff --git a/tests/ref/single-right-bracket.png b/tests/ref/single-right-bracket.png new file mode 100644 index 000000000..9867424dd Binary files /dev/null and b/tests/ref/single-right-bracket.png differ diff --git a/tests/src/args.rs b/tests/src/args.rs index 786733cce..db5d1a9ba 100644 --- a/tests/src/args.rs +++ b/tests/src/args.rs @@ -43,7 +43,9 @@ pub struct CliArguments { /// Runs SVG export. #[arg(long)] pub svg: bool, - /// Displays the syntax tree. + /// Displays the syntax tree before running tests. + /// + /// Note: This is ignored if using '--syntax-compare'. #[arg(long)] pub syntax: bool, /// Displays only one line per test, hiding details about failures. @@ -55,6 +57,29 @@ pub struct CliArguments { /// How many threads to spawn when running the tests. #[arg(short = 'j', long)] pub num_threads: Option, + /// Changes testing behavior for debugging the parser: With no argument, + /// outputs the concrete syntax trees of tests as files in + /// 'tests/store/syntax/'. With a directory as argument, will treat it as a + /// reference of correct syntax tree files and will print which output + /// syntax trees differ (viewing the diffs is on you). + /// + /// This overrides the normal testing system. It parses, but does not run + /// the test suite. + /// + /// If `cargo test` is run with `--no-default-features`, then compiling will + /// not include Typst's core crates, only typst-syntax, greatly speeding up + /// debugging when changing the parser. + /// + /// You can generate a correct reference directory by running on a known + /// good commit and copying the generated outputs to a new directory. + /// `_things` may be a good location as it is in the top-level gitignore. + /// + /// You can view diffs in VS Code with: `code --diff /.syntax + /// tests/store/syntax/.syntax` + #[arg(long)] + pub parser_compare: Option>, + // ^ I'm not using a subcommand here because then test patterns don't parse + // how you would expect and I'm too lazy to try to fix it. } impl CliArguments { diff --git a/tests/src/collect.rs b/tests/src/collect.rs index 80e5e5a8b..5c7327f13 100644 --- a/tests/src/collect.rs +++ b/tests/src/collect.rs @@ -6,8 +6,8 @@ use std::str::FromStr; use std::sync::LazyLock; use ecow::{eco_format, EcoString}; -use typst::syntax::package::PackageVersion; -use typst::syntax::{is_id_continue, is_ident, is_newline, FileId, Source, VirtualPath}; +use typst_syntax::package::PackageVersion; +use typst_syntax::{is_id_continue, is_ident, is_newline, FileId, Source, VirtualPath}; use unscanny::Scanner; /// Collects all tests from all files. diff --git a/tests/src/logger.rs b/tests/src/logger.rs index 45c9f0981..48bad451b 100644 --- a/tests/src/logger.rs +++ b/tests/src/logger.rs @@ -2,7 +2,16 @@ use std::io::{self, IsTerminal, StderrLock, Write}; use std::time::{Duration, Instant}; use crate::collect::Test; -use crate::run::TestResult; + +/// The result of running a single test. +pub struct TestResult { + /// The error log for this test. If empty, the test passed. + pub errors: String, + /// The info log for this test. + pub infos: String, + /// Whether the image was mismatched. + pub mismatched_image: bool, +} /// Receives status updates by individual test runs. pub struct Logger<'a> { @@ -58,7 +67,7 @@ impl<'a> Logger<'a> { } }; - if result.is_ok() { + if result.errors.is_empty() { self.passed += 1; } else { self.failed += 1; diff --git a/tests/src/run.rs b/tests/src/run.rs index caa078c4b..1ea19a16a 100644 --- a/tests/src/run.rs +++ b/tests/src/run.rs @@ -12,6 +12,7 @@ use typst::WorldExt; use typst_pdf::PdfOptions; use crate::collect::{FileSize, NoteKind, Test}; +use crate::logger::TestResult; use crate::world::TestWorld; /// Runs a single test. @@ -21,23 +22,6 @@ pub fn run(test: &Test) -> TestResult { Runner::new(test).run() } -/// The result of running a single test. -pub struct TestResult { - /// The error log for this test. If empty, the test passed. - pub errors: String, - /// The info log for this test. - pub infos: String, - /// Whether the image was mismatched. - pub mismatched_image: bool, -} - -impl TestResult { - /// Whether the test passed. - pub fn is_ok(&self) -> bool { - self.errors.is_empty() - } -} - /// Write a line to a log sink, defaulting to the test's error log. macro_rules! log { (into: $sink:expr, $($tts:tt)*) => { diff --git a/tests/src/tests.rs b/tests/src/tests.rs index 940c9e3c4..2b09b29c0 100644 --- a/tests/src/tests.rs +++ b/tests/src/tests.rs @@ -1,13 +1,19 @@ //! Typst's test runner. +#![cfg_attr(not(feature = "default"), allow(dead_code, unused_imports))] + mod args; mod collect; -mod custom; mod logger; + +#[cfg(feature = "default")] +mod custom; +#[cfg(feature = "default")] mod run; +#[cfg(feature = "default")] mod world; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::sync::LazyLock; use std::time::Duration; @@ -16,7 +22,8 @@ use parking_lot::Mutex; use rayon::iter::{ParallelBridge, ParallelIterator}; use crate::args::{CliArguments, Command}; -use crate::logger::Logger; +use crate::collect::Test; +use crate::logger::{Logger, TestResult}; /// The parsed command line arguments. static ARGS: LazyLock = LazyLock::new(CliArguments::parse); @@ -27,6 +34,9 @@ const SUITE_PATH: &str = "tests/suite"; /// The directory where the full test results are stored. const STORE_PATH: &str = "tests/store"; +/// The directory where syntax trees are stored. +const SYNTAX_PATH: &str = "tests/store/syntax"; + /// The directory where the reference images are stored. const REF_PATH: &str = "tests/ref"; @@ -89,6 +99,21 @@ fn test() { return; } + let parser_dirs = ARGS.parser_compare.clone().map(create_syntax_store); + #[cfg(not(feature = "default"))] + let parser_dirs = parser_dirs.or_else(|| Some(create_syntax_store(None))); + + let runner = |test: &Test| { + if let Some((live_path, ref_path)) = &parser_dirs { + run_parser_test(test, live_path, ref_path) + } else { + #[cfg(feature = "default")] + return run::run(test); + #[cfg(not(feature = "default"))] + unreachable!(); + } + }; + // Run the tests. let logger = Mutex::new(Logger::new(selected, skipped)); std::thread::scope(|scope| { @@ -112,7 +137,7 @@ fn test() { // to `typst::utils::Deferred` yielding. tests.iter().par_bridge().for_each(|test| { logger.lock().start(test); - let result = std::panic::catch_unwind(|| run::run(test)); + let result = std::panic::catch_unwind(|| runner(test)); logger.lock().end(test, result); }); @@ -142,3 +167,46 @@ fn undangle() { } } } + +fn create_syntax_store(ref_path: Option) -> (&'static Path, Option) { + if ref_path.as_ref().is_some_and(|p| !p.exists()) { + eprintln!("syntax reference path doesn't exist"); + std::process::exit(1); + } + + let live_path = Path::new(SYNTAX_PATH); + std::fs::remove_dir_all(live_path).ok(); + std::fs::create_dir_all(live_path).unwrap(); + (live_path, ref_path) +} + +fn run_parser_test( + test: &Test, + live_path: &Path, + ref_path: &Option, +) -> TestResult { + let mut result = TestResult { + errors: String::new(), + infos: String::new(), + mismatched_image: false, + }; + + let syntax_file = live_path.join(format!("{}.syntax", test.name)); + let tree = format!("{:#?}\n", test.source.root()); + std::fs::write(syntax_file, &tree).unwrap(); + + let Some(ref_path) = ref_path else { return result }; + let ref_file = ref_path.join(format!("{}.syntax", test.name)); + match std::fs::read_to_string(&ref_file) { + Ok(ref_tree) => { + if tree != ref_tree { + result.errors = "differs".to_string(); + } + } + Err(_) => { + result.errors = format!("missing reference: {}", ref_file.display()); + } + } + + result +} diff --git a/tests/suite/math/symbols.typ b/tests/suite/math/symbols.typ new file mode 100644 index 000000000..65a483162 --- /dev/null +++ b/tests/suite/math/symbols.typ @@ -0,0 +1,29 @@ +// Test math symbol edge cases. + +--- math-symbol-basic --- +#let sym = symbol("s", ("basic", "s")) +#test($sym.basic$, $#"s"$) + +--- math-symbol-underscore --- +#let sym = symbol("s", ("test_underscore", "s")) +// Error: 6-10 unknown symbol modifier +$sym.test_underscore$ + +--- math-symbol-dash --- +#let sym = symbol("s", ("test-dash", "s")) +// Error: 6-10 unknown symbol modifier +$sym.test-dash$ + +--- math-symbol-double --- +#let sym = symbol("s", ("test.basic", "s")) +#test($sym.test.basic$, $#"s"$) + +--- math-symbol-double-underscore --- +#let sym = symbol("s", ("one.test_underscore", "s")) +// Error: 10-14 unknown symbol modifier +$sym.one.test_underscore$ + +--- math-symbol-double-dash --- +#let sym = symbol("s", ("one.test-dash", "s")) +// Error: 10-14 unknown symbol modifier +$sym.one.test-dash$ diff --git a/tests/suite/model/heading.typ b/tests/suite/model/heading.typ index 884f203d2..d182724c8 100644 --- a/tests/suite/model/heading.typ +++ b/tests/suite/model/heading.typ @@ -38,7 +38,7 @@ multiline. --- heading-trailing-whitespace --- // Whether headings contain trailing whitespace with or without comments/labels. // Labels are special cased to immediately end headings in the parser, but also -// have unique whitespace behavior. +// #strike[have unique whitespace behavior] Now their behavior is consistent! #let join(..xs) = xs.pos().join() #let head(h) = heading(depth: 1, h) @@ -49,19 +49,20 @@ multiline. #test(head[h], [= h]) #test(head[h], [= h/**/]) -// Label behaves differently than normal trailing space and comment. -#test(head(join[h][ ]), [= h ]) -#test(head(join[h][ ]), [= h /**/]) +// #strike[Label behaves differently than normal trailing space and comment.] +// Now they behave the same! +#test(join(head[h])[ ], [= h ]) +#test(join(head[h])[ ], [= h /**/]) #test(join(head[h])[ ], [= h ]) // Combinations. -#test(head(join[h][ ][ ]), [= h /**/ ]) +#test(join(head[h])[ ][ ], [= h /**/ ]) #test(join(head[h])[ ][ ], [= h ]) -#test(head(join[h][ ]), [= h /**/]) +#test(join(head[h])[ ], [= h /**/]) #test(join(head[h])[ ], [= h/**/ ]) -// The first space attaches, but not the second -#test(join(head(join[h][ ]))[ ], [= h /**/ ]) +// #strike[The first space attaches, but not the second] Now neither attaches! +#test(join(head(join[h]))[ ][ ], [= h /**/ ]) --- heading-leading-whitespace --- // Test that leading whitespace and comments don't matter. diff --git a/tests/suite/model/list.typ b/tests/suite/model/list.typ index 46f4621f5..c3c123de1 100644 --- a/tests/suite/model/list.typ +++ b/tests/suite/model/list.typ @@ -34,6 +34,51 @@ _Shopping list_ - C - D +--- list-indent-trivia-nesting --- +// Test indent nesting behavior with odd trivia (comments and spaces). + +#let indented = [ +- a + /**/- b +/**/ - c + /*spanning + multiple + lines */ - d + - e +/**/ - f +/**/ - g +] +// Current behavior is that list columns are based on the first non-whitespace +// element in their line, so the block comments here determine the column the +// list starts at + +#let item = list.item +#let manual = { + [ ] + item({ + [a] + [ ] + item[b] + [ ]; [ ] + item({ + [c] + [ ]; [ ] + item[d] + }) + [ ] + item({ + [e] + [ ]; [ ] + item[f] + [ ]; [ ] + item[g] + }) + }) + [ ] +} + +#test(indented, manual) + --- list-tabs --- // This works because tabs are used consistently. - A with 1 tab diff --git a/tests/suite/scripting/blocks.typ b/tests/suite/scripting/blocks.typ index f139b8c62..ba1d9c89c 100644 --- a/tests/suite/scripting/blocks.typ +++ b/tests/suite/scripting/blocks.typ @@ -135,6 +135,9 @@ // Error: 2-3 unexpected closing brace #} +--- single-right-bracket --- +] + --- content-block-in-markup-scope --- // Content blocks also create a scope. #[#let x = 1]