From da83f33c4c015f927920b437610153c029c8291b Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski Date: Sun, 22 Sep 2024 17:38:38 -0400 Subject: [PATCH 01/18] 1. Add test-runner option to compare parser output --- tests/src/args.rs | 23 ++++++++++++++++- tests/src/tests.rs | 62 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 82 insertions(+), 3 deletions(-) diff --git a/tests/src/args.rs b/tests/src/args.rs index 786733cce..e94986ced 100644 --- a/tests/src/args.rs +++ b/tests/src/args.rs @@ -43,7 +43,9 @@ pub struct CliArguments { /// Runs SVG export. #[arg(long)] pub svg: bool, - /// Displays the syntax tree. + /// Displays the syntax tree before running tests. + /// + /// Note: This is ignored if using '--syntax-compare'. #[arg(long)] pub syntax: bool, /// Displays only one line per test, hiding details about failures. @@ -55,6 +57,25 @@ pub struct CliArguments { /// How many threads to spawn when running the tests. #[arg(short = 'j', long)] pub num_threads: Option, + /// Changes testing behavior for debugging the parser: With no argument, + /// outputs the concrete syntax trees of tests as files in + /// 'tests/store/syntax/'. With a directory as argument, will treat it as a + /// reference of correct syntax tree files and will print which output + /// syntax trees differ (viewing the diffs is on you). + /// + /// This overrides the normal testing system. It parses, but does not run + /// the test suite. + /// + /// You can generate a correct reference directory by running on a known + /// good commit and copying the generated outputs to a new directory. + /// `_things` may be a good location as it is in the top-level gitignore. + /// + /// You can view diffs in VS Code with: `code --diff /.syntax + /// tests/store/syntax/.syntax` + #[arg(long)] + pub parser_compare: Option>, + // ^ I'm not using a subcommand here because then test patterns don't parse + // how you would expect and I'm too lazy to try to fix it. } impl CliArguments { diff --git a/tests/src/tests.rs b/tests/src/tests.rs index 940c9e3c4..eb2cfd796 100644 --- a/tests/src/tests.rs +++ b/tests/src/tests.rs @@ -7,7 +7,7 @@ mod logger; mod run; mod world; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::sync::LazyLock; use std::time::Duration; @@ -16,7 +16,9 @@ use parking_lot::Mutex; use rayon::iter::{ParallelBridge, ParallelIterator}; use crate::args::{CliArguments, Command}; +use crate::collect::Test; use crate::logger::Logger; +use crate::run::TestResult; /// The parsed command line arguments. static ARGS: LazyLock = LazyLock::new(CliArguments::parse); @@ -27,6 +29,9 @@ const SUITE_PATH: &str = "tests/suite"; /// The directory where the full test results are stored. const STORE_PATH: &str = "tests/store"; +/// The directory where syntax trees are stored. +const SYNTAX_PATH: &str = "tests/store/syntax"; + /// The directory where the reference images are stored. const REF_PATH: &str = "tests/ref"; @@ -89,6 +94,16 @@ fn test() { return; } + let parser_dirs = ARGS.parser_compare.clone().map(create_syntax_store); + + let runner = |test: &Test| { + if let Some((live_path, ref_path)) = &parser_dirs { + run_parser_test(test, live_path, ref_path) + } else { + run::run(test) + } + }; + // Run the tests. let logger = Mutex::new(Logger::new(selected, skipped)); std::thread::scope(|scope| { @@ -112,7 +127,7 @@ fn test() { // to `typst::utils::Deferred` yielding. tests.iter().par_bridge().for_each(|test| { logger.lock().start(test); - let result = std::panic::catch_unwind(|| run::run(test)); + let result = std::panic::catch_unwind(|| runner(test)); logger.lock().end(test, result); }); @@ -142,3 +157,46 @@ fn undangle() { } } } + +fn create_syntax_store(ref_path: Option) -> (&'static Path, Option) { + if ref_path.as_ref().is_some_and(|p| !p.exists()) { + eprintln!("syntax reference path doesn't exist"); + std::process::exit(1); + } + + let live_path = Path::new(SYNTAX_PATH); + std::fs::remove_dir_all(live_path).ok(); + std::fs::create_dir_all(live_path).unwrap(); + (live_path, ref_path) +} + +fn run_parser_test( + test: &Test, + live_path: &Path, + ref_path: &Option, +) -> TestResult { + let mut result = TestResult { + errors: String::new(), + infos: String::new(), + mismatched_image: false, + }; + + let syntax_file = live_path.join(format!("{}.syntax", test.name)); + let tree = format!("{:#?}\n", test.source.root()); + std::fs::write(syntax_file, &tree).unwrap(); + + let Some(ref_path) = ref_path else { return result }; + let ref_file = ref_path.join(format!("{}.syntax", test.name)); + match std::fs::read_to_string(&ref_file) { + Ok(ref_tree) => { + if tree != ref_tree { + result.errors = "differs".to_string(); + } + } + Err(_) => { + result.errors = format!("missing reference: {}", ref_file.display()); + } + } + + result +} From a2761ab75ac4038edff8be1c4dc66b3770e74d38 Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski Date: Sun, 22 Sep 2024 17:38:38 -0400 Subject: [PATCH 02/18] 2. Allow compiling test-runner with only typst_syntax --- Cargo.lock | 1 + tests/Cargo.toml | 32 +++++++++++++++++++++++++------- tests/src/args.rs | 4 ++++ tests/src/collect.rs | 4 ++-- tests/src/logger.rs | 13 +++++++++++-- tests/src/run.rs | 18 +----------------- tests/src/tests.rs | 18 ++++++++++++++---- 7 files changed, 58 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4709fb5b4..5c148c117 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3018,6 +3018,7 @@ dependencies = [ "typst-pdf", "typst-render", "typst-svg", + "typst-syntax", "unscanny", "walkdir", ] diff --git a/tests/Cargo.toml b/tests/Cargo.toml index b1855b496..eed093eb6 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -11,14 +11,32 @@ name = "tests" path = "src/tests.rs" harness = false +[features] +# Allow just compiling the parser when only testing typst-syntax. To do so, +# pass '--no-default-features' to 'cargo test'. +default = [ + # "typst-syntax" intentionally not present + "typst", + "typst-assets", + "typst-dev-assets", + "typst-library", + "typst-pdf", + "typst-render", + "typst-svg", + "typst-svg", +] + [dependencies] -typst = { workspace = true } -typst-assets = { workspace = true, features = ["fonts"] } -typst-dev-assets = { workspace = true } -typst-library = { workspace = true } -typst-pdf = { workspace = true } -typst-render = { workspace = true } -typst-svg = { workspace = true } +typst-syntax = { workspace = true } +# Mark other Typst crates as optional so we can use '--no-default-features' +# to decrease compile times for parser testing. +typst = { workspace = true, optional = true } +typst-assets = { workspace = true, features = ["fonts"], optional = true } +typst-dev-assets = { workspace = true, optional = true } +typst-library = { workspace = true, optional = true } +typst-pdf = { workspace = true, optional = true } +typst-render = { workspace = true, optional = true } +typst-svg = { workspace = true, optional = true } clap = { workspace = true } comemo = { workspace = true } ecow = { workspace = true } diff --git a/tests/src/args.rs b/tests/src/args.rs index e94986ced..db5d1a9ba 100644 --- a/tests/src/args.rs +++ b/tests/src/args.rs @@ -66,6 +66,10 @@ pub struct CliArguments { /// This overrides the normal testing system. It parses, but does not run /// the test suite. /// + /// If `cargo test` is run with `--no-default-features`, then compiling will + /// not include Typst's core crates, only typst-syntax, greatly speeding up + /// debugging when changing the parser. + /// /// You can generate a correct reference directory by running on a known /// good commit and copying the generated outputs to a new directory. /// `_things` may be a good location as it is in the top-level gitignore. diff --git a/tests/src/collect.rs b/tests/src/collect.rs index 80e5e5a8b..5c7327f13 100644 --- a/tests/src/collect.rs +++ b/tests/src/collect.rs @@ -6,8 +6,8 @@ use std::str::FromStr; use std::sync::LazyLock; use ecow::{eco_format, EcoString}; -use typst::syntax::package::PackageVersion; -use typst::syntax::{is_id_continue, is_ident, is_newline, FileId, Source, VirtualPath}; +use typst_syntax::package::PackageVersion; +use typst_syntax::{is_id_continue, is_ident, is_newline, FileId, Source, VirtualPath}; use unscanny::Scanner; /// Collects all tests from all files. diff --git a/tests/src/logger.rs b/tests/src/logger.rs index 45c9f0981..48bad451b 100644 --- a/tests/src/logger.rs +++ b/tests/src/logger.rs @@ -2,7 +2,16 @@ use std::io::{self, IsTerminal, StderrLock, Write}; use std::time::{Duration, Instant}; use crate::collect::Test; -use crate::run::TestResult; + +/// The result of running a single test. +pub struct TestResult { + /// The error log for this test. If empty, the test passed. + pub errors: String, + /// The info log for this test. + pub infos: String, + /// Whether the image was mismatched. + pub mismatched_image: bool, +} /// Receives status updates by individual test runs. pub struct Logger<'a> { @@ -58,7 +67,7 @@ impl<'a> Logger<'a> { } }; - if result.is_ok() { + if result.errors.is_empty() { self.passed += 1; } else { self.failed += 1; diff --git a/tests/src/run.rs b/tests/src/run.rs index caa078c4b..1ea19a16a 100644 --- a/tests/src/run.rs +++ b/tests/src/run.rs @@ -12,6 +12,7 @@ use typst::WorldExt; use typst_pdf::PdfOptions; use crate::collect::{FileSize, NoteKind, Test}; +use crate::logger::TestResult; use crate::world::TestWorld; /// Runs a single test. @@ -21,23 +22,6 @@ pub fn run(test: &Test) -> TestResult { Runner::new(test).run() } -/// The result of running a single test. -pub struct TestResult { - /// The error log for this test. If empty, the test passed. - pub errors: String, - /// The info log for this test. - pub infos: String, - /// Whether the image was mismatched. - pub mismatched_image: bool, -} - -impl TestResult { - /// Whether the test passed. - pub fn is_ok(&self) -> bool { - self.errors.is_empty() - } -} - /// Write a line to a log sink, defaulting to the test's error log. macro_rules! log { (into: $sink:expr, $($tts:tt)*) => { diff --git a/tests/src/tests.rs b/tests/src/tests.rs index eb2cfd796..2b09b29c0 100644 --- a/tests/src/tests.rs +++ b/tests/src/tests.rs @@ -1,10 +1,16 @@ //! Typst's test runner. +#![cfg_attr(not(feature = "default"), allow(dead_code, unused_imports))] + mod args; mod collect; -mod custom; mod logger; + +#[cfg(feature = "default")] +mod custom; +#[cfg(feature = "default")] mod run; +#[cfg(feature = "default")] mod world; use std::path::{Path, PathBuf}; @@ -17,8 +23,7 @@ use rayon::iter::{ParallelBridge, ParallelIterator}; use crate::args::{CliArguments, Command}; use crate::collect::Test; -use crate::logger::Logger; -use crate::run::TestResult; +use crate::logger::{Logger, TestResult}; /// The parsed command line arguments. static ARGS: LazyLock = LazyLock::new(CliArguments::parse); @@ -95,12 +100,17 @@ fn test() { } let parser_dirs = ARGS.parser_compare.clone().map(create_syntax_store); + #[cfg(not(feature = "default"))] + let parser_dirs = parser_dirs.or_else(|| Some(create_syntax_store(None))); let runner = |test: &Test| { if let Some((live_path, ref_path)) = &parser_dirs { run_parser_test(test, live_path, ref_path) } else { - run::run(test) + #[cfg(feature = "default")] + return run::run(test); + #[cfg(not(feature = "default"))] + unreachable!(); } }; From a764aa419209d2d46d27d46c00c46cc12a371f08 Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski Date: Thu, 10 Oct 2024 11:57:27 -0400 Subject: [PATCH 03/18] 3. Add typst-syntax README and parser comments --- crates/typst-syntax/README.md | 40 +++++++ crates/typst-syntax/src/parser.rs | 170 +++++++++++++++++++++++++++--- 2 files changed, 193 insertions(+), 17 deletions(-) create mode 100644 crates/typst-syntax/README.md diff --git a/crates/typst-syntax/README.md b/crates/typst-syntax/README.md new file mode 100644 index 000000000..ced4096ef --- /dev/null +++ b/crates/typst-syntax/README.md @@ -0,0 +1,40 @@ +# typst-syntax + +Welcome to the Typst Syntax crate! This crate manages the syntactical structure +of Typst by holding some core abstractions like assigning source file ids, +parsing Typst syntax, creating an Abstract Syntax Tree (AST), initializing +source "spans" (for linking AST elements to their outputs in a document), and +syntax highlighting. + +Below are quick descriptions of the files you might be editing if you find +yourself here :) + +- `lexer.rs`: The lexical foundation of the parser, which converts a string of + characters into tokens. +- `parser.rs`: The main parser definition, preparing a Concrete Syntax Tree made + of nested vectors of `SyntaxNode`s. +- `reparser.rs`: The algorithm for reparsing the minimal required amount of + source text for efficient incremental compilation. +- `ast.rs`: The conversion layer between the Concrete Syntax Tree of the parser + and the Abstract Syntax Tree used for code evaluation. +- `node.rs` & `span.rs`: The underlying data structure for the Concrete Syntax + Tree and the definitions of source spans used for efficiently pointing to a + syntax node in things like diagnostics. +- `kind.rs` & `set.rs`: An enum with all syntactical tokens and nodes and + bit-set data structure for sets of `SyntaxKind`s. +- `highlight.rs`: Extracting of syntax highlighting information out of the + Concrete Syntax Tree (and outputting as HTML). +- `path.rs`, `file.rs`, `package.rs`: The system for interning project and + package paths as unique file IDs and resolving them in a virtual filesystem + (not actually for _opening_ files). + +The structure of the parser is largely adapted from Rust Analyzer. Their +[documentation][ra] is a good reference for a number of the design decisions +around the parser and AST. + +The reparsing algorithm is explained in Section 4 of [Martin's thesis][thesis] +(though it changed a bit since). + +[ra]: https://github.com/rust-lang/rust-analyzer/blob/master/docs/dev/syntax.md +[thesis]: + https://www.researchgate.net/publication/364622490_Fast_Typesetting_with_Incremental_Compilation diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index 8c783ffed..afa47257f 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -10,7 +10,7 @@ use crate::{ ast, is_ident, is_newline, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode, }; -/// Parses a source file. +/// Parses a source file as top-level markup. pub fn parse(text: &str) -> SyntaxNode { let _scope = typst_timing::TimingScope::new("parse"); let mut p = Parser::new(text, 0, LexMode::Markup); @@ -37,7 +37,7 @@ pub fn parse_math(text: &str) -> SyntaxNode { p.finish().into_iter().next().unwrap() } -/// Parses the contents of a file or content block. +/// Parses markup expressions until a stop condition is met. fn markup( p: &mut Parser, mut at_start: bool, @@ -96,7 +96,7 @@ pub(super) fn reparse_markup( (p.balanced && p.current_start() == range.end).then(|| p.finish()) } -/// Parses a single markup expression: This includes markup elements like +/// Parses a single markup expression. This includes markup elements like /// spaces, text, and headings, and embedded code expressions. fn markup_expr(p: &mut Parser, at_start: &mut bool) { match p.current() { @@ -414,6 +414,7 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) { } } +/// Try to parse delimiters based on the current token's unicode math class. fn maybe_delimited(p: &mut Parser) -> bool { let open = math_class(p.current_text()) == Some(MathClass::Opening); if open { @@ -422,6 +423,7 @@ fn maybe_delimited(p: &mut Parser) -> bool { open } +/// Parse matched delimiters in math: `[x + y]`. fn math_delimited(p: &mut Parser) { let m = p.marker(); p.eat(); @@ -444,6 +446,8 @@ fn math_delimited(p: &mut Parser) { p.wrap(m, SyntaxKind::Math); } +/// Remove one set of parentheses (if any) from a previously parsed expression +/// by converting to non-expression SyntaxKinds. fn math_unparen(p: &mut Parser, m: Marker) { let Some(node) = p.nodes.get_mut(m.0) else { return }; if node.kind() != SyntaxKind::MathDelimited { @@ -460,6 +464,10 @@ fn math_unparen(p: &mut Parser, m: Marker) { node.convert_to_kind(SyntaxKind::Math); } +/// The unicode math class of a string. Only returns `Some` if `text` has +/// exactly one unicode character or is a math shorthand string (currently just +/// `[|`, `||`, `|]`) and then only returns `Some` if there is a math class +/// defined for that character. fn math_class(text: &str) -> Option { match text { "[|" => return Some(MathClass::Opening), @@ -475,6 +483,7 @@ fn math_class(text: &str) -> Option { .and_then(unicode_math_class::class) } +/// Precedence and wrapper kinds for the binary math operators. fn math_op(kind: SyntaxKind) -> Option<(SyntaxKind, SyntaxKind, ast::Assoc, usize)> { match kind { SyntaxKind::Underscore => { @@ -490,6 +499,7 @@ fn math_op(kind: SyntaxKind) -> Option<(SyntaxKind, SyntaxKind, ast::Assoc, usiz } } +/// Parse an argument list in math: `(a, b; c, d; size: #50%)`. fn math_args(p: &mut Parser) { let m = p.marker(); p.convert(SyntaxKind::LeftParen); @@ -629,7 +639,7 @@ fn code_expr(p: &mut Parser) { code_expr_prec(p, false, 0) } -/// Parses a code expression embedded in markup or math. +/// Parses an atomic code expression embedded in markup or math. fn embedded_code_expr(p: &mut Parser) { p.enter_newline_mode(NewlineMode::Stop); p.enter(LexMode::Code); @@ -1130,6 +1140,21 @@ fn parenthesized_or_array_or_dict(p: &mut Parser) -> SyntaxKind { seen: HashSet::new(), }; + // An edge case with parens is whether we can interpret a leading spread + // expression as a dictionary, e.g. if we want `(..dict1, ..dict2)` to join + // the two dicts. + // + // The issue is that we decide on the type of the parenthesized expression + // here in the parser by the `SyntaxKind` we wrap with, instead of in eval + // based on the type of the spread item. + // + // The current fix is that we allow a leading colon to force the + // parenthesized value into a dict: + // - `(..arr1, ..arr2)` is wrapped as an `Array`. + // - `(: ..dict1, ..dict2)` is wrapped as a `Dict`. + // + // This does allow some unexpected expressions, such as `(: key: val)`, but + // it's currently intentional. if p.eat_if(SyntaxKind::Colon) { state.kind = Some(SyntaxKind::Dict); state.maybe_just_parens = false; @@ -1165,8 +1190,13 @@ fn parenthesized_or_array_or_dict(p: &mut Parser) -> SyntaxKind { /// State for array/dictionary parsing. struct GroupState { count: usize, + /// Whether this is just a single expression in parens: `(a)`. Single + /// element arrays require an explicit comma: `(a,)`, unless we're + /// spreading: `(..a)`. maybe_just_parens: bool, + /// The `SyntaxKind` to wrap as (if we've figured it out yet). kind: Option, + /// Store named arguments so we can give an error if they're repeated. seen: HashSet, } @@ -1484,32 +1514,90 @@ fn pattern_leaf<'s>( } } -/// Manages parsing of a stream of tokens. +/// Manages parsing a stream of tokens into a tree of [`SyntaxNode`]s. +/// +/// The implementation presents an interface that investigates a `current` token +/// and can take one of the following actions: +/// +/// 1. Eat a token, pushing `current` into the `nodes` vector as a [leaf +/// node](`SyntaxNode::leaf`) and prepare a new `current` by calling into the +/// lexer. +/// 2. Wrap nodes from a marker to the end of `nodes` (excluding `current`) into +/// an [inner node](`SyntaxNode::inner`) of a specific [`SyntaxKind`]. +/// 3. Produce or convert nodes into an [error node](`SyntaxNode::error`) when +/// something expected is missing or something unexpected is found. +/// +/// Overall the parser produces a nested tree of SyntaxNodes as a "_Concrete_ +/// Syntax Tree." The raw Concrete Syntax Tree should contain the entire source +/// text, and is used as-is for e.g. syntax highlighting and IDE features. In +/// `ast.rs` the CST is interpreted as a lazy view over an "_Abstract_ Syntax +/// Tree." The AST module skips over irrelevant tokens -- whitespace, comments, +/// code parens, commas in function args, etc. -- as it iterates through the +/// tree. +/// +/// ### Modes +/// +/// The parser manages the transitions between the three modes of Typst through +/// stacks of [lexer modes](`LexMode`) and [newline modes](`NewlineMode`). +/// +/// The lexer modes map to the three Typst modes and are stored in the lexer, +/// changing which`SyntaxKind`s it will generate. The mode also affects how the +/// parser treats trivia tokens (comments and whitespace). In Markup, trivia is +/// handled manually to deal with list indentation and must be explicitly eaten. +/// In Code and Math, trivia is managed internally and is implicitly eaten by +/// pushing onto the end of the `nodes` vector until a non-trivia kind is found. +/// +/// The newline mode is used in Code to determine whether a newline should end +/// the current expression. If so, the parser temporarily changes the current +/// token's kind to a fake [`SyntaxKind::End`]. When the parser exits the mode +/// the original `SyntaxKind` is restored. struct Parser<'s> { + /// The source text shared with the lexer. text: &'s str, + /// A lexer over the source text with multiple modes. Defines the boundaries + /// of tokens and determines their [`SyntaxKind`]. lexer: Lexer<'s>, + /// The index into `text` of the end of the previous token. prev_end: usize, + /// The index into `text` of the start of our current token (the end is + /// stored as the lexer's cursor). current_start: usize, + /// The [`SyntaxKind`] of the current token. current: SyntaxKind, + /// Whether the parser has the expected set of open/close delimiters. This + /// only ever transitions from `true` to `false`. balanced: bool, + /// Nodes representing the concrete syntax tree of previously parsed text. + /// In Code and Math, includes previously parsed trivia, but not `current`. nodes: Vec, + /// Stack of lexer modes to be pushed/popped. The current mode is implicitly + /// stored in the lexer. modes: Vec, + /// Stack of newline modes to be pushed/popped. The current mode is the tail + /// of the vector. newline_modes: Vec, + /// Parser checkpoints for a given text index. Used for efficient parser + /// backtracking similar to packrat parsing. See comments above in + /// [`expr_with_paren`]. memo: HashMap, Checkpoint<'s>)>, + /// The stored parse results at each checkpoint. memo_arena: Vec, } -/// How to proceed with parsing when seeing a newline. +/// How to proceed with parsing when at a newline in Code. #[derive(Clone)] enum NewlineMode { - /// Stop always. + /// Stop at any newline. Stop, - /// Proceed if there is no continuation with `else` or `.` + /// Continue only if there is no continuation with `else` or `.`. Contextual, - /// Just proceed like with normal whitespace. + /// Continue at newlines. Continue, } +/// A marker representing a node's position in the parser. Mainly used for +/// wrapping, but can also index into the parser to access the node, like +/// `p[m]`. #[derive(Debug, Copy, Clone, Eq, PartialEq)] struct Marker(usize); @@ -1523,6 +1611,7 @@ struct Checkpoint<'s> { } impl<'s> Parser<'s> { + /// Create a new parser starting from the given text offset and lexer mode. fn new(text: &'s str, offset: usize, mode: LexMode) -> Self { let mut lexer = Lexer::new(text, mode); lexer.jump(offset); @@ -1542,52 +1631,68 @@ impl<'s> Parser<'s> { } } + /// Consume the parser, yielding the full vector of parsed SyntaxNodes. fn finish(self) -> Vec { self.nodes } + /// The offset into `text` of the previous token's end. fn prev_end(&self) -> usize { self.prev_end } + /// Similar to a `peek()` function: returns the `kind` of the next token to + /// be eaten. fn current(&self) -> SyntaxKind { self.current } + /// The offset into `text` of the current token's start. fn current_start(&self) -> usize { self.current_start } + /// The offset into `text` of the current token's end. fn current_end(&self) -> usize { self.lexer.cursor() } + /// The current token's text. fn current_text(&self) -> &'s str { &self.text[self.current_start..self.current_end()] } + /// Whether the current token is a given [`SyntaxKind`]. fn at(&self, kind: SyntaxKind) -> bool { self.current == kind } + /// Whether the current token is contained in a [`SyntaxSet`]. fn at_set(&self, set: SyntaxSet) -> bool { set.contains(self.current) } + /// Whether we're at the end of the token stream. + /// + /// Note: This might be a fake end due to the newline mode. fn end(&self) -> bool { self.at(SyntaxKind::End) } + /// If we're at the given `kind` with no preceding trivia tokens. fn directly_at(&self, kind: SyntaxKind) -> bool { self.current == kind && self.prev_end == self.current_start } + /// Eat the current token by saving it to the `nodes` vector, then move + /// the lexer forward to prepare a new token. fn eat(&mut self) { self.save(); self.lex(); self.skip(); } + /// Eat the current node and return a reference for in-place mutation. #[track_caller] fn eat_and_get(&mut self) -> &mut SyntaxNode { let offset = self.nodes.len(); @@ -1597,9 +1702,9 @@ impl<'s> Parser<'s> { &mut self.nodes[offset] } - /// Eats if at `kind`. + /// Eat the token if at `kind`. Returns `true` if eaten. /// - /// Note: In math and code mode, this will ignore trivia in front of the + /// Note: In Math and Code, this will ignore trivia in front of the /// `kind`, To forbid skipping trivia, consider using `eat_if_direct`. fn eat_if(&mut self, kind: SyntaxKind) -> bool { let at = self.at(kind); @@ -1609,7 +1714,8 @@ impl<'s> Parser<'s> { at } - /// Eats only if currently at the start of `kind`. + /// Eat the token only if at `kind` with no preceding trivia. Returns `true` + /// if eaten. fn eat_if_direct(&mut self, kind: SyntaxKind) -> bool { let at = self.directly_at(kind); if at { @@ -1618,30 +1724,39 @@ impl<'s> Parser<'s> { at } + /// Assert that we are at the given [`SyntaxKind`] and eat it. This should + /// be used when moving between functions that expect to start with a + /// specific token. #[track_caller] fn assert(&mut self, kind: SyntaxKind) { assert_eq!(self.current, kind); self.eat(); } + /// Convert the current token's [`SyntaxKind`] and eat it. fn convert(&mut self, kind: SyntaxKind) { self.current = kind; self.eat(); } + /// Whether the current token is a newline, only used in Markup. fn newline(&mut self) -> bool { self.lexer.newline() } + /// The number of characters until the most recent newline in `text`. fn column(&self, at: usize) -> usize { self.text[..at].chars().rev().take_while(|&c| !is_newline(c)).count() } + /// A marker that will point to the current token in the parser once it's + /// been eaten. fn marker(&self) -> Marker { Marker(self.nodes.len()) } - /// Get a marker after the last non-trivia node. + /// A marker that will point to first trivia before this token in the + /// parser (or the token itself if no trivia precede it). fn before_trivia(&self) -> Marker { let mut i = self.nodes.len(); if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start { @@ -1658,6 +1773,7 @@ impl<'s> Parser<'s> { m.0 > 0 && self.nodes[m.0 - 1].kind().is_error() } + /// Iterate over the non-trivia tokens following the marker. #[track_caller] fn post_process(&mut self, m: Marker) -> impl Iterator { self.nodes[m.0..] @@ -1665,10 +1781,15 @@ impl<'s> Parser<'s> { .filter(|child| !child.kind().is_error() && !child.kind().is_trivia()) } + /// Wrap the nodes from a marker up to (but excluding) the current token in + /// a new [inner node](`SyntaxNode::inner`) of the given kind. This is an + /// easy interface for creating nested syntax nodes _after_ having parsed + /// their children. fn wrap(&mut self, from: Marker, kind: SyntaxKind) { self.wrap_within(from, self.before_trivia(), kind); } + /// Wrap including any trailing trivia nodes. fn wrap_all(&mut self, from: Marker, kind: SyntaxKind) { self.wrap_within(from, Marker(self.nodes.len()), kind) } @@ -1681,11 +1802,14 @@ impl<'s> Parser<'s> { self.nodes.insert(from, SyntaxNode::inner(kind, children)); } + /// Enter a new [`LexMode`] that will affect subsequent tokens (does not + /// modify the current token). fn enter(&mut self, mode: LexMode) { self.modes.push(self.lexer.mode()); self.lexer.set_mode(mode); } + /// Exit the current [`LexMode`], possibly re-lexing the current token. fn exit(&mut self) { let mode = self.modes.pop().unwrap(); if mode != self.lexer.mode() { @@ -1697,10 +1821,13 @@ impl<'s> Parser<'s> { } } + /// Enter a new [`NewlineMode`] that will affect subsequent tokens (does not + /// modify the current token). fn enter_newline_mode(&mut self, stop: NewlineMode) { self.newline_modes.push(stop); } + /// Exit the current [`NewlineMode`], possibly re-lexing the current token. fn exit_newline_mode(&mut self) { self.unskip(); self.newline_modes.pop(); @@ -1709,6 +1836,7 @@ impl<'s> Parser<'s> { self.skip(); } + /// Save a checkpoint of the parser state. fn checkpoint(&self) -> Checkpoint<'s> { Checkpoint { lexer: self.lexer.clone(), @@ -1719,6 +1847,7 @@ impl<'s> Parser<'s> { } } + /// Reset the parser from a checkpoint. fn restore(&mut self, checkpoint: Checkpoint<'s>) { self.lexer = checkpoint.lexer; self.prev_end = checkpoint.prev_end; @@ -1727,6 +1856,7 @@ impl<'s> Parser<'s> { self.nodes.truncate(checkpoint.nodes); } + /// Move past trivia nodes in Code/Math. fn skip(&mut self) { if self.lexer.mode() != LexMode::Markup { while self.current.is_trivia() { @@ -1736,6 +1866,8 @@ impl<'s> Parser<'s> { } } + /// Move the parser back to the start of this token or its leading trivia + /// (in Code/Math). fn unskip(&mut self) { if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start { while self.nodes.last().is_some_and(|last| last.kind().is_trivia()) { @@ -1747,6 +1879,7 @@ impl<'s> Parser<'s> { } } + /// Save the current token to the `nodes` vector as an Inner or Error node. fn save(&mut self) { let text = self.current_text(); if self.at(SyntaxKind::Error) { @@ -1761,21 +1894,24 @@ impl<'s> Parser<'s> { } } + /// Find the kind of the next non-trivia token in the lexer. fn next_non_trivia(lexer: &mut Lexer<'s>) -> SyntaxKind { loop { let next = lexer.next(); - // Loop is terminable, because SyntaxKind::End is not a trivia. + // Loop is terminable, because `SyntaxKind::End` is not a trivia. if !next.is_trivia() { break next; } } } + /// Move the lexer forward and prepare the current token. In Code, this + /// might insert a temporary [`SyntaxKind::End`] based on our newline mode. fn lex(&mut self) { self.current_start = self.lexer.cursor(); self.current = self.lexer.next(); - // Special cases to handle newlines in code mode. + // Special cases to handle newlines in Code. if self.lexer.mode() == LexMode::Code && self.lexer.newline() && match self.newline_modes.last() { @@ -1794,7 +1930,7 @@ impl<'s> Parser<'s> { } impl<'s> Parser<'s> { - /// Consume the given syntax `kind` or produce an error. + /// Consume the given `kind` or produce an error. fn expect(&mut self, kind: SyntaxKind) -> bool { let at = self.at(kind); if at { @@ -1833,7 +1969,7 @@ impl<'s> Parser<'s> { self.nodes.insert(m.0, error); } - /// Produce a hint. + /// Add a hint to a trailing error. fn hint(&mut self, hint: &str) { let m = self.before_trivia(); if let Some(error) = self.nodes.get_mut(m.0 - 1) { From 54eadb65a9a9133b64a3ace7605f3f2852a69373 Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski Date: Sat, 26 Oct 2024 04:03:54 -0400 Subject: [PATCH 04/18] 4. Rename convert to convert_and_eat --- crates/typst-syntax/src/parser.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index afa47257f..50277fab9 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -133,7 +133,7 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) { | SyntaxKind::ListMarker | SyntaxKind::EnumMarker | SyntaxKind::TermMarker - | SyntaxKind::Colon => p.convert(SyntaxKind::Text), + | SyntaxKind::Colon => p.convert_and_eat(SyntaxKind::Text), _ => { p.unexpected(); @@ -287,8 +287,8 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) { matches!(next, SyntaxKind::MathIdent | SyntaxKind::Text) && is_ident(&p.text[start..end]) } { - p.convert(SyntaxKind::Dot); - p.convert(SyntaxKind::Ident); + p.convert_and_eat(SyntaxKind::Dot); + p.convert_and_eat(SyntaxKind::Ident); p.wrap(m, SyntaxKind::FieldAccess); } if min_prec < 3 && p.directly_at(SyntaxKind::Text) && p.current_text() == "(" @@ -502,7 +502,7 @@ fn math_op(kind: SyntaxKind) -> Option<(SyntaxKind, SyntaxKind, ast::Assoc, usiz /// Parse an argument list in math: `(a, b; c, d; size: #50%)`. fn math_args(p: &mut Parser) { let m = p.marker(); - p.convert(SyntaxKind::LeftParen); + p.convert_and_eat(SyntaxKind::LeftParen); let mut namable = true; let mut named = None; @@ -515,8 +515,8 @@ fn math_args(p: &mut Parser) { && (p.at(SyntaxKind::MathIdent) || p.at(SyntaxKind::Text)) && p.text[p.current_end()..].starts_with(':') { - p.convert(SyntaxKind::Ident); - p.convert(SyntaxKind::Colon); + p.convert_and_eat(SyntaxKind::Ident); + p.convert_and_eat(SyntaxKind::Colon); named = Some(arg); arg = p.marker(); array = p.marker(); @@ -527,7 +527,7 @@ fn math_args(p: &mut Parser) { ";" => { maybe_wrap_in_math(p, arg, named); p.wrap(array, SyntaxKind::Array); - p.convert(SyntaxKind::Semicolon); + p.convert_and_eat(SyntaxKind::Semicolon); array = p.marker(); arg = p.marker(); namable = true; @@ -537,7 +537,7 @@ fn math_args(p: &mut Parser) { } "," => { maybe_wrap_in_math(p, arg, named); - p.convert(SyntaxKind::Comma); + p.convert_and_eat(SyntaxKind::Comma); arg = p.marker(); namable = true; if named.is_some() { @@ -570,7 +570,7 @@ fn math_args(p: &mut Parser) { } if p.at(SyntaxKind::Text) && p.current_text() == ")" { - p.convert(SyntaxKind::RightParen); + p.convert_and_eat(SyntaxKind::RightParen); } else { p.expected("closing paren"); p.balanced = false; @@ -1734,7 +1734,7 @@ impl<'s> Parser<'s> { } /// Convert the current token's [`SyntaxKind`] and eat it. - fn convert(&mut self, kind: SyntaxKind) { + fn convert_and_eat(&mut self, kind: SyntaxKind) { self.current = kind; self.eat(); } From 16cc7eb472c91470ae91f78ea67943b34be203f8 Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski Date: Thu, 10 Oct 2024 21:54:43 -0400 Subject: [PATCH 05/18] 5. Refactor parser memoization to localize functionality --- crates/typst-syntax/src/parser.rs | 153 ++++++++++++++++++++---------- 1 file changed, 102 insertions(+), 51 deletions(-) diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index 50277fab9..2a7e4611c 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -1057,29 +1057,25 @@ fn return_stmt(p: &mut Parser) { /// An expression that starts with a parenthesis. fn expr_with_paren(p: &mut Parser, atomic: bool) { - // If we've seen this position before and have a memoized result, just use - // it. See below for more explanation about this memoization. - let start = p.current_start(); - if let Some((range, end_point)) = p.memo.get(&start).cloned() { - // Restore the end point first, so that it doesn't truncate our freshly - // pushed nodes. If the current length of `p.nodes` doesn't match what - // we had in the memoized run, this might otherwise happen. - p.restore(end_point); - p.nodes.extend(p.memo_arena[range].iter().cloned()); + if atomic { + // Atomic expressions aren't modified by operators that follow them, so + // our first guess of array/dict will be correct. + parenthesized_or_array_or_dict(p); return; } - let m = p.marker(); - let checkpoint = p.checkpoint(); + // If we've seen this position before and have a memoized result, restore it + // and return. Otherwise, get a key to this position and a checkpoint to + // restart from in case we make a wrong prediction. + let Some((memo_key, checkpoint)) = p.restore_memo_or_checkpoint() else { return }; + // The node length from when we restored. + let prev_len = checkpoint.node_len; // When we reach a '(', we can't be sure what it is. First, we attempt to // parse as a simple parenthesized expression, array, or dictionary as // these are the most likely things. We can handle all of those in a single // pass. let kind = parenthesized_or_array_or_dict(p); - if atomic { - return; - } // If, however, '=>' or '=' follows, we must backtrack and reparse as either // a parameter list or a destructuring. To be able to do that, we created a @@ -1100,6 +1096,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) { // case running time of O(2n). if p.at(SyntaxKind::Arrow) { p.restore(checkpoint); + let m = p.marker(); params(p); if !p.expect(SyntaxKind::Arrow) { return; @@ -1108,6 +1105,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) { p.wrap(m, SyntaxKind::Closure); } else if p.at(SyntaxKind::Eq) && kind != SyntaxKind::Parenthesized { p.restore(checkpoint); + let m = p.marker(); destructuring_or_parenthesized(p, true, &mut HashSet::new()); if !p.expect(SyntaxKind::Eq) { return; @@ -1119,9 +1117,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) { } // Memoize result if we backtracked. - let offset = p.memo_arena.len(); - p.memo_arena.extend(p.nodes[m.0..].iter().cloned()); - p.memo.insert(start, (offset..p.memo_arena.len(), p.checkpoint())); + p.memoize_parsed_nodes(memo_key, prev_len); } /// Parses either @@ -1456,6 +1452,9 @@ fn destructuring_item<'s>( // Parse a normal positional pattern or a destructuring key. let was_at_pat = p.at_set(set::PATTERN); + + // We must use a full checkpoint here (can't just clone the lexer) because + // there may be trivia between the identifier and the colon we need to skip. let checkpoint = p.checkpoint(); if !(p.eat_if(SyntaxKind::Ident) && p.at(SyntaxKind::Colon)) { p.restore(checkpoint); @@ -1579,9 +1578,7 @@ struct Parser<'s> { /// Parser checkpoints for a given text index. Used for efficient parser /// backtracking similar to packrat parsing. See comments above in /// [`expr_with_paren`]. - memo: HashMap, Checkpoint<'s>)>, - /// The stored parse results at each checkpoint. - memo_arena: Vec, + memo: MemoArena<'s>, } /// How to proceed with parsing when at a newline in Code. @@ -1601,15 +1598,6 @@ enum NewlineMode { #[derive(Debug, Copy, Clone, Eq, PartialEq)] struct Marker(usize); -#[derive(Clone)] -struct Checkpoint<'s> { - lexer: Lexer<'s>, - prev_end: usize, - current_start: usize, - current: SyntaxKind, - nodes: usize, -} - impl<'s> Parser<'s> { /// Create a new parser starting from the given text offset and lexer mode. fn new(text: &'s str, offset: usize, mode: LexMode) -> Self { @@ -1626,8 +1614,7 @@ impl<'s> Parser<'s> { nodes: vec![], modes: vec![], newline_modes: vec![], - memo: HashMap::new(), - memo_arena: vec![], + memo: Default::default(), } } @@ -1836,26 +1823,6 @@ impl<'s> Parser<'s> { self.skip(); } - /// Save a checkpoint of the parser state. - fn checkpoint(&self) -> Checkpoint<'s> { - Checkpoint { - lexer: self.lexer.clone(), - prev_end: self.prev_end, - current_start: self.current_start, - current: self.current, - nodes: self.nodes.len(), - } - } - - /// Reset the parser from a checkpoint. - fn restore(&mut self, checkpoint: Checkpoint<'s>) { - self.lexer = checkpoint.lexer; - self.prev_end = checkpoint.prev_end; - self.current_start = checkpoint.current_start; - self.current = checkpoint.current; - self.nodes.truncate(checkpoint.nodes); - } - /// Move past trivia nodes in Code/Math. fn skip(&mut self) { if self.lexer.mode() != LexMode::Markup { @@ -1929,6 +1896,90 @@ impl<'s> Parser<'s> { } } +/// Extra parser state for efficiently recovering from mispredicted parses. +/// +/// This is the same idea as packrat parsing, but we use it only in the limited +/// case of parenthesized structures. See [`expr_with_paren`] for more. +#[derive(Default)] +struct MemoArena<'s> { + /// A single arena of previously parsed nodes (to reduce allocations). + /// Memoized ranges refer to unique sections of the arena. + arena: Vec, + /// A map from the parser's current position to a range of previously parsed + /// nodes in the arena and a checkpoint of the parser's state. These allow + /// us to reset the parser to avoid parsing the same location again. + memo_map: HashMap, Checkpoint<'s>)>, +} + +/// A type alias for the memo key so it doesn't get confused with other usizes. +/// +/// The memo is keyed by the index into `text` of the current token's start. +type MemoKey = usize; + +/// A checkpoint of the parser which can fully restore it to a previous state. +#[derive(Clone)] +struct Checkpoint<'s> { + lexer: Lexer<'s>, + prev_end: usize, + current_start: usize, + current: SyntaxKind, + node_len: usize, +} + +impl<'s> Parser<'s> { + /// Store the already parsed nodes and the parser state into the memo map by + /// extending the arena and storing the extended range and a checkpoint. + fn memoize_parsed_nodes(&mut self, key: MemoKey, prev_len: usize) { + let memo_start = self.memo.arena.len(); + self.memo.arena.extend_from_slice(&self.nodes[prev_len..]); + let arena_range = memo_start..self.memo.arena.len(); + self.memo.memo_map.insert(key, (arena_range, self.checkpoint())); + } + + /// Try to load a memoized result, return `None` if we did or `Some` (with a + /// checkpoint and a key for the memo map) if we didn't. + fn restore_memo_or_checkpoint(&mut self) -> Option<(MemoKey, Checkpoint<'s>)> { + // We use the starting index of the current token as our key. + let key: MemoKey = self.current_start(); + match self.memo.memo_map.get(&key).cloned() { + Some((range, checkpoint)) => { + self.nodes.extend_from_slice(&self.memo.arena[range]); + // It's important that we don't truncate the nodes vector since + // it may have grown or shrunk (due to other memoization or + // error reporting) since we made this checkpoint. + self.restore_partial(checkpoint); + None + } + None => Some((key, self.checkpoint())), + } + } + + /// Restore the parser to the state at a checkpoint. + fn restore(&mut self, checkpoint: Checkpoint<'s>) { + self.nodes.truncate(checkpoint.node_len); + self.restore_partial(checkpoint); + } + + /// Restore parts of the checkpoint excluding the nodes vector. + fn restore_partial(&mut self, checkpoint: Checkpoint<'s>) { + self.lexer = checkpoint.lexer; + self.prev_end = checkpoint.prev_end; + self.current_start = checkpoint.current_start; + self.current = checkpoint.current; + } + + /// Save a checkpoint of the parser state. + fn checkpoint(&self) -> Checkpoint<'s> { + Checkpoint { + lexer: self.lexer.clone(), + prev_end: self.prev_end, + current_start: self.current_start, + current: self.current, + node_len: self.nodes.len(), + } + } +} + impl<'s> Parser<'s> { /// Consume the given `kind` or produce an error. fn expect(&mut self, kind: SyntaxKind) -> bool { From 01186779cd92a7bad6ebff9154a85c6ab86cf7cb Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski Date: Mon, 21 Oct 2024 21:24:44 -0400 Subject: [PATCH 06/18] 6. Reduce size of memoization map state --- crates/typst-syntax/src/parser.rs | 57 ++++++++++++++++++------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index 2a7e4611c..19e8adbbb 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -1578,7 +1578,7 @@ struct Parser<'s> { /// Parser checkpoints for a given text index. Used for efficient parser /// backtracking similar to packrat parsing. See comments above in /// [`expr_with_paren`]. - memo: MemoArena<'s>, + memo: MemoArena, } /// How to proceed with parsing when at a newline in Code. @@ -1901,14 +1901,14 @@ impl<'s> Parser<'s> { /// This is the same idea as packrat parsing, but we use it only in the limited /// case of parenthesized structures. See [`expr_with_paren`] for more. #[derive(Default)] -struct MemoArena<'s> { +struct MemoArena { /// A single arena of previously parsed nodes (to reduce allocations). /// Memoized ranges refer to unique sections of the arena. arena: Vec, /// A map from the parser's current position to a range of previously parsed /// nodes in the arena and a checkpoint of the parser's state. These allow /// us to reset the parser to avoid parsing the same location again. - memo_map: HashMap, Checkpoint<'s>)>, + memo_map: HashMap, PartialState)>, } /// A type alias for the memo key so it doesn't get confused with other usizes. @@ -1917,37 +1917,45 @@ struct MemoArena<'s> { type MemoKey = usize; /// A checkpoint of the parser which can fully restore it to a previous state. +struct Checkpoint { + node_len: usize, + state: PartialState, +} + +/// State needed to restore the parser's current token and the lexer (but not +/// the nodes vector). #[derive(Clone)] -struct Checkpoint<'s> { - lexer: Lexer<'s>, +struct PartialState { + cursor: usize, + lex_mode: LexMode, prev_end: usize, current_start: usize, current: SyntaxKind, - node_len: usize, } impl<'s> Parser<'s> { /// Store the already parsed nodes and the parser state into the memo map by /// extending the arena and storing the extended range and a checkpoint. fn memoize_parsed_nodes(&mut self, key: MemoKey, prev_len: usize) { + let Checkpoint { state, node_len } = self.checkpoint(); let memo_start = self.memo.arena.len(); - self.memo.arena.extend_from_slice(&self.nodes[prev_len..]); + self.memo.arena.extend_from_slice(&self.nodes[prev_len..node_len]); let arena_range = memo_start..self.memo.arena.len(); - self.memo.memo_map.insert(key, (arena_range, self.checkpoint())); + self.memo.memo_map.insert(key, (arena_range, state)); } /// Try to load a memoized result, return `None` if we did or `Some` (with a /// checkpoint and a key for the memo map) if we didn't. - fn restore_memo_or_checkpoint(&mut self) -> Option<(MemoKey, Checkpoint<'s>)> { + fn restore_memo_or_checkpoint(&mut self) -> Option<(MemoKey, Checkpoint)> { // We use the starting index of the current token as our key. let key: MemoKey = self.current_start(); match self.memo.memo_map.get(&key).cloned() { - Some((range, checkpoint)) => { + Some((range, state)) => { self.nodes.extend_from_slice(&self.memo.arena[range]); // It's important that we don't truncate the nodes vector since // it may have grown or shrunk (due to other memoization or // error reporting) since we made this checkpoint. - self.restore_partial(checkpoint); + self.restore_partial(state); None } None => Some((key, self.checkpoint())), @@ -1955,28 +1963,31 @@ impl<'s> Parser<'s> { } /// Restore the parser to the state at a checkpoint. - fn restore(&mut self, checkpoint: Checkpoint<'s>) { + fn restore(&mut self, checkpoint: Checkpoint) { self.nodes.truncate(checkpoint.node_len); - self.restore_partial(checkpoint); + self.restore_partial(checkpoint.state); } /// Restore parts of the checkpoint excluding the nodes vector. - fn restore_partial(&mut self, checkpoint: Checkpoint<'s>) { - self.lexer = checkpoint.lexer; - self.prev_end = checkpoint.prev_end; - self.current_start = checkpoint.current_start; - self.current = checkpoint.current; + fn restore_partial(&mut self, state: PartialState) { + self.lexer.jump(state.cursor); + self.lexer.set_mode(state.lex_mode); + self.prev_end = state.prev_end; + self.current_start = state.current_start; + self.current = state.current; } /// Save a checkpoint of the parser state. - fn checkpoint(&self) -> Checkpoint<'s> { - Checkpoint { - lexer: self.lexer.clone(), + fn checkpoint(&self) -> Checkpoint { + let node_len = self.nodes.len(); + let state = PartialState { + cursor: self.lexer.cursor(), + lex_mode: self.lexer.mode(), prev_end: self.prev_end, current_start: self.current_start, current: self.current, - node_len: self.nodes.len(), - } + }; + Checkpoint { node_len, state } } } From 1cecae0333efcdfcfcca8e4e97ef590297808c2e Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski Date: Thu, 10 Oct 2024 11:57:27 -0400 Subject: [PATCH 07/18] 7. Return SyntaxNodes from the Lexer --- crates/typst-syntax/src/lexer.rs | 44 ++++++++++++++++++------------- crates/typst-syntax/src/parser.rs | 37 +++++++++++++------------- 2 files changed, 44 insertions(+), 37 deletions(-) diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs index 721225c6e..cdd4121c9 100644 --- a/crates/typst-syntax/src/lexer.rs +++ b/crates/typst-syntax/src/lexer.rs @@ -4,12 +4,12 @@ use unicode_script::{Script, UnicodeScript}; use unicode_segmentation::UnicodeSegmentation; use unscanny::Scanner; -use crate::{SyntaxError, SyntaxKind}; +use crate::{SyntaxError, SyntaxKind, SyntaxNode}; -/// Splits up a string of source code into tokens. +/// An iterator over a source code string which returns tokens. #[derive(Clone)] pub(super) struct Lexer<'s> { - /// The underlying scanner. + /// The scanner: contains the underlying string and location as a "cursor". s: Scanner<'s>, /// The mode the lexer is in. This determines which kinds of tokens it /// produces. @@ -73,11 +73,6 @@ impl<'s> Lexer<'s> { pub fn newline(&self) -> bool { self.newline } - - /// Take out the last error, if any. - pub fn take_error(&mut self) -> Option { - self.error.take() - } } impl Lexer<'_> { @@ -97,21 +92,24 @@ impl Lexer<'_> { /// Shared methods with all [`LexMode`]. impl Lexer<'_> { - /// Proceed to the next token and return its [`SyntaxKind`]. Note the - /// token could be a [trivia](SyntaxKind::is_trivia). - pub fn next(&mut self) -> SyntaxKind { + /// Return the next token in our text. Returns both the [`SyntaxNode`] + /// and the raw [`SyntaxKind`] to make it more ergonomic to check the kind + pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) { + debug_assert!(self.error.is_none()); + let start = self.s.cursor(); if self.mode == LexMode::Raw { - let Some((kind, end)) = self.raw.pop() else { - return SyntaxKind::End; + let kind = if let Some((kind, end)) = self.raw.pop() { + self.s.jump(end); + kind + } else { + SyntaxKind::End }; - self.s.jump(end); - return kind; + let node = SyntaxNode::leaf(kind, self.s.from(start)); + return (kind, node); } self.newline = false; - self.error = None; - let start = self.s.cursor(); - match self.s.eat() { + let kind = match self.s.eat() { Some(c) if is_space(c, self.mode) => self.whitespace(start, c), Some('/') if self.s.eat_if('/') => self.line_comment(), Some('/') if self.s.eat_if('*') => self.block_comment(), @@ -132,13 +130,21 @@ impl Lexer<'_> { }, None => SyntaxKind::End, - } + }; + + let text = self.s.from(start); + let node = match self.error.take() { + Some(error) => SyntaxNode::error(error, text), + None => SyntaxNode::leaf(kind, text), + }; + (kind, node) } /// Eat whitespace characters greedily. fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind { let more = self.s.eat_while(|c| is_space(c, self.mode)); let newlines = match c { + // Optimize eating a single space. ' ' if more.is_empty() => 0, _ => count_newlines(self.s.from(start)), }; diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index 19e8adbbb..b69486411 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -185,7 +185,7 @@ fn heading(p: &mut Parser) { whitespace_line(p); markup(p, false, usize::MAX, |p| { p.at_set(syntax_set!(Label, Space, RightBracket)) - && (!p.at(SyntaxKind::Space) || p.lexer.clone().next() == SyntaxKind::Label) + && (!p.at(SyntaxKind::Space) || p.lexer.clone().next().0 == SyntaxKind::Label) }); p.wrap(m, SyntaxKind::Heading); } @@ -282,7 +282,7 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) { while p.directly_at(SyntaxKind::Text) && p.current_text() == "." && { let mut copy = p.lexer.clone(); let start = copy.cursor(); - let next = copy.next(); + let next = copy.next().0; let end = copy.cursor(); matches!(next, SyntaxKind::MathIdent | SyntaxKind::Text) && is_ident(&p.text[start..end]) @@ -686,8 +686,8 @@ fn code_expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) { continue; } - let at_field_or_method = - p.directly_at(SyntaxKind::Dot) && p.lexer.clone().next() == SyntaxKind::Ident; + let at_field_or_method = p.directly_at(SyntaxKind::Dot) + && p.lexer.clone().next().0 == SyntaxKind::Ident; if atomic && !at_field_or_method { break; @@ -947,9 +947,8 @@ fn for_loop(p: &mut Parser) { let mut seen = HashSet::new(); pattern(p, false, &mut seen, None); - let m2 = p.marker(); - if p.eat_if(SyntaxKind::Comma) { - let node = &mut p[m2]; + if p.at(SyntaxKind::Comma) { + let node = p.eat_and_get(); node.unexpected(); node.hint("destructuring patterns must be wrapped in parentheses"); if p.at_set(set::PATTERN) { @@ -1563,6 +1562,9 @@ struct Parser<'s> { current_start: usize, /// The [`SyntaxKind`] of the current token. current: SyntaxKind, + /// The [`SyntaxNode`] of the current token, ready to be eaten and pushed + /// onto the end of `nodes`. + current_node: SyntaxNode, /// Whether the parser has the expected set of open/close delimiters. This /// only ever transitions from `true` to `false`. balanced: bool, @@ -1603,13 +1605,14 @@ impl<'s> Parser<'s> { fn new(text: &'s str, offset: usize, mode: LexMode) -> Self { let mut lexer = Lexer::new(text, mode); lexer.jump(offset); - let current = lexer.next(); + let (current, current_node) = lexer.next(); Self { lexer, text, prev_end: offset, current_start: offset, current, + current_node, balanced: true, nodes: vec![], modes: vec![], @@ -1722,7 +1725,8 @@ impl<'s> Parser<'s> { /// Convert the current token's [`SyntaxKind`] and eat it. fn convert_and_eat(&mut self, kind: SyntaxKind) { - self.current = kind; + // Only need to replace the node here. + self.current_node.convert_to_kind(kind); self.eat(); } @@ -1848,13 +1852,7 @@ impl<'s> Parser<'s> { /// Save the current token to the `nodes` vector as an Inner or Error node. fn save(&mut self) { - let text = self.current_text(); - if self.at(SyntaxKind::Error) { - let error = self.lexer.take_error().unwrap(); - self.nodes.push(SyntaxNode::error(error, text)); - } else { - self.nodes.push(SyntaxNode::leaf(self.current, text)); - } + self.nodes.push(self.current_node.clone()); if self.lexer.mode() == LexMode::Markup || !self.current.is_trivia() { self.prev_end = self.current_end(); @@ -1864,7 +1862,7 @@ impl<'s> Parser<'s> { /// Find the kind of the next non-trivia token in the lexer. fn next_non_trivia(lexer: &mut Lexer<'s>) -> SyntaxKind { loop { - let next = lexer.next(); + let next = lexer.next().0; // Loop is terminable, because `SyntaxKind::End` is not a trivia. if !next.is_trivia() { break next; @@ -1876,7 +1874,7 @@ impl<'s> Parser<'s> { /// might insert a temporary [`SyntaxKind::End`] based on our newline mode. fn lex(&mut self) { self.current_start = self.lexer.cursor(); - self.current = self.lexer.next(); + (self.current, self.current_node) = self.lexer.next(); // Special cases to handle newlines in Code. if self.lexer.mode() == LexMode::Code @@ -1931,6 +1929,7 @@ struct PartialState { prev_end: usize, current_start: usize, current: SyntaxKind, + current_node: SyntaxNode, } impl<'s> Parser<'s> { @@ -1975,6 +1974,7 @@ impl<'s> Parser<'s> { self.prev_end = state.prev_end; self.current_start = state.current_start; self.current = state.current; + self.current_node = state.current_node; } /// Save a checkpoint of the parser state. @@ -1986,6 +1986,7 @@ impl<'s> Parser<'s> { prev_end: self.prev_end, current_start: self.current_start, current: self.current, + current_node: self.current_node.clone(), }; Checkpoint { node_len, state } } From 09975d113385067302a4abbc1f5cf905e78915ad Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski Date: Thu, 10 Oct 2024 20:30:33 -0400 Subject: [PATCH 08/18] 8. Create Raw nodes entirely within the lexer --- crates/typst-syntax/src/lexer.rs | 120 ++++++++++++++---------------- crates/typst-syntax/src/parser.rs | 22 +----- crates/typst-syntax/src/set.rs | 2 +- 3 files changed, 59 insertions(+), 85 deletions(-) diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs index cdd4121c9..d2173f505 100644 --- a/crates/typst-syntax/src/lexer.rs +++ b/crates/typst-syntax/src/lexer.rs @@ -16,8 +16,6 @@ pub(super) struct Lexer<'s> { mode: LexMode, /// Whether the last token contained a newline. newline: bool, - /// The state held by raw line lexing. - raw: Vec<(SyntaxKind, usize)>, /// An error for the last token. error: Option, } @@ -31,8 +29,6 @@ pub(super) enum LexMode { Math, /// Keywords, literals and operators. Code, - /// The contents of a raw block. - Raw, } impl<'s> Lexer<'s> { @@ -44,7 +40,6 @@ impl<'s> Lexer<'s> { mode, newline: false, error: None, - raw: Vec::new(), } } @@ -97,16 +92,6 @@ impl Lexer<'_> { pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) { debug_assert!(self.error.is_none()); let start = self.s.cursor(); - if self.mode == LexMode::Raw { - let kind = if let Some((kind, end)) = self.raw.pop() { - self.s.jump(end); - kind - } else { - SyntaxKind::End - }; - let node = SyntaxNode::leaf(kind, self.s.from(start)); - return (kind, node); - } self.newline = false; let kind = match self.s.eat() { @@ -121,12 +106,11 @@ impl Lexer<'_> { ); kind } - + Some('`') if self.mode != LexMode::Math => return self.raw(), Some(c) => match self.mode { LexMode::Markup => self.markup(start, c), LexMode::Math => self.math(start, c), LexMode::Code => self.code(start, c), - LexMode::Raw => unreachable!(), }, None => SyntaxKind::End, @@ -193,7 +177,6 @@ impl Lexer<'_> { fn markup(&mut self, start: usize, c: char) -> SyntaxKind { match c { '\\' => self.backslash(), - '`' => self.raw(), 'h' if self.s.eat_if("ttp://") => self.link(), 'h' if self.s.eat_if("ttps://") => self.link(), '<' if self.s.at(is_id_continue) => self.label(), @@ -258,9 +241,10 @@ impl Lexer<'_> { } } - fn raw(&mut self) -> SyntaxKind { + /// Lex an entire raw segment at once. This is a convenience to avoid going + /// to and from the parser for each raw section. + fn raw(&mut self) -> (SyntaxKind, SyntaxNode) { let start = self.s.cursor() - 1; - self.raw.clear(); // Determine number of opening backticks. let mut backticks = 1; @@ -270,9 +254,11 @@ impl Lexer<'_> { // Special case for ``. if backticks == 2 { - self.push_raw(SyntaxKind::RawDelim); - self.s.jump(start + 1); - return SyntaxKind::RawDelim; + let nodes = vec![ + SyntaxNode::leaf(SyntaxKind::RawDelim, "`"), + SyntaxNode::leaf(SyntaxKind::RawDelim, "`"), + ]; + return (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes)); } // Find end of raw text. @@ -281,43 +267,55 @@ impl Lexer<'_> { match self.s.eat() { Some('`') => found += 1, Some(_) => found = 0, - None => break, + None => { + let msg = SyntaxError::new("unclosed raw text"); + let error = SyntaxNode::error(msg, self.s.from(start)); + return (SyntaxKind::Error, error); + } } } - - if found != backticks { - return self.error("unclosed raw text"); - } - let end = self.s.cursor(); - if backticks >= 3 { - self.blocky_raw(start, end, backticks); - } else { - self.inline_raw(start, end, backticks); - } - // Closing delimiter. - self.push_raw(SyntaxKind::RawDelim); + let mut nodes = Vec::with_capacity(3); // Will have at least 3. - // The saved tokens will be removed in reverse. - self.raw.reverse(); + // A closure for pushing a node onto our raw vector. Assumes the caller + // will move the scanner to the next location at each step. + let mut prev_start = start; + let mut push_raw = |kind, s: &Scanner| { + nodes.push(SyntaxNode::leaf(kind, s.from(prev_start))); + prev_start = s.cursor(); + }; // Opening delimiter. self.s.jump(start + backticks); - SyntaxKind::RawDelim + push_raw(SyntaxKind::RawDelim, &self.s); + + if backticks >= 3 { + self.blocky_raw(end - backticks, &mut push_raw); + } else { + self.inline_raw(end - backticks, &mut push_raw); + } + + // Closing delimiter. + self.s.jump(end); + push_raw(SyntaxKind::RawDelim, &self.s); + + (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes)) } - fn blocky_raw(&mut self, start: usize, end: usize, backticks: usize) { + fn blocky_raw(&mut self, inner_end: usize, mut push_raw: F) + where + F: FnMut(SyntaxKind, &Scanner), + { // Language tag. - self.s.jump(start + backticks); if self.s.eat_if(is_id_start) { self.s.eat_while(is_id_continue); - self.push_raw(SyntaxKind::RawLang); + push_raw(SyntaxKind::RawLang, &self.s); } // Determine inner content between backticks. self.s.eat_if(' '); - let inner = self.s.to(end - backticks); + let inner = self.s.to(inner_end); // Determine dedent level. let mut lines = split_newlines(inner); @@ -363,41 +361,32 @@ impl Lexer<'_> { let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum(); self.s.eat_newline(); self.s.advance(offset); - self.push_raw(SyntaxKind::RawTrimmed); + push_raw(SyntaxKind::RawTrimmed, &self.s); self.s.advance(line.len() - offset); - self.push_raw(SyntaxKind::Text); + push_raw(SyntaxKind::Text, &self.s); } // Add final trimmed. - if self.s.cursor() < end - backticks { - self.s.jump(end - backticks); - self.push_raw(SyntaxKind::RawTrimmed); + if self.s.cursor() < inner_end { + self.s.jump(inner_end); + push_raw(SyntaxKind::RawTrimmed, &self.s); } - self.s.jump(end); } - fn inline_raw(&mut self, start: usize, end: usize, backticks: usize) { - self.s.jump(start + backticks); - - while self.s.cursor() < end - backticks { + fn inline_raw(&mut self, inner_end: usize, mut push_raw: F) + where + F: FnMut(SyntaxKind, &Scanner), + { + while self.s.cursor() < inner_end { if self.s.at(is_newline) { - self.push_raw(SyntaxKind::Text); + push_raw(SyntaxKind::Text, &self.s); self.s.eat_newline(); - self.push_raw(SyntaxKind::RawTrimmed); + push_raw(SyntaxKind::RawTrimmed, &self.s); continue; } self.s.eat(); } - self.push_raw(SyntaxKind::Text); - - self.s.jump(end); - } - - /// Push the current cursor that marks the end of a raw segment of - /// the given `kind`. - fn push_raw(&mut self, kind: SyntaxKind) { - let end = self.s.cursor(); - self.raw.push((kind, end)); + push_raw(SyntaxKind::Text, &self.s); } fn link(&mut self) -> SyntaxKind { @@ -605,7 +594,6 @@ impl Lexer<'_> { impl Lexer<'_> { fn code(&mut self, start: usize, c: char) -> SyntaxKind { match c { - '`' => self.raw(), '<' if self.s.at(is_id_continue) => self.label(), '0'..='9' => self.number(start, c), '.' if self.s.at(char::is_ascii_digit) => self.number(start, c), diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index b69486411..6fd0878df 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -116,10 +116,11 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) { | SyntaxKind::Link | SyntaxKind::Label => p.eat(), + SyntaxKind::Raw => p.eat(), // Raw is handled entirely in the Lexer. + SyntaxKind::Hash => embedded_code_expr(p), SyntaxKind::Star => strong(p), SyntaxKind::Underscore => emph(p), - SyntaxKind::RawDelim => raw(p), SyntaxKind::HeadingMarker if *at_start => heading(p), SyntaxKind::ListMarker if *at_start => list_item(p), SyntaxKind::EnumMarker if *at_start => enum_item(p), @@ -162,22 +163,6 @@ fn emph(p: &mut Parser) { p.wrap(m, SyntaxKind::Emph); } -/// Parses raw text with optional syntax highlighting: `` `...` ``. -fn raw(p: &mut Parser) { - let m = p.marker(); - p.enter(LexMode::Raw); - p.assert(SyntaxKind::RawDelim); - - // Eats until the closing delimiter. - while !p.end() && !p.at(SyntaxKind::RawDelim) { - p.eat(); - } - - p.expect(SyntaxKind::RawDelim); - p.exit(); - p.wrap(m, SyntaxKind::Raw); -} - /// Parses a section heading: `= Introduction`. fn heading(p: &mut Parser) { let m = p.marker(); @@ -767,7 +752,6 @@ fn code_primary(p: &mut Parser, atomic: bool) { SyntaxKind::LeftBrace => code_block(p), SyntaxKind::LeftBracket => content_block(p), SyntaxKind::LeftParen => expr_with_paren(p, atomic), - SyntaxKind::RawDelim => raw(p), SyntaxKind::Dollar => equation(p), SyntaxKind::Let => let_binding(p), SyntaxKind::Set => set_rule(p), @@ -782,6 +766,8 @@ fn code_primary(p: &mut Parser, atomic: bool) { SyntaxKind::Continue => continue_stmt(p), SyntaxKind::Return => return_stmt(p), + SyntaxKind::Raw => p.eat(), // Raw is handled entirely in the Lexer. + SyntaxKind::None | SyntaxKind::Auto | SyntaxKind::Int diff --git a/crates/typst-syntax/src/set.rs b/crates/typst-syntax/src/set.rs index eaee7ef28..f3f1ba240 100644 --- a/crates/typst-syntax/src/set.rs +++ b/crates/typst-syntax/src/set.rs @@ -104,7 +104,7 @@ pub const ATOMIC_CODE_PRIMARY: SyntaxSet = syntax_set!( Numeric, Str, Label, - RawDelim, + Raw, ); /// Syntax kinds that are unary operators. From 88d86714a1e8c2f9ef8b77d4bcf7d44fa4e4dd26 Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski Date: Mon, 21 Oct 2024 22:18:23 -0400 Subject: [PATCH 09/18] 9. Parse math field access in the lexer --- crates/typst-syntax/src/lexer.rs | 41 ++++++++++++++++++++++++++++--- crates/typst-syntax/src/parser.rs | 20 +++------------ crates/typst-syntax/src/set.rs | 1 + tests/suite/math/symbols.typ | 29 ++++++++++++++++++++++ 4 files changed, 71 insertions(+), 20 deletions(-) create mode 100644 tests/suite/math/symbols.typ diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs index d2173f505..4a43c15ff 100644 --- a/crates/typst-syntax/src/lexer.rs +++ b/crates/typst-syntax/src/lexer.rs @@ -109,7 +109,10 @@ impl Lexer<'_> { Some('`') if self.mode != LexMode::Math => return self.raw(), Some(c) => match self.mode { LexMode::Markup => self.markup(start, c), - LexMode::Math => self.math(start, c), + LexMode::Math => match self.math(start, c) { + (kind, None) => kind, + (kind, Some(node)) => return (kind, node), + }, LexMode::Code => self.code(start, c), }, @@ -507,8 +510,8 @@ impl Lexer<'_> { /// Math. impl Lexer<'_> { - fn math(&mut self, start: usize, c: char) -> SyntaxKind { - match c { + fn math(&mut self, start: usize, c: char) -> (SyntaxKind, Option) { + let kind = match c { '\\' => self.backslash(), '"' => self.string(), @@ -561,11 +564,41 @@ impl Lexer<'_> { // Identifiers. c if is_math_id_start(c) && self.s.at(is_math_id_continue) => { self.s.eat_while(is_math_id_continue); - SyntaxKind::MathIdent + let (kind, node) = self.math_ident_or_field(start); + return (kind, Some(node)); } // Other math atoms. _ => self.math_text(start, c), + }; + (kind, None) + } + + /// Parse a single `MathIdent` or an entire `FieldAccess`. + fn math_ident_or_field(&mut self, start: usize) -> (SyntaxKind, SyntaxNode) { + let mut kind = SyntaxKind::MathIdent; + let mut node = SyntaxNode::leaf(kind, self.s.from(start)); + while let Some(ident) = self.maybe_dot_ident() { + kind = SyntaxKind::FieldAccess; + let field_children = vec![ + node, + SyntaxNode::leaf(SyntaxKind::Dot, '.'), + SyntaxNode::leaf(SyntaxKind::Ident, ident), + ]; + node = SyntaxNode::inner(kind, field_children); + } + (kind, node) + } + + /// If at a dot and a math identifier, eat and return the identifier. + fn maybe_dot_ident(&mut self) -> Option<&str> { + if self.s.scout(1).is_some_and(is_math_id_start) && self.s.eat_if('.') { + let ident_start = self.s.cursor(); + self.s.eat(); + self.s.eat_while(is_math_id_continue); + Some(self.s.from(ident_start)) + } else { + None } } diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index 6fd0878df..be065ca60 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -6,9 +6,7 @@ use ecow::{eco_format, EcoString}; use unicode_math_class::MathClass; use crate::set::{syntax_set, SyntaxSet}; -use crate::{ - ast, is_ident, is_newline, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode, -}; +use crate::{ast, is_newline, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode}; /// Parses a source file as top-level markup. pub fn parse(text: &str) -> SyntaxNode { @@ -261,21 +259,11 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) { let mut continuable = false; match p.current() { SyntaxKind::Hash => embedded_code_expr(p), - SyntaxKind::MathIdent => { + // The lexer manages creating full FieldAccess nodes if needed. + SyntaxKind::MathIdent | SyntaxKind::FieldAccess => { continuable = true; p.eat(); - while p.directly_at(SyntaxKind::Text) && p.current_text() == "." && { - let mut copy = p.lexer.clone(); - let start = copy.cursor(); - let next = copy.next().0; - let end = copy.cursor(); - matches!(next, SyntaxKind::MathIdent | SyntaxKind::Text) - && is_ident(&p.text[start..end]) - } { - p.convert_and_eat(SyntaxKind::Dot); - p.convert_and_eat(SyntaxKind::Ident); - p.wrap(m, SyntaxKind::FieldAccess); - } + // Parse a function call for an identifier or field access. if min_prec < 3 && p.directly_at(SyntaxKind::Text) && p.current_text() == "(" { math_args(p); diff --git a/crates/typst-syntax/src/set.rs b/crates/typst-syntax/src/set.rs index f3f1ba240..014aaf2f7 100644 --- a/crates/typst-syntax/src/set.rs +++ b/crates/typst-syntax/src/set.rs @@ -58,6 +58,7 @@ pub const STMT: SyntaxSet = syntax_set!(Let, Set, Show, Import, Include, Return) pub const MATH_EXPR: SyntaxSet = syntax_set!( Hash, MathIdent, + FieldAccess, Text, MathShorthand, Linebreak, diff --git a/tests/suite/math/symbols.typ b/tests/suite/math/symbols.typ new file mode 100644 index 000000000..65a483162 --- /dev/null +++ b/tests/suite/math/symbols.typ @@ -0,0 +1,29 @@ +// Test math symbol edge cases. + +--- math-symbol-basic --- +#let sym = symbol("s", ("basic", "s")) +#test($sym.basic$, $#"s"$) + +--- math-symbol-underscore --- +#let sym = symbol("s", ("test_underscore", "s")) +// Error: 6-10 unknown symbol modifier +$sym.test_underscore$ + +--- math-symbol-dash --- +#let sym = symbol("s", ("test-dash", "s")) +// Error: 6-10 unknown symbol modifier +$sym.test-dash$ + +--- math-symbol-double --- +#let sym = symbol("s", ("test.basic", "s")) +#test($sym.test.basic$, $#"s"$) + +--- math-symbol-double-underscore --- +#let sym = symbol("s", ("one.test_underscore", "s")) +// Error: 10-14 unknown symbol modifier +$sym.one.test_underscore$ + +--- math-symbol-double-dash --- +#let sym = symbol("s", ("one.test-dash", "s")) +// Error: 10-14 unknown symbol modifier +$sym.one.test-dash$ From 2ae1e1627f09ce8dfe76dd3e4b1b70fc95943f97 Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski Date: Thu, 10 Oct 2024 20:30:33 -0400 Subject: [PATCH 10/18] 10. Change parser modes using closures instead of manual stacks --- crates/typst-syntax/src/parser.rs | 323 +++++++++++++++--------------- 1 file changed, 158 insertions(+), 165 deletions(-) diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index be065ca60..44a388c56 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -226,11 +226,11 @@ fn whitespace_line(p: &mut Parser) { /// Parses a mathematical equation: `$x$`, `$ x^2 $`. fn equation(p: &mut Parser) { let m = p.marker(); - p.enter(LexMode::Math); - p.assert(SyntaxKind::Dollar); - math(p, |p| p.at(SyntaxKind::Dollar)); - p.expect_closing_delimiter(m, SyntaxKind::Dollar); - p.exit(); + p.with_mode(LexMode::Math, |p| { + p.assert(SyntaxKind::Dollar); + math(p, |p| p.at(SyntaxKind::Dollar)); + p.expect_closing_delimiter(m, SyntaxKind::Dollar); + }); p.wrap(m, SyntaxKind::Equation); } @@ -586,10 +586,11 @@ fn code(p: &mut Parser, stop: impl FnMut(&Parser) -> bool) { /// Parses a sequence of code expressions. fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) { while !p.end() && !stop(p) { - p.enter_newline_mode(NewlineMode::Contextual); - - let at_expr = p.at_set(set::CODE_EXPR); - if at_expr { + p.with_nl_mode(AtNewline::Contextual, |p| { + if !p.at_set(set::CODE_EXPR) { + p.unexpected(); + return; + } code_expr(p); if !p.end() && !stop(p) && !p.eat_if(SyntaxKind::Semicolon) { p.expected("semicolon or line break"); @@ -598,12 +599,7 @@ fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) { p.hint("try wrapping your code in a markup block (`[ ]`)"); } } - } - - p.exit_newline_mode(); - if !at_expr && !p.end() { - p.unexpected(); - } + }); } } @@ -614,29 +610,28 @@ fn code_expr(p: &mut Parser) { /// Parses an atomic code expression embedded in markup or math. fn embedded_code_expr(p: &mut Parser) { - p.enter_newline_mode(NewlineMode::Stop); - p.enter(LexMode::Code); - p.assert(SyntaxKind::Hash); - p.unskip(); + p.with_mode(LexMode::Code, |p| { + p.with_nl_mode(AtNewline::Stop, |p| { + p.assert(SyntaxKind::Hash); + p.unskip(); - let stmt = p.at_set(set::STMT); - let at = p.at_set(set::ATOMIC_CODE_EXPR); - code_expr_prec(p, true, 0); + let stmt = p.at_set(set::STMT); + let at = p.at_set(set::ATOMIC_CODE_EXPR); + code_expr_prec(p, true, 0); - // Consume error for things like `#12p` or `#"abc\"`.# - if !at && !p.current().is_trivia() && !p.end() { - p.unexpected(); - } + // Consume error for things like `#12p` or `#"abc\"`.# + if !at && !p.current().is_trivia() && !p.end() { + p.unexpected(); + } - let semi = - (stmt || p.directly_at(SyntaxKind::Semicolon)) && p.eat_if(SyntaxKind::Semicolon); + let semi = (stmt || p.directly_at(SyntaxKind::Semicolon)) + && p.eat_if(SyntaxKind::Semicolon); - if stmt && !semi && !p.end() && !p.at(SyntaxKind::RightBracket) { - p.expected("semicolon or line break"); - } - - p.exit(); - p.exit_newline_mode(); + if stmt && !semi && !p.end() && !p.at(SyntaxKind::RightBracket) { + p.expected("semicolon or line break"); + } + }); + }); } /// Parses a code expression with at least the given precedence. @@ -790,24 +785,24 @@ pub(super) fn reparse_block(text: &str, range: Range) -> Option SyntaxKind { - let m = p.marker(); - p.enter_newline_mode(NewlineMode::Continue); - p.assert(SyntaxKind::LeftParen); - let mut state = GroupState { count: 0, maybe_just_parens: true, @@ -1124,27 +1115,29 @@ fn parenthesized_or_array_or_dict(p: &mut Parser) -> SyntaxKind { // // This does allow some unexpected expressions, such as `(: key: val)`, but // it's currently intentional. - if p.eat_if(SyntaxKind::Colon) { - state.kind = Some(SyntaxKind::Dict); - state.maybe_just_parens = false; - } - - while !p.current().is_terminator() { - if !p.at_set(set::ARRAY_OR_DICT_ITEM) { - p.unexpected(); - continue; + let m = p.marker(); + p.with_nl_mode(AtNewline::Continue, |p| { + p.assert(SyntaxKind::LeftParen); + if p.eat_if(SyntaxKind::Colon) { + state.kind = Some(SyntaxKind::Dict); } - array_or_dict_item(p, &mut state); - state.count += 1; + while !p.current().is_terminator() { + if !p.at_set(set::ARRAY_OR_DICT_ITEM) { + p.unexpected(); + continue; + } - if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) { - state.maybe_just_parens = false; + array_or_dict_item(p, &mut state); + state.count += 1; + + if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) { + state.maybe_just_parens = false; + } } - } - p.expect_closing_delimiter(m, SyntaxKind::RightParen); - p.exit_newline_mode(); + p.expect_closing_delimiter(m, SyntaxKind::RightParen); + }); let kind = if state.maybe_just_parens && state.count == 1 { SyntaxKind::Parenthesized @@ -1230,25 +1223,25 @@ fn args(p: &mut Parser) { let m = p.marker(); if p.at(SyntaxKind::LeftParen) { let m2 = p.marker(); - p.enter_newline_mode(NewlineMode::Continue); - p.assert(SyntaxKind::LeftParen); + p.with_nl_mode(AtNewline::Continue, |p| { + p.assert(SyntaxKind::LeftParen); - let mut seen = HashSet::new(); - while !p.current().is_terminator() { - if !p.at_set(set::ARG) { - p.unexpected(); - continue; + let mut seen = HashSet::new(); + while !p.current().is_terminator() { + if !p.at_set(set::ARG) { + p.unexpected(); + continue; + } + + arg(p, &mut seen); + + if !p.current().is_terminator() { + p.expect(SyntaxKind::Comma); + } } - arg(p, &mut seen); - - if !p.current().is_terminator() { - p.expect(SyntaxKind::Comma); - } - } - - p.expect_closing_delimiter(m2, SyntaxKind::RightParen); - p.exit_newline_mode(); + p.expect_closing_delimiter(m2, SyntaxKind::RightParen); + }); } while p.directly_at(SyntaxKind::LeftBracket) { @@ -1293,27 +1286,27 @@ fn arg<'s>(p: &mut Parser<'s>, seen: &mut HashSet<&'s str>) { /// Parses a closure's parameters: `(x, y)`. fn params(p: &mut Parser) { let m = p.marker(); - p.enter_newline_mode(NewlineMode::Continue); - p.assert(SyntaxKind::LeftParen); + p.with_nl_mode(AtNewline::Continue, |p| { + p.assert(SyntaxKind::LeftParen); - let mut seen = HashSet::new(); - let mut sink = false; + let mut seen = HashSet::new(); + let mut sink = false; - while !p.current().is_terminator() { - if !p.at_set(set::PARAM) { - p.unexpected(); - continue; + while !p.current().is_terminator() { + if !p.at_set(set::PARAM) { + p.unexpected(); + continue; + } + + param(p, &mut seen, &mut sink); + + if !p.current().is_terminator() { + p.expect(SyntaxKind::Comma); + } } - param(p, &mut seen, &mut sink); - - if !p.current().is_terminator() { - p.expect(SyntaxKind::Comma); - } - } - - p.expect_closing_delimiter(m, SyntaxKind::RightParen); - p.exit_newline_mode(); + p.expect_closing_delimiter(m, SyntaxKind::RightParen); + }); p.wrap(m, SyntaxKind::Params); } @@ -1374,25 +1367,25 @@ fn destructuring_or_parenthesized<'s>( let mut maybe_just_parens = true; let m = p.marker(); - p.enter_newline_mode(NewlineMode::Continue); - p.assert(SyntaxKind::LeftParen); + p.with_nl_mode(AtNewline::Continue, |p| { + p.assert(SyntaxKind::LeftParen); - while !p.current().is_terminator() { - if !p.at_set(set::DESTRUCTURING_ITEM) { - p.unexpected(); - continue; + while !p.current().is_terminator() { + if !p.at_set(set::DESTRUCTURING_ITEM) { + p.unexpected(); + continue; + } + + destructuring_item(p, reassignment, seen, &mut maybe_just_parens, &mut sink); + count += 1; + + if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) { + maybe_just_parens = false; + } } - destructuring_item(p, reassignment, seen, &mut maybe_just_parens, &mut sink); - count += 1; - - if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) { - maybe_just_parens = false; - } - } - - p.expect_closing_delimiter(m, SyntaxKind::RightParen); - p.exit_newline_mode(); + p.expect_closing_delimiter(m, SyntaxKind::RightParen); + }); if maybe_just_parens && count == 1 && !sink { p.wrap(m, SyntaxKind::Parenthesized); @@ -1510,7 +1503,7 @@ fn pattern_leaf<'s>( /// ### Modes /// /// The parser manages the transitions between the three modes of Typst through -/// stacks of [lexer modes](`LexMode`) and [newline modes](`NewlineMode`). +/// [lexer modes](`LexMode`) and [newline modes](`AtNewline`). /// /// The lexer modes map to the three Typst modes and are stored in the lexer, /// changing which`SyntaxKind`s it will generate. The mode also affects how the @@ -1527,8 +1520,11 @@ struct Parser<'s> { /// The source text shared with the lexer. text: &'s str, /// A lexer over the source text with multiple modes. Defines the boundaries - /// of tokens and determines their [`SyntaxKind`]. + /// of tokens and determines their [`SyntaxKind`]. Contains the [`LexMode`] + /// defining our current Typst mode. lexer: Lexer<'s>, + /// The newline mode: whether to insert a temporary end at newlines in Code. + nl_mode: AtNewline, /// The index into `text` of the end of the previous token. prev_end: usize, /// The index into `text` of the start of our current token (the end is @@ -1545,12 +1541,6 @@ struct Parser<'s> { /// Nodes representing the concrete syntax tree of previously parsed text. /// In Code and Math, includes previously parsed trivia, but not `current`. nodes: Vec, - /// Stack of lexer modes to be pushed/popped. The current mode is implicitly - /// stored in the lexer. - modes: Vec, - /// Stack of newline modes to be pushed/popped. The current mode is the tail - /// of the vector. - newline_modes: Vec, /// Parser checkpoints for a given text index. Used for efficient parser /// backtracking similar to packrat parsing. See comments above in /// [`expr_with_paren`]. @@ -1558,14 +1548,28 @@ struct Parser<'s> { } /// How to proceed with parsing when at a newline in Code. -#[derive(Clone)] -enum NewlineMode { +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum AtNewline { + /// Continue at newlines. + Continue, /// Stop at any newline. Stop, /// Continue only if there is no continuation with `else` or `.`. Contextual, - /// Continue at newlines. - Continue, +} + +impl AtNewline { + /// Whether to stop at a newline or continue based on the current context. + fn stop(self, kind: impl FnOnce() -> SyntaxKind) -> bool { + match self { + AtNewline::Continue => false, + AtNewline::Stop => true, + AtNewline::Contextual => match kind() { + SyntaxKind::Else | SyntaxKind::Dot => false, + _ => true, + }, + } + } } /// A marker representing a node's position in the parser. Mainly used for @@ -1581,16 +1585,15 @@ impl<'s> Parser<'s> { lexer.jump(offset); let (current, current_node) = lexer.next(); Self { - lexer, text, + lexer, + nl_mode: AtNewline::Continue, prev_end: offset, current_start: offset, current, current_node, balanced: true, nodes: vec![], - modes: vec![], - newline_modes: vec![], memo: Default::default(), } } @@ -1767,35 +1770,33 @@ impl<'s> Parser<'s> { self.nodes.insert(from, SyntaxNode::inner(kind, children)); } - /// Enter a new [`LexMode`] that will affect subsequent tokens (does not - /// modify the current token). - fn enter(&mut self, mode: LexMode) { - self.modes.push(self.lexer.mode()); + /// Parse within the [`LexMode`] for subsequent tokens (does not change the + /// current token). This may re-lex the final token on exit. + /// + /// This function effectively repurposes the call stack as a stack of modes. + fn with_mode(&mut self, mode: LexMode, func: impl FnOnce(&mut Parser<'s>)) { + let previous = self.lexer.mode(); self.lexer.set_mode(mode); - } - - /// Exit the current [`LexMode`], possibly re-lexing the current token. - fn exit(&mut self) { - let mode = self.modes.pop().unwrap(); - if mode != self.lexer.mode() { + func(self); + if mode != previous { self.unskip(); - self.lexer.set_mode(mode); + self.lexer.set_mode(previous); self.lexer.jump(self.current_start); self.lex(); self.skip(); } } - /// Enter a new [`NewlineMode`] that will affect subsequent tokens (does not - /// modify the current token). - fn enter_newline_mode(&mut self, stop: NewlineMode) { - self.newline_modes.push(stop); - } - - /// Exit the current [`NewlineMode`], possibly re-lexing the current token. - fn exit_newline_mode(&mut self) { + /// Parse within the [`AtNewline`] mode for subsequent tokens (does not + /// change the current token). This may re-lex the final token on exit. + /// + /// This function effectively repurposes the call stack as a stack of modes. + fn with_nl_mode(&mut self, mode: AtNewline, func: impl FnOnce(&mut Parser<'s>)) { + let previous = self.nl_mode; + self.nl_mode = mode; + func(self); self.unskip(); - self.newline_modes.pop(); + self.nl_mode = previous; self.lexer.jump(self.prev_end); self.lex(); self.skip(); @@ -1853,15 +1854,7 @@ impl<'s> Parser<'s> { // Special cases to handle newlines in Code. if self.lexer.mode() == LexMode::Code && self.lexer.newline() - && match self.newline_modes.last() { - Some(NewlineMode::Continue) => false, - Some(NewlineMode::Contextual) => !matches!( - Self::next_non_trivia(&mut self.lexer.clone()), - SyntaxKind::Else | SyntaxKind::Dot - ), - Some(NewlineMode::Stop) => true, - None => false, - } + && self.nl_mode.stop(|| Self::next_non_trivia(&mut self.lexer.clone())) { self.current = SyntaxKind::End; } From c466080fb2c3fc8bca895fc3ead0b0a7deb9b80d Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski Date: Thu, 10 Oct 2024 11:57:27 -0400 Subject: [PATCH 11/18] 11. Add Parser::finish_into --- crates/typst-syntax/src/parser.rs | 47 ++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index 44a388c56..34c65820d 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -12,37 +12,45 @@ use crate::{ast, is_newline, set, LexMode, Lexer, SyntaxError, SyntaxKind, Synta pub fn parse(text: &str) -> SyntaxNode { let _scope = typst_timing::TimingScope::new("parse"); let mut p = Parser::new(text, 0, LexMode::Markup); - markup(&mut p, true, 0, |_| false); - p.finish().into_iter().next().unwrap() + markup_exprs(&mut p, true, 0, |_| false); + p.finish_into(SyntaxKind::Markup) } /// Parses top-level code. pub fn parse_code(text: &str) -> SyntaxNode { let _scope = typst_timing::TimingScope::new("parse code"); let mut p = Parser::new(text, 0, LexMode::Code); - let m = p.marker(); - p.skip(); code_exprs(&mut p, |_| false); - p.wrap_all(m, SyntaxKind::Code); - p.finish().into_iter().next().unwrap() + p.finish_into(SyntaxKind::Code) } /// Parses top-level math. pub fn parse_math(text: &str) -> SyntaxNode { let _scope = typst_timing::TimingScope::new("parse math"); let mut p = Parser::new(text, 0, LexMode::Math); - math(&mut p, |_| false); - p.finish().into_iter().next().unwrap() + math_exprs(&mut p, |_| false); + p.finish_into(SyntaxKind::Math) } /// Parses markup expressions until a stop condition is met. fn markup( + p: &mut Parser, + at_start: bool, + min_indent: usize, + stop: impl FnMut(&Parser) -> bool, +) { + let m = p.marker(); + markup_exprs(p, at_start, min_indent, stop); + p.wrap(m, SyntaxKind::Markup); +} + +/// Parses a sequence of markup expressions. +fn markup_exprs( p: &mut Parser, mut at_start: bool, min_indent: usize, mut stop: impl FnMut(&Parser) -> bool, ) { - let m = p.marker(); let mut nesting: usize = 0; while !p.end() { match p.current() { @@ -63,7 +71,6 @@ fn markup( markup_expr(p, &mut at_start); } - p.wrap(m, SyntaxKind::Markup); } /// Reparses a subsection of markup incrementally. @@ -235,8 +242,14 @@ fn equation(p: &mut Parser) { } /// Parses the contents of a mathematical equation: `x^2 + 1`. -fn math(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) { +fn math(p: &mut Parser, stop: impl FnMut(&Parser) -> bool) { let m = p.marker(); + math_exprs(p, stop); + p.wrap(m, SyntaxKind::Math); +} + +/// Parses a sequence of math expressions. +fn math_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) { while !p.end() && !stop(p) { if p.at_set(set::MATH_EXPR) { math_expr(p); @@ -244,7 +257,6 @@ fn math(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) { p.unexpected(); } } - p.wrap(m, SyntaxKind::Math); } /// Parses a single math expression: This includes math elements like @@ -1603,6 +1615,12 @@ impl<'s> Parser<'s> { self.nodes } + /// Consume the parser, generating a single top-level node. + fn finish_into(self, kind: SyntaxKind) -> SyntaxNode { + assert!(self.at(SyntaxKind::End)); + SyntaxNode::inner(kind, self.finish()) + } + /// The offset into `text` of the previous token's end. fn prev_end(&self) -> usize { self.prev_end @@ -1757,11 +1775,6 @@ impl<'s> Parser<'s> { self.wrap_within(from, self.before_trivia(), kind); } - /// Wrap including any trailing trivia nodes. - fn wrap_all(&mut self, from: Marker, kind: SyntaxKind) { - self.wrap_within(from, Marker(self.nodes.len()), kind) - } - fn wrap_within(&mut self, from: Marker, to: Marker, kind: SyntaxKind) { let len = self.nodes.len(); let to = to.0.min(len); From 91b384ad7b83fd7098d2a90306982b12affe1ca5 Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski Date: Thu, 10 Oct 2024 11:57:27 -0400 Subject: [PATCH 12/18] 12. Add the Token type and replace lex/skip/save methods --- crates/typst-syntax/src/parser.rs | 244 ++++++++++++++---------------- 1 file changed, 112 insertions(+), 132 deletions(-) diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index 34c65820d..405e3e5c5 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -325,11 +325,7 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) { _ => p.expected("expression"), } - if continuable - && min_prec < 3 - && p.prev_end() == p.current_start() - && maybe_delimited(p) - { + if continuable && min_prec < 3 && !p.had_trivia() && maybe_delimited(p) { p.wrap(m, SyntaxKind::Math); } @@ -581,6 +577,8 @@ fn maybe_wrap_in_math(p: &mut Parser, arg: Marker, named: Option) { // Convert 0 exprs into a blank math element (so empty arguments are allowed). // Convert 2+ exprs into a math element (so they become a joined sequence). p.wrap_within(arg, p.marker(), SyntaxKind::Math); + // We need to update `n_trivia` since we no longer have any. + p.token.n_trivia = 0; // TODO: Maybe create a `flush_trivia()` method? } if let Some(m) = named { @@ -625,14 +623,17 @@ fn embedded_code_expr(p: &mut Parser) { p.with_mode(LexMode::Code, |p| { p.with_nl_mode(AtNewline::Stop, |p| { p.assert(SyntaxKind::Hash); - p.unskip(); + if p.had_trivia() { + p.expected("expression"); + return; + } let stmt = p.at_set(set::STMT); let at = p.at_set(set::ATOMIC_CODE_EXPR); code_expr_prec(p, true, 0); // Consume error for things like `#12p` or `#"abc\"`.# - if !at && !p.current().is_trivia() && !p.end() { + if !at && !p.end() { p.unexpected(); } @@ -1493,14 +1494,15 @@ fn pattern_leaf<'s>( /// Manages parsing a stream of tokens into a tree of [`SyntaxNode`]s. /// -/// The implementation presents an interface that investigates a `current` token -/// and can take one of the following actions: +/// The implementation presents an interface that investigates a current `token` +/// with a [`SyntaxKind`] and can take one of the following actions: /// -/// 1. Eat a token, pushing `current` into the `nodes` vector as a [leaf -/// node](`SyntaxNode::leaf`) and prepare a new `current` by calling into the +/// 1. Eat a token: push `token` onto the `nodes` vector as a [leaf +/// node](`SyntaxNode::leaf`) and prepare a new `token` by calling into the /// lexer. -/// 2. Wrap nodes from a marker to the end of `nodes` (excluding `current`) into -/// an [inner node](`SyntaxNode::inner`) of a specific [`SyntaxKind`]. +/// 2. Wrap nodes from a marker to the end of `nodes` (excluding `token` and any +/// attached trivia) into an [inner node](`SyntaxNode::inner`) of a specific +/// `SyntaxKind`. /// 3. Produce or convert nodes into an [error node](`SyntaxNode::error`) when /// something expected is missing or something unexpected is found. /// @@ -1525,9 +1527,9 @@ fn pattern_leaf<'s>( /// pushing onto the end of the `nodes` vector until a non-trivia kind is found. /// /// The newline mode is used in Code to determine whether a newline should end -/// the current expression. If so, the parser temporarily changes the current -/// token's kind to a fake [`SyntaxKind::End`]. When the parser exits the mode -/// the original `SyntaxKind` is restored. +/// the current expression. If so, the parser temporarily changes `token`'s kind +/// to a fake [`SyntaxKind::End`]. When the parser exits the mode the original +/// `SyntaxKind` is restored. struct Parser<'s> { /// The source text shared with the lexer. text: &'s str, @@ -1537,21 +1539,16 @@ struct Parser<'s> { lexer: Lexer<'s>, /// The newline mode: whether to insert a temporary end at newlines in Code. nl_mode: AtNewline, - /// The index into `text` of the end of the previous token. - prev_end: usize, - /// The index into `text` of the start of our current token (the end is - /// stored as the lexer's cursor). - current_start: usize, - /// The [`SyntaxKind`] of the current token. - current: SyntaxKind, - /// The [`SyntaxNode`] of the current token, ready to be eaten and pushed - /// onto the end of `nodes`. - current_node: SyntaxNode, + /// The current token under inspection, not yet present in `nodes`. This + /// acts like a single item of lookahead for the parser. + /// + /// When wrapping, this is _not_ included in the wrapped nodes. + token: Token, /// Whether the parser has the expected set of open/close delimiters. This /// only ever transitions from `true` to `false`. balanced: bool, /// Nodes representing the concrete syntax tree of previously parsed text. - /// In Code and Math, includes previously parsed trivia, but not `current`. + /// In Code and Math, includes previously parsed trivia, but not `token`. nodes: Vec, /// Parser checkpoints for a given text index. Used for efficient parser /// backtracking similar to packrat parsing. See comments above in @@ -1559,6 +1556,26 @@ struct Parser<'s> { memo: MemoArena, } +/// A single token returned from the lexer with a cached [`SyntaxKind`] and a +/// record of preceding trivia. +#[derive(Debug, Clone)] +struct Token { + /// The [`SyntaxKind`] of the current token. + kind: SyntaxKind, + /// The [`SyntaxNode`] of the current token, ready to be eaten and pushed + /// onto the end of `nodes`. + node: SyntaxNode, + /// The number of preceding trivia before this token. + n_trivia: usize, + /// Whether this token's preceding trivia contained a newline. + had_newline: bool, + /// The index into `text` of the start of our current token (the end is + /// stored as the lexer's cursor). + start: usize, + /// The index into `text` of the end of the previous token. + prev_end: usize, +} + /// How to proceed with parsing when at a newline in Code. #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum AtNewline { @@ -1572,11 +1589,12 @@ enum AtNewline { impl AtNewline { /// Whether to stop at a newline or continue based on the current context. - fn stop(self, kind: impl FnOnce() -> SyntaxKind) -> bool { + fn stop(self, kind: SyntaxKind) -> bool { + #[allow(clippy::match_like_matches_macro)] match self { AtNewline::Continue => false, AtNewline::Stop => true, - AtNewline::Contextual => match kind() { + AtNewline::Contextual => match kind { SyntaxKind::Else | SyntaxKind::Dot => false, _ => true, }, @@ -1595,17 +1613,16 @@ impl<'s> Parser<'s> { fn new(text: &'s str, offset: usize, mode: LexMode) -> Self { let mut lexer = Lexer::new(text, mode); lexer.jump(offset); - let (current, current_node) = lexer.next(); + let nl_mode = AtNewline::Continue; + let mut nodes = vec![]; + let token = Self::lex(&mut nodes, &mut lexer, nl_mode); Self { text, lexer, - nl_mode: AtNewline::Continue, - prev_end: offset, - current_start: offset, - current, - current_node, + nl_mode, + token, balanced: true, - nodes: vec![], + nodes, memo: Default::default(), } } @@ -1623,18 +1640,18 @@ impl<'s> Parser<'s> { /// The offset into `text` of the previous token's end. fn prev_end(&self) -> usize { - self.prev_end + self.token.prev_end } /// Similar to a `peek()` function: returns the `kind` of the next token to /// be eaten. fn current(&self) -> SyntaxKind { - self.current + self.token.kind } /// The offset into `text` of the current token's start. fn current_start(&self) -> usize { - self.current_start + self.token.start } /// The offset into `text` of the current token's end. @@ -1644,17 +1661,17 @@ impl<'s> Parser<'s> { /// The current token's text. fn current_text(&self) -> &'s str { - &self.text[self.current_start..self.current_end()] + &self.text[self.token.start..self.current_end()] } /// Whether the current token is a given [`SyntaxKind`]. fn at(&self, kind: SyntaxKind) -> bool { - self.current == kind + self.token.kind == kind } /// Whether the current token is contained in a [`SyntaxSet`]. fn at_set(&self, set: SyntaxSet) -> bool { - set.contains(self.current) + set.contains(self.token.kind) } /// Whether we're at the end of the token stream. @@ -1666,24 +1683,21 @@ impl<'s> Parser<'s> { /// If we're at the given `kind` with no preceding trivia tokens. fn directly_at(&self, kind: SyntaxKind) -> bool { - self.current == kind && self.prev_end == self.current_start + self.token.kind == kind && !self.had_trivia() } /// Eat the current token by saving it to the `nodes` vector, then move /// the lexer forward to prepare a new token. fn eat(&mut self) { - self.save(); - self.lex(); - self.skip(); + self.nodes.push(std::mem::take(&mut self.token.node)); + self.token = Self::lex(&mut self.nodes, &mut self.lexer, self.nl_mode); } /// Eat the current node and return a reference for in-place mutation. #[track_caller] fn eat_and_get(&mut self) -> &mut SyntaxNode { let offset = self.nodes.len(); - self.save(); - self.lex(); - self.skip(); + self.eat(); &mut self.nodes[offset] } @@ -1714,20 +1728,25 @@ impl<'s> Parser<'s> { /// specific token. #[track_caller] fn assert(&mut self, kind: SyntaxKind) { - assert_eq!(self.current, kind); + assert_eq!(self.token.kind, kind); self.eat(); } /// Convert the current token's [`SyntaxKind`] and eat it. fn convert_and_eat(&mut self, kind: SyntaxKind) { // Only need to replace the node here. - self.current_node.convert_to_kind(kind); + self.token.node.convert_to_kind(kind); self.eat(); } /// Whether the current token is a newline, only used in Markup. - fn newline(&mut self) -> bool { - self.lexer.newline() + fn newline(&self) -> bool { + self.token.had_newline + } + + /// Whether `token` had any trivia before it in Code/Math. + fn had_trivia(&self) -> bool { + self.token.n_trivia > 0 } /// The number of characters until the most recent newline in `text`. @@ -1744,13 +1763,7 @@ impl<'s> Parser<'s> { /// A marker that will point to first trivia before this token in the /// parser (or the token itself if no trivia precede it). fn before_trivia(&self) -> Marker { - let mut i = self.nodes.len(); - if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start { - while i > 0 && self.nodes[i - 1].kind().is_trivia() { - i -= 1; - } - } - Marker(i) + Marker(self.nodes.len() - self.token.n_trivia) } /// Whether the last non-trivia node is an error. @@ -1792,11 +1805,10 @@ impl<'s> Parser<'s> { self.lexer.set_mode(mode); func(self); if mode != previous { - self.unskip(); self.lexer.set_mode(previous); - self.lexer.jump(self.current_start); - self.lex(); - self.skip(); + self.lexer.jump(self.token.prev_end); + self.nodes.truncate(self.nodes.len() - self.token.n_trivia); + self.token = Self::lex(&mut self.nodes, &mut self.lexer, self.nl_mode); } } @@ -1808,69 +1820,46 @@ impl<'s> Parser<'s> { let previous = self.nl_mode; self.nl_mode = mode; func(self); - self.unskip(); self.nl_mode = previous; - self.lexer.jump(self.prev_end); - self.lex(); - self.skip(); - } - - /// Move past trivia nodes in Code/Math. - fn skip(&mut self) { - if self.lexer.mode() != LexMode::Markup { - while self.current.is_trivia() { - self.save(); - self.lex(); - } - } - } - - /// Move the parser back to the start of this token or its leading trivia - /// (in Code/Math). - fn unskip(&mut self) { - if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start { - while self.nodes.last().is_some_and(|last| last.kind().is_trivia()) { - self.nodes.pop(); - } - - self.lexer.jump(self.prev_end); - self.lex(); - } - } - - /// Save the current token to the `nodes` vector as an Inner or Error node. - fn save(&mut self) { - self.nodes.push(self.current_node.clone()); - - if self.lexer.mode() == LexMode::Markup || !self.current.is_trivia() { - self.prev_end = self.current_end(); - } - } - - /// Find the kind of the next non-trivia token in the lexer. - fn next_non_trivia(lexer: &mut Lexer<'s>) -> SyntaxKind { - loop { - let next = lexer.next().0; - // Loop is terminable, because `SyntaxKind::End` is not a trivia. - if !next.is_trivia() { - break next; + if mode != previous && self.token.had_newline { + let actual_kind = self.token.node.kind(); + if self.nl_mode.stop(actual_kind) { + self.token.kind = SyntaxKind::End; + } else { + self.token.kind = actual_kind; } } } /// Move the lexer forward and prepare the current token. In Code, this /// might insert a temporary [`SyntaxKind::End`] based on our newline mode. - fn lex(&mut self) { - self.current_start = self.lexer.cursor(); - (self.current, self.current_node) = self.lexer.next(); + /// + /// This is not a method on `self` because we need a valid token before we + /// can initialize the parser. + fn lex(nodes: &mut Vec, lexer: &mut Lexer, nl_mode: AtNewline) -> Token { + let prev_end = lexer.cursor(); + let mut start = prev_end; + let (mut kind, mut node) = lexer.next(); + let mut n_trivia = 0; + let mut had_newline = lexer.newline(); - // Special cases to handle newlines in Code. - if self.lexer.mode() == LexMode::Code - && self.lexer.newline() - && self.nl_mode.stop(|| Self::next_non_trivia(&mut self.lexer.clone())) - { - self.current = SyntaxKind::End; + if lexer.mode() != LexMode::Markup { + while kind.is_trivia() { + n_trivia += 1; + nodes.push(node); + start = lexer.cursor(); + (kind, node) = lexer.next(); + had_newline |= lexer.newline(); + } + if lexer.mode() == LexMode::Code && had_newline { + // Insert a temporary ['SyntaxKind::End'] to halt the parser. + // The actual `SyntaxKind` will be restored from `node` later. + if nl_mode.stop(kind) { + kind = SyntaxKind::End; + } + } } + Token { kind, node, n_trivia, had_newline, start, prev_end } } } @@ -1906,10 +1895,7 @@ struct Checkpoint { struct PartialState { cursor: usize, lex_mode: LexMode, - prev_end: usize, - current_start: usize, - current: SyntaxKind, - current_node: SyntaxNode, + token: Token, } impl<'s> Parser<'s> { @@ -1951,10 +1937,7 @@ impl<'s> Parser<'s> { fn restore_partial(&mut self, state: PartialState) { self.lexer.jump(state.cursor); self.lexer.set_mode(state.lex_mode); - self.prev_end = state.prev_end; - self.current_start = state.current_start; - self.current = state.current; - self.current_node = state.current_node; + self.token = state.token; } /// Save a checkpoint of the parser state. @@ -1963,10 +1946,7 @@ impl<'s> Parser<'s> { let state = PartialState { cursor: self.lexer.cursor(), lex_mode: self.lexer.mode(), - prev_end: self.prev_end, - current_start: self.current_start, - current: self.current, - current_node: self.current_node.clone(), + token: self.token.clone(), }; Checkpoint { node_len, state } } @@ -1978,7 +1958,7 @@ impl<'s> Parser<'s> { let at = self.at(kind); if at { self.eat(); - } else if kind == SyntaxKind::Ident && self.current.is_keyword() { + } else if kind == SyntaxKind::Ident && self.token.kind.is_keyword() { self.trim_errors(); self.eat_and_get().expected(kind.name()); } else { @@ -2024,7 +2004,7 @@ impl<'s> Parser<'s> { /// unexpected. fn unexpected(&mut self) { self.trim_errors(); - self.balanced &= !self.current.is_grouping(); + self.balanced &= !self.token.kind.is_grouping(); self.eat_and_get().unexpected(); } From 97229d24e44505b373632a51e29b4b844c6c6ee6 Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski Date: Sun, 27 Oct 2024 11:17:23 -0400 Subject: [PATCH 13/18] 13. Reorder functions to avoid jumping around when reading code. No actual changes. --- crates/typst-syntax/src/parser.rs | 224 ++++++++++++++++-------------- 1 file changed, 116 insertions(+), 108 deletions(-) diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index 405e3e5c5..8a1c8f76d 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -395,6 +395,22 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) { } } +/// Precedence and wrapper kinds for the binary math operators. +fn math_op(kind: SyntaxKind) -> Option<(SyntaxKind, SyntaxKind, ast::Assoc, usize)> { + match kind { + SyntaxKind::Underscore => { + Some((SyntaxKind::MathAttach, SyntaxKind::Hat, ast::Assoc::Right, 2)) + } + SyntaxKind::Hat => { + Some((SyntaxKind::MathAttach, SyntaxKind::Underscore, ast::Assoc::Right, 2)) + } + SyntaxKind::Slash => { + Some((SyntaxKind::MathFrac, SyntaxKind::End, ast::Assoc::Left, 1)) + } + _ => None, + } +} + /// Try to parse delimiters based on the current token's unicode math class. fn maybe_delimited(p: &mut Parser) -> bool { let open = math_class(p.current_text()) == Some(MathClass::Opening); @@ -464,22 +480,6 @@ fn math_class(text: &str) -> Option { .and_then(unicode_math_class::class) } -/// Precedence and wrapper kinds for the binary math operators. -fn math_op(kind: SyntaxKind) -> Option<(SyntaxKind, SyntaxKind, ast::Assoc, usize)> { - match kind { - SyntaxKind::Underscore => { - Some((SyntaxKind::MathAttach, SyntaxKind::Hat, ast::Assoc::Right, 2)) - } - SyntaxKind::Hat => { - Some((SyntaxKind::MathAttach, SyntaxKind::Underscore, ast::Assoc::Right, 2)) - } - SyntaxKind::Slash => { - Some((SyntaxKind::MathFrac, SyntaxKind::End, ast::Assoc::Left, 1)) - } - _ => None, - } -} - /// Parse an argument list in math: `(a, b; c, d; size: #50%)`. fn math_args(p: &mut Parser) { let m = p.marker(); @@ -613,11 +613,6 @@ fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) { } } -/// Parses a single code expression. -fn code_expr(p: &mut Parser) { - code_expr_prec(p, false, 0) -} - /// Parses an atomic code expression embedded in markup or math. fn embedded_code_expr(p: &mut Parser) { p.with_mode(LexMode::Code, |p| { @@ -647,6 +642,11 @@ fn embedded_code_expr(p: &mut Parser) { }); } +/// Parses a single code expression. +fn code_expr(p: &mut Parser) { + code_expr_prec(p, false, 0) +} + /// Parses a code expression with at least the given precedence. fn code_expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) { let m = p.marker(); @@ -777,15 +777,6 @@ fn code_primary(p: &mut Parser, atomic: bool) { } } -/// Parses a content or code block. -fn block(p: &mut Parser) { - match p.current() { - SyntaxKind::LeftBracket => content_block(p), - SyntaxKind::LeftBrace => code_block(p), - _ => p.expected("block"), - } -} - /// Reparses a full content or code block. pub(super) fn reparse_block(text: &str, range: Range) -> Option { let mut p = Parser::new(text, range.start, LexMode::Code); @@ -795,6 +786,15 @@ pub(super) fn reparse_block(text: &str, range: Range) -> Option content_block(p), + SyntaxKind::LeftBrace => code_block(p), + _ => p.expected("block"), + } +} + /// Parses a code block: `{ let x = 1; x + 2 }`. fn code_block(p: &mut Parser) { let m = p.marker(); @@ -1608,6 +1608,22 @@ impl AtNewline { #[derive(Debug, Copy, Clone, Eq, PartialEq)] struct Marker(usize); +// Index into the parser with markers. +impl Index for Parser<'_> { + type Output = SyntaxNode; + + fn index(&self, m: Marker) -> &Self::Output { + &self.nodes[m.0] + } +} + +impl IndexMut for Parser<'_> { + fn index_mut(&mut self, m: Marker) -> &mut Self::Output { + &mut self.nodes[m.0] + } +} + +/// Creating/Consuming the parser and getting info about the current token. impl<'s> Parser<'s> { /// Create a new parser starting from the given text offset and lexer mode. fn new(text: &'s str, offset: usize, mode: LexMode) -> Self { @@ -1638,32 +1654,12 @@ impl<'s> Parser<'s> { SyntaxNode::inner(kind, self.finish()) } - /// The offset into `text` of the previous token's end. - fn prev_end(&self) -> usize { - self.token.prev_end - } - /// Similar to a `peek()` function: returns the `kind` of the next token to /// be eaten. fn current(&self) -> SyntaxKind { self.token.kind } - /// The offset into `text` of the current token's start. - fn current_start(&self) -> usize { - self.token.start - } - - /// The offset into `text` of the current token's end. - fn current_end(&self) -> usize { - self.lexer.cursor() - } - - /// The current token's text. - fn current_text(&self) -> &'s str { - &self.text[self.token.start..self.current_end()] - } - /// Whether the current token is a given [`SyntaxKind`]. fn at(&self, kind: SyntaxKind) -> bool { self.token.kind == kind @@ -1686,11 +1682,62 @@ impl<'s> Parser<'s> { self.token.kind == kind && !self.had_trivia() } - /// Eat the current token by saving it to the `nodes` vector, then move - /// the lexer forward to prepare a new token. - fn eat(&mut self) { - self.nodes.push(std::mem::take(&mut self.token.node)); - self.token = Self::lex(&mut self.nodes, &mut self.lexer, self.nl_mode); + /// Whether `token` had any trivia before it in Code/Math. + fn had_trivia(&self) -> bool { + self.token.n_trivia > 0 + } + + /// Whether the current token is a newline, only used in Markup. + fn newline(&self) -> bool { + self.token.had_newline + } + + /// The number of characters until the most recent newline in `text`. + fn column(&self, at: usize) -> usize { + self.text[..at].chars().rev().take_while(|&c| !is_newline(c)).count() + } + + /// The current token's text. + fn current_text(&self) -> &'s str { + &self.text[self.token.start..self.current_end()] + } + + /// The offset into `text` of the current token's start. + fn current_start(&self) -> usize { + self.token.start + } + + /// The offset into `text` of the current token's end. + fn current_end(&self) -> usize { + self.lexer.cursor() + } + + /// The offset into `text` of the previous token's end. + fn prev_end(&self) -> usize { + self.token.prev_end + } +} + +// The main parsing interface for generating tokens and eating/modifying nodes. +impl<'s> Parser<'s> { + /// A marker that will point to the current token in the parser once it's + /// been eaten. + fn marker(&self) -> Marker { + Marker(self.nodes.len()) + } + + /// A marker that will point to first trivia before this token in the + /// parser (or the token itself if no trivia precede it). + fn before_trivia(&self) -> Marker { + Marker(self.nodes.len() - self.token.n_trivia) + } + + /// Iterate over the non-trivia tokens following the marker. + #[track_caller] + fn post_process(&mut self, m: Marker) -> impl Iterator { + self.nodes[m.0..] + .iter_mut() + .filter(|child| !child.kind().is_error() && !child.kind().is_trivia()) } /// Eat the current node and return a reference for in-place mutation. @@ -1739,45 +1786,11 @@ impl<'s> Parser<'s> { self.eat(); } - /// Whether the current token is a newline, only used in Markup. - fn newline(&self) -> bool { - self.token.had_newline - } - - /// Whether `token` had any trivia before it in Code/Math. - fn had_trivia(&self) -> bool { - self.token.n_trivia > 0 - } - - /// The number of characters until the most recent newline in `text`. - fn column(&self, at: usize) -> usize { - self.text[..at].chars().rev().take_while(|&c| !is_newline(c)).count() - } - - /// A marker that will point to the current token in the parser once it's - /// been eaten. - fn marker(&self) -> Marker { - Marker(self.nodes.len()) - } - - /// A marker that will point to first trivia before this token in the - /// parser (or the token itself if no trivia precede it). - fn before_trivia(&self) -> Marker { - Marker(self.nodes.len() - self.token.n_trivia) - } - - /// Whether the last non-trivia node is an error. - fn after_error(&mut self) -> bool { - let m = self.before_trivia(); - m.0 > 0 && self.nodes[m.0 - 1].kind().is_error() - } - - /// Iterate over the non-trivia tokens following the marker. - #[track_caller] - fn post_process(&mut self, m: Marker) -> impl Iterator { - self.nodes[m.0..] - .iter_mut() - .filter(|child| !child.kind().is_error() && !child.kind().is_trivia()) + /// Eat the current token by saving it to the `nodes` vector, then move + /// the lexer forward to prepare a new token. + fn eat(&mut self) { + self.nodes.push(std::mem::take(&mut self.token.node)); + self.token = Self::lex(&mut self.nodes, &mut self.lexer, self.nl_mode); } /// Wrap the nodes from a marker up to (but excluding) the current token in @@ -1898,6 +1911,7 @@ struct PartialState { token: Token, } +/// The Memoization interface. impl<'s> Parser<'s> { /// Store the already parsed nodes and the parser state into the memo map by /// extending the arena and storing the extended range and a checkpoint. @@ -1952,6 +1966,8 @@ impl<'s> Parser<'s> { } } +/// Functions for eating expected or unexpected tokens and generating errors if +/// we don't get what we expect. impl<'s> Parser<'s> { /// Consume the given `kind` or produce an error. fn expect(&mut self, kind: SyntaxKind) -> bool { @@ -1984,6 +2000,12 @@ impl<'s> Parser<'s> { } } + /// Whether the last non-trivia node is an error. + fn after_error(&mut self) -> bool { + let m = self.before_trivia(); + m.0 > 0 && self.nodes[m.0 - 1].kind().is_error() + } + /// Produce an error that the given `thing` was expected at the position /// of the marker `m`. fn expected_at(&mut self, m: Marker, thing: &str) { @@ -2021,17 +2043,3 @@ impl<'s> Parser<'s> { self.nodes.drain(start..end); } } - -impl Index for Parser<'_> { - type Output = SyntaxNode; - - fn index(&self, m: Marker) -> &Self::Output { - &self.nodes[m.0] - } -} - -impl IndexMut for Parser<'_> { - fn index_mut(&mut self, m: Marker) -> &mut Self::Output { - &mut self.nodes[m.0] - } -} From 4ce0b069f6478163eed2d2fd1860905bd47a5f46 Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski Date: Thu, 10 Oct 2024 17:51:05 -0400 Subject: [PATCH 14/18] 14. Update 'maybe_wrap_in_math' to remove 'wrap_within' and 'post_process'! --- crates/typst-syntax/src/parser.rs | 70 +++++++++++++++---------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index 8a1c8f76d..67d34b239 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -490,6 +490,8 @@ fn math_args(p: &mut Parser) { let mut has_arrays = false; let mut array = p.marker(); let mut arg = p.marker(); + // The number of math expressions per argument. + let mut count = 0; while !p.end() && !p.at(SyntaxKind::Dollar) { if namable @@ -506,20 +508,22 @@ fn math_args(p: &mut Parser) { match p.current_text() { ")" => break, ";" => { - maybe_wrap_in_math(p, arg, named); + maybe_wrap_in_math(p, arg, count, named); p.wrap(array, SyntaxKind::Array); p.convert_and_eat(SyntaxKind::Semicolon); array = p.marker(); arg = p.marker(); + count = 0; namable = true; named = None; has_arrays = true; continue; } "," => { - maybe_wrap_in_math(p, arg, named); + maybe_wrap_in_math(p, arg, count, named); p.convert_and_eat(SyntaxKind::Comma); arg = p.marker(); + count = 0; namable = true; if named.is_some() { array = p.marker(); @@ -532,6 +536,7 @@ fn math_args(p: &mut Parser) { if p.at_set(set::MATH_EXPR) { math_expr(p); + count += 1; } else { p.unexpected(); } @@ -540,7 +545,7 @@ fn math_args(p: &mut Parser) { } if arg != p.marker() { - maybe_wrap_in_math(p, arg, named); + maybe_wrap_in_math(p, arg, count, named); if named.is_some() { array = p.marker(); } @@ -560,25 +565,26 @@ fn math_args(p: &mut Parser) { p.wrap(m, SyntaxKind::Args); } -/// Wrap math function arguments in a "Math" SyntaxKind to combine adjacent expressions -/// or create blank content. +/// Wrap math function arguments to join adjacent math content or create an +/// empty 'Math' node for when we have 0 args. /// -/// We don't wrap when `exprs == 1`, as there is only one expression, so the grouping -/// isn't needed, and this would change the type of the expression from potentially -/// non-content to content. -/// -/// Note that `exprs` might be 0 if we have whitespace or trivia before a comma i.e. -/// `mat(; ,)` or `sin(x, , , ,)`. This would create an empty Math element before that -/// trivia if we called `p.wrap()` -- breaking the expected AST for 2-d arguments -- so -/// we instead manually wrap to our current marker using `p.wrap_within()`. -fn maybe_wrap_in_math(p: &mut Parser, arg: Marker, named: Option) { - let exprs = p.post_process(arg).filter(|node| node.is::()).count(); - if exprs != 1 { - // Convert 0 exprs into a blank math element (so empty arguments are allowed). - // Convert 2+ exprs into a math element (so they become a joined sequence). - p.wrap_within(arg, p.marker(), SyntaxKind::Math); - // We need to update `n_trivia` since we no longer have any. - p.token.n_trivia = 0; // TODO: Maybe create a `flush_trivia()` method? +/// We don't wrap when `count == 1`, since wrapping would change the type of the +/// expression from potentially non-content to content. Ex: `$ func(#12pt) $` +/// would change the type from size to content if wrapped. +fn maybe_wrap_in_math(p: &mut Parser, arg: Marker, count: usize, named: Option) { + if count == 0 { + // Flush trivia so that the new empty Math node will be wrapped _inside_ + // any `SyntaxKind::Array` elements created in `math_args`. + // (And if we don't follow by wrapping in an array, it has no effect.) + // The difference in node layout without this would look like: + // Expression: `$ mat( ;) $` + // - Correct: [ .., Space(" "), Array[Math[], ], Semicolon(";"), .. ] + // - Incorrect: [ .., Math[], Array[], Space(" "), Semicolon(";"), .. ] + p.flush_trivia(); + } + + if count != 1 { + p.wrap(arg, SyntaxKind::Math); } if let Some(m) = named { @@ -1732,14 +1738,6 @@ impl<'s> Parser<'s> { Marker(self.nodes.len() - self.token.n_trivia) } - /// Iterate over the non-trivia tokens following the marker. - #[track_caller] - fn post_process(&mut self, m: Marker) -> impl Iterator { - self.nodes[m.0..] - .iter_mut() - .filter(|child| !child.kind().is_error() && !child.kind().is_trivia()) - } - /// Eat the current node and return a reference for in-place mutation. #[track_caller] fn eat_and_get(&mut self) -> &mut SyntaxNode { @@ -1793,17 +1791,19 @@ impl<'s> Parser<'s> { self.token = Self::lex(&mut self.nodes, &mut self.lexer, self.nl_mode); } + /// Detach the parsed trivia nodes from this token (but not newline info) so + /// that subsequent wrapping will include the trivia. + fn flush_trivia(&mut self) { + self.token.n_trivia = 0; + self.token.prev_end = self.token.start; + } + /// Wrap the nodes from a marker up to (but excluding) the current token in /// a new [inner node](`SyntaxNode::inner`) of the given kind. This is an /// easy interface for creating nested syntax nodes _after_ having parsed /// their children. fn wrap(&mut self, from: Marker, kind: SyntaxKind) { - self.wrap_within(from, self.before_trivia(), kind); - } - - fn wrap_within(&mut self, from: Marker, to: Marker, kind: SyntaxKind) { - let len = self.nodes.len(); - let to = to.0.min(len); + let to = self.before_trivia().0; let from = from.0.min(to); let children = self.nodes.drain(from..to).collect(); self.nodes.insert(from, SyntaxNode::inner(kind, children)); From 26c61be1dc761306ea7f256b73344a22d843b622 Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski Date: Tue, 22 Oct 2024 00:13:56 -0400 Subject: [PATCH 15/18] 15. Convert Markup mode to use newline modes (And break out Newline info into separate struct) --- crates/typst-syntax/src/lexer.rs | 5 + crates/typst-syntax/src/parser.rs | 293 ++++++++++++++++-------------- tests/suite/model/heading.typ | 17 +- 3 files changed, 166 insertions(+), 149 deletions(-) diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs index 4a43c15ff..d09c6f842 100644 --- a/crates/typst-syntax/src/lexer.rs +++ b/crates/typst-syntax/src/lexer.rs @@ -68,6 +68,11 @@ impl<'s> Lexer<'s> { pub fn newline(&self) -> bool { self.newline } + + /// The number of characters until the most recent newline. + pub fn column(&self) -> usize { + self.s.before().chars().rev().take_while(|&c| !is_newline(c)).count() + } } impl Lexer<'_> { diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index 67d34b239..6e59f45e6 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -6,13 +6,13 @@ use ecow::{eco_format, EcoString}; use unicode_math_class::MathClass; use crate::set::{syntax_set, SyntaxSet}; -use crate::{ast, is_newline, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode}; +use crate::{ast, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode}; /// Parses a source file as top-level markup. pub fn parse(text: &str) -> SyntaxNode { let _scope = typst_timing::TimingScope::new("parse"); let mut p = Parser::new(text, 0, LexMode::Markup); - markup_exprs(&mut p, true, 0, |_| false); + markup_exprs(&mut p, true, |_| false); p.finish_into(SyntaxKind::Markup) } @@ -36,11 +36,14 @@ pub fn parse_math(text: &str) -> SyntaxNode { fn markup( p: &mut Parser, at_start: bool, - min_indent: usize, + wrap_trivia: bool, stop: impl FnMut(&Parser) -> bool, ) { - let m = p.marker(); - markup_exprs(p, at_start, min_indent, stop); + let m = if wrap_trivia { p.before_trivia() } else { p.marker() }; + markup_exprs(p, at_start, stop); + if wrap_trivia { + p.flush_trivia(); + } p.wrap(m, SyntaxKind::Markup); } @@ -48,9 +51,9 @@ fn markup( fn markup_exprs( p: &mut Parser, mut at_start: bool, - min_indent: usize, mut stop: impl FnMut(&Parser) -> bool, ) { + at_start |= p.had_newline(); let mut nesting: usize = 0; while !p.end() { match p.current() { @@ -59,17 +62,8 @@ fn markup_exprs( _ if stop(p) => break, _ => {} } - - if p.newline() { - at_start = true; - if min_indent > 0 && p.column(p.current_end()) < min_indent { - break; - } - p.eat(); - continue; - } - - markup_expr(p, &mut at_start); + markup_expr(p, at_start); + at_start = p.had_newline(); } } @@ -82,6 +76,7 @@ pub(super) fn reparse_markup( mut stop: impl FnMut(SyntaxKind) -> bool, ) -> Option> { let mut p = Parser::new(text, range.start, LexMode::Markup); + *at_start |= p.had_newline(); while !p.end() && p.current_start() < range.end { match p.current() { SyntaxKind::LeftBracket => *nesting += 1, @@ -89,30 +84,17 @@ pub(super) fn reparse_markup( _ if stop(p.current()) => break, _ => {} } - - if p.newline() { - *at_start = true; - p.eat(); - continue; - } - - markup_expr(&mut p, at_start); + markup_expr(&mut p, *at_start); + *at_start = p.had_newline(); } (p.balanced && p.current_start() == range.end).then(|| p.finish()) } -/// Parses a single markup expression. This includes markup elements like -/// spaces, text, and headings, and embedded code expressions. -fn markup_expr(p: &mut Parser, at_start: &mut bool) { +/// Parses a single markup expression. This includes markup elements like text, +/// headings, strong/emph, lists/enums, etc. This is also the entry point for +/// parsing math equations and embedded code expressions. +fn markup_expr(p: &mut Parser, at_start: bool) { match p.current() { - SyntaxKind::Space - | SyntaxKind::Parbreak - | SyntaxKind::LineComment - | SyntaxKind::BlockComment => { - p.eat(); - return; - } - SyntaxKind::Text | SyntaxKind::Linebreak | SyntaxKind::Escape @@ -126,10 +108,10 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) { SyntaxKind::Hash => embedded_code_expr(p), SyntaxKind::Star => strong(p), SyntaxKind::Underscore => emph(p), - SyntaxKind::HeadingMarker if *at_start => heading(p), - SyntaxKind::ListMarker if *at_start => list_item(p), - SyntaxKind::EnumMarker if *at_start => enum_item(p), - SyntaxKind::TermMarker if *at_start => term_item(p), + SyntaxKind::HeadingMarker if at_start => heading(p), + SyntaxKind::ListMarker if at_start => list_item(p), + SyntaxKind::EnumMarker if at_start => enum_item(p), + SyntaxKind::TermMarker if at_start => term_item(p), SyntaxKind::RefMarker => reference(p), SyntaxKind::Dollar => equation(p), @@ -141,76 +123,74 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) { | SyntaxKind::TermMarker | SyntaxKind::Colon => p.convert_and_eat(SyntaxKind::Text), - _ => { - p.unexpected(); - return; // Don't set `at_start` - } + _ => p.unexpected(), } - - *at_start = false; } /// Parses strong content: `*Strong*`. fn strong(p: &mut Parser) { - let m = p.marker(); - p.assert(SyntaxKind::Star); - markup(p, false, 0, |p| p.at_set(syntax_set!(Star, Parbreak, RightBracket))); - p.expect_closing_delimiter(m, SyntaxKind::Star); - p.wrap(m, SyntaxKind::Strong); + p.with_nl_mode(AtNewline::StopParBreak, |p| { + let m = p.marker(); + p.assert(SyntaxKind::Star); + markup(p, false, true, |p| p.at_set(syntax_set!(Star, RightBracket))); + p.expect_closing_delimiter(m, SyntaxKind::Star); + p.wrap(m, SyntaxKind::Strong); + }); } /// Parses emphasized content: `_Emphasized_`. fn emph(p: &mut Parser) { - let m = p.marker(); - p.assert(SyntaxKind::Underscore); - markup(p, false, 0, |p| p.at_set(syntax_set!(Underscore, Parbreak, RightBracket))); - p.expect_closing_delimiter(m, SyntaxKind::Underscore); - p.wrap(m, SyntaxKind::Emph); + p.with_nl_mode(AtNewline::StopParBreak, |p| { + let m = p.marker(); + p.assert(SyntaxKind::Underscore); + markup(p, false, true, |p| p.at_set(syntax_set!(Underscore, RightBracket))); + p.expect_closing_delimiter(m, SyntaxKind::Underscore); + p.wrap(m, SyntaxKind::Emph); + }); } /// Parses a section heading: `= Introduction`. fn heading(p: &mut Parser) { - let m = p.marker(); - p.assert(SyntaxKind::HeadingMarker); - whitespace_line(p); - markup(p, false, usize::MAX, |p| { - p.at_set(syntax_set!(Label, Space, RightBracket)) - && (!p.at(SyntaxKind::Space) || p.lexer.clone().next().0 == SyntaxKind::Label) + p.with_nl_mode(AtNewline::Stop, |p| { + let m = p.marker(); + p.assert(SyntaxKind::HeadingMarker); + markup(p, false, false, |p| p.at_set(syntax_set!(Label, RightBracket))); + p.wrap(m, SyntaxKind::Heading); }); - p.wrap(m, SyntaxKind::Heading); } /// Parses an item in a bullet list: `- ...`. fn list_item(p: &mut Parser) { - let m = p.marker(); - let min_indent = p.column(p.current_start()) + 1; - p.assert(SyntaxKind::ListMarker); - whitespace_line(p); - markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket)); - p.wrap(m, SyntaxKind::ListItem); + p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| { + let m = p.marker(); + p.assert(SyntaxKind::ListMarker); + markup(p, false, false, |p| p.at_set(syntax_set!(RightBracket))); + p.wrap(m, SyntaxKind::ListItem); + }); } /// Parses an item in an enumeration (numbered list): `+ ...` or `1. ...`. fn enum_item(p: &mut Parser) { - let m = p.marker(); - let min_indent = p.column(p.current_start()) + 1; - p.assert(SyntaxKind::EnumMarker); - whitespace_line(p); - markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket)); - p.wrap(m, SyntaxKind::EnumItem); + p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| { + let m = p.marker(); + p.assert(SyntaxKind::EnumMarker); + markup(p, false, false, |p| p.at(SyntaxKind::RightBracket)); + p.wrap(m, SyntaxKind::EnumItem); + }); } /// Parses an item in a term list: `/ Term: Details`. fn term_item(p: &mut Parser) { - let m = p.marker(); - p.assert(SyntaxKind::TermMarker); - let min_indent = p.column(p.prev_end()); - whitespace_line(p); - markup(p, false, usize::MAX, |p| p.at_set(syntax_set!(Colon, RightBracket))); - p.expect(SyntaxKind::Colon); - whitespace_line(p); - markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket)); - p.wrap(m, SyntaxKind::TermItem); + p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| { + let m = p.marker(); + p.with_nl_mode(AtNewline::Stop, |p| { + p.assert(SyntaxKind::TermMarker); + markup(p, false, false, |p| p.at_set(syntax_set!(Colon, RightBracket))); + }); + p.expect(SyntaxKind::Colon); + markup(p, false, false, |p| p.at(SyntaxKind::RightBracket)); + p.wrap(m, SyntaxKind::TermItem); + }); } /// Parses a reference: `@target`, `@target[..]`. @@ -223,20 +203,15 @@ fn reference(p: &mut Parser) { p.wrap(m, SyntaxKind::Ref); } -/// Consumes whitespace that does not contain a newline. -fn whitespace_line(p: &mut Parser) { - while !p.newline() && p.current().is_trivia() { - p.eat(); - } -} - /// Parses a mathematical equation: `$x$`, `$ x^2 $`. fn equation(p: &mut Parser) { let m = p.marker(); p.with_mode(LexMode::Math, |p| { - p.assert(SyntaxKind::Dollar); - math(p, |p| p.at(SyntaxKind::Dollar)); - p.expect_closing_delimiter(m, SyntaxKind::Dollar); + p.with_nl_mode(AtNewline::Continue, |p| { + p.assert(SyntaxKind::Dollar); + math(p, |p| p.at(SyntaxKind::Dollar)); + p.expect_closing_delimiter(m, SyntaxKind::Dollar); + }); }); p.wrap(m, SyntaxKind::Equation); } @@ -602,7 +577,7 @@ fn code(p: &mut Parser, stop: impl FnMut(&Parser) -> bool) { /// Parses a sequence of code expressions. fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) { while !p.end() && !stop(p) { - p.with_nl_mode(AtNewline::Contextual, |p| { + p.with_nl_mode(AtNewline::ContextualContinue, |p| { if !p.at_set(set::CODE_EXPR) { p.unexpected(); return; @@ -818,9 +793,11 @@ fn code_block(p: &mut Parser) { fn content_block(p: &mut Parser) { let m = p.marker(); p.with_mode(LexMode::Markup, |p| { - p.assert(SyntaxKind::LeftBracket); - markup(p, true, 0, |p| p.at(SyntaxKind::RightBracket)); - p.expect_closing_delimiter(m, SyntaxKind::RightBracket); + p.with_nl_mode(AtNewline::Continue, |p| { + p.assert(SyntaxKind::LeftBracket); + markup(p, true, true, |p| p.at(SyntaxKind::RightBracket)); + p.expect_closing_delimiter(m, SyntaxKind::RightBracket); + }); }); p.wrap(m, SyntaxKind::ContentBlock); } @@ -1526,15 +1503,11 @@ fn pattern_leaf<'s>( /// [lexer modes](`LexMode`) and [newline modes](`AtNewline`). /// /// The lexer modes map to the three Typst modes and are stored in the lexer, -/// changing which`SyntaxKind`s it will generate. The mode also affects how the -/// parser treats trivia tokens (comments and whitespace). In Markup, trivia is -/// handled manually to deal with list indentation and must be explicitly eaten. -/// In Code and Math, trivia is managed internally and is implicitly eaten by -/// pushing onto the end of the `nodes` vector until a non-trivia kind is found. +/// changing which`SyntaxKind`s it will generate. /// -/// The newline mode is used in Code to determine whether a newline should end -/// the current expression. If so, the parser temporarily changes `token`'s kind -/// to a fake [`SyntaxKind::End`]. When the parser exits the mode the original +/// The newline mode is used to determine whether a newline should end the +/// current expression. If so, the parser temporarily changes `token`'s kind to +/// a fake [`SyntaxKind::End`]. When the parser exits the mode the original /// `SyntaxKind` is restored. struct Parser<'s> { /// The source text shared with the lexer. @@ -1543,7 +1516,7 @@ struct Parser<'s> { /// of tokens and determines their [`SyntaxKind`]. Contains the [`LexMode`] /// defining our current Typst mode. lexer: Lexer<'s>, - /// The newline mode: whether to insert a temporary end at newlines in Code. + /// The newline mode: whether to insert a temporary end at newlines. nl_mode: AtNewline, /// The current token under inspection, not yet present in `nodes`. This /// acts like a single item of lookahead for the parser. @@ -1574,7 +1547,7 @@ struct Token { /// The number of preceding trivia before this token. n_trivia: usize, /// Whether this token's preceding trivia contained a newline. - had_newline: bool, + newline: Option, /// The index into `text` of the start of our current token (the end is /// stored as the lexer's cursor). start: usize, @@ -1582,28 +1555,52 @@ struct Token { prev_end: usize, } -/// How to proceed with parsing when at a newline in Code. +/// Information about a newline if present (currently only relevant in Markup). +#[derive(Debug, Clone, Copy)] +struct Newline { + /// The column of our token in its line. + /// + /// Note that this is actually the column of the first non-whitespace + /// `SyntaxKind` in the line, so `\n /**/- list` has column 2 (not 6) + /// because the block comment is the first non-space kind. + column: Option, + /// Whether any of our newlines were paragraph breaks. + parbreak: bool, +} + +/// How to proceed with parsing when at a newline. #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum AtNewline { /// Continue at newlines. Continue, /// Stop at any newline. Stop, - /// Continue only if there is no continuation with `else` or `.`. - Contextual, + /// Continue only if there is no continuation with `else` or `.` (Code only). + ContextualContinue, + /// Stop only at a parbreak, not normal newlines (Markup only). + StopParBreak, + /// Require that the token's column be greater or equal to a column (Markup + /// only). If this is `0`, acts like `Continue`; if this is `usize::MAX`, + /// acts like `Stop`. + RequireColumn(usize), } impl AtNewline { /// Whether to stop at a newline or continue based on the current context. - fn stop(self, kind: SyntaxKind) -> bool { + fn stop_at(self, Newline { column, parbreak }: Newline, kind: SyntaxKind) -> bool { #[allow(clippy::match_like_matches_macro)] match self { AtNewline::Continue => false, AtNewline::Stop => true, - AtNewline::Contextual => match kind { + AtNewline::ContextualContinue => match kind { SyntaxKind::Else | SyntaxKind::Dot => false, _ => true, }, + AtNewline::StopParBreak => parbreak, + AtNewline::RequireColumn(min_col) => match column { + Some(column) => column <= min_col, + None => false, // Don't stop if we had no column. + }, } } } @@ -1688,19 +1685,24 @@ impl<'s> Parser<'s> { self.token.kind == kind && !self.had_trivia() } - /// Whether `token` had any trivia before it in Code/Math. + /// Whether `token` had any preceding trivia. fn had_trivia(&self) -> bool { self.token.n_trivia > 0 } - /// Whether the current token is a newline, only used in Markup. - fn newline(&self) -> bool { - self.token.had_newline + /// Whether `token` had a newline among any of its preceding trivia. + fn had_newline(&self) -> bool { + self.token.newline.is_some() } - /// The number of characters until the most recent newline in `text`. - fn column(&self, at: usize) -> usize { - self.text[..at].chars().rev().take_while(|&c| !is_newline(c)).count() + /// The number of characters until the most recent newline from the current + /// token, or 0 if it did not follow a newline. + /// + /// Note that this is actually the column of the first non-whitespace + /// `SyntaxKind` in the line, so `\n /**/- list` has column 2 (not 6) + /// because the block comment is the first non-space kind. + fn current_column(&self) -> usize { + self.token.newline.and_then(|newline| newline.column).unwrap_or(0) } /// The current token's text. @@ -1834,12 +1836,15 @@ impl<'s> Parser<'s> { self.nl_mode = mode; func(self); self.nl_mode = previous; - if mode != previous && self.token.had_newline { - let actual_kind = self.token.node.kind(); - if self.nl_mode.stop(actual_kind) { - self.token.kind = SyntaxKind::End; - } else { - self.token.kind = actual_kind; + if let Some(newline) = self.token.newline { + if mode != previous { + // Restore our actual token's kind or insert a fake end. + let actual_kind = self.token.node.kind(); + if self.nl_mode.stop_at(newline, actual_kind) { + self.token.kind = SyntaxKind::End; + } else { + self.token.kind = actual_kind; + } } } } @@ -1854,25 +1859,31 @@ impl<'s> Parser<'s> { let mut start = prev_end; let (mut kind, mut node) = lexer.next(); let mut n_trivia = 0; - let mut had_newline = lexer.newline(); + let mut had_newline = false; + let mut newline = Newline { column: None, parbreak: false }; - if lexer.mode() != LexMode::Markup { - while kind.is_trivia() { - n_trivia += 1; - nodes.push(node); - start = lexer.cursor(); - (kind, node) = lexer.next(); - had_newline |= lexer.newline(); - } - if lexer.mode() == LexMode::Code && had_newline { - // Insert a temporary ['SyntaxKind::End'] to halt the parser. - // The actual `SyntaxKind` will be restored from `node` later. - if nl_mode.stop(kind) { - kind = SyntaxKind::End; + while kind.is_trivia() { + if lexer.newline() { + // Newlines are always trivia. + had_newline = true; + newline.parbreak |= kind == SyntaxKind::Parbreak; + if lexer.mode() == LexMode::Markup { + newline.column = Some(lexer.column()); } } + n_trivia += 1; + nodes.push(node); + start = lexer.cursor(); + (kind, node) = lexer.next(); } - Token { kind, node, n_trivia, had_newline, start, prev_end } + if had_newline && nl_mode.stop_at(newline, kind) { + // Insert a temporary `SyntaxKind::End` to halt the parser. + // The actual kind will be restored from `node` later. + kind = SyntaxKind::End; + } + + let newline = had_newline.then_some(newline); + Token { kind, node, n_trivia, newline, start, prev_end } } } diff --git a/tests/suite/model/heading.typ b/tests/suite/model/heading.typ index 884f203d2..d182724c8 100644 --- a/tests/suite/model/heading.typ +++ b/tests/suite/model/heading.typ @@ -38,7 +38,7 @@ multiline. --- heading-trailing-whitespace --- // Whether headings contain trailing whitespace with or without comments/labels. // Labels are special cased to immediately end headings in the parser, but also -// have unique whitespace behavior. +// #strike[have unique whitespace behavior] Now their behavior is consistent! #let join(..xs) = xs.pos().join() #let head(h) = heading(depth: 1, h) @@ -49,19 +49,20 @@ multiline. #test(head[h], [= h]) #test(head[h], [= h/**/]) -// Label behaves differently than normal trailing space and comment. -#test(head(join[h][ ]), [= h ]) -#test(head(join[h][ ]), [= h /**/]) +// #strike[Label behaves differently than normal trailing space and comment.] +// Now they behave the same! +#test(join(head[h])[ ], [= h ]) +#test(join(head[h])[ ], [= h /**/]) #test(join(head[h])[ ], [= h ]) // Combinations. -#test(head(join[h][ ][ ]), [= h /**/ ]) +#test(join(head[h])[ ][ ], [= h /**/ ]) #test(join(head[h])[ ][ ], [= h ]) -#test(head(join[h][ ]), [= h /**/]) +#test(join(head[h])[ ], [= h /**/]) #test(join(head[h])[ ], [= h/**/ ]) -// The first space attaches, but not the second -#test(join(head(join[h][ ]))[ ], [= h /**/ ]) +// #strike[The first space attaches, but not the second] Now neither attaches! +#test(join(head(join[h]))[ ][ ], [= h /**/ ]) --- heading-leading-whitespace --- // Test that leading whitespace and comments don't matter. From 86ce443806d166f3b75c2a792b0461aa35552dec Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski Date: Thu, 24 Oct 2024 22:03:35 -0400 Subject: [PATCH 16/18] 16. Compress with_mode and with_nl_mode to reduce rightward drift --- crates/typst-syntax/src/parser.rs | 77 +++++++++++++++---------------- 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index 6e59f45e6..761cea029 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -206,12 +206,10 @@ fn reference(p: &mut Parser) { /// Parses a mathematical equation: `$x$`, `$ x^2 $`. fn equation(p: &mut Parser) { let m = p.marker(); - p.with_mode(LexMode::Math, |p| { - p.with_nl_mode(AtNewline::Continue, |p| { - p.assert(SyntaxKind::Dollar); - math(p, |p| p.at(SyntaxKind::Dollar)); - p.expect_closing_delimiter(m, SyntaxKind::Dollar); - }); + p.enter_modes(LexMode::Math, AtNewline::Continue, |p| { + p.assert(SyntaxKind::Dollar); + math(p, |p| p.at(SyntaxKind::Dollar)); + p.expect_closing_delimiter(m, SyntaxKind::Dollar); }); p.wrap(m, SyntaxKind::Equation); } @@ -596,30 +594,28 @@ fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) { /// Parses an atomic code expression embedded in markup or math. fn embedded_code_expr(p: &mut Parser) { - p.with_mode(LexMode::Code, |p| { - p.with_nl_mode(AtNewline::Stop, |p| { - p.assert(SyntaxKind::Hash); - if p.had_trivia() { - p.expected("expression"); - return; - } + p.enter_modes(LexMode::Code, AtNewline::Stop, |p| { + p.assert(SyntaxKind::Hash); + if p.had_trivia() { + p.expected("expression"); + return; + } - let stmt = p.at_set(set::STMT); - let at = p.at_set(set::ATOMIC_CODE_EXPR); - code_expr_prec(p, true, 0); + let stmt = p.at_set(set::STMT); + let at = p.at_set(set::ATOMIC_CODE_EXPR); + code_expr_prec(p, true, 0); - // Consume error for things like `#12p` or `#"abc\"`.# - if !at && !p.end() { - p.unexpected(); - } + // Consume error for things like `#12p` or `#"abc\"`.# + if !at && !p.end() { + p.unexpected(); + } - let semi = (stmt || p.directly_at(SyntaxKind::Semicolon)) - && p.eat_if(SyntaxKind::Semicolon); + let semi = (stmt || p.directly_at(SyntaxKind::Semicolon)) + && p.eat_if(SyntaxKind::Semicolon); - if stmt && !semi && !p.end() && !p.at(SyntaxKind::RightBracket) { - p.expected("semicolon or line break"); - } - }); + if stmt && !semi && !p.end() && !p.at(SyntaxKind::RightBracket) { + p.expected("semicolon or line break"); + } }); } @@ -779,12 +775,10 @@ fn block(p: &mut Parser) { /// Parses a code block: `{ let x = 1; x + 2 }`. fn code_block(p: &mut Parser) { let m = p.marker(); - p.with_mode(LexMode::Code, |p| { - p.with_nl_mode(AtNewline::Continue, |p| { - p.assert(SyntaxKind::LeftBrace); - code(p, |p| p.at_set(syntax_set!(RightBrace, RightBracket, RightParen))); - p.expect_closing_delimiter(m, SyntaxKind::RightBrace); - }); + p.enter_modes(LexMode::Code, AtNewline::Continue, |p| { + p.assert(SyntaxKind::LeftBrace); + code(p, |p| p.at_set(syntax_set!(RightBrace, RightBracket, RightParen))); + p.expect_closing_delimiter(m, SyntaxKind::RightBrace); }); p.wrap(m, SyntaxKind::CodeBlock); } @@ -792,12 +786,10 @@ fn code_block(p: &mut Parser) { /// Parses a content block: `[*Hi* there!]`. fn content_block(p: &mut Parser) { let m = p.marker(); - p.with_mode(LexMode::Markup, |p| { - p.with_nl_mode(AtNewline::Continue, |p| { - p.assert(SyntaxKind::LeftBracket); - markup(p, true, true, |p| p.at(SyntaxKind::RightBracket)); - p.expect_closing_delimiter(m, SyntaxKind::RightBracket); - }); + p.enter_modes(LexMode::Markup, AtNewline::Continue, |p| { + p.assert(SyntaxKind::LeftBracket); + markup(p, true, true, |p| p.at(SyntaxKind::RightBracket)); + p.expect_closing_delimiter(m, SyntaxKind::RightBracket); }); p.wrap(m, SyntaxKind::ContentBlock); } @@ -1815,10 +1807,15 @@ impl<'s> Parser<'s> { /// current token). This may re-lex the final token on exit. /// /// This function effectively repurposes the call stack as a stack of modes. - fn with_mode(&mut self, mode: LexMode, func: impl FnOnce(&mut Parser<'s>)) { + fn enter_modes( + &mut self, + mode: LexMode, + stop: AtNewline, + func: impl FnOnce(&mut Parser<'s>), + ) { let previous = self.lexer.mode(); self.lexer.set_mode(mode); - func(self); + self.with_nl_mode(stop, func); if mode != previous { self.lexer.set_mode(previous); self.lexer.jump(self.token.prev_end); From 9d9a1b1e33cdc379200c1d3881c34fe05c496894 Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski Date: Sat, 26 Oct 2024 00:37:14 -0400 Subject: [PATCH 17/18] 17. Replace while loop closures and Parser::end() to use SyntaxSet --- crates/typst-syntax/src/parser.rs | 81 +++++++++++++--------------- crates/typst-syntax/src/reparser.rs | 8 +-- tests/ref/single-right-bracket.png | Bin 0 -> 118 bytes tests/suite/scripting/blocks.typ | 3 ++ 4 files changed, 42 insertions(+), 50 deletions(-) create mode 100644 tests/ref/single-right-bracket.png diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index 761cea029..b26cc0020 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -12,7 +12,7 @@ use crate::{ast, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode}; pub fn parse(text: &str) -> SyntaxNode { let _scope = typst_timing::TimingScope::new("parse"); let mut p = Parser::new(text, 0, LexMode::Markup); - markup_exprs(&mut p, true, |_| false); + markup_exprs(&mut p, true, syntax_set!(End)); p.finish_into(SyntaxKind::Markup) } @@ -20,7 +20,7 @@ pub fn parse(text: &str) -> SyntaxNode { pub fn parse_code(text: &str) -> SyntaxNode { let _scope = typst_timing::TimingScope::new("parse code"); let mut p = Parser::new(text, 0, LexMode::Code); - code_exprs(&mut p, |_| false); + code_exprs(&mut p, syntax_set!(End)); p.finish_into(SyntaxKind::Code) } @@ -28,19 +28,14 @@ pub fn parse_code(text: &str) -> SyntaxNode { pub fn parse_math(text: &str) -> SyntaxNode { let _scope = typst_timing::TimingScope::new("parse math"); let mut p = Parser::new(text, 0, LexMode::Math); - math_exprs(&mut p, |_| false); + math_exprs(&mut p, syntax_set!(End)); p.finish_into(SyntaxKind::Math) } /// Parses markup expressions until a stop condition is met. -fn markup( - p: &mut Parser, - at_start: bool, - wrap_trivia: bool, - stop: impl FnMut(&Parser) -> bool, -) { +fn markup(p: &mut Parser, at_start: bool, wrap_trivia: bool, stop_set: SyntaxSet) { let m = if wrap_trivia { p.before_trivia() } else { p.marker() }; - markup_exprs(p, at_start, stop); + markup_exprs(p, at_start, stop_set); if wrap_trivia { p.flush_trivia(); } @@ -48,18 +43,15 @@ fn markup( } /// Parses a sequence of markup expressions. -fn markup_exprs( - p: &mut Parser, - mut at_start: bool, - mut stop: impl FnMut(&Parser) -> bool, -) { +fn markup_exprs(p: &mut Parser, mut at_start: bool, stop_set: SyntaxSet) { + debug_assert!(stop_set.contains(SyntaxKind::End)); at_start |= p.had_newline(); let mut nesting: usize = 0; - while !p.end() { + loop { match p.current() { SyntaxKind::LeftBracket => nesting += 1, SyntaxKind::RightBracket if nesting > 0 => nesting -= 1, - _ if stop(p) => break, + _ if p.at_set(stop_set) => break, _ => {} } markup_expr(p, at_start); @@ -73,15 +65,16 @@ pub(super) fn reparse_markup( range: Range, at_start: &mut bool, nesting: &mut usize, - mut stop: impl FnMut(SyntaxKind) -> bool, + top_level: bool, ) -> Option> { let mut p = Parser::new(text, range.start, LexMode::Markup); *at_start |= p.had_newline(); - while !p.end() && p.current_start() < range.end { + while p.current_start() < range.end { match p.current() { SyntaxKind::LeftBracket => *nesting += 1, SyntaxKind::RightBracket if *nesting > 0 => *nesting -= 1, - _ if stop(p.current()) => break, + SyntaxKind::RightBracket if !top_level => break, + SyntaxKind::End => break, _ => {} } markup_expr(&mut p, *at_start); @@ -132,7 +125,7 @@ fn strong(p: &mut Parser) { p.with_nl_mode(AtNewline::StopParBreak, |p| { let m = p.marker(); p.assert(SyntaxKind::Star); - markup(p, false, true, |p| p.at_set(syntax_set!(Star, RightBracket))); + markup(p, false, true, syntax_set!(Star, RightBracket, End)); p.expect_closing_delimiter(m, SyntaxKind::Star); p.wrap(m, SyntaxKind::Strong); }); @@ -143,7 +136,7 @@ fn emph(p: &mut Parser) { p.with_nl_mode(AtNewline::StopParBreak, |p| { let m = p.marker(); p.assert(SyntaxKind::Underscore); - markup(p, false, true, |p| p.at_set(syntax_set!(Underscore, RightBracket))); + markup(p, false, true, syntax_set!(Underscore, RightBracket, End)); p.expect_closing_delimiter(m, SyntaxKind::Underscore); p.wrap(m, SyntaxKind::Emph); }); @@ -154,7 +147,7 @@ fn heading(p: &mut Parser) { p.with_nl_mode(AtNewline::Stop, |p| { let m = p.marker(); p.assert(SyntaxKind::HeadingMarker); - markup(p, false, false, |p| p.at_set(syntax_set!(Label, RightBracket))); + markup(p, false, false, syntax_set!(Label, RightBracket, End)); p.wrap(m, SyntaxKind::Heading); }); } @@ -164,7 +157,7 @@ fn list_item(p: &mut Parser) { p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| { let m = p.marker(); p.assert(SyntaxKind::ListMarker); - markup(p, false, false, |p| p.at_set(syntax_set!(RightBracket))); + markup(p, false, false, syntax_set!(RightBracket, End)); p.wrap(m, SyntaxKind::ListItem); }); } @@ -174,7 +167,7 @@ fn enum_item(p: &mut Parser) { p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| { let m = p.marker(); p.assert(SyntaxKind::EnumMarker); - markup(p, false, false, |p| p.at(SyntaxKind::RightBracket)); + markup(p, false, false, syntax_set!(RightBracket, End)); p.wrap(m, SyntaxKind::EnumItem); }); } @@ -185,10 +178,10 @@ fn term_item(p: &mut Parser) { let m = p.marker(); p.with_nl_mode(AtNewline::Stop, |p| { p.assert(SyntaxKind::TermMarker); - markup(p, false, false, |p| p.at_set(syntax_set!(Colon, RightBracket))); + markup(p, false, false, syntax_set!(Colon, RightBracket, End)); }); p.expect(SyntaxKind::Colon); - markup(p, false, false, |p| p.at(SyntaxKind::RightBracket)); + markup(p, false, false, syntax_set!(RightBracket, End)); p.wrap(m, SyntaxKind::TermItem); }); } @@ -208,22 +201,23 @@ fn equation(p: &mut Parser) { let m = p.marker(); p.enter_modes(LexMode::Math, AtNewline::Continue, |p| { p.assert(SyntaxKind::Dollar); - math(p, |p| p.at(SyntaxKind::Dollar)); + math(p, syntax_set!(Dollar, RightBracket, End)); p.expect_closing_delimiter(m, SyntaxKind::Dollar); }); p.wrap(m, SyntaxKind::Equation); } /// Parses the contents of a mathematical equation: `x^2 + 1`. -fn math(p: &mut Parser, stop: impl FnMut(&Parser) -> bool) { +fn math(p: &mut Parser, stop_set: SyntaxSet) { let m = p.marker(); - math_exprs(p, stop); + math_exprs(p, stop_set); p.wrap(m, SyntaxKind::Math); } /// Parses a sequence of math expressions. -fn math_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) { - while !p.end() && !stop(p) { +fn math_exprs(p: &mut Parser, stop_set: SyntaxSet) { + debug_assert!(stop_set.contains(SyntaxKind::End)); + while !p.at_set(stop_set) { if p.at_set(set::MATH_EXPR) { math_expr(p); } else { @@ -398,7 +392,7 @@ fn math_delimited(p: &mut Parser) { let m = p.marker(); p.eat(); let m2 = p.marker(); - while !p.end() && !p.at(SyntaxKind::Dollar) { + while !p.at_set(syntax_set!(Dollar, End)) { if math_class(p.current_text()) == Some(MathClass::Closing) { p.wrap(m2, SyntaxKind::Math); p.eat(); @@ -466,7 +460,7 @@ fn math_args(p: &mut Parser) { // The number of math expressions per argument. let mut count = 0; - while !p.end() && !p.at(SyntaxKind::Dollar) { + while !p.at_set(syntax_set!(Dollar, End)) { if namable && (p.at(SyntaxKind::MathIdent) || p.at(SyntaxKind::Text)) && p.text[p.current_end()..].starts_with(':') @@ -566,22 +560,23 @@ fn maybe_wrap_in_math(p: &mut Parser, arg: Marker, count: usize, named: Option bool) { +fn code(p: &mut Parser, stop_set: SyntaxSet) { let m = p.marker(); - code_exprs(p, stop); + code_exprs(p, stop_set); p.wrap(m, SyntaxKind::Code); } /// Parses a sequence of code expressions. -fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) { - while !p.end() && !stop(p) { +fn code_exprs(p: &mut Parser, stop_set: SyntaxSet) { + debug_assert!(stop_set.contains(SyntaxKind::End)); + while !p.at_set(stop_set) { p.with_nl_mode(AtNewline::ContextualContinue, |p| { if !p.at_set(set::CODE_EXPR) { p.unexpected(); return; } code_expr(p); - if !p.end() && !stop(p) && !p.eat_if(SyntaxKind::Semicolon) { + if !p.at_set(stop_set) && !p.eat_if(SyntaxKind::Semicolon) { p.expected("semicolon or line break"); if p.at(SyntaxKind::Label) { p.hint("labels can only be applied in markup mode"); @@ -596,7 +591,7 @@ fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) { fn embedded_code_expr(p: &mut Parser) { p.enter_modes(LexMode::Code, AtNewline::Stop, |p| { p.assert(SyntaxKind::Hash); - if p.had_trivia() { + if p.had_trivia() || p.end() { p.expected("expression"); return; } @@ -606,7 +601,7 @@ fn embedded_code_expr(p: &mut Parser) { code_expr_prec(p, true, 0); // Consume error for things like `#12p` or `#"abc\"`.# - if !at && !p.end() { + if !at { p.unexpected(); } @@ -777,7 +772,7 @@ fn code_block(p: &mut Parser) { let m = p.marker(); p.enter_modes(LexMode::Code, AtNewline::Continue, |p| { p.assert(SyntaxKind::LeftBrace); - code(p, |p| p.at_set(syntax_set!(RightBrace, RightBracket, RightParen))); + code(p, syntax_set!(RightBrace, RightBracket, RightParen, End)); p.expect_closing_delimiter(m, SyntaxKind::RightBrace); }); p.wrap(m, SyntaxKind::CodeBlock); @@ -788,7 +783,7 @@ fn content_block(p: &mut Parser) { let m = p.marker(); p.enter_modes(LexMode::Markup, AtNewline::Continue, |p| { p.assert(SyntaxKind::LeftBracket); - markup(p, true, true, |p| p.at(SyntaxKind::RightBracket)); + markup(p, true, true, syntax_set!(RightBracket, End)); p.expect_closing_delimiter(m, SyntaxKind::RightBracket); }); p.wrap(m, SyntaxKind::ContentBlock); diff --git a/crates/typst-syntax/src/reparser.rs b/crates/typst-syntax/src/reparser.rs index 7a9704906..c20d8314f 100644 --- a/crates/typst-syntax/src/reparser.rs +++ b/crates/typst-syntax/src/reparser.rs @@ -157,19 +157,13 @@ fn try_reparse( let new_range = shifted..shifted + new_len; let at_end = end == children.len(); - // Stop parsing early if this kind is encountered. - let stop_kind = match parent_kind { - Some(_) => SyntaxKind::RightBracket, - None => SyntaxKind::End, - }; - // Reparse! let reparsed = reparse_markup( text, new_range.clone(), &mut at_start, &mut nesting, - |kind| kind == stop_kind, + parent_kind.is_none(), ); if let Some(newborns) = reparsed { diff --git a/tests/ref/single-right-bracket.png b/tests/ref/single-right-bracket.png new file mode 100644 index 0000000000000000000000000000000000000000..9867424ddfa324301c82cc4dde8072d9dfaa899f GIT binary patch literal 118 zcmeAS@N?(olHy`uVBq!ia0vp^6+kS_0VEhE<%|3RQnsEhjv*Ddl7HAcG$dYm6xi*q zE9WORe@;PCL(QJexeYq|46;71IC}Wpqgv*ak0;k1UYz=M#nHuL{ZT$l3=9kQOnGR} Rb7?8aKu=dcmvv4FO#p!jD Date: Sun, 3 Nov 2024 20:35:21 -0500 Subject: [PATCH 18/18] 18. Restore list indent behavior --- crates/typst-syntax/src/lexer.rs | 8 +++--- crates/typst-syntax/src/parser.rs | 41 ++++++++++++---------------- tests/suite/model/list.typ | 45 +++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 27 deletions(-) diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs index d09c6f842..1314016fa 100644 --- a/crates/typst-syntax/src/lexer.rs +++ b/crates/typst-syntax/src/lexer.rs @@ -69,9 +69,11 @@ impl<'s> Lexer<'s> { self.newline } - /// The number of characters until the most recent newline. - pub fn column(&self) -> usize { - self.s.before().chars().rev().take_while(|&c| !is_newline(c)).count() + /// The number of characters until the most recent newline from an index. + pub fn column(&self, index: usize) -> usize { + let mut s = self.s; // Make a new temporary scanner (cheap). + s.jump(index); + s.before().chars().rev().take_while(|&c| !is_newline(c)).count() } } diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index b26cc0020..5fc621d6d 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -1545,11 +1545,7 @@ struct Token { /// Information about a newline if present (currently only relevant in Markup). #[derive(Debug, Clone, Copy)] struct Newline { - /// The column of our token in its line. - /// - /// Note that this is actually the column of the first non-whitespace - /// `SyntaxKind` in the line, so `\n /**/- list` has column 2 (not 6) - /// because the block comment is the first non-space kind. + /// The column of the start of our token in its line. column: Option, /// Whether any of our newlines were paragraph breaks. parbreak: bool, @@ -1684,10 +1680,6 @@ impl<'s> Parser<'s> { /// The number of characters until the most recent newline from the current /// token, or 0 if it did not follow a newline. - /// - /// Note that this is actually the column of the first non-whitespace - /// `SyntaxKind` in the line, so `\n /**/- list` has column 2 (not 6) - /// because the block comment is the first non-space kind. fn current_column(&self) -> usize { self.token.newline.and_then(|newline| newline.column).unwrap_or(0) } @@ -1852,29 +1844,30 @@ impl<'s> Parser<'s> { let (mut kind, mut node) = lexer.next(); let mut n_trivia = 0; let mut had_newline = false; - let mut newline = Newline { column: None, parbreak: false }; + let mut parbreak = false; while kind.is_trivia() { - if lexer.newline() { - // Newlines are always trivia. - had_newline = true; - newline.parbreak |= kind == SyntaxKind::Parbreak; - if lexer.mode() == LexMode::Markup { - newline.column = Some(lexer.column()); - } - } + had_newline |= lexer.newline(); // Newlines are always trivia. + parbreak |= kind == SyntaxKind::Parbreak; n_trivia += 1; nodes.push(node); start = lexer.cursor(); (kind, node) = lexer.next(); } - if had_newline && nl_mode.stop_at(newline, kind) { - // Insert a temporary `SyntaxKind::End` to halt the parser. - // The actual kind will be restored from `node` later. - kind = SyntaxKind::End; - } - let newline = had_newline.then_some(newline); + let newline = if had_newline { + let column = (lexer.mode() == LexMode::Markup).then(|| lexer.column(start)); + let newline = Newline { column, parbreak }; + if nl_mode.stop_at(newline, kind) { + // Insert a temporary `SyntaxKind::End` to halt the parser. + // The actual kind will be restored from `node` later. + kind = SyntaxKind::End; + } + Some(newline) + } else { + None + }; + Token { kind, node, n_trivia, newline, start, prev_end } } } diff --git a/tests/suite/model/list.typ b/tests/suite/model/list.typ index 46f4621f5..c3c123de1 100644 --- a/tests/suite/model/list.typ +++ b/tests/suite/model/list.typ @@ -34,6 +34,51 @@ _Shopping list_ - C - D +--- list-indent-trivia-nesting --- +// Test indent nesting behavior with odd trivia (comments and spaces). + +#let indented = [ +- a + /**/- b +/**/ - c + /*spanning + multiple + lines */ - d + - e +/**/ - f +/**/ - g +] +// Current behavior is that list columns are based on the first non-whitespace +// element in their line, so the block comments here determine the column the +// list starts at + +#let item = list.item +#let manual = { + [ ] + item({ + [a] + [ ] + item[b] + [ ]; [ ] + item({ + [c] + [ ]; [ ] + item[d] + }) + [ ] + item({ + [e] + [ ]; [ ] + item[f] + [ ]; [ ] + item[g] + }) + }) + [ ] +} + +#test(indented, manual) + --- list-tabs --- // This works because tabs are used consistently. - A with 1 tab