From da83f33c4c015f927920b437610153c029c8291b Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Sun, 22 Sep 2024 17:38:38 -0400
Subject: [PATCH 01/18] 1. Add test-runner option to compare parser output

---
 tests/src/args.rs  | 23 ++++++++++++++++-
 tests/src/tests.rs | 62 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 82 insertions(+), 3 deletions(-)
diff --git a/tests/src/args.rs b/tests/src/args.rs
index 786733cce..e94986ced 100644
--- a/tests/src/args.rs
+++ b/tests/src/args.rs
@@ -43,7 +43,9 @@ pub struct CliArguments {
     /// Runs SVG export.
     #[arg(long)]
     pub svg: bool,
-    /// Displays the syntax tree.
+    /// Displays the syntax tree before running tests.
+    ///
+    /// Note: This is ignored if using '--syntax-compare'.
     #[arg(long)]
     pub syntax: bool,
     /// Displays only one line per test, hiding details about failures.
@@ -55,6 +57,25 @@ pub struct CliArguments {
     /// How many threads to spawn when running the tests.
     #[arg(short = 'j', long)]
     pub num_threads: Option<usize>,
+    /// Changes testing behavior for debugging the parser: With no argument,
+    /// outputs the concrete syntax trees of tests as files in
+    /// 'tests/store/syntax/'. With a directory as argument, will treat it as a
+    /// reference of correct syntax tree files and will print which output
+    /// syntax trees differ (viewing the diffs is on you).
+    ///
+    /// This overrides the normal testing system. It parses, but does not run
+    /// the test suite.
+    ///
+    /// You can generate a correct reference directory by running on a known
+    /// good commit and copying the generated outputs to a new directory.
+    /// `_things` may be a good location as it is in the top-level gitignore.
+    ///
+    /// You can view diffs in VS Code with: `code --diff <ref_dir>/<test>.syntax
+    /// tests/store/syntax/<test>.syntax`
+    #[arg(long)]
+    pub parser_compare: Option<Option<PathBuf>>,
+    // ^ I'm not using a subcommand here because then test patterns don't parse
+    // how you would expect and I'm too lazy to try to fix it.
 }
 
 impl CliArguments {
diff --git a/tests/src/tests.rs b/tests/src/tests.rs
index 940c9e3c4..eb2cfd796 100644
--- a/tests/src/tests.rs
+++ b/tests/src/tests.rs
@@ -7,7 +7,7 @@ mod logger;
 mod run;
 mod world;
 
-use std::path::Path;
+use std::path::{Path, PathBuf};
 use std::sync::LazyLock;
 use std::time::Duration;
 
@@ -16,7 +16,9 @@ use parking_lot::Mutex;
 use rayon::iter::{ParallelBridge, ParallelIterator};
 
 use crate::args::{CliArguments, Command};
+use crate::collect::Test;
 use crate::logger::Logger;
+use crate::run::TestResult;
 
 /// The parsed command line arguments.
 static ARGS: LazyLock<CliArguments> = LazyLock::new(CliArguments::parse);
@@ -27,6 +29,9 @@ const SUITE_PATH: &str = "tests/suite";
 /// The directory where the full test results are stored.
 const STORE_PATH: &str = "tests/store";
 
+/// The directory where syntax trees are stored.
+const SYNTAX_PATH: &str = "tests/store/syntax";
+
 /// The directory where the reference images are stored.
 const REF_PATH: &str = "tests/ref";
 
@@ -89,6 +94,16 @@ fn test() {
         return;
     }
 
+    let parser_dirs = ARGS.parser_compare.clone().map(create_syntax_store);
+
+    let runner = |test: &Test| {
+        if let Some((live_path, ref_path)) = &parser_dirs {
+            run_parser_test(test, live_path, ref_path)
+        } else {
+            run::run(test)
+        }
+    };
+
     // Run the tests.
     let logger = Mutex::new(Logger::new(selected, skipped));
     std::thread::scope(|scope| {
@@ -112,7 +127,7 @@ fn test() {
         // to `typst::utils::Deferred` yielding.
         tests.iter().par_bridge().for_each(|test| {
             logger.lock().start(test);
-            let result = std::panic::catch_unwind(|| run::run(test));
+            let result = std::panic::catch_unwind(|| runner(test));
             logger.lock().end(test, result);
         });
 
@@ -142,3 +157,46 @@ fn undangle() {
         }
     }
 }
+
+fn create_syntax_store(ref_path: Option<PathBuf>) -> (&'static Path, Option<PathBuf>) {
+    if ref_path.as_ref().is_some_and(|p| !p.exists()) {
+        eprintln!("syntax reference path doesn't exist");
+        std::process::exit(1);
+    }
+
+    let live_path = Path::new(SYNTAX_PATH);
+    std::fs::remove_dir_all(live_path).ok();
+    std::fs::create_dir_all(live_path).unwrap();
+    (live_path, ref_path)
+}
+
+fn run_parser_test(
+    test: &Test,
+    live_path: &Path,
+    ref_path: &Option<PathBuf>,
+) -> TestResult {
+    let mut result = TestResult {
+        errors: String::new(),
+        infos: String::new(),
+        mismatched_image: false,
+    };
+
+    let syntax_file = live_path.join(format!("{}.syntax", test.name));
+    let tree = format!("{:#?}\n", test.source.root());
+    std::fs::write(syntax_file, &tree).unwrap();
+
+    let Some(ref_path) = ref_path else { return result };
+    let ref_file = ref_path.join(format!("{}.syntax", test.name));
+    match std::fs::read_to_string(&ref_file) {
+        Ok(ref_tree) => {
+            if tree != ref_tree {
+                result.errors = "differs".to_string();
+            }
+        }
+        Err(_) => {
+            result.errors = format!("missing reference: {}", ref_file.display());
+        }
+    }
+
+    result
+}

From a2761ab75ac4038edff8be1c4dc66b3770e74d38 Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Sun, 22 Sep 2024 17:38:38 -0400
Subject: [PATCH 02/18] 2. Allow compiling test-runner with only typst_syntax

---
 Cargo.lock           |  1 +
 tests/Cargo.toml     | 32 +++++++++++++++++++++++++-------
 tests/src/args.rs    |  4 ++++
 tests/src/collect.rs |  4 ++--
 tests/src/logger.rs  | 13 +++++++++++--
 tests/src/run.rs     | 18 +-----------------
 tests/src/tests.rs   | 18 ++++++++++++++----
 7 files changed, 58 insertions(+), 32 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4709fb5b4..5c148c117 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3018,6 +3018,7 @@ dependencies = [
  "typst-pdf",
  "typst-render",
  "typst-svg",
+ "typst-syntax",
  "unscanny",
  "walkdir",
 ]
diff --git a/tests/Cargo.toml b/tests/Cargo.toml
index b1855b496..eed093eb6 100644
--- a/tests/Cargo.toml
+++ b/tests/Cargo.toml
@@ -11,14 +11,32 @@ name = "tests"
 path = "src/tests.rs"
 harness = false
 
+[features]
+# Allow just compiling the parser when only testing typst-syntax. To do so,
+# pass '--no-default-features' to 'cargo test'.
+default = [
+    # "typst-syntax" intentionally not present
+    "typst",
+    "typst-assets",
+    "typst-dev-assets",
+    "typst-library",
+    "typst-pdf",
+    "typst-render",
+    "typst-svg",
+    "typst-svg",
+]
+
 [dependencies]
-typst = { workspace = true }
-typst-assets = { workspace = true, features = ["fonts"] }
-typst-dev-assets = { workspace = true }
-typst-library = { workspace = true }
-typst-pdf = { workspace = true }
-typst-render = { workspace = true }
-typst-svg = { workspace = true }
+typst-syntax = { workspace = true }
+# Mark other Typst crates as optional so we can use '--no-default-features'
+# to decrease compile times for parser testing.
+typst = { workspace = true, optional = true }
+typst-assets = { workspace = true, features = ["fonts"], optional = true }
+typst-dev-assets = { workspace = true, optional = true }
+typst-library = { workspace = true, optional = true }
+typst-pdf = { workspace = true, optional = true }
+typst-render = { workspace = true, optional = true }
+typst-svg = { workspace = true, optional = true }
 clap = { workspace = true }
 comemo = { workspace = true }
 ecow = { workspace = true }
diff --git a/tests/src/args.rs b/tests/src/args.rs
index e94986ced..db5d1a9ba 100644
--- a/tests/src/args.rs
+++ b/tests/src/args.rs
@@ -66,6 +66,10 @@ pub struct CliArguments {
     /// This overrides the normal testing system. It parses, but does not run
     /// the test suite.
     ///
+    /// If `cargo test` is run with `--no-default-features`, then compiling will
+    /// not include Typst's core crates, only typst-syntax, greatly speeding up
+    /// debugging when changing the parser.
+    ///
     /// You can generate a correct reference directory by running on a known
     /// good commit and copying the generated outputs to a new directory.
     /// `_things` may be a good location as it is in the top-level gitignore.
diff --git a/tests/src/collect.rs b/tests/src/collect.rs
index 80e5e5a8b..5c7327f13 100644
--- a/tests/src/collect.rs
+++ b/tests/src/collect.rs
@@ -6,8 +6,8 @@ use std::str::FromStr;
 use std::sync::LazyLock;
 
 use ecow::{eco_format, EcoString};
-use typst::syntax::package::PackageVersion;
-use typst::syntax::{is_id_continue, is_ident, is_newline, FileId, Source, VirtualPath};
+use typst_syntax::package::PackageVersion;
+use typst_syntax::{is_id_continue, is_ident, is_newline, FileId, Source, VirtualPath};
 use unscanny::Scanner;
 
 /// Collects all tests from all files.
diff --git a/tests/src/logger.rs b/tests/src/logger.rs
index 45c9f0981..48bad451b 100644
--- a/tests/src/logger.rs
+++ b/tests/src/logger.rs
@@ -2,7 +2,16 @@ use std::io::{self, IsTerminal, StderrLock, Write};
 use std::time::{Duration, Instant};
 
 use crate::collect::Test;
-use crate::run::TestResult;
+
+/// The result of running a single test.
+pub struct TestResult {
+    /// The error log for this test. If empty, the test passed.
+    pub errors: String,
+    /// The info log for this test.
+    pub infos: String,
+    /// Whether the image was mismatched.
+    pub mismatched_image: bool,
+}
 
 /// Receives status updates by individual test runs.
 pub struct Logger<'a> {
@@ -58,7 +67,7 @@ impl<'a> Logger<'a> {
             }
         };
 
-        if result.is_ok() {
+        if result.errors.is_empty() {
             self.passed += 1;
         } else {
             self.failed += 1;
diff --git a/tests/src/run.rs b/tests/src/run.rs
index caa078c4b..1ea19a16a 100644
--- a/tests/src/run.rs
+++ b/tests/src/run.rs
@@ -12,6 +12,7 @@ use typst::WorldExt;
 use typst_pdf::PdfOptions;
 
 use crate::collect::{FileSize, NoteKind, Test};
+use crate::logger::TestResult;
 use crate::world::TestWorld;
 
 /// Runs a single test.
@@ -21,23 +22,6 @@ pub fn run(test: &Test) -> TestResult {
     Runner::new(test).run()
 }
 
-/// The result of running a single test.
-pub struct TestResult {
-    /// The error log for this test. If empty, the test passed.
-    pub errors: String,
-    /// The info log for this test.
-    pub infos: String,
-    /// Whether the image was mismatched.
-    pub mismatched_image: bool,
-}
-
-impl TestResult {
-    /// Whether the test passed.
-    pub fn is_ok(&self) -> bool {
-        self.errors.is_empty()
-    }
-}
-
 /// Write a line to a log sink, defaulting to the test's error log.
 macro_rules! log {
     (into: $sink:expr, $($tts:tt)*) => {
diff --git a/tests/src/tests.rs b/tests/src/tests.rs
index eb2cfd796..2b09b29c0 100644
--- a/tests/src/tests.rs
+++ b/tests/src/tests.rs
@@ -1,10 +1,16 @@
 //! Typst's test runner.
 
+#![cfg_attr(not(feature = "default"), allow(dead_code, unused_imports))]
+
 mod args;
 mod collect;
-mod custom;
 mod logger;
+
+#[cfg(feature = "default")]
+mod custom;
+#[cfg(feature = "default")]
 mod run;
+#[cfg(feature = "default")]
 mod world;
 
 use std::path::{Path, PathBuf};
@@ -17,8 +23,7 @@ use rayon::iter::{ParallelBridge, ParallelIterator};
 
 use crate::args::{CliArguments, Command};
 use crate::collect::Test;
-use crate::logger::Logger;
-use crate::run::TestResult;
+use crate::logger::{Logger, TestResult};
 
 /// The parsed command line arguments.
 static ARGS: LazyLock<CliArguments> = LazyLock::new(CliArguments::parse);
@@ -95,12 +100,17 @@ fn test() {
     }
 
     let parser_dirs = ARGS.parser_compare.clone().map(create_syntax_store);
+    #[cfg(not(feature = "default"))]
+    let parser_dirs = parser_dirs.or_else(|| Some(create_syntax_store(None)));
 
     let runner = |test: &Test| {
         if let Some((live_path, ref_path)) = &parser_dirs {
             run_parser_test(test, live_path, ref_path)
         } else {
-            run::run(test)
+            #[cfg(feature = "default")]
+            return run::run(test);
+            #[cfg(not(feature = "default"))]
+            unreachable!();
         }
     };
 

From a764aa419209d2d46d27d46c00c46cc12a371f08 Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Thu, 10 Oct 2024 11:57:27 -0400
Subject: [PATCH 03/18] 3. Add typst-syntax README and parser comments

---
 crates/typst-syntax/README.md     |  40 +++++++
 crates/typst-syntax/src/parser.rs | 170 +++++++++++++++++++++++++++---
 2 files changed, 193 insertions(+), 17 deletions(-)
 create mode 100644 crates/typst-syntax/README.md

diff --git a/crates/typst-syntax/README.md b/crates/typst-syntax/README.md
new file mode 100644
index 000000000..ced4096ef
--- /dev/null
+++ b/crates/typst-syntax/README.md
@@ -0,0 +1,40 @@
+# typst-syntax
+
+Welcome to the Typst Syntax crate! This crate manages the syntactical structure
+of Typst by holding some core abstractions like assigning source file ids,
+parsing Typst syntax, creating an Abstract Syntax Tree (AST), initializing
+source "spans" (for linking AST elements to their outputs in a document), and
+syntax highlighting.
+
+Below are quick descriptions of the files you might be editing if you find
+yourself here :)
+
+- `lexer.rs`: The lexical foundation of the parser, which converts a string of
+  characters into tokens.
+- `parser.rs`: The main parser definition, preparing a Concrete Syntax Tree made
+  of nested vectors of `SyntaxNode`s.
+- `reparser.rs`: The algorithm for reparsing the minimal required amount of
+  source text for efficient incremental compilation.
+- `ast.rs`: The conversion layer between the Concrete Syntax Tree of the parser
+  and the Abstract Syntax Tree used for code evaluation.
+- `node.rs` & `span.rs`: The underlying data structure for the Concrete Syntax
+  Tree and the definitions of source spans used for efficiently pointing to a
+  syntax node in things like diagnostics.
+- `kind.rs` & `set.rs`: An enum with all syntactical tokens and nodes and
+  bit-set data structure for sets of `SyntaxKind`s.
+- `highlight.rs`: Extracting of syntax highlighting information out of the
+  Concrete Syntax Tree (and outputting as HTML).
+- `path.rs`, `file.rs`, `package.rs`: The system for interning project and
+  package paths as unique file IDs and resolving them in a virtual filesystem
+  (not actually for _opening_ files).
+
+The structure of the parser is largely adapted from Rust Analyzer. Their
+[documentation][ra] is a good reference for a number of the design decisions
+around the parser and AST.
+
+The reparsing algorithm is explained in Section 4 of [Martin's thesis][thesis]
+(though it changed a bit since).
+
+[ra]: https://github.com/rust-lang/rust-analyzer/blob/master/docs/dev/syntax.md
+[thesis]:
+    https://www.researchgate.net/publication/364622490_Fast_Typesetting_with_Incremental_Compilation
diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index 8c783ffed..afa47257f 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -10,7 +10,7 @@ use crate::{
     ast, is_ident, is_newline, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode,
 };
 
-/// Parses a source file.
+/// Parses a source file as top-level markup.
 pub fn parse(text: &str) -> SyntaxNode {
     let _scope = typst_timing::TimingScope::new("parse");
     let mut p = Parser::new(text, 0, LexMode::Markup);
@@ -37,7 +37,7 @@ pub fn parse_math(text: &str) -> SyntaxNode {
     p.finish().into_iter().next().unwrap()
 }
 
-/// Parses the contents of a file or content block.
+/// Parses markup expressions until a stop condition is met.
 fn markup(
     p: &mut Parser,
     mut at_start: bool,
@@ -96,7 +96,7 @@ pub(super) fn reparse_markup(
     (p.balanced && p.current_start() == range.end).then(|| p.finish())
 }
 
-/// Parses a single markup expression: This includes markup elements like
+/// Parses a single markup expression. This includes markup elements like
 /// spaces, text, and headings, and embedded code expressions.
 fn markup_expr(p: &mut Parser, at_start: &mut bool) {
     match p.current() {
@@ -414,6 +414,7 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) {
     }
 }
 
+/// Try to parse delimiters based on the current token's unicode math class.
 fn maybe_delimited(p: &mut Parser) -> bool {
     let open = math_class(p.current_text()) == Some(MathClass::Opening);
     if open {
@@ -422,6 +423,7 @@ fn maybe_delimited(p: &mut Parser) -> bool {
     open
 }
 
+/// Parse matched delimiters in math: `[x + y]`.
 fn math_delimited(p: &mut Parser) {
     let m = p.marker();
     p.eat();
@@ -444,6 +446,8 @@ fn math_delimited(p: &mut Parser) {
     p.wrap(m, SyntaxKind::Math);
 }
 
+/// Remove one set of parentheses (if any) from a previously parsed expression
+/// by converting to non-expression SyntaxKinds.
 fn math_unparen(p: &mut Parser, m: Marker) {
     let Some(node) = p.nodes.get_mut(m.0) else { return };
     if node.kind() != SyntaxKind::MathDelimited {
@@ -460,6 +464,10 @@ fn math_unparen(p: &mut Parser, m: Marker) {
     node.convert_to_kind(SyntaxKind::Math);
 }
 
+/// The unicode math class of a string. Only returns `Some` if `text` has
+/// exactly one unicode character or is a math shorthand string (currently just
+/// `[|`, `||`, `|]`) and then only returns `Some` if there is a math class
+/// defined for that character.
 fn math_class(text: &str) -> Option<MathClass> {
     match text {
         "[|" => return Some(MathClass::Opening),
@@ -475,6 +483,7 @@ fn math_class(text: &str) -> Option<MathClass> {
         .and_then(unicode_math_class::class)
 }
 
+/// Precedence and wrapper kinds for the binary math operators.
 fn math_op(kind: SyntaxKind) -> Option<(SyntaxKind, SyntaxKind, ast::Assoc, usize)> {
     match kind {
         SyntaxKind::Underscore => {
@@ -490,6 +499,7 @@ fn math_op(kind: SyntaxKind) -> Option<(SyntaxKind, SyntaxKind, ast::Assoc, usiz
     }
 }
 
+/// Parse an argument list in math: `(a, b; c, d; size: #50%)`.
 fn math_args(p: &mut Parser) {
     let m = p.marker();
     p.convert(SyntaxKind::LeftParen);
@@ -629,7 +639,7 @@ fn code_expr(p: &mut Parser) {
     code_expr_prec(p, false, 0)
 }
 
-/// Parses a code expression embedded in markup or math.
+/// Parses an atomic code expression embedded in markup or math.
 fn embedded_code_expr(p: &mut Parser) {
     p.enter_newline_mode(NewlineMode::Stop);
     p.enter(LexMode::Code);
@@ -1130,6 +1140,21 @@ fn parenthesized_or_array_or_dict(p: &mut Parser) -> SyntaxKind {
         seen: HashSet::new(),
     };
 
+    // An edge case with parens is whether we can interpret a leading spread
+    // expression as a dictionary, e.g. if we want `(..dict1, ..dict2)` to join
+    // the two dicts.
+    //
+    // The issue is that we decide on the type of the parenthesized expression
+    // here in the parser by the `SyntaxKind` we wrap with, instead of in eval
+    // based on the type of the spread item.
+    //
+    // The current fix is that we allow a leading colon to force the
+    // parenthesized value into a dict:
+    // - `(..arr1, ..arr2)` is wrapped as an `Array`.
+    // - `(: ..dict1, ..dict2)` is wrapped as a `Dict`.
+    //
+    // This does allow some unexpected expressions, such as `(: key: val)`, but
+    // it's currently intentional.
     if p.eat_if(SyntaxKind::Colon) {
         state.kind = Some(SyntaxKind::Dict);
         state.maybe_just_parens = false;
@@ -1165,8 +1190,13 @@ fn parenthesized_or_array_or_dict(p: &mut Parser) -> SyntaxKind {
 /// State for array/dictionary parsing.
 struct GroupState {
     count: usize,
+    /// Whether this is just a single expression in parens: `(a)`. Single
+    /// element arrays require an explicit comma: `(a,)`, unless we're
+    /// spreading: `(..a)`.
     maybe_just_parens: bool,
+    /// The `SyntaxKind` to wrap as (if we've figured it out yet).
     kind: Option<SyntaxKind>,
+    /// Store named arguments so we can give an error if they're repeated.
     seen: HashSet<EcoString>,
 }
 
@@ -1484,32 +1514,90 @@ fn pattern_leaf<'s>(
     }
 }
 
-/// Manages parsing of a stream of tokens.
+/// Manages parsing a stream of tokens into a tree of [`SyntaxNode`]s.
+///
+/// The implementation presents an interface that investigates a `current` token
+/// and can take one of the following actions:
+///
+/// 1. Eat a token, pushing `current` into the `nodes` vector as a [leaf
+///    node](`SyntaxNode::leaf`) and prepare a new `current` by calling into the
+///    lexer.
+/// 2. Wrap nodes from a marker to the end of `nodes` (excluding `current`) into
+///    an [inner node](`SyntaxNode::inner`) of a specific [`SyntaxKind`].
+/// 3. Produce or convert nodes into an [error node](`SyntaxNode::error`) when
+///    something expected is missing or something unexpected is found.
+///
+/// Overall the parser produces a nested tree of SyntaxNodes as a "_Concrete_
+/// Syntax Tree." The raw Concrete Syntax Tree should contain the entire source
+/// text, and is used as-is for e.g. syntax highlighting and IDE features. In
+/// `ast.rs` the CST is interpreted as a lazy view over an "_Abstract_ Syntax
+/// Tree." The AST module skips over irrelevant tokens -- whitespace, comments,
+/// code parens, commas in function args, etc. -- as it iterates through the
+/// tree.
+///
+/// ### Modes
+///
+/// The parser manages the transitions between the three modes of Typst through
+/// stacks of [lexer modes](`LexMode`) and [newline modes](`NewlineMode`).
+///
+/// The lexer modes map to the three Typst modes and are stored in the lexer,
+/// changing which`SyntaxKind`s it will generate. The mode also affects how the
+/// parser treats trivia tokens (comments and whitespace). In Markup, trivia is
+/// handled manually to deal with list indentation and must be explicitly eaten.
+/// In Code and Math, trivia is managed internally and is implicitly eaten by
+/// pushing onto the end of the `nodes` vector until a non-trivia kind is found.
+///
+/// The newline mode is used in Code to determine whether a newline should end
+/// the current expression. If so, the parser temporarily changes the current
+/// token's kind to a fake [`SyntaxKind::End`]. When the parser exits the mode
+/// the original `SyntaxKind` is restored.
 struct Parser<'s> {
+    /// The source text shared with the lexer.
     text: &'s str,
+    /// A lexer over the source text with multiple modes. Defines the boundaries
+    /// of tokens and determines their [`SyntaxKind`].
     lexer: Lexer<'s>,
+    /// The index into `text` of the end of the previous token.
     prev_end: usize,
+    /// The index into `text` of the start of our current token (the end is
+    /// stored as the lexer's cursor).
     current_start: usize,
+    /// The [`SyntaxKind`] of the current token.
     current: SyntaxKind,
+    /// Whether the parser has the expected set of open/close delimiters. This
+    /// only ever transitions from `true` to `false`.
     balanced: bool,
+    /// Nodes representing the concrete syntax tree of previously parsed text.
+    /// In Code and Math, includes previously parsed trivia, but not `current`.
     nodes: Vec<SyntaxNode>,
+    /// Stack of lexer modes to be pushed/popped. The current mode is implicitly
+    /// stored in the lexer.
     modes: Vec<LexMode>,
+    /// Stack of newline modes to be pushed/popped. The current mode is the tail
+    /// of the vector.
     newline_modes: Vec<NewlineMode>,
+    /// Parser checkpoints for a given text index. Used for efficient parser
+    /// backtracking similar to packrat parsing. See comments above in
+    /// [`expr_with_paren`].
     memo: HashMap<usize, (Range<usize>, Checkpoint<'s>)>,
+    /// The stored parse results at each checkpoint.
     memo_arena: Vec<SyntaxNode>,
 }
 
-/// How to proceed with parsing when seeing a newline.
+/// How to proceed with parsing when at a newline in Code.
 #[derive(Clone)]
 enum NewlineMode {
-    /// Stop always.
+    /// Stop at any newline.
     Stop,
-    /// Proceed if there is no continuation with `else` or `.`
+    /// Continue only if there is no continuation with `else` or `.`.
     Contextual,
-    /// Just proceed like with normal whitespace.
+    /// Continue at newlines.
     Continue,
 }
 
+/// A marker representing a node's position in the parser. Mainly used for
+/// wrapping, but can also index into the parser to access the node, like
+/// `p[m]`.
 #[derive(Debug, Copy, Clone, Eq, PartialEq)]
 struct Marker(usize);
 
@@ -1523,6 +1611,7 @@ struct Checkpoint<'s> {
 }
 
 impl<'s> Parser<'s> {
+    /// Create a new parser starting from the given text offset and lexer mode.
     fn new(text: &'s str, offset: usize, mode: LexMode) -> Self {
         let mut lexer = Lexer::new(text, mode);
         lexer.jump(offset);
@@ -1542,52 +1631,68 @@ impl<'s> Parser<'s> {
         }
     }
 
+    /// Consume the parser, yielding the full vector of parsed SyntaxNodes.
     fn finish(self) -> Vec<SyntaxNode> {
         self.nodes
     }
 
+    /// The offset into `text` of the previous token's end.
     fn prev_end(&self) -> usize {
         self.prev_end
     }
 
+    /// Similar to a `peek()` function: returns the `kind` of the next token to
+    /// be eaten.
     fn current(&self) -> SyntaxKind {
         self.current
     }
 
+    /// The offset into `text` of the current token's start.
     fn current_start(&self) -> usize {
         self.current_start
     }
 
+    /// The offset into `text` of the current token's end.
     fn current_end(&self) -> usize {
         self.lexer.cursor()
     }
 
+    /// The current token's text.
     fn current_text(&self) -> &'s str {
         &self.text[self.current_start..self.current_end()]
     }
 
+    /// Whether the current token is a given [`SyntaxKind`].
     fn at(&self, kind: SyntaxKind) -> bool {
         self.current == kind
     }
 
+    /// Whether the current token is contained in a [`SyntaxSet`].
     fn at_set(&self, set: SyntaxSet) -> bool {
         set.contains(self.current)
     }
 
+    /// Whether we're at the end of the token stream.
+    ///
+    /// Note: This might be a fake end due to the newline mode.
     fn end(&self) -> bool {
         self.at(SyntaxKind::End)
     }
 
+    /// If we're at the given `kind` with no preceding trivia tokens.
     fn directly_at(&self, kind: SyntaxKind) -> bool {
         self.current == kind && self.prev_end == self.current_start
     }
 
+    /// Eat the current token by saving it to the `nodes` vector, then move
+    /// the lexer forward to prepare a new token.
     fn eat(&mut self) {
         self.save();
         self.lex();
         self.skip();
     }
 
+    /// Eat the current node and return a reference for in-place mutation.
     #[track_caller]
     fn eat_and_get(&mut self) -> &mut SyntaxNode {
         let offset = self.nodes.len();
@@ -1597,9 +1702,9 @@ impl<'s> Parser<'s> {
         &mut self.nodes[offset]
     }
 
-    /// Eats if at `kind`.
+    /// Eat the token if at `kind`. Returns `true` if eaten.
     ///
-    /// Note: In math and code mode, this will ignore trivia in front of the
+    /// Note: In Math and Code, this will ignore trivia in front of the
     /// `kind`, To forbid skipping trivia, consider using `eat_if_direct`.
     fn eat_if(&mut self, kind: SyntaxKind) -> bool {
         let at = self.at(kind);
@@ -1609,7 +1714,8 @@ impl<'s> Parser<'s> {
         at
     }
 
-    /// Eats only if currently at the start of `kind`.
+    /// Eat the token only if at `kind` with no preceding trivia. Returns `true`
+    /// if eaten.
     fn eat_if_direct(&mut self, kind: SyntaxKind) -> bool {
         let at = self.directly_at(kind);
         if at {
@@ -1618,30 +1724,39 @@ impl<'s> Parser<'s> {
         at
     }
 
+    /// Assert that we are at the given [`SyntaxKind`] and eat it. This should
+    /// be used when moving between functions that expect to start with a
+    /// specific token.
     #[track_caller]
     fn assert(&mut self, kind: SyntaxKind) {
         assert_eq!(self.current, kind);
         self.eat();
     }
 
+    /// Convert the current token's [`SyntaxKind`] and eat it.
     fn convert(&mut self, kind: SyntaxKind) {
         self.current = kind;
         self.eat();
     }
 
+    /// Whether the current token is a newline, only used in Markup.
     fn newline(&mut self) -> bool {
         self.lexer.newline()
     }
 
+    /// The number of characters until the most recent newline in `text`.
     fn column(&self, at: usize) -> usize {
         self.text[..at].chars().rev().take_while(|&c| !is_newline(c)).count()
     }
 
+    /// A marker that will point to the current token in the parser once it's
+    /// been eaten.
     fn marker(&self) -> Marker {
         Marker(self.nodes.len())
     }
 
-    /// Get a marker after the last non-trivia node.
+    /// A marker that will point to first trivia before this token in the
+    /// parser (or the token itself if no trivia precede it).
     fn before_trivia(&self) -> Marker {
         let mut i = self.nodes.len();
         if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start {
@@ -1658,6 +1773,7 @@ impl<'s> Parser<'s> {
         m.0 > 0 && self.nodes[m.0 - 1].kind().is_error()
     }
 
+    /// Iterate over the non-trivia tokens following the marker.
     #[track_caller]
     fn post_process(&mut self, m: Marker) -> impl Iterator<Item = &mut SyntaxNode> {
         self.nodes[m.0..]
@@ -1665,10 +1781,15 @@ impl<'s> Parser<'s> {
             .filter(|child| !child.kind().is_error() && !child.kind().is_trivia())
     }
 
+    /// Wrap the nodes from a marker up to (but excluding) the current token in
+    /// a new [inner node](`SyntaxNode::inner`) of the given kind. This is an
+    /// easy interface for creating nested syntax nodes _after_ having parsed
+    /// their children.
     fn wrap(&mut self, from: Marker, kind: SyntaxKind) {
         self.wrap_within(from, self.before_trivia(), kind);
     }
 
+    /// Wrap including any trailing trivia nodes.
     fn wrap_all(&mut self, from: Marker, kind: SyntaxKind) {
         self.wrap_within(from, Marker(self.nodes.len()), kind)
     }
@@ -1681,11 +1802,14 @@ impl<'s> Parser<'s> {
         self.nodes.insert(from, SyntaxNode::inner(kind, children));
     }
 
+    /// Enter a new [`LexMode`] that will affect subsequent tokens (does not
+    /// modify the current token).
     fn enter(&mut self, mode: LexMode) {
         self.modes.push(self.lexer.mode());
         self.lexer.set_mode(mode);
     }
 
+    /// Exit the current [`LexMode`], possibly re-lexing the current token.
     fn exit(&mut self) {
         let mode = self.modes.pop().unwrap();
         if mode != self.lexer.mode() {
@@ -1697,10 +1821,13 @@ impl<'s> Parser<'s> {
         }
     }
 
+    /// Enter a new [`NewlineMode`] that will affect subsequent tokens (does not
+    /// modify the current token).
     fn enter_newline_mode(&mut self, stop: NewlineMode) {
         self.newline_modes.push(stop);
     }
 
+    /// Exit the current [`NewlineMode`], possibly re-lexing the current token.
     fn exit_newline_mode(&mut self) {
         self.unskip();
         self.newline_modes.pop();
@@ -1709,6 +1836,7 @@ impl<'s> Parser<'s> {
         self.skip();
     }
 
+    /// Save a checkpoint of the parser state.
     fn checkpoint(&self) -> Checkpoint<'s> {
         Checkpoint {
             lexer: self.lexer.clone(),
@@ -1719,6 +1847,7 @@ impl<'s> Parser<'s> {
         }
     }
 
+    /// Reset the parser from a checkpoint.
     fn restore(&mut self, checkpoint: Checkpoint<'s>) {
         self.lexer = checkpoint.lexer;
         self.prev_end = checkpoint.prev_end;
@@ -1727,6 +1856,7 @@ impl<'s> Parser<'s> {
         self.nodes.truncate(checkpoint.nodes);
     }
 
+    /// Move past trivia nodes in Code/Math.
     fn skip(&mut self) {
         if self.lexer.mode() != LexMode::Markup {
             while self.current.is_trivia() {
@@ -1736,6 +1866,8 @@ impl<'s> Parser<'s> {
         }
     }
 
+    /// Move the parser back to the start of this token or its leading trivia
+    /// (in Code/Math).
     fn unskip(&mut self) {
         if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start {
             while self.nodes.last().is_some_and(|last| last.kind().is_trivia()) {
@@ -1747,6 +1879,7 @@ impl<'s> Parser<'s> {
         }
     }
 
+    /// Save the current token to the `nodes` vector as an Inner or Error node.
     fn save(&mut self) {
         let text = self.current_text();
         if self.at(SyntaxKind::Error) {
@@ -1761,21 +1894,24 @@ impl<'s> Parser<'s> {
         }
     }
 
+    /// Find the kind of the next non-trivia token in the lexer.
     fn next_non_trivia(lexer: &mut Lexer<'s>) -> SyntaxKind {
         loop {
             let next = lexer.next();
-            // Loop is terminable, because SyntaxKind::End is not a trivia.
+            // Loop is terminable, because `SyntaxKind::End` is not a trivia.
             if !next.is_trivia() {
                 break next;
             }
         }
     }
 
+    /// Move the lexer forward and prepare the current token. In Code, this
+    /// might insert a temporary [`SyntaxKind::End`] based on our newline mode.
     fn lex(&mut self) {
         self.current_start = self.lexer.cursor();
         self.current = self.lexer.next();
 
-        // Special cases to handle newlines in code mode.
+        // Special cases to handle newlines in Code.
         if self.lexer.mode() == LexMode::Code
             && self.lexer.newline()
             && match self.newline_modes.last() {
@@ -1794,7 +1930,7 @@ impl<'s> Parser<'s> {
 }
 
 impl<'s> Parser<'s> {
-    /// Consume the given syntax `kind` or produce an error.
+    /// Consume the given `kind` or produce an error.
     fn expect(&mut self, kind: SyntaxKind) -> bool {
         let at = self.at(kind);
         if at {
@@ -1833,7 +1969,7 @@ impl<'s> Parser<'s> {
         self.nodes.insert(m.0, error);
     }
 
-    /// Produce a hint.
+    /// Add a hint to a trailing error.
     fn hint(&mut self, hint: &str) {
         let m = self.before_trivia();
         if let Some(error) = self.nodes.get_mut(m.0 - 1) {

From 54eadb65a9a9133b64a3ace7605f3f2852a69373 Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Sat, 26 Oct 2024 04:03:54 -0400
Subject: [PATCH 04/18] 4. Rename convert to convert_and_eat

---
 crates/typst-syntax/src/parser.rs | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index afa47257f..50277fab9 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -133,7 +133,7 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) {
         | SyntaxKind::ListMarker
         | SyntaxKind::EnumMarker
         | SyntaxKind::TermMarker
-        | SyntaxKind::Colon => p.convert(SyntaxKind::Text),
+        | SyntaxKind::Colon => p.convert_and_eat(SyntaxKind::Text),
 
         _ => {
             p.unexpected();
@@ -287,8 +287,8 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) {
                 matches!(next, SyntaxKind::MathIdent | SyntaxKind::Text)
                     && is_ident(&p.text[start..end])
             } {
-                p.convert(SyntaxKind::Dot);
-                p.convert(SyntaxKind::Ident);
+                p.convert_and_eat(SyntaxKind::Dot);
+                p.convert_and_eat(SyntaxKind::Ident);
                 p.wrap(m, SyntaxKind::FieldAccess);
             }
             if min_prec < 3 && p.directly_at(SyntaxKind::Text) && p.current_text() == "("
@@ -502,7 +502,7 @@ fn math_op(kind: SyntaxKind) -> Option<(SyntaxKind, SyntaxKind, ast::Assoc, usiz
 /// Parse an argument list in math: `(a, b; c, d; size: #50%)`.
 fn math_args(p: &mut Parser) {
     let m = p.marker();
-    p.convert(SyntaxKind::LeftParen);
+    p.convert_and_eat(SyntaxKind::LeftParen);
 
     let mut namable = true;
     let mut named = None;
@@ -515,8 +515,8 @@ fn math_args(p: &mut Parser) {
             && (p.at(SyntaxKind::MathIdent) || p.at(SyntaxKind::Text))
             && p.text[p.current_end()..].starts_with(':')
         {
-            p.convert(SyntaxKind::Ident);
-            p.convert(SyntaxKind::Colon);
+            p.convert_and_eat(SyntaxKind::Ident);
+            p.convert_and_eat(SyntaxKind::Colon);
             named = Some(arg);
             arg = p.marker();
             array = p.marker();
@@ -527,7 +527,7 @@ fn math_args(p: &mut Parser) {
             ";" => {
                 maybe_wrap_in_math(p, arg, named);
                 p.wrap(array, SyntaxKind::Array);
-                p.convert(SyntaxKind::Semicolon);
+                p.convert_and_eat(SyntaxKind::Semicolon);
                 array = p.marker();
                 arg = p.marker();
                 namable = true;
@@ -537,7 +537,7 @@ fn math_args(p: &mut Parser) {
             }
             "," => {
                 maybe_wrap_in_math(p, arg, named);
-                p.convert(SyntaxKind::Comma);
+                p.convert_and_eat(SyntaxKind::Comma);
                 arg = p.marker();
                 namable = true;
                 if named.is_some() {
@@ -570,7 +570,7 @@ fn math_args(p: &mut Parser) {
     }
 
     if p.at(SyntaxKind::Text) && p.current_text() == ")" {
-        p.convert(SyntaxKind::RightParen);
+        p.convert_and_eat(SyntaxKind::RightParen);
     } else {
         p.expected("closing paren");
         p.balanced = false;
@@ -1734,7 +1734,7 @@ impl<'s> Parser<'s> {
     }
 
     /// Convert the current token's [`SyntaxKind`] and eat it.
-    fn convert(&mut self, kind: SyntaxKind) {
+    fn convert_and_eat(&mut self, kind: SyntaxKind) {
         self.current = kind;
         self.eat();
     }

From 16cc7eb472c91470ae91f78ea67943b34be203f8 Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Thu, 10 Oct 2024 21:54:43 -0400
Subject: [PATCH 05/18] 5. Refactor parser memoization to localize
 functionality

---
 crates/typst-syntax/src/parser.rs | 153 ++++++++++++++++++++----------
 1 file changed, 102 insertions(+), 51 deletions(-)

diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index 50277fab9..2a7e4611c 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -1057,29 +1057,25 @@ fn return_stmt(p: &mut Parser) {
 
 /// An expression that starts with a parenthesis.
 fn expr_with_paren(p: &mut Parser, atomic: bool) {
-    // If we've seen this position before and have a memoized result, just use
-    // it. See below for more explanation about this memoization.
-    let start = p.current_start();
-    if let Some((range, end_point)) = p.memo.get(&start).cloned() {
-        // Restore the end point first, so that it doesn't truncate our freshly
-        // pushed nodes. If the current length of `p.nodes` doesn't match what
-        // we had in the memoized run, this might otherwise happen.
-        p.restore(end_point);
-        p.nodes.extend(p.memo_arena[range].iter().cloned());
+    if atomic {
+        // Atomic expressions aren't modified by operators that follow them, so
+        // our first guess of array/dict will be correct.
+        parenthesized_or_array_or_dict(p);
         return;
     }
 
-    let m = p.marker();
-    let checkpoint = p.checkpoint();
+    // If we've seen this position before and have a memoized result, restore it
+    // and return. Otherwise, get a key to this position and a checkpoint to
+    // restart from in case we make a wrong prediction.
+    let Some((memo_key, checkpoint)) = p.restore_memo_or_checkpoint() else { return };
+    // The node length from when we restored.
+    let prev_len = checkpoint.node_len;
 
     // When we reach a '(', we can't be sure what it is. First, we attempt to
     // parse as a simple parenthesized expression, array, or dictionary as
     // these are the most likely things. We can handle all of those in a single
     // pass.
     let kind = parenthesized_or_array_or_dict(p);
-    if atomic {
-        return;
-    }
 
     // If, however, '=>' or '=' follows, we must backtrack and reparse as either
     // a parameter list or a destructuring. To be able to do that, we created a
@@ -1100,6 +1096,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) {
     // case running time of O(2n).
     if p.at(SyntaxKind::Arrow) {
         p.restore(checkpoint);
+        let m = p.marker();
         params(p);
         if !p.expect(SyntaxKind::Arrow) {
             return;
@@ -1108,6 +1105,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) {
         p.wrap(m, SyntaxKind::Closure);
     } else if p.at(SyntaxKind::Eq) && kind != SyntaxKind::Parenthesized {
         p.restore(checkpoint);
+        let m = p.marker();
         destructuring_or_parenthesized(p, true, &mut HashSet::new());
         if !p.expect(SyntaxKind::Eq) {
             return;
@@ -1119,9 +1117,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) {
     }
 
     // Memoize result if we backtracked.
-    let offset = p.memo_arena.len();
-    p.memo_arena.extend(p.nodes[m.0..].iter().cloned());
-    p.memo.insert(start, (offset..p.memo_arena.len(), p.checkpoint()));
+    p.memoize_parsed_nodes(memo_key, prev_len);
 }
 
 /// Parses either
@@ -1456,6 +1452,9 @@ fn destructuring_item<'s>(
 
     // Parse a normal positional pattern or a destructuring key.
     let was_at_pat = p.at_set(set::PATTERN);
+
+    // We must use a full checkpoint here (can't just clone the lexer) because
+    // there may be trivia between the identifier and the colon we need to skip.
     let checkpoint = p.checkpoint();
     if !(p.eat_if(SyntaxKind::Ident) && p.at(SyntaxKind::Colon)) {
         p.restore(checkpoint);
@@ -1579,9 +1578,7 @@ struct Parser<'s> {
     /// Parser checkpoints for a given text index. Used for efficient parser
     /// backtracking similar to packrat parsing. See comments above in
     /// [`expr_with_paren`].
-    memo: HashMap<usize, (Range<usize>, Checkpoint<'s>)>,
-    /// The stored parse results at each checkpoint.
-    memo_arena: Vec<SyntaxNode>,
+    memo: MemoArena<'s>,
 }
 
 /// How to proceed with parsing when at a newline in Code.
@@ -1601,15 +1598,6 @@ enum NewlineMode {
 #[derive(Debug, Copy, Clone, Eq, PartialEq)]
 struct Marker(usize);
 
-#[derive(Clone)]
-struct Checkpoint<'s> {
-    lexer: Lexer<'s>,
-    prev_end: usize,
-    current_start: usize,
-    current: SyntaxKind,
-    nodes: usize,
-}
-
 impl<'s> Parser<'s> {
     /// Create a new parser starting from the given text offset and lexer mode.
     fn new(text: &'s str, offset: usize, mode: LexMode) -> Self {
@@ -1626,8 +1614,7 @@ impl<'s> Parser<'s> {
             nodes: vec![],
             modes: vec![],
             newline_modes: vec![],
-            memo: HashMap::new(),
-            memo_arena: vec![],
+            memo: Default::default(),
         }
     }
 
@@ -1836,26 +1823,6 @@ impl<'s> Parser<'s> {
         self.skip();
     }
 
-    /// Save a checkpoint of the parser state.
-    fn checkpoint(&self) -> Checkpoint<'s> {
-        Checkpoint {
-            lexer: self.lexer.clone(),
-            prev_end: self.prev_end,
-            current_start: self.current_start,
-            current: self.current,
-            nodes: self.nodes.len(),
-        }
-    }
-
-    /// Reset the parser from a checkpoint.
-    fn restore(&mut self, checkpoint: Checkpoint<'s>) {
-        self.lexer = checkpoint.lexer;
-        self.prev_end = checkpoint.prev_end;
-        self.current_start = checkpoint.current_start;
-        self.current = checkpoint.current;
-        self.nodes.truncate(checkpoint.nodes);
-    }
-
     /// Move past trivia nodes in Code/Math.
     fn skip(&mut self) {
         if self.lexer.mode() != LexMode::Markup {
@@ -1929,6 +1896,90 @@ impl<'s> Parser<'s> {
     }
 }
 
+/// Extra parser state for efficiently recovering from mispredicted parses.
+///
+/// This is the same idea as packrat parsing, but we use it only in the limited
+/// case of parenthesized structures. See [`expr_with_paren`] for more.
+#[derive(Default)]
+struct MemoArena<'s> {
+    /// A single arena of previously parsed nodes (to reduce allocations).
+    /// Memoized ranges refer to unique sections of the arena.
+    arena: Vec<SyntaxNode>,
+    /// A map from the parser's current position to a range of previously parsed
+    /// nodes in the arena and a checkpoint of the parser's state. These allow
+    /// us to reset the parser to avoid parsing the same location again.
+    memo_map: HashMap<MemoKey, (Range<usize>, Checkpoint<'s>)>,
+}
+
+/// A type alias for the memo key so it doesn't get confused with other usizes.
+///
+/// The memo is keyed by the index into `text` of the current token's start.
+type MemoKey = usize;
+
+/// A checkpoint of the parser which can fully restore it to a previous state.
+#[derive(Clone)]
+struct Checkpoint<'s> {
+    lexer: Lexer<'s>,
+    prev_end: usize,
+    current_start: usize,
+    current: SyntaxKind,
+    node_len: usize,
+}
+
+impl<'s> Parser<'s> {
+    /// Store the already parsed nodes and the parser state into the memo map by
+    /// extending the arena and storing the extended range and a checkpoint.
+    fn memoize_parsed_nodes(&mut self, key: MemoKey, prev_len: usize) {
+        let memo_start = self.memo.arena.len();
+        self.memo.arena.extend_from_slice(&self.nodes[prev_len..]);
+        let arena_range = memo_start..self.memo.arena.len();
+        self.memo.memo_map.insert(key, (arena_range, self.checkpoint()));
+    }
+
+    /// Try to load a memoized result, return `None` if we did or `Some` (with a
+    /// checkpoint and a key for the memo map) if we didn't.
+    fn restore_memo_or_checkpoint(&mut self) -> Option<(MemoKey, Checkpoint<'s>)> {
+        // We use the starting index of the current token as our key.
+        let key: MemoKey = self.current_start();
+        match self.memo.memo_map.get(&key).cloned() {
+            Some((range, checkpoint)) => {
+                self.nodes.extend_from_slice(&self.memo.arena[range]);
+                // It's important that we don't truncate the nodes vector since
+                // it may have grown or shrunk (due to other memoization or
+                // error reporting) since we made this checkpoint.
+                self.restore_partial(checkpoint);
+                None
+            }
+            None => Some((key, self.checkpoint())),
+        }
+    }
+
+    /// Restore the parser to the state at a checkpoint.
+    fn restore(&mut self, checkpoint: Checkpoint<'s>) {
+        self.nodes.truncate(checkpoint.node_len);
+        self.restore_partial(checkpoint);
+    }
+
+    /// Restore parts of the checkpoint excluding the nodes vector.
+    fn restore_partial(&mut self, checkpoint: Checkpoint<'s>) {
+        self.lexer = checkpoint.lexer;
+        self.prev_end = checkpoint.prev_end;
+        self.current_start = checkpoint.current_start;
+        self.current = checkpoint.current;
+    }
+
+    /// Save a checkpoint of the parser state.
+    fn checkpoint(&self) -> Checkpoint<'s> {
+        Checkpoint {
+            lexer: self.lexer.clone(),
+            prev_end: self.prev_end,
+            current_start: self.current_start,
+            current: self.current,
+            node_len: self.nodes.len(),
+        }
+    }
+}
+
 impl<'s> Parser<'s> {
     /// Consume the given `kind` or produce an error.
     fn expect(&mut self, kind: SyntaxKind) -> bool {

From 01186779cd92a7bad6ebff9154a85c6ab86cf7cb Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Mon, 21 Oct 2024 21:24:44 -0400
Subject: [PATCH 06/18] 6. Reduce size of memoization map state

---
 crates/typst-syntax/src/parser.rs | 57 ++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index 2a7e4611c..19e8adbbb 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -1578,7 +1578,7 @@ struct Parser<'s> {
     /// Parser checkpoints for a given text index. Used for efficient parser
     /// backtracking similar to packrat parsing. See comments above in
     /// [`expr_with_paren`].
-    memo: MemoArena<'s>,
+    memo: MemoArena,
 }
 
 /// How to proceed with parsing when at a newline in Code.
@@ -1901,14 +1901,14 @@ impl<'s> Parser<'s> {
 /// This is the same idea as packrat parsing, but we use it only in the limited
 /// case of parenthesized structures. See [`expr_with_paren`] for more.
 #[derive(Default)]
-struct MemoArena<'s> {
+struct MemoArena {
     /// A single arena of previously parsed nodes (to reduce allocations).
     /// Memoized ranges refer to unique sections of the arena.
     arena: Vec<SyntaxNode>,
     /// A map from the parser's current position to a range of previously parsed
     /// nodes in the arena and a checkpoint of the parser's state. These allow
     /// us to reset the parser to avoid parsing the same location again.
-    memo_map: HashMap<MemoKey, (Range<usize>, Checkpoint<'s>)>,
+    memo_map: HashMap<MemoKey, (Range<usize>, PartialState)>,
 }
 
 /// A type alias for the memo key so it doesn't get confused with other usizes.
@@ -1917,37 +1917,45 @@ struct MemoArena<'s> {
 type MemoKey = usize;
 
 /// A checkpoint of the parser which can fully restore it to a previous state.
+struct Checkpoint {
+    node_len: usize,
+    state: PartialState,
+}
+
+/// State needed to restore the parser's current token and the lexer (but not
+/// the nodes vector).
 #[derive(Clone)]
-struct Checkpoint<'s> {
-    lexer: Lexer<'s>,
+struct PartialState {
+    cursor: usize,
+    lex_mode: LexMode,
     prev_end: usize,
     current_start: usize,
     current: SyntaxKind,
-    node_len: usize,
 }
 
 impl<'s> Parser<'s> {
     /// Store the already parsed nodes and the parser state into the memo map by
     /// extending the arena and storing the extended range and a checkpoint.
     fn memoize_parsed_nodes(&mut self, key: MemoKey, prev_len: usize) {
+        let Checkpoint { state, node_len } = self.checkpoint();
         let memo_start = self.memo.arena.len();
-        self.memo.arena.extend_from_slice(&self.nodes[prev_len..]);
+        self.memo.arena.extend_from_slice(&self.nodes[prev_len..node_len]);
         let arena_range = memo_start..self.memo.arena.len();
-        self.memo.memo_map.insert(key, (arena_range, self.checkpoint()));
+        self.memo.memo_map.insert(key, (arena_range, state));
     }
 
     /// Try to load a memoized result, return `None` if we did or `Some` (with a
     /// checkpoint and a key for the memo map) if we didn't.
-    fn restore_memo_or_checkpoint(&mut self) -> Option<(MemoKey, Checkpoint<'s>)> {
+    fn restore_memo_or_checkpoint(&mut self) -> Option<(MemoKey, Checkpoint)> {
         // We use the starting index of the current token as our key.
         let key: MemoKey = self.current_start();
         match self.memo.memo_map.get(&key).cloned() {
-            Some((range, checkpoint)) => {
+            Some((range, state)) => {
                 self.nodes.extend_from_slice(&self.memo.arena[range]);
                 // It's important that we don't truncate the nodes vector since
                 // it may have grown or shrunk (due to other memoization or
                 // error reporting) since we made this checkpoint.
-                self.restore_partial(checkpoint);
+                self.restore_partial(state);
                 None
             }
             None => Some((key, self.checkpoint())),
@@ -1955,28 +1963,31 @@ impl<'s> Parser<'s> {
     }
 
     /// Restore the parser to the state at a checkpoint.
-    fn restore(&mut self, checkpoint: Checkpoint<'s>) {
+    fn restore(&mut self, checkpoint: Checkpoint) {
         self.nodes.truncate(checkpoint.node_len);
-        self.restore_partial(checkpoint);
+        self.restore_partial(checkpoint.state);
     }
 
     /// Restore parts of the checkpoint excluding the nodes vector.
-    fn restore_partial(&mut self, checkpoint: Checkpoint<'s>) {
-        self.lexer = checkpoint.lexer;
-        self.prev_end = checkpoint.prev_end;
-        self.current_start = checkpoint.current_start;
-        self.current = checkpoint.current;
+    fn restore_partial(&mut self, state: PartialState) {
+        self.lexer.jump(state.cursor);
+        self.lexer.set_mode(state.lex_mode);
+        self.prev_end = state.prev_end;
+        self.current_start = state.current_start;
+        self.current = state.current;
     }
 
     /// Save a checkpoint of the parser state.
-    fn checkpoint(&self) -> Checkpoint<'s> {
-        Checkpoint {
-            lexer: self.lexer.clone(),
+    fn checkpoint(&self) -> Checkpoint {
+        let node_len = self.nodes.len();
+        let state = PartialState {
+            cursor: self.lexer.cursor(),
+            lex_mode: self.lexer.mode(),
             prev_end: self.prev_end,
             current_start: self.current_start,
             current: self.current,
-            node_len: self.nodes.len(),
-        }
+        };
+        Checkpoint { node_len, state }
     }
 }
 

From 1cecae0333efcdfcfcca8e4e97ef590297808c2e Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Thu, 10 Oct 2024 11:57:27 -0400
Subject: [PATCH 07/18] 7. Return SyntaxNodes from the Lexer

---
 crates/typst-syntax/src/lexer.rs  | 44 ++++++++++++++++++-------------
 crates/typst-syntax/src/parser.rs | 37 +++++++++++++-------------
 2 files changed, 44 insertions(+), 37 deletions(-)

diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs
index 721225c6e..cdd4121c9 100644
--- a/crates/typst-syntax/src/lexer.rs
+++ b/crates/typst-syntax/src/lexer.rs
@@ -4,12 +4,12 @@ use unicode_script::{Script, UnicodeScript};
 use unicode_segmentation::UnicodeSegmentation;
 use unscanny::Scanner;
 
-use crate::{SyntaxError, SyntaxKind};
+use crate::{SyntaxError, SyntaxKind, SyntaxNode};
 
-/// Splits up a string of source code into tokens.
+/// An iterator over a source code string which returns tokens.
 #[derive(Clone)]
 pub(super) struct Lexer<'s> {
-    /// The underlying scanner.
+    /// The scanner: contains the underlying string and location as a "cursor".
     s: Scanner<'s>,
     /// The mode the lexer is in. This determines which kinds of tokens it
     /// produces.
@@ -73,11 +73,6 @@ impl<'s> Lexer<'s> {
     pub fn newline(&self) -> bool {
         self.newline
     }
-
-    /// Take out the last error, if any.
-    pub fn take_error(&mut self) -> Option<SyntaxError> {
-        self.error.take()
-    }
 }
 
 impl Lexer<'_> {
@@ -97,21 +92,24 @@ impl Lexer<'_> {
 
 /// Shared methods with all [`LexMode`].
 impl Lexer<'_> {
-    /// Proceed to the next token and return its [`SyntaxKind`]. Note the
-    /// token could be a [trivia](SyntaxKind::is_trivia).
-    pub fn next(&mut self) -> SyntaxKind {
+    /// Return the next token in our text. Returns both the [`SyntaxNode`]
+    /// and the raw [`SyntaxKind`] to make it more ergonomic to check the kind
+    pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) {
+        debug_assert!(self.error.is_none());
+        let start = self.s.cursor();
         if self.mode == LexMode::Raw {
-            let Some((kind, end)) = self.raw.pop() else {
-                return SyntaxKind::End;
+            let kind = if let Some((kind, end)) = self.raw.pop() {
+                self.s.jump(end);
+                kind
+            } else {
+                SyntaxKind::End
             };
-            self.s.jump(end);
-            return kind;
+            let node = SyntaxNode::leaf(kind, self.s.from(start));
+            return (kind, node);
         }
 
         self.newline = false;
-        self.error = None;
-        let start = self.s.cursor();
-        match self.s.eat() {
+        let kind = match self.s.eat() {
             Some(c) if is_space(c, self.mode) => self.whitespace(start, c),
             Some('/') if self.s.eat_if('/') => self.line_comment(),
             Some('/') if self.s.eat_if('*') => self.block_comment(),
@@ -132,13 +130,21 @@ impl Lexer<'_> {
             },
 
             None => SyntaxKind::End,
-        }
+        };
+
+        let text = self.s.from(start);
+        let node = match self.error.take() {
+            Some(error) => SyntaxNode::error(error, text),
+            None => SyntaxNode::leaf(kind, text),
+        };
+        (kind, node)
     }
 
     /// Eat whitespace characters greedily.
     fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind {
         let more = self.s.eat_while(|c| is_space(c, self.mode));
         let newlines = match c {
+            // Optimize eating a single space.
             ' ' if more.is_empty() => 0,
             _ => count_newlines(self.s.from(start)),
         };
diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index 19e8adbbb..b69486411 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -185,7 +185,7 @@ fn heading(p: &mut Parser) {
     whitespace_line(p);
     markup(p, false, usize::MAX, |p| {
         p.at_set(syntax_set!(Label, Space, RightBracket))
-            && (!p.at(SyntaxKind::Space) || p.lexer.clone().next() == SyntaxKind::Label)
+            && (!p.at(SyntaxKind::Space) || p.lexer.clone().next().0 == SyntaxKind::Label)
     });
     p.wrap(m, SyntaxKind::Heading);
 }
@@ -282,7 +282,7 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) {
             while p.directly_at(SyntaxKind::Text) && p.current_text() == "." && {
                 let mut copy = p.lexer.clone();
                 let start = copy.cursor();
-                let next = copy.next();
+                let next = copy.next().0;
                 let end = copy.cursor();
                 matches!(next, SyntaxKind::MathIdent | SyntaxKind::Text)
                     && is_ident(&p.text[start..end])
@@ -686,8 +686,8 @@ fn code_expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) {
             continue;
         }
 
-        let at_field_or_method =
-            p.directly_at(SyntaxKind::Dot) && p.lexer.clone().next() == SyntaxKind::Ident;
+        let at_field_or_method = p.directly_at(SyntaxKind::Dot)
+            && p.lexer.clone().next().0 == SyntaxKind::Ident;
 
         if atomic && !at_field_or_method {
             break;
@@ -947,9 +947,8 @@ fn for_loop(p: &mut Parser) {
     let mut seen = HashSet::new();
     pattern(p, false, &mut seen, None);
 
-    let m2 = p.marker();
-    if p.eat_if(SyntaxKind::Comma) {
-        let node = &mut p[m2];
+    if p.at(SyntaxKind::Comma) {
+        let node = p.eat_and_get();
         node.unexpected();
         node.hint("destructuring patterns must be wrapped in parentheses");
         if p.at_set(set::PATTERN) {
@@ -1563,6 +1562,9 @@ struct Parser<'s> {
     current_start: usize,
     /// The [`SyntaxKind`] of the current token.
     current: SyntaxKind,
+    /// The [`SyntaxNode`] of the current token, ready to be eaten and pushed
+    /// onto the end of `nodes`.
+    current_node: SyntaxNode,
     /// Whether the parser has the expected set of open/close delimiters. This
     /// only ever transitions from `true` to `false`.
     balanced: bool,
@@ -1603,13 +1605,14 @@ impl<'s> Parser<'s> {
     fn new(text: &'s str, offset: usize, mode: LexMode) -> Self {
         let mut lexer = Lexer::new(text, mode);
         lexer.jump(offset);
-        let current = lexer.next();
+        let (current, current_node) = lexer.next();
         Self {
             lexer,
             text,
             prev_end: offset,
             current_start: offset,
             current,
+            current_node,
             balanced: true,
             nodes: vec![],
             modes: vec![],
@@ -1722,7 +1725,8 @@ impl<'s> Parser<'s> {
 
     /// Convert the current token's [`SyntaxKind`] and eat it.
     fn convert_and_eat(&mut self, kind: SyntaxKind) {
-        self.current = kind;
+        // Only need to replace the node here.
+        self.current_node.convert_to_kind(kind);
         self.eat();
     }
 
@@ -1848,13 +1852,7 @@ impl<'s> Parser<'s> {
 
     /// Save the current token to the `nodes` vector as an Inner or Error node.
     fn save(&mut self) {
-        let text = self.current_text();
-        if self.at(SyntaxKind::Error) {
-            let error = self.lexer.take_error().unwrap();
-            self.nodes.push(SyntaxNode::error(error, text));
-        } else {
-            self.nodes.push(SyntaxNode::leaf(self.current, text));
-        }
+        self.nodes.push(self.current_node.clone());
 
         if self.lexer.mode() == LexMode::Markup || !self.current.is_trivia() {
             self.prev_end = self.current_end();
@@ -1864,7 +1862,7 @@ impl<'s> Parser<'s> {
     /// Find the kind of the next non-trivia token in the lexer.
     fn next_non_trivia(lexer: &mut Lexer<'s>) -> SyntaxKind {
         loop {
-            let next = lexer.next();
+            let next = lexer.next().0;
             // Loop is terminable, because `SyntaxKind::End` is not a trivia.
             if !next.is_trivia() {
                 break next;
@@ -1876,7 +1874,7 @@ impl<'s> Parser<'s> {
     /// might insert a temporary [`SyntaxKind::End`] based on our newline mode.
     fn lex(&mut self) {
         self.current_start = self.lexer.cursor();
-        self.current = self.lexer.next();
+        (self.current, self.current_node) = self.lexer.next();
 
         // Special cases to handle newlines in Code.
         if self.lexer.mode() == LexMode::Code
@@ -1931,6 +1929,7 @@ struct PartialState {
     prev_end: usize,
     current_start: usize,
     current: SyntaxKind,
+    current_node: SyntaxNode,
 }
 
 impl<'s> Parser<'s> {
@@ -1975,6 +1974,7 @@ impl<'s> Parser<'s> {
         self.prev_end = state.prev_end;
         self.current_start = state.current_start;
         self.current = state.current;
+        self.current_node = state.current_node;
     }
 
     /// Save a checkpoint of the parser state.
@@ -1986,6 +1986,7 @@ impl<'s> Parser<'s> {
             prev_end: self.prev_end,
             current_start: self.current_start,
             current: self.current,
+            current_node: self.current_node.clone(),
         };
         Checkpoint { node_len, state }
     }

From 09975d113385067302a4abbc1f5cf905e78915ad Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Thu, 10 Oct 2024 20:30:33 -0400
Subject: [PATCH 08/18] 8. Create Raw nodes entirely within the lexer

---
 crates/typst-syntax/src/lexer.rs  | 120 ++++++++++++++----------------
 crates/typst-syntax/src/parser.rs |  22 +-----
 crates/typst-syntax/src/set.rs    |   2 +-
 3 files changed, 59 insertions(+), 85 deletions(-)

diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs
index cdd4121c9..d2173f505 100644
--- a/crates/typst-syntax/src/lexer.rs
+++ b/crates/typst-syntax/src/lexer.rs
@@ -16,8 +16,6 @@ pub(super) struct Lexer<'s> {
     mode: LexMode,
     /// Whether the last token contained a newline.
     newline: bool,
-    /// The state held by raw line lexing.
-    raw: Vec<(SyntaxKind, usize)>,
     /// An error for the last token.
     error: Option<SyntaxError>,
 }
@@ -31,8 +29,6 @@ pub(super) enum LexMode {
     Math,
     /// Keywords, literals and operators.
     Code,
-    /// The contents of a raw block.
-    Raw,
 }
 
 impl<'s> Lexer<'s> {
@@ -44,7 +40,6 @@ impl<'s> Lexer<'s> {
             mode,
             newline: false,
             error: None,
-            raw: Vec::new(),
         }
     }
 
@@ -97,16 +92,6 @@ impl Lexer<'_> {
     pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) {
         debug_assert!(self.error.is_none());
         let start = self.s.cursor();
-        if self.mode == LexMode::Raw {
-            let kind = if let Some((kind, end)) = self.raw.pop() {
-                self.s.jump(end);
-                kind
-            } else {
-                SyntaxKind::End
-            };
-            let node = SyntaxNode::leaf(kind, self.s.from(start));
-            return (kind, node);
-        }
 
         self.newline = false;
         let kind = match self.s.eat() {
@@ -121,12 +106,11 @@ impl Lexer<'_> {
                 );
                 kind
             }
-
+            Some('`') if self.mode != LexMode::Math => return self.raw(),
             Some(c) => match self.mode {
                 LexMode::Markup => self.markup(start, c),
                 LexMode::Math => self.math(start, c),
                 LexMode::Code => self.code(start, c),
-                LexMode::Raw => unreachable!(),
             },
 
             None => SyntaxKind::End,
@@ -193,7 +177,6 @@ impl Lexer<'_> {
     fn markup(&mut self, start: usize, c: char) -> SyntaxKind {
         match c {
             '\\' => self.backslash(),
-            '`' => self.raw(),
             'h' if self.s.eat_if("ttp://") => self.link(),
             'h' if self.s.eat_if("ttps://") => self.link(),
             '<' if self.s.at(is_id_continue) => self.label(),
@@ -258,9 +241,10 @@ impl Lexer<'_> {
         }
     }
 
-    fn raw(&mut self) -> SyntaxKind {
+    /// Lex an entire raw segment at once. This is a convenience to avoid going
+    /// to and from the parser for each raw section.
+    fn raw(&mut self) -> (SyntaxKind, SyntaxNode) {
         let start = self.s.cursor() - 1;
-        self.raw.clear();
 
         // Determine number of opening backticks.
         let mut backticks = 1;
@@ -270,9 +254,11 @@ impl Lexer<'_> {
 
         // Special case for ``.
         if backticks == 2 {
-            self.push_raw(SyntaxKind::RawDelim);
-            self.s.jump(start + 1);
-            return SyntaxKind::RawDelim;
+            let nodes = vec![
+                SyntaxNode::leaf(SyntaxKind::RawDelim, "`"),
+                SyntaxNode::leaf(SyntaxKind::RawDelim, "`"),
+            ];
+            return (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes));
         }
 
         // Find end of raw text.
@@ -281,43 +267,55 @@ impl Lexer<'_> {
             match self.s.eat() {
                 Some('`') => found += 1,
                 Some(_) => found = 0,
-                None => break,
+                None => {
+                    let msg = SyntaxError::new("unclosed raw text");
+                    let error = SyntaxNode::error(msg, self.s.from(start));
+                    return (SyntaxKind::Error, error);
+                }
             }
         }
-
-        if found != backticks {
-            return self.error("unclosed raw text");
-        }
-
         let end = self.s.cursor();
-        if backticks >= 3 {
-            self.blocky_raw(start, end, backticks);
-        } else {
-            self.inline_raw(start, end, backticks);
-        }
 
-        // Closing delimiter.
-        self.push_raw(SyntaxKind::RawDelim);
+        let mut nodes = Vec::with_capacity(3); // Will have at least 3.
 
-        // The saved tokens will be removed in reverse.
-        self.raw.reverse();
+        // A closure for pushing a node onto our raw vector. Assumes the caller
+        // will move the scanner to the next location at each step.
+        let mut prev_start = start;
+        let mut push_raw = |kind, s: &Scanner| {
+            nodes.push(SyntaxNode::leaf(kind, s.from(prev_start)));
+            prev_start = s.cursor();
+        };
 
         // Opening delimiter.
         self.s.jump(start + backticks);
-        SyntaxKind::RawDelim
+        push_raw(SyntaxKind::RawDelim, &self.s);
+
+        if backticks >= 3 {
+            self.blocky_raw(end - backticks, &mut push_raw);
+        } else {
+            self.inline_raw(end - backticks, &mut push_raw);
+        }
+
+        // Closing delimiter.
+        self.s.jump(end);
+        push_raw(SyntaxKind::RawDelim, &self.s);
+
+        (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes))
     }
 
-    fn blocky_raw(&mut self, start: usize, end: usize, backticks: usize) {
+    fn blocky_raw<F>(&mut self, inner_end: usize, mut push_raw: F)
+    where
+        F: FnMut(SyntaxKind, &Scanner),
+    {
         // Language tag.
-        self.s.jump(start + backticks);
         if self.s.eat_if(is_id_start) {
             self.s.eat_while(is_id_continue);
-            self.push_raw(SyntaxKind::RawLang);
+            push_raw(SyntaxKind::RawLang, &self.s);
         }
 
         // Determine inner content between backticks.
         self.s.eat_if(' ');
-        let inner = self.s.to(end - backticks);
+        let inner = self.s.to(inner_end);
 
         // Determine dedent level.
         let mut lines = split_newlines(inner);
@@ -363,41 +361,32 @@ impl Lexer<'_> {
             let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum();
             self.s.eat_newline();
             self.s.advance(offset);
-            self.push_raw(SyntaxKind::RawTrimmed);
+            push_raw(SyntaxKind::RawTrimmed, &self.s);
             self.s.advance(line.len() - offset);
-            self.push_raw(SyntaxKind::Text);
+            push_raw(SyntaxKind::Text, &self.s);
         }
 
         // Add final trimmed.
-        if self.s.cursor() < end - backticks {
-            self.s.jump(end - backticks);
-            self.push_raw(SyntaxKind::RawTrimmed);
+        if self.s.cursor() < inner_end {
+            self.s.jump(inner_end);
+            push_raw(SyntaxKind::RawTrimmed, &self.s);
         }
-        self.s.jump(end);
     }
 
-    fn inline_raw(&mut self, start: usize, end: usize, backticks: usize) {
-        self.s.jump(start + backticks);
-
-        while self.s.cursor() < end - backticks {
+    fn inline_raw<F>(&mut self, inner_end: usize, mut push_raw: F)
+    where
+        F: FnMut(SyntaxKind, &Scanner),
+    {
+        while self.s.cursor() < inner_end {
             if self.s.at(is_newline) {
-                self.push_raw(SyntaxKind::Text);
+                push_raw(SyntaxKind::Text, &self.s);
                 self.s.eat_newline();
-                self.push_raw(SyntaxKind::RawTrimmed);
+                push_raw(SyntaxKind::RawTrimmed, &self.s);
                 continue;
             }
             self.s.eat();
         }
-        self.push_raw(SyntaxKind::Text);
-
-        self.s.jump(end);
-    }
-
-    /// Push the current cursor that marks the end of a raw segment of
-    /// the given `kind`.
-    fn push_raw(&mut self, kind: SyntaxKind) {
-        let end = self.s.cursor();
-        self.raw.push((kind, end));
+        push_raw(SyntaxKind::Text, &self.s);
     }
 
     fn link(&mut self) -> SyntaxKind {
@@ -605,7 +594,6 @@ impl Lexer<'_> {
 impl Lexer<'_> {
     fn code(&mut self, start: usize, c: char) -> SyntaxKind {
         match c {
-            '`' => self.raw(),
             '<' if self.s.at(is_id_continue) => self.label(),
             '0'..='9' => self.number(start, c),
             '.' if self.s.at(char::is_ascii_digit) => self.number(start, c),
diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index b69486411..6fd0878df 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -116,10 +116,11 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) {
         | SyntaxKind::Link
         | SyntaxKind::Label => p.eat(),
 
+        SyntaxKind::Raw => p.eat(), // Raw is handled entirely in the Lexer.
+
         SyntaxKind::Hash => embedded_code_expr(p),
         SyntaxKind::Star => strong(p),
         SyntaxKind::Underscore => emph(p),
-        SyntaxKind::RawDelim => raw(p),
         SyntaxKind::HeadingMarker if *at_start => heading(p),
         SyntaxKind::ListMarker if *at_start => list_item(p),
         SyntaxKind::EnumMarker if *at_start => enum_item(p),
@@ -162,22 +163,6 @@ fn emph(p: &mut Parser) {
     p.wrap(m, SyntaxKind::Emph);
 }
 
-/// Parses raw text with optional syntax highlighting: `` `...` ``.
-fn raw(p: &mut Parser) {
-    let m = p.marker();
-    p.enter(LexMode::Raw);
-    p.assert(SyntaxKind::RawDelim);
-
-    // Eats until the closing delimiter.
-    while !p.end() && !p.at(SyntaxKind::RawDelim) {
-        p.eat();
-    }
-
-    p.expect(SyntaxKind::RawDelim);
-    p.exit();
-    p.wrap(m, SyntaxKind::Raw);
-}
-
 /// Parses a section heading: `= Introduction`.
 fn heading(p: &mut Parser) {
     let m = p.marker();
@@ -767,7 +752,6 @@ fn code_primary(p: &mut Parser, atomic: bool) {
         SyntaxKind::LeftBrace => code_block(p),
         SyntaxKind::LeftBracket => content_block(p),
         SyntaxKind::LeftParen => expr_with_paren(p, atomic),
-        SyntaxKind::RawDelim => raw(p),
         SyntaxKind::Dollar => equation(p),
         SyntaxKind::Let => let_binding(p),
         SyntaxKind::Set => set_rule(p),
@@ -782,6 +766,8 @@ fn code_primary(p: &mut Parser, atomic: bool) {
         SyntaxKind::Continue => continue_stmt(p),
         SyntaxKind::Return => return_stmt(p),
 
+        SyntaxKind::Raw => p.eat(), // Raw is handled entirely in the Lexer.
+
         SyntaxKind::None
         | SyntaxKind::Auto
         | SyntaxKind::Int
diff --git a/crates/typst-syntax/src/set.rs b/crates/typst-syntax/src/set.rs
index eaee7ef28..f3f1ba240 100644
--- a/crates/typst-syntax/src/set.rs
+++ b/crates/typst-syntax/src/set.rs
@@ -104,7 +104,7 @@ pub const ATOMIC_CODE_PRIMARY: SyntaxSet = syntax_set!(
     Numeric,
     Str,
     Label,
-    RawDelim,
+    Raw,
 );
 
 /// Syntax kinds that are unary operators.

From 88d86714a1e8c2f9ef8b77d4bcf7d44fa4e4dd26 Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Mon, 21 Oct 2024 22:18:23 -0400
Subject: [PATCH 09/18] 9. Parse math field access in the lexer

---
 crates/typst-syntax/src/lexer.rs  | 41 ++++++++++++++++++++++++++++---
 crates/typst-syntax/src/parser.rs | 20 +++------------
 crates/typst-syntax/src/set.rs    |  1 +
 tests/suite/math/symbols.typ      | 29 ++++++++++++++++++++++
 4 files changed, 71 insertions(+), 20 deletions(-)
 create mode 100644 tests/suite/math/symbols.typ

diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs
index d2173f505..4a43c15ff 100644
--- a/crates/typst-syntax/src/lexer.rs
+++ b/crates/typst-syntax/src/lexer.rs
@@ -109,7 +109,10 @@ impl Lexer<'_> {
             Some('`') if self.mode != LexMode::Math => return self.raw(),
             Some(c) => match self.mode {
                 LexMode::Markup => self.markup(start, c),
-                LexMode::Math => self.math(start, c),
+                LexMode::Math => match self.math(start, c) {
+                    (kind, None) => kind,
+                    (kind, Some(node)) => return (kind, node),
+                },
                 LexMode::Code => self.code(start, c),
             },
 
@@ -507,8 +510,8 @@ impl Lexer<'_> {
 
 /// Math.
 impl Lexer<'_> {
-    fn math(&mut self, start: usize, c: char) -> SyntaxKind {
-        match c {
+    fn math(&mut self, start: usize, c: char) -> (SyntaxKind, Option<SyntaxNode>) {
+        let kind = match c {
             '\\' => self.backslash(),
             '"' => self.string(),
 
@@ -561,11 +564,41 @@ impl Lexer<'_> {
             // Identifiers.
             c if is_math_id_start(c) && self.s.at(is_math_id_continue) => {
                 self.s.eat_while(is_math_id_continue);
-                SyntaxKind::MathIdent
+                let (kind, node) = self.math_ident_or_field(start);
+                return (kind, Some(node));
             }
 
             // Other math atoms.
             _ => self.math_text(start, c),
+        };
+        (kind, None)
+    }
+
+    /// Parse a single `MathIdent` or an entire `FieldAccess`.
+    fn math_ident_or_field(&mut self, start: usize) -> (SyntaxKind, SyntaxNode) {
+        let mut kind = SyntaxKind::MathIdent;
+        let mut node = SyntaxNode::leaf(kind, self.s.from(start));
+        while let Some(ident) = self.maybe_dot_ident() {
+            kind = SyntaxKind::FieldAccess;
+            let field_children = vec![
+                node,
+                SyntaxNode::leaf(SyntaxKind::Dot, '.'),
+                SyntaxNode::leaf(SyntaxKind::Ident, ident),
+            ];
+            node = SyntaxNode::inner(kind, field_children);
+        }
+        (kind, node)
+    }
+
+    /// If at a dot and a math identifier, eat and return the identifier.
+    fn maybe_dot_ident(&mut self) -> Option<&str> {
+        if self.s.scout(1).is_some_and(is_math_id_start) && self.s.eat_if('.') {
+            let ident_start = self.s.cursor();
+            self.s.eat();
+            self.s.eat_while(is_math_id_continue);
+            Some(self.s.from(ident_start))
+        } else {
+            None
         }
     }
 
diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index 6fd0878df..be065ca60 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -6,9 +6,7 @@ use ecow::{eco_format, EcoString};
 use unicode_math_class::MathClass;
 
 use crate::set::{syntax_set, SyntaxSet};
-use crate::{
-    ast, is_ident, is_newline, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode,
-};
+use crate::{ast, is_newline, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode};
 
 /// Parses a source file as top-level markup.
 pub fn parse(text: &str) -> SyntaxNode {
@@ -261,21 +259,11 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) {
     let mut continuable = false;
     match p.current() {
         SyntaxKind::Hash => embedded_code_expr(p),
-        SyntaxKind::MathIdent => {
+        // The lexer manages creating full FieldAccess nodes if needed.
+        SyntaxKind::MathIdent | SyntaxKind::FieldAccess => {
             continuable = true;
             p.eat();
-            while p.directly_at(SyntaxKind::Text) && p.current_text() == "." && {
-                let mut copy = p.lexer.clone();
-                let start = copy.cursor();
-                let next = copy.next().0;
-                let end = copy.cursor();
-                matches!(next, SyntaxKind::MathIdent | SyntaxKind::Text)
-                    && is_ident(&p.text[start..end])
-            } {
-                p.convert_and_eat(SyntaxKind::Dot);
-                p.convert_and_eat(SyntaxKind::Ident);
-                p.wrap(m, SyntaxKind::FieldAccess);
-            }
+            // Parse a function call for an identifier or field access.
             if min_prec < 3 && p.directly_at(SyntaxKind::Text) && p.current_text() == "("
             {
                 math_args(p);
diff --git a/crates/typst-syntax/src/set.rs b/crates/typst-syntax/src/set.rs
index f3f1ba240..014aaf2f7 100644
--- a/crates/typst-syntax/src/set.rs
+++ b/crates/typst-syntax/src/set.rs
@@ -58,6 +58,7 @@ pub const STMT: SyntaxSet = syntax_set!(Let, Set, Show, Import, Include, Return)
 pub const MATH_EXPR: SyntaxSet = syntax_set!(
     Hash,
     MathIdent,
+    FieldAccess,
     Text,
     MathShorthand,
     Linebreak,
diff --git a/tests/suite/math/symbols.typ b/tests/suite/math/symbols.typ
new file mode 100644
index 000000000..65a483162
--- /dev/null
+++ b/tests/suite/math/symbols.typ
@@ -0,0 +1,29 @@
+// Test math symbol edge cases.
+
+--- math-symbol-basic ---
+#let sym = symbol("s", ("basic", "s"))
+#test($sym.basic$, $#"s"$)
+
+--- math-symbol-underscore ---
+#let sym = symbol("s", ("test_underscore", "s"))
+// Error: 6-10 unknown symbol modifier
+$sym.test_underscore$
+
+--- math-symbol-dash ---
+#let sym = symbol("s", ("test-dash", "s"))
+// Error: 6-10 unknown symbol modifier
+$sym.test-dash$
+
+--- math-symbol-double ---
+#let sym = symbol("s", ("test.basic", "s"))
+#test($sym.test.basic$, $#"s"$)
+
+--- math-symbol-double-underscore ---
+#let sym = symbol("s", ("one.test_underscore", "s"))
+// Error: 10-14 unknown symbol modifier
+$sym.one.test_underscore$
+
+--- math-symbol-double-dash ---
+#let sym = symbol("s", ("one.test-dash", "s"))
+// Error: 10-14 unknown symbol modifier
+$sym.one.test-dash$

From 2ae1e1627f09ce8dfe76dd3e4b1b70fc95943f97 Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Thu, 10 Oct 2024 20:30:33 -0400
Subject: [PATCH 10/18] 10. Change parser modes using closures instead of
 manual stacks

---
 crates/typst-syntax/src/parser.rs | 323 +++++++++++++++---------------
 1 file changed, 158 insertions(+), 165 deletions(-)

diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index be065ca60..44a388c56 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -226,11 +226,11 @@ fn whitespace_line(p: &mut Parser) {
 /// Parses a mathematical equation: `$x$`, `$ x^2 $`.
 fn equation(p: &mut Parser) {
     let m = p.marker();
-    p.enter(LexMode::Math);
-    p.assert(SyntaxKind::Dollar);
-    math(p, |p| p.at(SyntaxKind::Dollar));
-    p.expect_closing_delimiter(m, SyntaxKind::Dollar);
-    p.exit();
+    p.with_mode(LexMode::Math, |p| {
+        p.assert(SyntaxKind::Dollar);
+        math(p, |p| p.at(SyntaxKind::Dollar));
+        p.expect_closing_delimiter(m, SyntaxKind::Dollar);
+    });
     p.wrap(m, SyntaxKind::Equation);
 }
 
@@ -586,10 +586,11 @@ fn code(p: &mut Parser, stop: impl FnMut(&Parser) -> bool) {
 /// Parses a sequence of code expressions.
 fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) {
     while !p.end() && !stop(p) {
-        p.enter_newline_mode(NewlineMode::Contextual);
-
-        let at_expr = p.at_set(set::CODE_EXPR);
-        if at_expr {
+        p.with_nl_mode(AtNewline::Contextual, |p| {
+            if !p.at_set(set::CODE_EXPR) {
+                p.unexpected();
+                return;
+            }
             code_expr(p);
             if !p.end() && !stop(p) && !p.eat_if(SyntaxKind::Semicolon) {
                 p.expected("semicolon or line break");
@@ -598,12 +599,7 @@ fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) {
                     p.hint("try wrapping your code in a markup block (`[ ]`)");
                 }
             }
-        }
-
-        p.exit_newline_mode();
-        if !at_expr && !p.end() {
-            p.unexpected();
-        }
+        });
     }
 }
 
@@ -614,29 +610,28 @@ fn code_expr(p: &mut Parser) {
 
 /// Parses an atomic code expression embedded in markup or math.
 fn embedded_code_expr(p: &mut Parser) {
-    p.enter_newline_mode(NewlineMode::Stop);
-    p.enter(LexMode::Code);
-    p.assert(SyntaxKind::Hash);
-    p.unskip();
+    p.with_mode(LexMode::Code, |p| {
+        p.with_nl_mode(AtNewline::Stop, |p| {
+            p.assert(SyntaxKind::Hash);
+            p.unskip();
 
-    let stmt = p.at_set(set::STMT);
-    let at = p.at_set(set::ATOMIC_CODE_EXPR);
-    code_expr_prec(p, true, 0);
+            let stmt = p.at_set(set::STMT);
+            let at = p.at_set(set::ATOMIC_CODE_EXPR);
+            code_expr_prec(p, true, 0);
 
-    // Consume error for things like `#12p` or `#"abc\"`.#
-    if !at && !p.current().is_trivia() && !p.end() {
-        p.unexpected();
-    }
+            // Consume error for things like `#12p` or `#"abc\"`.#
+            if !at && !p.current().is_trivia() && !p.end() {
+                p.unexpected();
+            }
 
-    let semi =
-        (stmt || p.directly_at(SyntaxKind::Semicolon)) && p.eat_if(SyntaxKind::Semicolon);
+            let semi = (stmt || p.directly_at(SyntaxKind::Semicolon))
+                && p.eat_if(SyntaxKind::Semicolon);
 
-    if stmt && !semi && !p.end() && !p.at(SyntaxKind::RightBracket) {
-        p.expected("semicolon or line break");
-    }
-
-    p.exit();
-    p.exit_newline_mode();
+            if stmt && !semi && !p.end() && !p.at(SyntaxKind::RightBracket) {
+                p.expected("semicolon or line break");
+            }
+        });
+    });
 }
 
 /// Parses a code expression with at least the given precedence.
@@ -790,24 +785,24 @@ pub(super) fn reparse_block(text: &str, range: Range<usize>) -> Option<SyntaxNod
 /// Parses a code block: `{ let x = 1; x + 2 }`.
 fn code_block(p: &mut Parser) {
     let m = p.marker();
-    p.enter(LexMode::Code);
-    p.enter_newline_mode(NewlineMode::Continue);
-    p.assert(SyntaxKind::LeftBrace);
-    code(p, |p| p.at_set(syntax_set!(RightBrace, RightBracket, RightParen)));
-    p.expect_closing_delimiter(m, SyntaxKind::RightBrace);
-    p.exit();
-    p.exit_newline_mode();
+    p.with_mode(LexMode::Code, |p| {
+        p.with_nl_mode(AtNewline::Continue, |p| {
+            p.assert(SyntaxKind::LeftBrace);
+            code(p, |p| p.at_set(syntax_set!(RightBrace, RightBracket, RightParen)));
+            p.expect_closing_delimiter(m, SyntaxKind::RightBrace);
+        });
+    });
     p.wrap(m, SyntaxKind::CodeBlock);
 }
 
 /// Parses a content block: `[*Hi* there!]`.
 fn content_block(p: &mut Parser) {
     let m = p.marker();
-    p.enter(LexMode::Markup);
-    p.assert(SyntaxKind::LeftBracket);
-    markup(p, true, 0, |p| p.at(SyntaxKind::RightBracket));
-    p.expect_closing_delimiter(m, SyntaxKind::RightBracket);
-    p.exit();
+    p.with_mode(LexMode::Markup, |p| {
+        p.assert(SyntaxKind::LeftBracket);
+        markup(p, true, 0, |p| p.at(SyntaxKind::RightBracket));
+        p.expect_closing_delimiter(m, SyntaxKind::RightBracket);
+    });
     p.wrap(m, SyntaxKind::ContentBlock);
 }
 
@@ -950,14 +945,14 @@ fn module_import(p: &mut Parser) {
 
     if p.eat_if(SyntaxKind::Colon) {
         if p.at(SyntaxKind::LeftParen) {
-            let m1 = p.marker();
-            p.enter_newline_mode(NewlineMode::Continue);
-            p.assert(SyntaxKind::LeftParen);
+            p.with_nl_mode(AtNewline::Continue, |p| {
+                let m2 = p.marker();
+                p.assert(SyntaxKind::LeftParen);
 
-            import_items(p);
+                import_items(p);
 
-            p.expect_closing_delimiter(m1, SyntaxKind::RightParen);
-            p.exit_newline_mode();
+                p.expect_closing_delimiter(m2, SyntaxKind::RightParen);
+            });
         } else if !p.eat_if(SyntaxKind::Star) {
             import_items(p);
         }
@@ -1098,10 +1093,6 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) {
 /// - an array: `(1, "hi", 12cm)`, or
 /// - a dictionary: `(thickness: 3pt, pattern: dashed)`.
 fn parenthesized_or_array_or_dict(p: &mut Parser) -> SyntaxKind {
-    let m = p.marker();
-    p.enter_newline_mode(NewlineMode::Continue);
-    p.assert(SyntaxKind::LeftParen);
-
     let mut state = GroupState {
         count: 0,
         maybe_just_parens: true,
@@ -1124,27 +1115,29 @@ fn parenthesized_or_array_or_dict(p: &mut Parser) -> SyntaxKind {
     //
     // This does allow some unexpected expressions, such as `(: key: val)`, but
     // it's currently intentional.
-    if p.eat_if(SyntaxKind::Colon) {
-        state.kind = Some(SyntaxKind::Dict);
-        state.maybe_just_parens = false;
-    }
-
-    while !p.current().is_terminator() {
-        if !p.at_set(set::ARRAY_OR_DICT_ITEM) {
-            p.unexpected();
-            continue;
+    let m = p.marker();
+    p.with_nl_mode(AtNewline::Continue, |p| {
+        p.assert(SyntaxKind::LeftParen);
+        if p.eat_if(SyntaxKind::Colon) {
+            state.kind = Some(SyntaxKind::Dict);
         }
 
-        array_or_dict_item(p, &mut state);
-        state.count += 1;
+        while !p.current().is_terminator() {
+            if !p.at_set(set::ARRAY_OR_DICT_ITEM) {
+                p.unexpected();
+                continue;
+            }
 
-        if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) {
-            state.maybe_just_parens = false;
+            array_or_dict_item(p, &mut state);
+            state.count += 1;
+
+            if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) {
+                state.maybe_just_parens = false;
+            }
         }
-    }
 
-    p.expect_closing_delimiter(m, SyntaxKind::RightParen);
-    p.exit_newline_mode();
+        p.expect_closing_delimiter(m, SyntaxKind::RightParen);
+    });
 
     let kind = if state.maybe_just_parens && state.count == 1 {
         SyntaxKind::Parenthesized
@@ -1230,25 +1223,25 @@ fn args(p: &mut Parser) {
     let m = p.marker();
     if p.at(SyntaxKind::LeftParen) {
         let m2 = p.marker();
-        p.enter_newline_mode(NewlineMode::Continue);
-        p.assert(SyntaxKind::LeftParen);
+        p.with_nl_mode(AtNewline::Continue, |p| {
+            p.assert(SyntaxKind::LeftParen);
 
-        let mut seen = HashSet::new();
-        while !p.current().is_terminator() {
-            if !p.at_set(set::ARG) {
-                p.unexpected();
-                continue;
+            let mut seen = HashSet::new();
+            while !p.current().is_terminator() {
+                if !p.at_set(set::ARG) {
+                    p.unexpected();
+                    continue;
+                }
+
+                arg(p, &mut seen);
+
+                if !p.current().is_terminator() {
+                    p.expect(SyntaxKind::Comma);
+                }
             }
 
-            arg(p, &mut seen);
-
-            if !p.current().is_terminator() {
-                p.expect(SyntaxKind::Comma);
-            }
-        }
-
-        p.expect_closing_delimiter(m2, SyntaxKind::RightParen);
-        p.exit_newline_mode();
+            p.expect_closing_delimiter(m2, SyntaxKind::RightParen);
+        });
     }
 
     while p.directly_at(SyntaxKind::LeftBracket) {
@@ -1293,27 +1286,27 @@ fn arg<'s>(p: &mut Parser<'s>, seen: &mut HashSet<&'s str>) {
 /// Parses a closure's parameters: `(x, y)`.
 fn params(p: &mut Parser) {
     let m = p.marker();
-    p.enter_newline_mode(NewlineMode::Continue);
-    p.assert(SyntaxKind::LeftParen);
+    p.with_nl_mode(AtNewline::Continue, |p| {
+        p.assert(SyntaxKind::LeftParen);
 
-    let mut seen = HashSet::new();
-    let mut sink = false;
+        let mut seen = HashSet::new();
+        let mut sink = false;
 
-    while !p.current().is_terminator() {
-        if !p.at_set(set::PARAM) {
-            p.unexpected();
-            continue;
+        while !p.current().is_terminator() {
+            if !p.at_set(set::PARAM) {
+                p.unexpected();
+                continue;
+            }
+
+            param(p, &mut seen, &mut sink);
+
+            if !p.current().is_terminator() {
+                p.expect(SyntaxKind::Comma);
+            }
         }
 
-        param(p, &mut seen, &mut sink);
-
-        if !p.current().is_terminator() {
-            p.expect(SyntaxKind::Comma);
-        }
-    }
-
-    p.expect_closing_delimiter(m, SyntaxKind::RightParen);
-    p.exit_newline_mode();
+        p.expect_closing_delimiter(m, SyntaxKind::RightParen);
+    });
     p.wrap(m, SyntaxKind::Params);
 }
 
@@ -1374,25 +1367,25 @@ fn destructuring_or_parenthesized<'s>(
     let mut maybe_just_parens = true;
 
     let m = p.marker();
-    p.enter_newline_mode(NewlineMode::Continue);
-    p.assert(SyntaxKind::LeftParen);
+    p.with_nl_mode(AtNewline::Continue, |p| {
+        p.assert(SyntaxKind::LeftParen);
 
-    while !p.current().is_terminator() {
-        if !p.at_set(set::DESTRUCTURING_ITEM) {
-            p.unexpected();
-            continue;
+        while !p.current().is_terminator() {
+            if !p.at_set(set::DESTRUCTURING_ITEM) {
+                p.unexpected();
+                continue;
+            }
+
+            destructuring_item(p, reassignment, seen, &mut maybe_just_parens, &mut sink);
+            count += 1;
+
+            if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) {
+                maybe_just_parens = false;
+            }
         }
 
-        destructuring_item(p, reassignment, seen, &mut maybe_just_parens, &mut sink);
-        count += 1;
-
-        if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) {
-            maybe_just_parens = false;
-        }
-    }
-
-    p.expect_closing_delimiter(m, SyntaxKind::RightParen);
-    p.exit_newline_mode();
+        p.expect_closing_delimiter(m, SyntaxKind::RightParen);
+    });
 
     if maybe_just_parens && count == 1 && !sink {
         p.wrap(m, SyntaxKind::Parenthesized);
@@ -1510,7 +1503,7 @@ fn pattern_leaf<'s>(
 /// ### Modes
 ///
 /// The parser manages the transitions between the three modes of Typst through
-/// stacks of [lexer modes](`LexMode`) and [newline modes](`NewlineMode`).
+/// [lexer modes](`LexMode`) and [newline modes](`AtNewline`).
 ///
 /// The lexer modes map to the three Typst modes and are stored in the lexer,
 /// changing which`SyntaxKind`s it will generate. The mode also affects how the
@@ -1527,8 +1520,11 @@ struct Parser<'s> {
     /// The source text shared with the lexer.
     text: &'s str,
     /// A lexer over the source text with multiple modes. Defines the boundaries
-    /// of tokens and determines their [`SyntaxKind`].
+    /// of tokens and determines their [`SyntaxKind`]. Contains the [`LexMode`]
+    /// defining our current Typst mode.
     lexer: Lexer<'s>,
+    /// The newline mode: whether to insert a temporary end at newlines in Code.
+    nl_mode: AtNewline,
     /// The index into `text` of the end of the previous token.
     prev_end: usize,
     /// The index into `text` of the start of our current token (the end is
@@ -1545,12 +1541,6 @@ struct Parser<'s> {
     /// Nodes representing the concrete syntax tree of previously parsed text.
     /// In Code and Math, includes previously parsed trivia, but not `current`.
     nodes: Vec<SyntaxNode>,
-    /// Stack of lexer modes to be pushed/popped. The current mode is implicitly
-    /// stored in the lexer.
-    modes: Vec<LexMode>,
-    /// Stack of newline modes to be pushed/popped. The current mode is the tail
-    /// of the vector.
-    newline_modes: Vec<NewlineMode>,
     /// Parser checkpoints for a given text index. Used for efficient parser
     /// backtracking similar to packrat parsing. See comments above in
     /// [`expr_with_paren`].
@@ -1558,14 +1548,28 @@ struct Parser<'s> {
 }
 
 /// How to proceed with parsing when at a newline in Code.
-#[derive(Clone)]
-enum NewlineMode {
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum AtNewline {
+    /// Continue at newlines.
+    Continue,
     /// Stop at any newline.
     Stop,
     /// Continue only if there is no continuation with `else` or `.`.
     Contextual,
-    /// Continue at newlines.
-    Continue,
+}
+
+impl AtNewline {
+    /// Whether to stop at a newline or continue based on the current context.
+    fn stop(self, kind: impl FnOnce() -> SyntaxKind) -> bool {
+        match self {
+            AtNewline::Continue => false,
+            AtNewline::Stop => true,
+            AtNewline::Contextual => match kind() {
+                SyntaxKind::Else | SyntaxKind::Dot => false,
+                _ => true,
+            },
+        }
+    }
 }
 
 /// A marker representing a node's position in the parser. Mainly used for
@@ -1581,16 +1585,15 @@ impl<'s> Parser<'s> {
         lexer.jump(offset);
         let (current, current_node) = lexer.next();
         Self {
-            lexer,
             text,
+            lexer,
+            nl_mode: AtNewline::Continue,
             prev_end: offset,
             current_start: offset,
             current,
             current_node,
             balanced: true,
             nodes: vec![],
-            modes: vec![],
-            newline_modes: vec![],
             memo: Default::default(),
         }
     }
@@ -1767,35 +1770,33 @@ impl<'s> Parser<'s> {
         self.nodes.insert(from, SyntaxNode::inner(kind, children));
     }
 
-    /// Enter a new [`LexMode`] that will affect subsequent tokens (does not
-    /// modify the current token).
-    fn enter(&mut self, mode: LexMode) {
-        self.modes.push(self.lexer.mode());
+    /// Parse within the [`LexMode`] for subsequent tokens (does not change the
+    /// current token). This may re-lex the final token on exit.
+    ///
+    /// This function effectively repurposes the call stack as a stack of modes.
+    fn with_mode(&mut self, mode: LexMode, func: impl FnOnce(&mut Parser<'s>)) {
+        let previous = self.lexer.mode();
         self.lexer.set_mode(mode);
-    }
-
-    /// Exit the current [`LexMode`], possibly re-lexing the current token.
-    fn exit(&mut self) {
-        let mode = self.modes.pop().unwrap();
-        if mode != self.lexer.mode() {
+        func(self);
+        if mode != previous {
             self.unskip();
-            self.lexer.set_mode(mode);
+            self.lexer.set_mode(previous);
             self.lexer.jump(self.current_start);
             self.lex();
             self.skip();
         }
     }
 
-    /// Enter a new [`NewlineMode`] that will affect subsequent tokens (does not
-    /// modify the current token).
-    fn enter_newline_mode(&mut self, stop: NewlineMode) {
-        self.newline_modes.push(stop);
-    }
-
-    /// Exit the current [`NewlineMode`], possibly re-lexing the current token.
-    fn exit_newline_mode(&mut self) {
+    /// Parse within the [`AtNewline`] mode for subsequent tokens (does not
+    /// change the current token). This may re-lex the final token on exit.
+    ///
+    /// This function effectively repurposes the call stack as a stack of modes.
+    fn with_nl_mode(&mut self, mode: AtNewline, func: impl FnOnce(&mut Parser<'s>)) {
+        let previous = self.nl_mode;
+        self.nl_mode = mode;
+        func(self);
         self.unskip();
-        self.newline_modes.pop();
+        self.nl_mode = previous;
         self.lexer.jump(self.prev_end);
         self.lex();
         self.skip();
@@ -1853,15 +1854,7 @@ impl<'s> Parser<'s> {
         // Special cases to handle newlines in Code.
         if self.lexer.mode() == LexMode::Code
             && self.lexer.newline()
-            && match self.newline_modes.last() {
-                Some(NewlineMode::Continue) => false,
-                Some(NewlineMode::Contextual) => !matches!(
-                    Self::next_non_trivia(&mut self.lexer.clone()),
-                    SyntaxKind::Else | SyntaxKind::Dot
-                ),
-                Some(NewlineMode::Stop) => true,
-                None => false,
-            }
+            && self.nl_mode.stop(|| Self::next_non_trivia(&mut self.lexer.clone()))
         {
             self.current = SyntaxKind::End;
         }

From c466080fb2c3fc8bca895fc3ead0b0a7deb9b80d Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Thu, 10 Oct 2024 11:57:27 -0400
Subject: [PATCH 11/18] 11. Add Parser::finish_into

---
 crates/typst-syntax/src/parser.rs | 47 ++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index 44a388c56..34c65820d 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -12,37 +12,45 @@ use crate::{ast, is_newline, set, LexMode, Lexer, SyntaxError, SyntaxKind, Synta
 pub fn parse(text: &str) -> SyntaxNode {
     let _scope = typst_timing::TimingScope::new("parse");
     let mut p = Parser::new(text, 0, LexMode::Markup);
-    markup(&mut p, true, 0, |_| false);
-    p.finish().into_iter().next().unwrap()
+    markup_exprs(&mut p, true, 0, |_| false);
+    p.finish_into(SyntaxKind::Markup)
 }
 
 /// Parses top-level code.
 pub fn parse_code(text: &str) -> SyntaxNode {
     let _scope = typst_timing::TimingScope::new("parse code");
     let mut p = Parser::new(text, 0, LexMode::Code);
-    let m = p.marker();
-    p.skip();
     code_exprs(&mut p, |_| false);
-    p.wrap_all(m, SyntaxKind::Code);
-    p.finish().into_iter().next().unwrap()
+    p.finish_into(SyntaxKind::Code)
 }
 
 /// Parses top-level math.
 pub fn parse_math(text: &str) -> SyntaxNode {
     let _scope = typst_timing::TimingScope::new("parse math");
     let mut p = Parser::new(text, 0, LexMode::Math);
-    math(&mut p, |_| false);
-    p.finish().into_iter().next().unwrap()
+    math_exprs(&mut p, |_| false);
+    p.finish_into(SyntaxKind::Math)
 }
 
 /// Parses markup expressions until a stop condition is met.
 fn markup(
+    p: &mut Parser,
+    at_start: bool,
+    min_indent: usize,
+    stop: impl FnMut(&Parser) -> bool,
+) {
+    let m = p.marker();
+    markup_exprs(p, at_start, min_indent, stop);
+    p.wrap(m, SyntaxKind::Markup);
+}
+
+/// Parses a sequence of markup expressions.
+fn markup_exprs(
     p: &mut Parser,
     mut at_start: bool,
     min_indent: usize,
     mut stop: impl FnMut(&Parser) -> bool,
 ) {
-    let m = p.marker();
     let mut nesting: usize = 0;
     while !p.end() {
         match p.current() {
@@ -63,7 +71,6 @@ fn markup(
 
         markup_expr(p, &mut at_start);
     }
-    p.wrap(m, SyntaxKind::Markup);
 }
 
 /// Reparses a subsection of markup incrementally.
@@ -235,8 +242,14 @@ fn equation(p: &mut Parser) {
 }
 
 /// Parses the contents of a mathematical equation: `x^2 + 1`.
-fn math(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) {
+fn math(p: &mut Parser, stop: impl FnMut(&Parser) -> bool) {
     let m = p.marker();
+    math_exprs(p, stop);
+    p.wrap(m, SyntaxKind::Math);
+}
+
+/// Parses a sequence of math expressions.
+fn math_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) {
     while !p.end() && !stop(p) {
         if p.at_set(set::MATH_EXPR) {
             math_expr(p);
@@ -244,7 +257,6 @@ fn math(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) {
             p.unexpected();
         }
     }
-    p.wrap(m, SyntaxKind::Math);
 }
 
 /// Parses a single math expression: This includes math elements like
@@ -1603,6 +1615,12 @@ impl<'s> Parser<'s> {
         self.nodes
     }
 
+    /// Consume the parser, generating a single top-level node.
+    fn finish_into(self, kind: SyntaxKind) -> SyntaxNode {
+        assert!(self.at(SyntaxKind::End));
+        SyntaxNode::inner(kind, self.finish())
+    }
+
     /// The offset into `text` of the previous token's end.
     fn prev_end(&self) -> usize {
         self.prev_end
@@ -1757,11 +1775,6 @@ impl<'s> Parser<'s> {
         self.wrap_within(from, self.before_trivia(), kind);
     }
 
-    /// Wrap including any trailing trivia nodes.
-    fn wrap_all(&mut self, from: Marker, kind: SyntaxKind) {
-        self.wrap_within(from, Marker(self.nodes.len()), kind)
-    }
-
     fn wrap_within(&mut self, from: Marker, to: Marker, kind: SyntaxKind) {
         let len = self.nodes.len();
         let to = to.0.min(len);

From 91b384ad7b83fd7098d2a90306982b12affe1ca5 Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Thu, 10 Oct 2024 11:57:27 -0400
Subject: [PATCH 12/18] 12. Add the Token type and replace lex/skip/save
 methods

---
 crates/typst-syntax/src/parser.rs | 244 ++++++++++++++----------------
 1 file changed, 112 insertions(+), 132 deletions(-)

diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index 34c65820d..405e3e5c5 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -325,11 +325,7 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) {
         _ => p.expected("expression"),
     }
 
-    if continuable
-        && min_prec < 3
-        && p.prev_end() == p.current_start()
-        && maybe_delimited(p)
-    {
+    if continuable && min_prec < 3 && !p.had_trivia() && maybe_delimited(p) {
         p.wrap(m, SyntaxKind::Math);
     }
 
@@ -581,6 +577,8 @@ fn maybe_wrap_in_math(p: &mut Parser, arg: Marker, named: Option<Marker>) {
         // Convert 0 exprs into a blank math element (so empty arguments are allowed).
         // Convert 2+ exprs into a math element (so they become a joined sequence).
         p.wrap_within(arg, p.marker(), SyntaxKind::Math);
+        // We need to update `n_trivia` since we no longer have any.
+        p.token.n_trivia = 0; // TODO: Maybe create a `flush_trivia()` method?
     }
 
     if let Some(m) = named {
@@ -625,14 +623,17 @@ fn embedded_code_expr(p: &mut Parser) {
     p.with_mode(LexMode::Code, |p| {
         p.with_nl_mode(AtNewline::Stop, |p| {
             p.assert(SyntaxKind::Hash);
-            p.unskip();
+            if p.had_trivia() {
+                p.expected("expression");
+                return;
+            }
 
             let stmt = p.at_set(set::STMT);
             let at = p.at_set(set::ATOMIC_CODE_EXPR);
             code_expr_prec(p, true, 0);
 
             // Consume error for things like `#12p` or `#"abc\"`.#
-            if !at && !p.current().is_trivia() && !p.end() {
+            if !at && !p.end() {
                 p.unexpected();
             }
 
@@ -1493,14 +1494,15 @@ fn pattern_leaf<'s>(
 
 /// Manages parsing a stream of tokens into a tree of [`SyntaxNode`]s.
 ///
-/// The implementation presents an interface that investigates a `current` token
-/// and can take one of the following actions:
+/// The implementation presents an interface that investigates a current `token`
+/// with a [`SyntaxKind`] and can take one of the following actions:
 ///
-/// 1. Eat a token, pushing `current` into the `nodes` vector as a [leaf
-///    node](`SyntaxNode::leaf`) and prepare a new `current` by calling into the
+/// 1. Eat a token: push `token` onto the `nodes` vector as a [leaf
+///    node](`SyntaxNode::leaf`) and prepare a new `token` by calling into the
 ///    lexer.
-/// 2. Wrap nodes from a marker to the end of `nodes` (excluding `current`) into
-///    an [inner node](`SyntaxNode::inner`) of a specific [`SyntaxKind`].
+/// 2. Wrap nodes from a marker to the end of `nodes` (excluding `token` and any
+///    attached trivia) into an [inner node](`SyntaxNode::inner`) of a specific
+///    `SyntaxKind`.
 /// 3. Produce or convert nodes into an [error node](`SyntaxNode::error`) when
 ///    something expected is missing or something unexpected is found.
 ///
@@ -1525,9 +1527,9 @@ fn pattern_leaf<'s>(
 /// pushing onto the end of the `nodes` vector until a non-trivia kind is found.
 ///
 /// The newline mode is used in Code to determine whether a newline should end
-/// the current expression. If so, the parser temporarily changes the current
-/// token's kind to a fake [`SyntaxKind::End`]. When the parser exits the mode
-/// the original `SyntaxKind` is restored.
+/// the current expression. If so, the parser temporarily changes `token`'s kind
+/// to a fake [`SyntaxKind::End`]. When the parser exits the mode the original
+/// `SyntaxKind` is restored.
 struct Parser<'s> {
     /// The source text shared with the lexer.
     text: &'s str,
@@ -1537,21 +1539,16 @@ struct Parser<'s> {
     lexer: Lexer<'s>,
     /// The newline mode: whether to insert a temporary end at newlines in Code.
     nl_mode: AtNewline,
-    /// The index into `text` of the end of the previous token.
-    prev_end: usize,
-    /// The index into `text` of the start of our current token (the end is
-    /// stored as the lexer's cursor).
-    current_start: usize,
-    /// The [`SyntaxKind`] of the current token.
-    current: SyntaxKind,
-    /// The [`SyntaxNode`] of the current token, ready to be eaten and pushed
-    /// onto the end of `nodes`.
-    current_node: SyntaxNode,
+    /// The current token under inspection, not yet present in `nodes`. This
+    /// acts like a single item of lookahead for the parser.
+    ///
+    /// When wrapping, this is _not_ included in the wrapped nodes.
+    token: Token,
     /// Whether the parser has the expected set of open/close delimiters. This
     /// only ever transitions from `true` to `false`.
     balanced: bool,
     /// Nodes representing the concrete syntax tree of previously parsed text.
-    /// In Code and Math, includes previously parsed trivia, but not `current`.
+    /// In Code and Math, includes previously parsed trivia, but not `token`.
     nodes: Vec<SyntaxNode>,
     /// Parser checkpoints for a given text index. Used for efficient parser
     /// backtracking similar to packrat parsing. See comments above in
@@ -1559,6 +1556,26 @@ struct Parser<'s> {
     memo: MemoArena,
 }
 
+/// A single token returned from the lexer with a cached [`SyntaxKind`] and a
+/// record of preceding trivia.
+#[derive(Debug, Clone)]
+struct Token {
+    /// The [`SyntaxKind`] of the current token.
+    kind: SyntaxKind,
+    /// The [`SyntaxNode`] of the current token, ready to be eaten and pushed
+    /// onto the end of `nodes`.
+    node: SyntaxNode,
+    /// The number of preceding trivia before this token.
+    n_trivia: usize,
+    /// Whether this token's preceding trivia contained a newline.
+    had_newline: bool,
+    /// The index into `text` of the start of our current token (the end is
+    /// stored as the lexer's cursor).
+    start: usize,
+    /// The index into `text` of the end of the previous token.
+    prev_end: usize,
+}
+
 /// How to proceed with parsing when at a newline in Code.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 enum AtNewline {
@@ -1572,11 +1589,12 @@ enum AtNewline {
 
 impl AtNewline {
     /// Whether to stop at a newline or continue based on the current context.
-    fn stop(self, kind: impl FnOnce() -> SyntaxKind) -> bool {
+    fn stop(self, kind: SyntaxKind) -> bool {
+        #[allow(clippy::match_like_matches_macro)]
         match self {
             AtNewline::Continue => false,
             AtNewline::Stop => true,
-            AtNewline::Contextual => match kind() {
+            AtNewline::Contextual => match kind {
                 SyntaxKind::Else | SyntaxKind::Dot => false,
                 _ => true,
             },
@@ -1595,17 +1613,16 @@ impl<'s> Parser<'s> {
     fn new(text: &'s str, offset: usize, mode: LexMode) -> Self {
         let mut lexer = Lexer::new(text, mode);
         lexer.jump(offset);
-        let (current, current_node) = lexer.next();
+        let nl_mode = AtNewline::Continue;
+        let mut nodes = vec![];
+        let token = Self::lex(&mut nodes, &mut lexer, nl_mode);
         Self {
             text,
             lexer,
-            nl_mode: AtNewline::Continue,
-            prev_end: offset,
-            current_start: offset,
-            current,
-            current_node,
+            nl_mode,
+            token,
             balanced: true,
-            nodes: vec![],
+            nodes,
             memo: Default::default(),
         }
     }
@@ -1623,18 +1640,18 @@ impl<'s> Parser<'s> {
 
     /// The offset into `text` of the previous token's end.
     fn prev_end(&self) -> usize {
-        self.prev_end
+        self.token.prev_end
     }
 
     /// Similar to a `peek()` function: returns the `kind` of the next token to
     /// be eaten.
     fn current(&self) -> SyntaxKind {
-        self.current
+        self.token.kind
     }
 
     /// The offset into `text` of the current token's start.
     fn current_start(&self) -> usize {
-        self.current_start
+        self.token.start
     }
 
     /// The offset into `text` of the current token's end.
@@ -1644,17 +1661,17 @@ impl<'s> Parser<'s> {
 
     /// The current token's text.
     fn current_text(&self) -> &'s str {
-        &self.text[self.current_start..self.current_end()]
+        &self.text[self.token.start..self.current_end()]
     }
 
     /// Whether the current token is a given [`SyntaxKind`].
     fn at(&self, kind: SyntaxKind) -> bool {
-        self.current == kind
+        self.token.kind == kind
     }
 
     /// Whether the current token is contained in a [`SyntaxSet`].
     fn at_set(&self, set: SyntaxSet) -> bool {
-        set.contains(self.current)
+        set.contains(self.token.kind)
     }
 
     /// Whether we're at the end of the token stream.
@@ -1666,24 +1683,21 @@ impl<'s> Parser<'s> {
 
     /// If we're at the given `kind` with no preceding trivia tokens.
     fn directly_at(&self, kind: SyntaxKind) -> bool {
-        self.current == kind && self.prev_end == self.current_start
+        self.token.kind == kind && !self.had_trivia()
     }
 
     /// Eat the current token by saving it to the `nodes` vector, then move
     /// the lexer forward to prepare a new token.
     fn eat(&mut self) {
-        self.save();
-        self.lex();
-        self.skip();
+        self.nodes.push(std::mem::take(&mut self.token.node));
+        self.token = Self::lex(&mut self.nodes, &mut self.lexer, self.nl_mode);
     }
 
     /// Eat the current node and return a reference for in-place mutation.
     #[track_caller]
     fn eat_and_get(&mut self) -> &mut SyntaxNode {
         let offset = self.nodes.len();
-        self.save();
-        self.lex();
-        self.skip();
+        self.eat();
         &mut self.nodes[offset]
     }
 
@@ -1714,20 +1728,25 @@ impl<'s> Parser<'s> {
     /// specific token.
     #[track_caller]
     fn assert(&mut self, kind: SyntaxKind) {
-        assert_eq!(self.current, kind);
+        assert_eq!(self.token.kind, kind);
         self.eat();
     }
 
     /// Convert the current token's [`SyntaxKind`] and eat it.
     fn convert_and_eat(&mut self, kind: SyntaxKind) {
         // Only need to replace the node here.
-        self.current_node.convert_to_kind(kind);
+        self.token.node.convert_to_kind(kind);
         self.eat();
     }
 
     /// Whether the current token is a newline, only used in Markup.
-    fn newline(&mut self) -> bool {
-        self.lexer.newline()
+    fn newline(&self) -> bool {
+        self.token.had_newline
+    }
+
+    /// Whether `token` had any trivia before it in Code/Math.
+    fn had_trivia(&self) -> bool {
+        self.token.n_trivia > 0
     }
 
     /// The number of characters until the most recent newline in `text`.
@@ -1744,13 +1763,7 @@ impl<'s> Parser<'s> {
     /// A marker that will point to first trivia before this token in the
     /// parser (or the token itself if no trivia precede it).
     fn before_trivia(&self) -> Marker {
-        let mut i = self.nodes.len();
-        if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start {
-            while i > 0 && self.nodes[i - 1].kind().is_trivia() {
-                i -= 1;
-            }
-        }
-        Marker(i)
+        Marker(self.nodes.len() - self.token.n_trivia)
     }
 
     /// Whether the last non-trivia node is an error.
@@ -1792,11 +1805,10 @@ impl<'s> Parser<'s> {
         self.lexer.set_mode(mode);
         func(self);
         if mode != previous {
-            self.unskip();
             self.lexer.set_mode(previous);
-            self.lexer.jump(self.current_start);
-            self.lex();
-            self.skip();
+            self.lexer.jump(self.token.prev_end);
+            self.nodes.truncate(self.nodes.len() - self.token.n_trivia);
+            self.token = Self::lex(&mut self.nodes, &mut self.lexer, self.nl_mode);
         }
     }
 
@@ -1808,69 +1820,46 @@ impl<'s> Parser<'s> {
         let previous = self.nl_mode;
         self.nl_mode = mode;
         func(self);
-        self.unskip();
         self.nl_mode = previous;
-        self.lexer.jump(self.prev_end);
-        self.lex();
-        self.skip();
-    }
-
-    /// Move past trivia nodes in Code/Math.
-    fn skip(&mut self) {
-        if self.lexer.mode() != LexMode::Markup {
-            while self.current.is_trivia() {
-                self.save();
-                self.lex();
-            }
-        }
-    }
-
-    /// Move the parser back to the start of this token or its leading trivia
-    /// (in Code/Math).
-    fn unskip(&mut self) {
-        if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start {
-            while self.nodes.last().is_some_and(|last| last.kind().is_trivia()) {
-                self.nodes.pop();
-            }
-
-            self.lexer.jump(self.prev_end);
-            self.lex();
-        }
-    }
-
-    /// Save the current token to the `nodes` vector as an Inner or Error node.
-    fn save(&mut self) {
-        self.nodes.push(self.current_node.clone());
-
-        if self.lexer.mode() == LexMode::Markup || !self.current.is_trivia() {
-            self.prev_end = self.current_end();
-        }
-    }
-
-    /// Find the kind of the next non-trivia token in the lexer.
-    fn next_non_trivia(lexer: &mut Lexer<'s>) -> SyntaxKind {
-        loop {
-            let next = lexer.next().0;
-            // Loop is terminable, because `SyntaxKind::End` is not a trivia.
-            if !next.is_trivia() {
-                break next;
+        if mode != previous && self.token.had_newline {
+            let actual_kind = self.token.node.kind();
+            if self.nl_mode.stop(actual_kind) {
+                self.token.kind = SyntaxKind::End;
+            } else {
+                self.token.kind = actual_kind;
             }
         }
     }
 
     /// Move the lexer forward and prepare the current token. In Code, this
     /// might insert a temporary [`SyntaxKind::End`] based on our newline mode.
-    fn lex(&mut self) {
-        self.current_start = self.lexer.cursor();
-        (self.current, self.current_node) = self.lexer.next();
+    ///
+    /// This is not a method on `self` because we need a valid token before we
+    /// can initialize the parser.
+    fn lex(nodes: &mut Vec<SyntaxNode>, lexer: &mut Lexer, nl_mode: AtNewline) -> Token {
+        let prev_end = lexer.cursor();
+        let mut start = prev_end;
+        let (mut kind, mut node) = lexer.next();
+        let mut n_trivia = 0;
+        let mut had_newline = lexer.newline();
 
-        // Special cases to handle newlines in Code.
-        if self.lexer.mode() == LexMode::Code
-            && self.lexer.newline()
-            && self.nl_mode.stop(|| Self::next_non_trivia(&mut self.lexer.clone()))
-        {
-            self.current = SyntaxKind::End;
+        if lexer.mode() != LexMode::Markup {
+            while kind.is_trivia() {
+                n_trivia += 1;
+                nodes.push(node);
+                start = lexer.cursor();
+                (kind, node) = lexer.next();
+                had_newline |= lexer.newline();
+            }
+            if lexer.mode() == LexMode::Code && had_newline {
+                // Insert a temporary ['SyntaxKind::End'] to halt the parser.
+                // The actual `SyntaxKind` will be restored from `node` later.
+                if nl_mode.stop(kind) {
+                    kind = SyntaxKind::End;
+                }
+            }
         }
+        Token { kind, node, n_trivia, had_newline, start, prev_end }
     }
 }
 
@@ -1906,10 +1895,7 @@ struct Checkpoint {
 struct PartialState {
     cursor: usize,
     lex_mode: LexMode,
-    prev_end: usize,
-    current_start: usize,
-    current: SyntaxKind,
-    current_node: SyntaxNode,
+    token: Token,
 }
 
 impl<'s> Parser<'s> {
@@ -1951,10 +1937,7 @@ impl<'s> Parser<'s> {
     fn restore_partial(&mut self, state: PartialState) {
         self.lexer.jump(state.cursor);
         self.lexer.set_mode(state.lex_mode);
-        self.prev_end = state.prev_end;
-        self.current_start = state.current_start;
-        self.current = state.current;
-        self.current_node = state.current_node;
+        self.token = state.token;
     }
 
     /// Save a checkpoint of the parser state.
@@ -1963,10 +1946,7 @@ impl<'s> Parser<'s> {
         let state = PartialState {
             cursor: self.lexer.cursor(),
             lex_mode: self.lexer.mode(),
-            prev_end: self.prev_end,
-            current_start: self.current_start,
-            current: self.current,
-            current_node: self.current_node.clone(),
+            token: self.token.clone(),
         };
         Checkpoint { node_len, state }
     }
@@ -1978,7 +1958,7 @@ impl<'s> Parser<'s> {
         let at = self.at(kind);
         if at {
             self.eat();
-        } else if kind == SyntaxKind::Ident && self.current.is_keyword() {
+        } else if kind == SyntaxKind::Ident && self.token.kind.is_keyword() {
             self.trim_errors();
             self.eat_and_get().expected(kind.name());
         } else {
@@ -2024,7 +2004,7 @@ impl<'s> Parser<'s> {
     /// unexpected.
     fn unexpected(&mut self) {
         self.trim_errors();
-        self.balanced &= !self.current.is_grouping();
+        self.balanced &= !self.token.kind.is_grouping();
         self.eat_and_get().unexpected();
     }
 

From 97229d24e44505b373632a51e29b4b844c6c6ee6 Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Sun, 27 Oct 2024 11:17:23 -0400
Subject: [PATCH 13/18] 13. Reorder functions to avoid jumping around when
 reading code. No actual changes.

---
 crates/typst-syntax/src/parser.rs | 224 ++++++++++++++++--------------
 1 file changed, 116 insertions(+), 108 deletions(-)

diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index 405e3e5c5..8a1c8f76d 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -395,6 +395,22 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) {
     }
 }
 
+/// Precedence and wrapper kinds for the binary math operators.
+fn math_op(kind: SyntaxKind) -> Option<(SyntaxKind, SyntaxKind, ast::Assoc, usize)> {
+    match kind {
+        SyntaxKind::Underscore => {
+            Some((SyntaxKind::MathAttach, SyntaxKind::Hat, ast::Assoc::Right, 2))
+        }
+        SyntaxKind::Hat => {
+            Some((SyntaxKind::MathAttach, SyntaxKind::Underscore, ast::Assoc::Right, 2))
+        }
+        SyntaxKind::Slash => {
+            Some((SyntaxKind::MathFrac, SyntaxKind::End, ast::Assoc::Left, 1))
+        }
+        _ => None,
+    }
+}
+
 /// Try to parse delimiters based on the current token's unicode math class.
 fn maybe_delimited(p: &mut Parser) -> bool {
     let open = math_class(p.current_text()) == Some(MathClass::Opening);
@@ -464,22 +480,6 @@ fn math_class(text: &str) -> Option<MathClass> {
         .and_then(unicode_math_class::class)
 }
 
-/// Precedence and wrapper kinds for the binary math operators.
-fn math_op(kind: SyntaxKind) -> Option<(SyntaxKind, SyntaxKind, ast::Assoc, usize)> {
-    match kind {
-        SyntaxKind::Underscore => {
-            Some((SyntaxKind::MathAttach, SyntaxKind::Hat, ast::Assoc::Right, 2))
-        }
-        SyntaxKind::Hat => {
-            Some((SyntaxKind::MathAttach, SyntaxKind::Underscore, ast::Assoc::Right, 2))
-        }
-        SyntaxKind::Slash => {
-            Some((SyntaxKind::MathFrac, SyntaxKind::End, ast::Assoc::Left, 1))
-        }
-        _ => None,
-    }
-}
-
 /// Parse an argument list in math: `(a, b; c, d; size: #50%)`.
 fn math_args(p: &mut Parser) {
     let m = p.marker();
@@ -613,11 +613,6 @@ fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) {
     }
 }
 
-/// Parses a single code expression.
-fn code_expr(p: &mut Parser) {
-    code_expr_prec(p, false, 0)
-}
-
 /// Parses an atomic code expression embedded in markup or math.
 fn embedded_code_expr(p: &mut Parser) {
     p.with_mode(LexMode::Code, |p| {
@@ -647,6 +642,11 @@ fn embedded_code_expr(p: &mut Parser) {
     });
 }
 
+/// Parses a single code expression.
+fn code_expr(p: &mut Parser) {
+    code_expr_prec(p, false, 0)
+}
+
 /// Parses a code expression with at least the given precedence.
 fn code_expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) {
     let m = p.marker();
@@ -777,15 +777,6 @@ fn code_primary(p: &mut Parser, atomic: bool) {
     }
 }
 
-/// Parses a content or code block.
-fn block(p: &mut Parser) {
-    match p.current() {
-        SyntaxKind::LeftBracket => content_block(p),
-        SyntaxKind::LeftBrace => code_block(p),
-        _ => p.expected("block"),
-    }
-}
-
 /// Reparses a full content or code block.
 pub(super) fn reparse_block(text: &str, range: Range<usize>) -> Option<SyntaxNode> {
     let mut p = Parser::new(text, range.start, LexMode::Code);
@@ -795,6 +786,15 @@ pub(super) fn reparse_block(text: &str, range: Range<usize>) -> Option<SyntaxNod
         .then(|| p.finish().into_iter().next().unwrap())
 }
 
+/// Parses a content or code block.
+fn block(p: &mut Parser) {
+    match p.current() {
+        SyntaxKind::LeftBracket => content_block(p),
+        SyntaxKind::LeftBrace => code_block(p),
+        _ => p.expected("block"),
+    }
+}
+
 /// Parses a code block: `{ let x = 1; x + 2 }`.
 fn code_block(p: &mut Parser) {
     let m = p.marker();
@@ -1608,6 +1608,22 @@ impl AtNewline {
 #[derive(Debug, Copy, Clone, Eq, PartialEq)]
 struct Marker(usize);
 
+// Index into the parser with markers.
+impl Index<Marker> for Parser<'_> {
+    type Output = SyntaxNode;
+
+    fn index(&self, m: Marker) -> &Self::Output {
+        &self.nodes[m.0]
+    }
+}
+
+impl IndexMut<Marker> for Parser<'_> {
+    fn index_mut(&mut self, m: Marker) -> &mut Self::Output {
+        &mut self.nodes[m.0]
+    }
+}
+
+/// Creating/Consuming the parser and getting info about the current token.
 impl<'s> Parser<'s> {
     /// Create a new parser starting from the given text offset and lexer mode.
     fn new(text: &'s str, offset: usize, mode: LexMode) -> Self {
@@ -1638,32 +1654,12 @@ impl<'s> Parser<'s> {
         SyntaxNode::inner(kind, self.finish())
     }
 
-    /// The offset into `text` of the previous token's end.
-    fn prev_end(&self) -> usize {
-        self.token.prev_end
-    }
-
     /// Similar to a `peek()` function: returns the `kind` of the next token to
     /// be eaten.
     fn current(&self) -> SyntaxKind {
         self.token.kind
     }
 
-    /// The offset into `text` of the current token's start.
-    fn current_start(&self) -> usize {
-        self.token.start
-    }
-
-    /// The offset into `text` of the current token's end.
-    fn current_end(&self) -> usize {
-        self.lexer.cursor()
-    }
-
-    /// The current token's text.
-    fn current_text(&self) -> &'s str {
-        &self.text[self.token.start..self.current_end()]
-    }
-
     /// Whether the current token is a given [`SyntaxKind`].
     fn at(&self, kind: SyntaxKind) -> bool {
         self.token.kind == kind
@@ -1686,11 +1682,62 @@ impl<'s> Parser<'s> {
         self.token.kind == kind && !self.had_trivia()
     }
 
-    /// Eat the current token by saving it to the `nodes` vector, then move
-    /// the lexer forward to prepare a new token.
-    fn eat(&mut self) {
-        self.nodes.push(std::mem::take(&mut self.token.node));
-        self.token = Self::lex(&mut self.nodes, &mut self.lexer, self.nl_mode);
+    /// Whether `token` had any trivia before it in Code/Math.
+    fn had_trivia(&self) -> bool {
+        self.token.n_trivia > 0
+    }
+
+    /// Whether the current token is a newline, only used in Markup.
+    fn newline(&self) -> bool {
+        self.token.had_newline
+    }
+
+    /// The number of characters until the most recent newline in `text`.
+    fn column(&self, at: usize) -> usize {
+        self.text[..at].chars().rev().take_while(|&c| !is_newline(c)).count()
+    }
+
+    /// The current token's text.
+    fn current_text(&self) -> &'s str {
+        &self.text[self.token.start..self.current_end()]
+    }
+
+    /// The offset into `text` of the current token's start.
+    fn current_start(&self) -> usize {
+        self.token.start
+    }
+
+    /// The offset into `text` of the current token's end.
+    fn current_end(&self) -> usize {
+        self.lexer.cursor()
+    }
+
+    /// The offset into `text` of the previous token's end.
+    fn prev_end(&self) -> usize {
+        self.token.prev_end
+    }
+}
+
+// The main parsing interface for generating tokens and eating/modifying nodes.
+impl<'s> Parser<'s> {
+    /// A marker that will point to the current token in the parser once it's
+    /// been eaten.
+    fn marker(&self) -> Marker {
+        Marker(self.nodes.len())
+    }
+
+    /// A marker that will point to first trivia before this token in the
+    /// parser (or the token itself if no trivia precede it).
+    fn before_trivia(&self) -> Marker {
+        Marker(self.nodes.len() - self.token.n_trivia)
+    }
+
+    /// Iterate over the non-trivia tokens following the marker.
+    #[track_caller]
+    fn post_process(&mut self, m: Marker) -> impl Iterator<Item = &mut SyntaxNode> {
+        self.nodes[m.0..]
+            .iter_mut()
+            .filter(|child| !child.kind().is_error() && !child.kind().is_trivia())
     }
 
     /// Eat the current node and return a reference for in-place mutation.
@@ -1739,45 +1786,11 @@ impl<'s> Parser<'s> {
         self.eat();
     }
 
-    /// Whether the current token is a newline, only used in Markup.
-    fn newline(&self) -> bool {
-        self.token.had_newline
-    }
-
-    /// Whether `token` had any trivia before it in Code/Math.
-    fn had_trivia(&self) -> bool {
-        self.token.n_trivia > 0
-    }
-
-    /// The number of characters until the most recent newline in `text`.
-    fn column(&self, at: usize) -> usize {
-        self.text[..at].chars().rev().take_while(|&c| !is_newline(c)).count()
-    }
-
-    /// A marker that will point to the current token in the parser once it's
-    /// been eaten.
-    fn marker(&self) -> Marker {
-        Marker(self.nodes.len())
-    }
-
-    /// A marker that will point to first trivia before this token in the
-    /// parser (or the token itself if no trivia precede it).
-    fn before_trivia(&self) -> Marker {
-        Marker(self.nodes.len() - self.token.n_trivia)
-    }
-
-    /// Whether the last non-trivia node is an error.
-    fn after_error(&mut self) -> bool {
-        let m = self.before_trivia();
-        m.0 > 0 && self.nodes[m.0 - 1].kind().is_error()
-    }
-
-    /// Iterate over the non-trivia tokens following the marker.
-    #[track_caller]
-    fn post_process(&mut self, m: Marker) -> impl Iterator<Item = &mut SyntaxNode> {
-        self.nodes[m.0..]
-            .iter_mut()
-            .filter(|child| !child.kind().is_error() && !child.kind().is_trivia())
+    /// Eat the current token by saving it to the `nodes` vector, then move
+    /// the lexer forward to prepare a new token.
+    fn eat(&mut self) {
+        self.nodes.push(std::mem::take(&mut self.token.node));
+        self.token = Self::lex(&mut self.nodes, &mut self.lexer, self.nl_mode);
     }
 
     /// Wrap the nodes from a marker up to (but excluding) the current token in
@@ -1898,6 +1911,7 @@ struct PartialState {
     token: Token,
 }
 
+/// The Memoization interface.
 impl<'s> Parser<'s> {
     /// Store the already parsed nodes and the parser state into the memo map by
     /// extending the arena and storing the extended range and a checkpoint.
@@ -1952,6 +1966,8 @@ impl<'s> Parser<'s> {
     }
 }
 
+/// Functions for eating expected or unexpected tokens and generating errors if
+/// we don't get what we expect.
 impl<'s> Parser<'s> {
     /// Consume the given `kind` or produce an error.
     fn expect(&mut self, kind: SyntaxKind) -> bool {
@@ -1984,6 +2000,12 @@ impl<'s> Parser<'s> {
         }
     }
 
+    /// Whether the last non-trivia node is an error.
+    fn after_error(&mut self) -> bool {
+        let m = self.before_trivia();
+        m.0 > 0 && self.nodes[m.0 - 1].kind().is_error()
+    }
+
     /// Produce an error that the given `thing` was expected at the position
     /// of the marker `m`.
     fn expected_at(&mut self, m: Marker, thing: &str) {
@@ -2021,17 +2043,3 @@ impl<'s> Parser<'s> {
         self.nodes.drain(start..end);
     }
 }
-
-impl Index<Marker> for Parser<'_> {
-    type Output = SyntaxNode;
-
-    fn index(&self, m: Marker) -> &Self::Output {
-        &self.nodes[m.0]
-    }
-}
-
-impl IndexMut<Marker> for Parser<'_> {
-    fn index_mut(&mut self, m: Marker) -> &mut Self::Output {
-        &mut self.nodes[m.0]
-    }
-}

From 4ce0b069f6478163eed2d2fd1860905bd47a5f46 Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Thu, 10 Oct 2024 17:51:05 -0400
Subject: [PATCH 14/18] 14. Update 'maybe_wrap_in_math' to remove 'wrap_within'
 and 'post_process'!

---
 crates/typst-syntax/src/parser.rs | 70 +++++++++++++++----------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index 8a1c8f76d..67d34b239 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -490,6 +490,8 @@ fn math_args(p: &mut Parser) {
     let mut has_arrays = false;
     let mut array = p.marker();
     let mut arg = p.marker();
+    // The number of math expressions per argument.
+    let mut count = 0;
 
     while !p.end() && !p.at(SyntaxKind::Dollar) {
         if namable
@@ -506,20 +508,22 @@ fn math_args(p: &mut Parser) {
         match p.current_text() {
             ")" => break,
             ";" => {
-                maybe_wrap_in_math(p, arg, named);
+                maybe_wrap_in_math(p, arg, count, named);
                 p.wrap(array, SyntaxKind::Array);
                 p.convert_and_eat(SyntaxKind::Semicolon);
                 array = p.marker();
                 arg = p.marker();
+                count = 0;
                 namable = true;
                 named = None;
                 has_arrays = true;
                 continue;
             }
             "," => {
-                maybe_wrap_in_math(p, arg, named);
+                maybe_wrap_in_math(p, arg, count, named);
                 p.convert_and_eat(SyntaxKind::Comma);
                 arg = p.marker();
+                count = 0;
                 namable = true;
                 if named.is_some() {
                     array = p.marker();
@@ -532,6 +536,7 @@ fn math_args(p: &mut Parser) {
 
         if p.at_set(set::MATH_EXPR) {
             math_expr(p);
+            count += 1;
         } else {
             p.unexpected();
         }
@@ -540,7 +545,7 @@ fn math_args(p: &mut Parser) {
     }
 
     if arg != p.marker() {
-        maybe_wrap_in_math(p, arg, named);
+        maybe_wrap_in_math(p, arg, count, named);
         if named.is_some() {
             array = p.marker();
         }
@@ -560,25 +565,26 @@ fn math_args(p: &mut Parser) {
     p.wrap(m, SyntaxKind::Args);
 }
 
-/// Wrap math function arguments in a "Math" SyntaxKind to combine adjacent expressions
-/// or create blank content.
+/// Wrap math function arguments to join adjacent math content or create an
+/// empty 'Math' node for when we have 0 args.
 ///
-/// We don't wrap when `exprs == 1`, as there is only one expression, so the grouping
-/// isn't needed, and this would change the type of the expression from potentially
-/// non-content to content.
-///
-/// Note that `exprs` might be 0 if we have whitespace or trivia before a comma i.e.
-/// `mat(; ,)` or `sin(x, , , ,)`. This would create an empty Math element before that
-/// trivia if we called `p.wrap()` -- breaking the expected AST for 2-d arguments -- so
-/// we instead manually wrap to our current marker using `p.wrap_within()`.
-fn maybe_wrap_in_math(p: &mut Parser, arg: Marker, named: Option<Marker>) {
-    let exprs = p.post_process(arg).filter(|node| node.is::<ast::Expr>()).count();
-    if exprs != 1 {
-        // Convert 0 exprs into a blank math element (so empty arguments are allowed).
-        // Convert 2+ exprs into a math element (so they become a joined sequence).
-        p.wrap_within(arg, p.marker(), SyntaxKind::Math);
-        // We need to update `n_trivia` since we no longer have any.
-        p.token.n_trivia = 0; // TODO: Maybe create a `flush_trivia()` method?
+/// We don't wrap when `count == 1`, since wrapping would change the type of the
+/// expression from potentially non-content to content. Ex: `$ func(#12pt) $`
+/// would change the type from size to content if wrapped.
+fn maybe_wrap_in_math(p: &mut Parser, arg: Marker, count: usize, named: Option<Marker>) {
+    if count == 0 {
+        // Flush trivia so that the new empty Math node will be wrapped _inside_
+        // any `SyntaxKind::Array` elements created in `math_args`.
+        // (And if we don't follow by wrapping in an array, it has no effect.)
+        // The difference in node layout without this would look like:
+        // Expression: `$ mat( ;) $`
+        // - Correct:   [ .., Space(" "), Array[Math[], ], Semicolon(";"), .. ]
+        // - Incorrect: [ .., Math[], Array[], Space(" "), Semicolon(";"), .. ]
+        p.flush_trivia();
+    }
+
+    if count != 1 {
+        p.wrap(arg, SyntaxKind::Math);
     }
 
     if let Some(m) = named {
@@ -1732,14 +1738,6 @@ impl<'s> Parser<'s> {
         Marker(self.nodes.len() - self.token.n_trivia)
     }
 
-    /// Iterate over the non-trivia tokens following the marker.
-    #[track_caller]
-    fn post_process(&mut self, m: Marker) -> impl Iterator<Item = &mut SyntaxNode> {
-        self.nodes[m.0..]
-            .iter_mut()
-            .filter(|child| !child.kind().is_error() && !child.kind().is_trivia())
-    }
-
     /// Eat the current node and return a reference for in-place mutation.
     #[track_caller]
     fn eat_and_get(&mut self) -> &mut SyntaxNode {
@@ -1793,17 +1791,19 @@ impl<'s> Parser<'s> {
         self.token = Self::lex(&mut self.nodes, &mut self.lexer, self.nl_mode);
     }
 
+    /// Detach the parsed trivia nodes from this token (but not newline info) so
+    /// that subsequent wrapping will include the trivia.
+    fn flush_trivia(&mut self) {
+        self.token.n_trivia = 0;
+        self.token.prev_end = self.token.start;
+    }
+
     /// Wrap the nodes from a marker up to (but excluding) the current token in
     /// a new [inner node](`SyntaxNode::inner`) of the given kind. This is an
     /// easy interface for creating nested syntax nodes _after_ having parsed
     /// their children.
     fn wrap(&mut self, from: Marker, kind: SyntaxKind) {
-        self.wrap_within(from, self.before_trivia(), kind);
-    }
-
-    fn wrap_within(&mut self, from: Marker, to: Marker, kind: SyntaxKind) {
-        let len = self.nodes.len();
-        let to = to.0.min(len);
+        let to = self.before_trivia().0;
         let from = from.0.min(to);
         let children = self.nodes.drain(from..to).collect();
         self.nodes.insert(from, SyntaxNode::inner(kind, children));

From 26c61be1dc761306ea7f256b73344a22d843b622 Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Tue, 22 Oct 2024 00:13:56 -0400
Subject: [PATCH 15/18] 15. Convert Markup mode to use newline modes

(And break out Newline info into separate struct)
---
 crates/typst-syntax/src/lexer.rs  |   5 +
 crates/typst-syntax/src/parser.rs | 293 ++++++++++++++++--------------
 tests/suite/model/heading.typ     |  17 +-
 3 files changed, 166 insertions(+), 149 deletions(-)

diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs
index 4a43c15ff..d09c6f842 100644
--- a/crates/typst-syntax/src/lexer.rs
+++ b/crates/typst-syntax/src/lexer.rs
@@ -68,6 +68,11 @@ impl<'s> Lexer<'s> {
     pub fn newline(&self) -> bool {
         self.newline
     }
+
+    /// The number of characters until the most recent newline.
+    pub fn column(&self) -> usize {
+        self.s.before().chars().rev().take_while(|&c| !is_newline(c)).count()
+    }
 }
 
 impl Lexer<'_> {
diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index 67d34b239..6e59f45e6 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -6,13 +6,13 @@ use ecow::{eco_format, EcoString};
 use unicode_math_class::MathClass;
 
 use crate::set::{syntax_set, SyntaxSet};
-use crate::{ast, is_newline, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode};
+use crate::{ast, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode};
 
 /// Parses a source file as top-level markup.
 pub fn parse(text: &str) -> SyntaxNode {
     let _scope = typst_timing::TimingScope::new("parse");
     let mut p = Parser::new(text, 0, LexMode::Markup);
-    markup_exprs(&mut p, true, 0, |_| false);
+    markup_exprs(&mut p, true, |_| false);
     p.finish_into(SyntaxKind::Markup)
 }
 
@@ -36,11 +36,14 @@ pub fn parse_math(text: &str) -> SyntaxNode {
 fn markup(
     p: &mut Parser,
     at_start: bool,
-    min_indent: usize,
+    wrap_trivia: bool,
     stop: impl FnMut(&Parser) -> bool,
 ) {
-    let m = p.marker();
-    markup_exprs(p, at_start, min_indent, stop);
+    let m = if wrap_trivia { p.before_trivia() } else { p.marker() };
+    markup_exprs(p, at_start, stop);
+    if wrap_trivia {
+        p.flush_trivia();
+    }
     p.wrap(m, SyntaxKind::Markup);
 }
 
@@ -48,9 +51,9 @@ fn markup(
 fn markup_exprs(
     p: &mut Parser,
     mut at_start: bool,
-    min_indent: usize,
     mut stop: impl FnMut(&Parser) -> bool,
 ) {
+    at_start |= p.had_newline();
     let mut nesting: usize = 0;
     while !p.end() {
         match p.current() {
@@ -59,17 +62,8 @@ fn markup_exprs(
             _ if stop(p) => break,
             _ => {}
         }
-
-        if p.newline() {
-            at_start = true;
-            if min_indent > 0 && p.column(p.current_end()) < min_indent {
-                break;
-            }
-            p.eat();
-            continue;
-        }
-
-        markup_expr(p, &mut at_start);
+        markup_expr(p, at_start);
+        at_start = p.had_newline();
     }
 }
 
@@ -82,6 +76,7 @@ pub(super) fn reparse_markup(
     mut stop: impl FnMut(SyntaxKind) -> bool,
 ) -> Option<Vec<SyntaxNode>> {
     let mut p = Parser::new(text, range.start, LexMode::Markup);
+    *at_start |= p.had_newline();
     while !p.end() && p.current_start() < range.end {
         match p.current() {
             SyntaxKind::LeftBracket => *nesting += 1,
@@ -89,30 +84,17 @@ pub(super) fn reparse_markup(
             _ if stop(p.current()) => break,
             _ => {}
         }
-
-        if p.newline() {
-            *at_start = true;
-            p.eat();
-            continue;
-        }
-
-        markup_expr(&mut p, at_start);
+        markup_expr(&mut p, *at_start);
+        *at_start = p.had_newline();
     }
     (p.balanced && p.current_start() == range.end).then(|| p.finish())
 }
 
-/// Parses a single markup expression. This includes markup elements like
-/// spaces, text, and headings, and embedded code expressions.
-fn markup_expr(p: &mut Parser, at_start: &mut bool) {
+/// Parses a single markup expression. This includes markup elements like text,
+/// headings, strong/emph, lists/enums, etc. This is also the entry point for
+/// parsing math equations and embedded code expressions.
+fn markup_expr(p: &mut Parser, at_start: bool) {
     match p.current() {
-        SyntaxKind::Space
-        | SyntaxKind::Parbreak
-        | SyntaxKind::LineComment
-        | SyntaxKind::BlockComment => {
-            p.eat();
-            return;
-        }
-
         SyntaxKind::Text
         | SyntaxKind::Linebreak
         | SyntaxKind::Escape
@@ -126,10 +108,10 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) {
         SyntaxKind::Hash => embedded_code_expr(p),
         SyntaxKind::Star => strong(p),
         SyntaxKind::Underscore => emph(p),
-        SyntaxKind::HeadingMarker if *at_start => heading(p),
-        SyntaxKind::ListMarker if *at_start => list_item(p),
-        SyntaxKind::EnumMarker if *at_start => enum_item(p),
-        SyntaxKind::TermMarker if *at_start => term_item(p),
+        SyntaxKind::HeadingMarker if at_start => heading(p),
+        SyntaxKind::ListMarker if at_start => list_item(p),
+        SyntaxKind::EnumMarker if at_start => enum_item(p),
+        SyntaxKind::TermMarker if at_start => term_item(p),
         SyntaxKind::RefMarker => reference(p),
         SyntaxKind::Dollar => equation(p),
 
@@ -141,76 +123,74 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) {
         | SyntaxKind::TermMarker
         | SyntaxKind::Colon => p.convert_and_eat(SyntaxKind::Text),
 
-        _ => {
-            p.unexpected();
-            return; // Don't set `at_start`
-        }
+        _ => p.unexpected(),
     }
-
-    *at_start = false;
 }
 
 /// Parses strong content: `*Strong*`.
 fn strong(p: &mut Parser) {
-    let m = p.marker();
-    p.assert(SyntaxKind::Star);
-    markup(p, false, 0, |p| p.at_set(syntax_set!(Star, Parbreak, RightBracket)));
-    p.expect_closing_delimiter(m, SyntaxKind::Star);
-    p.wrap(m, SyntaxKind::Strong);
+    p.with_nl_mode(AtNewline::StopParBreak, |p| {
+        let m = p.marker();
+        p.assert(SyntaxKind::Star);
+        markup(p, false, true, |p| p.at_set(syntax_set!(Star, RightBracket)));
+        p.expect_closing_delimiter(m, SyntaxKind::Star);
+        p.wrap(m, SyntaxKind::Strong);
+    });
 }
 
 /// Parses emphasized content: `_Emphasized_`.
 fn emph(p: &mut Parser) {
-    let m = p.marker();
-    p.assert(SyntaxKind::Underscore);
-    markup(p, false, 0, |p| p.at_set(syntax_set!(Underscore, Parbreak, RightBracket)));
-    p.expect_closing_delimiter(m, SyntaxKind::Underscore);
-    p.wrap(m, SyntaxKind::Emph);
+    p.with_nl_mode(AtNewline::StopParBreak, |p| {
+        let m = p.marker();
+        p.assert(SyntaxKind::Underscore);
+        markup(p, false, true, |p| p.at_set(syntax_set!(Underscore, RightBracket)));
+        p.expect_closing_delimiter(m, SyntaxKind::Underscore);
+        p.wrap(m, SyntaxKind::Emph);
+    });
 }
 
 /// Parses a section heading: `= Introduction`.
 fn heading(p: &mut Parser) {
-    let m = p.marker();
-    p.assert(SyntaxKind::HeadingMarker);
-    whitespace_line(p);
-    markup(p, false, usize::MAX, |p| {
-        p.at_set(syntax_set!(Label, Space, RightBracket))
-            && (!p.at(SyntaxKind::Space) || p.lexer.clone().next().0 == SyntaxKind::Label)
+    p.with_nl_mode(AtNewline::Stop, |p| {
+        let m = p.marker();
+        p.assert(SyntaxKind::HeadingMarker);
+        markup(p, false, false, |p| p.at_set(syntax_set!(Label, RightBracket)));
+        p.wrap(m, SyntaxKind::Heading);
     });
-    p.wrap(m, SyntaxKind::Heading);
 }
 
 /// Parses an item in a bullet list: `- ...`.
 fn list_item(p: &mut Parser) {
-    let m = p.marker();
-    let min_indent = p.column(p.current_start()) + 1;
-    p.assert(SyntaxKind::ListMarker);
-    whitespace_line(p);
-    markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket));
-    p.wrap(m, SyntaxKind::ListItem);
+    p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| {
+        let m = p.marker();
+        p.assert(SyntaxKind::ListMarker);
+        markup(p, false, false, |p| p.at_set(syntax_set!(RightBracket)));
+        p.wrap(m, SyntaxKind::ListItem);
+    });
 }
 
 /// Parses an item in an enumeration (numbered list): `+ ...` or `1. ...`.
 fn enum_item(p: &mut Parser) {
-    let m = p.marker();
-    let min_indent = p.column(p.current_start()) + 1;
-    p.assert(SyntaxKind::EnumMarker);
-    whitespace_line(p);
-    markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket));
-    p.wrap(m, SyntaxKind::EnumItem);
+    p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| {
+        let m = p.marker();
+        p.assert(SyntaxKind::EnumMarker);
+        markup(p, false, false, |p| p.at(SyntaxKind::RightBracket));
+        p.wrap(m, SyntaxKind::EnumItem);
+    });
 }
 
 /// Parses an item in a term list: `/ Term: Details`.
 fn term_item(p: &mut Parser) {
-    let m = p.marker();
-    p.assert(SyntaxKind::TermMarker);
-    let min_indent = p.column(p.prev_end());
-    whitespace_line(p);
-    markup(p, false, usize::MAX, |p| p.at_set(syntax_set!(Colon, RightBracket)));
-    p.expect(SyntaxKind::Colon);
-    whitespace_line(p);
-    markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket));
-    p.wrap(m, SyntaxKind::TermItem);
+    p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| {
+        let m = p.marker();
+        p.with_nl_mode(AtNewline::Stop, |p| {
+            p.assert(SyntaxKind::TermMarker);
+            markup(p, false, false, |p| p.at_set(syntax_set!(Colon, RightBracket)));
+        });
+        p.expect(SyntaxKind::Colon);
+        markup(p, false, false, |p| p.at(SyntaxKind::RightBracket));
+        p.wrap(m, SyntaxKind::TermItem);
+    });
 }
 
 /// Parses a reference: `@target`, `@target[..]`.
@@ -223,20 +203,15 @@ fn reference(p: &mut Parser) {
     p.wrap(m, SyntaxKind::Ref);
 }
 
-/// Consumes whitespace that does not contain a newline.
-fn whitespace_line(p: &mut Parser) {
-    while !p.newline() && p.current().is_trivia() {
-        p.eat();
-    }
-}
-
 /// Parses a mathematical equation: `$x$`, `$ x^2 $`.
 fn equation(p: &mut Parser) {
     let m = p.marker();
     p.with_mode(LexMode::Math, |p| {
-        p.assert(SyntaxKind::Dollar);
-        math(p, |p| p.at(SyntaxKind::Dollar));
-        p.expect_closing_delimiter(m, SyntaxKind::Dollar);
+        p.with_nl_mode(AtNewline::Continue, |p| {
+            p.assert(SyntaxKind::Dollar);
+            math(p, |p| p.at(SyntaxKind::Dollar));
+            p.expect_closing_delimiter(m, SyntaxKind::Dollar);
+        });
     });
     p.wrap(m, SyntaxKind::Equation);
 }
@@ -602,7 +577,7 @@ fn code(p: &mut Parser, stop: impl FnMut(&Parser) -> bool) {
 /// Parses a sequence of code expressions.
 fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) {
     while !p.end() && !stop(p) {
-        p.with_nl_mode(AtNewline::Contextual, |p| {
+        p.with_nl_mode(AtNewline::ContextualContinue, |p| {
             if !p.at_set(set::CODE_EXPR) {
                 p.unexpected();
                 return;
@@ -818,9 +793,11 @@ fn code_block(p: &mut Parser) {
 fn content_block(p: &mut Parser) {
     let m = p.marker();
     p.with_mode(LexMode::Markup, |p| {
-        p.assert(SyntaxKind::LeftBracket);
-        markup(p, true, 0, |p| p.at(SyntaxKind::RightBracket));
-        p.expect_closing_delimiter(m, SyntaxKind::RightBracket);
+        p.with_nl_mode(AtNewline::Continue, |p| {
+            p.assert(SyntaxKind::LeftBracket);
+            markup(p, true, true, |p| p.at(SyntaxKind::RightBracket));
+            p.expect_closing_delimiter(m, SyntaxKind::RightBracket);
+        });
     });
     p.wrap(m, SyntaxKind::ContentBlock);
 }
@@ -1526,15 +1503,11 @@ fn pattern_leaf<'s>(
 /// [lexer modes](`LexMode`) and [newline modes](`AtNewline`).
 ///
 /// The lexer modes map to the three Typst modes and are stored in the lexer,
-/// changing which`SyntaxKind`s it will generate. The mode also affects how the
-/// parser treats trivia tokens (comments and whitespace). In Markup, trivia is
-/// handled manually to deal with list indentation and must be explicitly eaten.
-/// In Code and Math, trivia is managed internally and is implicitly eaten by
-/// pushing onto the end of the `nodes` vector until a non-trivia kind is found.
+/// changing which`SyntaxKind`s it will generate.
 ///
-/// The newline mode is used in Code to determine whether a newline should end
-/// the current expression. If so, the parser temporarily changes `token`'s kind
-/// to a fake [`SyntaxKind::End`]. When the parser exits the mode the original
+/// The newline mode is used to determine whether a newline should end the
+/// current expression. If so, the parser temporarily changes `token`'s kind to
+/// a fake [`SyntaxKind::End`]. When the parser exits the mode the original
 /// `SyntaxKind` is restored.
 struct Parser<'s> {
     /// The source text shared with the lexer.
@@ -1543,7 +1516,7 @@ struct Parser<'s> {
     /// of tokens and determines their [`SyntaxKind`]. Contains the [`LexMode`]
     /// defining our current Typst mode.
     lexer: Lexer<'s>,
-    /// The newline mode: whether to insert a temporary end at newlines in Code.
+    /// The newline mode: whether to insert a temporary end at newlines.
     nl_mode: AtNewline,
     /// The current token under inspection, not yet present in `nodes`. This
     /// acts like a single item of lookahead for the parser.
@@ -1574,7 +1547,7 @@ struct Token {
     /// The number of preceding trivia before this token.
     n_trivia: usize,
     /// Whether this token's preceding trivia contained a newline.
-    had_newline: bool,
+    newline: Option<Newline>,
     /// The index into `text` of the start of our current token (the end is
     /// stored as the lexer's cursor).
     start: usize,
@@ -1582,28 +1555,52 @@ struct Token {
     prev_end: usize,
 }
 
-/// How to proceed with parsing when at a newline in Code.
+/// Information about a newline if present (currently only relevant in Markup).
+#[derive(Debug, Clone, Copy)]
+struct Newline {
+    /// The column of our token in its line.
+    ///
+    /// Note that this is actually the column of the first non-whitespace
+    /// `SyntaxKind` in the line, so `\n  /**/- list` has column 2 (not 6)
+    /// because the block comment is the first non-space kind.
+    column: Option<usize>,
+    /// Whether any of our newlines were paragraph breaks.
+    parbreak: bool,
+}
+
+/// How to proceed with parsing when at a newline.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 enum AtNewline {
     /// Continue at newlines.
     Continue,
     /// Stop at any newline.
     Stop,
-    /// Continue only if there is no continuation with `else` or `.`.
-    Contextual,
+    /// Continue only if there is no continuation with `else` or `.` (Code only).
+    ContextualContinue,
+    /// Stop only at a parbreak, not normal newlines (Markup only).
+    StopParBreak,
+    /// Require that the token's column be greater or equal to a column (Markup
+    /// only). If this is `0`, acts like `Continue`; if this is `usize::MAX`,
+    /// acts like `Stop`.
+    RequireColumn(usize),
 }
 
 impl AtNewline {
     /// Whether to stop at a newline or continue based on the current context.
-    fn stop(self, kind: SyntaxKind) -> bool {
+    fn stop_at(self, Newline { column, parbreak }: Newline, kind: SyntaxKind) -> bool {
         #[allow(clippy::match_like_matches_macro)]
         match self {
             AtNewline::Continue => false,
             AtNewline::Stop => true,
-            AtNewline::Contextual => match kind {
+            AtNewline::ContextualContinue => match kind {
                 SyntaxKind::Else | SyntaxKind::Dot => false,
                 _ => true,
             },
+            AtNewline::StopParBreak => parbreak,
+            AtNewline::RequireColumn(min_col) => match column {
+                Some(column) => column <= min_col,
+                None => false, // Don't stop if we had no column.
+            },
         }
     }
 }
@@ -1688,19 +1685,24 @@ impl<'s> Parser<'s> {
         self.token.kind == kind && !self.had_trivia()
     }
 
-    /// Whether `token` had any trivia before it in Code/Math.
+    /// Whether `token` had any preceding trivia.
     fn had_trivia(&self) -> bool {
         self.token.n_trivia > 0
     }
 
-    /// Whether the current token is a newline, only used in Markup.
-    fn newline(&self) -> bool {
-        self.token.had_newline
+    /// Whether `token` had a newline among any of its preceding trivia.
+    fn had_newline(&self) -> bool {
+        self.token.newline.is_some()
     }
 
-    /// The number of characters until the most recent newline in `text`.
-    fn column(&self, at: usize) -> usize {
-        self.text[..at].chars().rev().take_while(|&c| !is_newline(c)).count()
+    /// The number of characters until the most recent newline from the current
+    /// token, or 0 if it did not follow a newline.
+    ///
+    /// Note that this is actually the column of the first non-whitespace
+    /// `SyntaxKind` in the line, so `\n  /**/- list` has column 2 (not 6)
+    /// because the block comment is the first non-space kind.
+    fn current_column(&self) -> usize {
+        self.token.newline.and_then(|newline| newline.column).unwrap_or(0)
     }
 
     /// The current token's text.
@@ -1834,12 +1836,15 @@ impl<'s> Parser<'s> {
         self.nl_mode = mode;
         func(self);
         self.nl_mode = previous;
-        if mode != previous && self.token.had_newline {
-            let actual_kind = self.token.node.kind();
-            if self.nl_mode.stop(actual_kind) {
-                self.token.kind = SyntaxKind::End;
-            } else {
-                self.token.kind = actual_kind;
+        if let Some(newline) = self.token.newline {
+            if mode != previous {
+                // Restore our actual token's kind or insert a fake end.
+                let actual_kind = self.token.node.kind();
+                if self.nl_mode.stop_at(newline, actual_kind) {
+                    self.token.kind = SyntaxKind::End;
+                } else {
+                    self.token.kind = actual_kind;
+                }
             }
         }
     }
@@ -1854,25 +1859,31 @@ impl<'s> Parser<'s> {
         let mut start = prev_end;
         let (mut kind, mut node) = lexer.next();
         let mut n_trivia = 0;
-        let mut had_newline = lexer.newline();
+        let mut had_newline = false;
+        let mut newline = Newline { column: None, parbreak: false };
 
-        if lexer.mode() != LexMode::Markup {
-            while kind.is_trivia() {
-                n_trivia += 1;
-                nodes.push(node);
-                start = lexer.cursor();
-                (kind, node) = lexer.next();
-                had_newline |= lexer.newline();
-            }
-            if lexer.mode() == LexMode::Code && had_newline {
-                // Insert a temporary ['SyntaxKind::End'] to halt the parser.
-                // The actual `SyntaxKind` will be restored from `node` later.
-                if nl_mode.stop(kind) {
-                    kind = SyntaxKind::End;
+        while kind.is_trivia() {
+            if lexer.newline() {
+                // Newlines are always trivia.
+                had_newline = true;
+                newline.parbreak |= kind == SyntaxKind::Parbreak;
+                if lexer.mode() == LexMode::Markup {
+                    newline.column = Some(lexer.column());
                 }
             }
+            n_trivia += 1;
+            nodes.push(node);
+            start = lexer.cursor();
+            (kind, node) = lexer.next();
         }
-        Token { kind, node, n_trivia, had_newline, start, prev_end }
+        if had_newline && nl_mode.stop_at(newline, kind) {
+            // Insert a temporary `SyntaxKind::End` to halt the parser.
+            // The actual kind will be restored from `node` later.
+            kind = SyntaxKind::End;
+        }
+
+        let newline = had_newline.then_some(newline);
+        Token { kind, node, n_trivia, newline, start, prev_end }
     }
 }
 
diff --git a/tests/suite/model/heading.typ b/tests/suite/model/heading.typ
index 884f203d2..d182724c8 100644
--- a/tests/suite/model/heading.typ
+++ b/tests/suite/model/heading.typ
@@ -38,7 +38,7 @@ multiline.
 --- heading-trailing-whitespace ---
 // Whether headings contain trailing whitespace with or without comments/labels.
 // Labels are special cased to immediately end headings in the parser, but also
-// have unique whitespace behavior.
+// #strike[have unique whitespace behavior] Now their behavior is consistent!
 
 #let join(..xs) = xs.pos().join()
 #let head(h) = heading(depth: 1, h)
@@ -49,19 +49,20 @@ multiline.
 #test(head[h], [= h<a>])
 #test(head[h], [= h/**/<b>])
 
-// Label behaves differently than normal trailing space and comment.
-#test(head(join[h][ ]), [= h  ])
-#test(head(join[h][ ]), [= h  /**/])
+// #strike[Label behaves differently than normal trailing space and comment.]
+// Now they behave the same!
+#test(join(head[h])[ ], [= h  ])
+#test(join(head[h])[ ], [= h  /**/])
 #test(join(head[h])[ ], [= h  <c>])
 
 // Combinations.
-#test(head(join[h][ ][ ]), [= h  /**/  ])
+#test(join(head[h])[ ][ ], [= h  /**/  ])
 #test(join(head[h])[ ][ ], [= h  <d>  ])
-#test(head(join[h][ ]), [= h  /**/<e>])
+#test(join(head[h])[ ], [= h  /**/<e>])
 #test(join(head[h])[ ], [= h/**/  <f>])
 
-// The first space attaches, but not the second
-#test(join(head(join[h][ ]))[ ], [= h  /**/  <g>])
+// #strike[The first space attaches, but not the second] Now neither attaches!
+#test(join(head(join[h]))[ ][ ], [= h  /**/  <g>])
 
 --- heading-leading-whitespace ---
 // Test that leading whitespace and comments don't matter.

From 86ce443806d166f3b75c2a792b0461aa35552dec Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Thu, 24 Oct 2024 22:03:35 -0400
Subject: [PATCH 16/18] 16. Compress with_mode and with_nl_mode to reduce
 rightward drift

---
 crates/typst-syntax/src/parser.rs | 77 +++++++++++++++----------------
 1 file changed, 37 insertions(+), 40 deletions(-)

diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index 6e59f45e6..761cea029 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -206,12 +206,10 @@ fn reference(p: &mut Parser) {
 /// Parses a mathematical equation: `$x$`, `$ x^2 $`.
 fn equation(p: &mut Parser) {
     let m = p.marker();
-    p.with_mode(LexMode::Math, |p| {
-        p.with_nl_mode(AtNewline::Continue, |p| {
-            p.assert(SyntaxKind::Dollar);
-            math(p, |p| p.at(SyntaxKind::Dollar));
-            p.expect_closing_delimiter(m, SyntaxKind::Dollar);
-        });
+    p.enter_modes(LexMode::Math, AtNewline::Continue, |p| {
+        p.assert(SyntaxKind::Dollar);
+        math(p, |p| p.at(SyntaxKind::Dollar));
+        p.expect_closing_delimiter(m, SyntaxKind::Dollar);
     });
     p.wrap(m, SyntaxKind::Equation);
 }
@@ -596,30 +594,28 @@ fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) {
 
 /// Parses an atomic code expression embedded in markup or math.
 fn embedded_code_expr(p: &mut Parser) {
-    p.with_mode(LexMode::Code, |p| {
-        p.with_nl_mode(AtNewline::Stop, |p| {
-            p.assert(SyntaxKind::Hash);
-            if p.had_trivia() {
-                p.expected("expression");
-                return;
-            }
+    p.enter_modes(LexMode::Code, AtNewline::Stop, |p| {
+        p.assert(SyntaxKind::Hash);
+        if p.had_trivia() {
+            p.expected("expression");
+            return;
+        }
 
-            let stmt = p.at_set(set::STMT);
-            let at = p.at_set(set::ATOMIC_CODE_EXPR);
-            code_expr_prec(p, true, 0);
+        let stmt = p.at_set(set::STMT);
+        let at = p.at_set(set::ATOMIC_CODE_EXPR);
+        code_expr_prec(p, true, 0);
 
-            // Consume error for things like `#12p` or `#"abc\"`.#
-            if !at && !p.end() {
-                p.unexpected();
-            }
+        // Consume error for things like `#12p` or `#"abc\"`.#
+        if !at && !p.end() {
+            p.unexpected();
+        }
 
-            let semi = (stmt || p.directly_at(SyntaxKind::Semicolon))
-                && p.eat_if(SyntaxKind::Semicolon);
+        let semi = (stmt || p.directly_at(SyntaxKind::Semicolon))
+            && p.eat_if(SyntaxKind::Semicolon);
 
-            if stmt && !semi && !p.end() && !p.at(SyntaxKind::RightBracket) {
-                p.expected("semicolon or line break");
-            }
-        });
+        if stmt && !semi && !p.end() && !p.at(SyntaxKind::RightBracket) {
+            p.expected("semicolon or line break");
+        }
     });
 }
 
@@ -779,12 +775,10 @@ fn block(p: &mut Parser) {
 /// Parses a code block: `{ let x = 1; x + 2 }`.
 fn code_block(p: &mut Parser) {
     let m = p.marker();
-    p.with_mode(LexMode::Code, |p| {
-        p.with_nl_mode(AtNewline::Continue, |p| {
-            p.assert(SyntaxKind::LeftBrace);
-            code(p, |p| p.at_set(syntax_set!(RightBrace, RightBracket, RightParen)));
-            p.expect_closing_delimiter(m, SyntaxKind::RightBrace);
-        });
+    p.enter_modes(LexMode::Code, AtNewline::Continue, |p| {
+        p.assert(SyntaxKind::LeftBrace);
+        code(p, |p| p.at_set(syntax_set!(RightBrace, RightBracket, RightParen)));
+        p.expect_closing_delimiter(m, SyntaxKind::RightBrace);
     });
     p.wrap(m, SyntaxKind::CodeBlock);
 }
@@ -792,12 +786,10 @@ fn code_block(p: &mut Parser) {
 /// Parses a content block: `[*Hi* there!]`.
 fn content_block(p: &mut Parser) {
     let m = p.marker();
-    p.with_mode(LexMode::Markup, |p| {
-        p.with_nl_mode(AtNewline::Continue, |p| {
-            p.assert(SyntaxKind::LeftBracket);
-            markup(p, true, true, |p| p.at(SyntaxKind::RightBracket));
-            p.expect_closing_delimiter(m, SyntaxKind::RightBracket);
-        });
+    p.enter_modes(LexMode::Markup, AtNewline::Continue, |p| {
+        p.assert(SyntaxKind::LeftBracket);
+        markup(p, true, true, |p| p.at(SyntaxKind::RightBracket));
+        p.expect_closing_delimiter(m, SyntaxKind::RightBracket);
     });
     p.wrap(m, SyntaxKind::ContentBlock);
 }
@@ -1815,10 +1807,15 @@ impl<'s> Parser<'s> {
     /// current token). This may re-lex the final token on exit.
     ///
     /// This function effectively repurposes the call stack as a stack of modes.
-    fn with_mode(&mut self, mode: LexMode, func: impl FnOnce(&mut Parser<'s>)) {
+    fn enter_modes(
+        &mut self,
+        mode: LexMode,
+        stop: AtNewline,
+        func: impl FnOnce(&mut Parser<'s>),
+    ) {
         let previous = self.lexer.mode();
         self.lexer.set_mode(mode);
-        func(self);
+        self.with_nl_mode(stop, func);
         if mode != previous {
             self.lexer.set_mode(previous);
             self.lexer.jump(self.token.prev_end);

From 9d9a1b1e33cdc379200c1d3881c34fe05c496894 Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Sat, 26 Oct 2024 00:37:14 -0400
Subject: [PATCH 17/18] 17. Replace while loop closures and Parser::end() to
 use SyntaxSet

---
 crates/typst-syntax/src/parser.rs   |  81 +++++++++++++---------------
 crates/typst-syntax/src/reparser.rs |   8 +--
 tests/ref/single-right-bracket.png  | Bin 0 -> 118 bytes
 tests/suite/scripting/blocks.typ    |   3 ++
 4 files changed, 42 insertions(+), 50 deletions(-)
 create mode 100644 tests/ref/single-right-bracket.png

diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index 761cea029..b26cc0020 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -12,7 +12,7 @@ use crate::{ast, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode};
 pub fn parse(text: &str) -> SyntaxNode {
     let _scope = typst_timing::TimingScope::new("parse");
     let mut p = Parser::new(text, 0, LexMode::Markup);
-    markup_exprs(&mut p, true, |_| false);
+    markup_exprs(&mut p, true, syntax_set!(End));
     p.finish_into(SyntaxKind::Markup)
 }
 
@@ -20,7 +20,7 @@ pub fn parse(text: &str) -> SyntaxNode {
 pub fn parse_code(text: &str) -> SyntaxNode {
     let _scope = typst_timing::TimingScope::new("parse code");
     let mut p = Parser::new(text, 0, LexMode::Code);
-    code_exprs(&mut p, |_| false);
+    code_exprs(&mut p, syntax_set!(End));
     p.finish_into(SyntaxKind::Code)
 }
 
@@ -28,19 +28,14 @@ pub fn parse_code(text: &str) -> SyntaxNode {
 pub fn parse_math(text: &str) -> SyntaxNode {
     let _scope = typst_timing::TimingScope::new("parse math");
     let mut p = Parser::new(text, 0, LexMode::Math);
-    math_exprs(&mut p, |_| false);
+    math_exprs(&mut p, syntax_set!(End));
     p.finish_into(SyntaxKind::Math)
 }
 
 /// Parses markup expressions until a stop condition is met.
-fn markup(
-    p: &mut Parser,
-    at_start: bool,
-    wrap_trivia: bool,
-    stop: impl FnMut(&Parser) -> bool,
-) {
+fn markup(p: &mut Parser, at_start: bool, wrap_trivia: bool, stop_set: SyntaxSet) {
     let m = if wrap_trivia { p.before_trivia() } else { p.marker() };
-    markup_exprs(p, at_start, stop);
+    markup_exprs(p, at_start, stop_set);
     if wrap_trivia {
         p.flush_trivia();
     }
@@ -48,18 +43,15 @@ fn markup(
 }
 
 /// Parses a sequence of markup expressions.
-fn markup_exprs(
-    p: &mut Parser,
-    mut at_start: bool,
-    mut stop: impl FnMut(&Parser) -> bool,
-) {
+fn markup_exprs(p: &mut Parser, mut at_start: bool, stop_set: SyntaxSet) {
+    debug_assert!(stop_set.contains(SyntaxKind::End));
     at_start |= p.had_newline();
     let mut nesting: usize = 0;
-    while !p.end() {
+    loop {
         match p.current() {
             SyntaxKind::LeftBracket => nesting += 1,
             SyntaxKind::RightBracket if nesting > 0 => nesting -= 1,
-            _ if stop(p) => break,
+            _ if p.at_set(stop_set) => break,
             _ => {}
         }
         markup_expr(p, at_start);
@@ -73,15 +65,16 @@ pub(super) fn reparse_markup(
     range: Range<usize>,
     at_start: &mut bool,
     nesting: &mut usize,
-    mut stop: impl FnMut(SyntaxKind) -> bool,
+    top_level: bool,
 ) -> Option<Vec<SyntaxNode>> {
     let mut p = Parser::new(text, range.start, LexMode::Markup);
     *at_start |= p.had_newline();
-    while !p.end() && p.current_start() < range.end {
+    while p.current_start() < range.end {
         match p.current() {
             SyntaxKind::LeftBracket => *nesting += 1,
             SyntaxKind::RightBracket if *nesting > 0 => *nesting -= 1,
-            _ if stop(p.current()) => break,
+            SyntaxKind::RightBracket if !top_level => break,
+            SyntaxKind::End => break,
             _ => {}
         }
         markup_expr(&mut p, *at_start);
@@ -132,7 +125,7 @@ fn strong(p: &mut Parser) {
     p.with_nl_mode(AtNewline::StopParBreak, |p| {
         let m = p.marker();
         p.assert(SyntaxKind::Star);
-        markup(p, false, true, |p| p.at_set(syntax_set!(Star, RightBracket)));
+        markup(p, false, true, syntax_set!(Star, RightBracket, End));
         p.expect_closing_delimiter(m, SyntaxKind::Star);
         p.wrap(m, SyntaxKind::Strong);
     });
@@ -143,7 +136,7 @@ fn emph(p: &mut Parser) {
     p.with_nl_mode(AtNewline::StopParBreak, |p| {
         let m = p.marker();
         p.assert(SyntaxKind::Underscore);
-        markup(p, false, true, |p| p.at_set(syntax_set!(Underscore, RightBracket)));
+        markup(p, false, true, syntax_set!(Underscore, RightBracket, End));
         p.expect_closing_delimiter(m, SyntaxKind::Underscore);
         p.wrap(m, SyntaxKind::Emph);
     });
@@ -154,7 +147,7 @@ fn heading(p: &mut Parser) {
     p.with_nl_mode(AtNewline::Stop, |p| {
         let m = p.marker();
         p.assert(SyntaxKind::HeadingMarker);
-        markup(p, false, false, |p| p.at_set(syntax_set!(Label, RightBracket)));
+        markup(p, false, false, syntax_set!(Label, RightBracket, End));
         p.wrap(m, SyntaxKind::Heading);
     });
 }
@@ -164,7 +157,7 @@ fn list_item(p: &mut Parser) {
     p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| {
         let m = p.marker();
         p.assert(SyntaxKind::ListMarker);
-        markup(p, false, false, |p| p.at_set(syntax_set!(RightBracket)));
+        markup(p, false, false, syntax_set!(RightBracket, End));
         p.wrap(m, SyntaxKind::ListItem);
     });
 }
@@ -174,7 +167,7 @@ fn enum_item(p: &mut Parser) {
     p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| {
         let m = p.marker();
         p.assert(SyntaxKind::EnumMarker);
-        markup(p, false, false, |p| p.at(SyntaxKind::RightBracket));
+        markup(p, false, false, syntax_set!(RightBracket, End));
         p.wrap(m, SyntaxKind::EnumItem);
     });
 }
@@ -185,10 +178,10 @@ fn term_item(p: &mut Parser) {
         let m = p.marker();
         p.with_nl_mode(AtNewline::Stop, |p| {
             p.assert(SyntaxKind::TermMarker);
-            markup(p, false, false, |p| p.at_set(syntax_set!(Colon, RightBracket)));
+            markup(p, false, false, syntax_set!(Colon, RightBracket, End));
         });
         p.expect(SyntaxKind::Colon);
-        markup(p, false, false, |p| p.at(SyntaxKind::RightBracket));
+        markup(p, false, false, syntax_set!(RightBracket, End));
         p.wrap(m, SyntaxKind::TermItem);
     });
 }
@@ -208,22 +201,23 @@ fn equation(p: &mut Parser) {
     let m = p.marker();
     p.enter_modes(LexMode::Math, AtNewline::Continue, |p| {
         p.assert(SyntaxKind::Dollar);
-        math(p, |p| p.at(SyntaxKind::Dollar));
+        math(p, syntax_set!(Dollar, RightBracket, End));
         p.expect_closing_delimiter(m, SyntaxKind::Dollar);
     });
     p.wrap(m, SyntaxKind::Equation);
 }
 
 /// Parses the contents of a mathematical equation: `x^2 + 1`.
-fn math(p: &mut Parser, stop: impl FnMut(&Parser) -> bool) {
+fn math(p: &mut Parser, stop_set: SyntaxSet) {
     let m = p.marker();
-    math_exprs(p, stop);
+    math_exprs(p, stop_set);
     p.wrap(m, SyntaxKind::Math);
 }
 
 /// Parses a sequence of math expressions.
-fn math_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) {
-    while !p.end() && !stop(p) {
+fn math_exprs(p: &mut Parser, stop_set: SyntaxSet) {
+    debug_assert!(stop_set.contains(SyntaxKind::End));
+    while !p.at_set(stop_set) {
         if p.at_set(set::MATH_EXPR) {
             math_expr(p);
         } else {
@@ -398,7 +392,7 @@ fn math_delimited(p: &mut Parser) {
     let m = p.marker();
     p.eat();
     let m2 = p.marker();
-    while !p.end() && !p.at(SyntaxKind::Dollar) {
+    while !p.at_set(syntax_set!(Dollar, End)) {
         if math_class(p.current_text()) == Some(MathClass::Closing) {
             p.wrap(m2, SyntaxKind::Math);
             p.eat();
@@ -466,7 +460,7 @@ fn math_args(p: &mut Parser) {
     // The number of math expressions per argument.
     let mut count = 0;
 
-    while !p.end() && !p.at(SyntaxKind::Dollar) {
+    while !p.at_set(syntax_set!(Dollar, End)) {
         if namable
             && (p.at(SyntaxKind::MathIdent) || p.at(SyntaxKind::Text))
             && p.text[p.current_end()..].starts_with(':')
@@ -566,22 +560,23 @@ fn maybe_wrap_in_math(p: &mut Parser, arg: Marker, count: usize, named: Option<M
 }
 
 /// Parses the contents of a code block.
-fn code(p: &mut Parser, stop: impl FnMut(&Parser) -> bool) {
+fn code(p: &mut Parser, stop_set: SyntaxSet) {
     let m = p.marker();
-    code_exprs(p, stop);
+    code_exprs(p, stop_set);
     p.wrap(m, SyntaxKind::Code);
 }
 
 /// Parses a sequence of code expressions.
-fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) {
-    while !p.end() && !stop(p) {
+fn code_exprs(p: &mut Parser, stop_set: SyntaxSet) {
+    debug_assert!(stop_set.contains(SyntaxKind::End));
+    while !p.at_set(stop_set) {
         p.with_nl_mode(AtNewline::ContextualContinue, |p| {
             if !p.at_set(set::CODE_EXPR) {
                 p.unexpected();
                 return;
             }
             code_expr(p);
-            if !p.end() && !stop(p) && !p.eat_if(SyntaxKind::Semicolon) {
+            if !p.at_set(stop_set) && !p.eat_if(SyntaxKind::Semicolon) {
                 p.expected("semicolon or line break");
                 if p.at(SyntaxKind::Label) {
                     p.hint("labels can only be applied in markup mode");
@@ -596,7 +591,7 @@ fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) {
 fn embedded_code_expr(p: &mut Parser) {
     p.enter_modes(LexMode::Code, AtNewline::Stop, |p| {
         p.assert(SyntaxKind::Hash);
-        if p.had_trivia() {
+        if p.had_trivia() || p.end() {
             p.expected("expression");
             return;
         }
@@ -606,7 +601,7 @@ fn embedded_code_expr(p: &mut Parser) {
         code_expr_prec(p, true, 0);
 
         // Consume error for things like `#12p` or `#"abc\"`.#
-        if !at && !p.end() {
+        if !at {
             p.unexpected();
         }
 
@@ -777,7 +772,7 @@ fn code_block(p: &mut Parser) {
     let m = p.marker();
     p.enter_modes(LexMode::Code, AtNewline::Continue, |p| {
         p.assert(SyntaxKind::LeftBrace);
-        code(p, |p| p.at_set(syntax_set!(RightBrace, RightBracket, RightParen)));
+        code(p, syntax_set!(RightBrace, RightBracket, RightParen, End));
         p.expect_closing_delimiter(m, SyntaxKind::RightBrace);
     });
     p.wrap(m, SyntaxKind::CodeBlock);
@@ -788,7 +783,7 @@ fn content_block(p: &mut Parser) {
     let m = p.marker();
     p.enter_modes(LexMode::Markup, AtNewline::Continue, |p| {
         p.assert(SyntaxKind::LeftBracket);
-        markup(p, true, true, |p| p.at(SyntaxKind::RightBracket));
+        markup(p, true, true, syntax_set!(RightBracket, End));
         p.expect_closing_delimiter(m, SyntaxKind::RightBracket);
     });
     p.wrap(m, SyntaxKind::ContentBlock);
diff --git a/crates/typst-syntax/src/reparser.rs b/crates/typst-syntax/src/reparser.rs
index 7a9704906..c20d8314f 100644
--- a/crates/typst-syntax/src/reparser.rs
+++ b/crates/typst-syntax/src/reparser.rs
@@ -157,19 +157,13 @@ fn try_reparse(
         let new_range = shifted..shifted + new_len;
         let at_end = end == children.len();
 
-        // Stop parsing early if this kind is encountered.
-        let stop_kind = match parent_kind {
-            Some(_) => SyntaxKind::RightBracket,
-            None => SyntaxKind::End,
-        };
-
         // Reparse!
         let reparsed = reparse_markup(
             text,
             new_range.clone(),
             &mut at_start,
             &mut nesting,
-            |kind| kind == stop_kind,
+            parent_kind.is_none(),
         );
 
         if let Some(newborns) = reparsed {
diff --git a/tests/ref/single-right-bracket.png b/tests/ref/single-right-bracket.png
new file mode 100644
index 0000000000000000000000000000000000000000..9867424ddfa324301c82cc4dde8072d9dfaa899f
GIT binary patch
literal 118
zcmeAS@N?(olHy`uVBq!ia0vp^6+kS_0VEhE<%|3RQnsEhjv*Ddl7HAcG$dYm6xi*q
zE9WORe@;PCL(QJexeYq|46;71IC}Wpqgv*ak0;k1UYz=M#nHuL{ZT$l3=9kQOnGR}
Rb7?8aKu=dcmvv4FO#p!jD<l8_

literal 0
HcmV?d00001

diff --git a/tests/suite/scripting/blocks.typ b/tests/suite/scripting/blocks.typ
index f139b8c62..ba1d9c89c 100644
--- a/tests/suite/scripting/blocks.typ
+++ b/tests/suite/scripting/blocks.typ
@@ -135,6 +135,9 @@
 // Error: 2-3 unexpected closing brace
 #}
 
+--- single-right-bracket ---
+]
+
 --- content-block-in-markup-scope ---
 // Content blocks also create a scope.
 #[#let x = 1]

From 2c9728f53b318a6cae092f30ad0956a536af7ccb Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <wrzian@umich.edu>
Date: Sun, 3 Nov 2024 20:35:21 -0500
Subject: [PATCH 18/18] 18. Restore list indent behavior

---
 crates/typst-syntax/src/lexer.rs  |  8 +++---
 crates/typst-syntax/src/parser.rs | 41 ++++++++++++----------------
 tests/suite/model/list.typ        | 45 +++++++++++++++++++++++++++++++
 3 files changed, 67 insertions(+), 27 deletions(-)

diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs
index d09c6f842..1314016fa 100644
--- a/crates/typst-syntax/src/lexer.rs
+++ b/crates/typst-syntax/src/lexer.rs
@@ -69,9 +69,11 @@ impl<'s> Lexer<'s> {
         self.newline
     }
 
-    /// The number of characters until the most recent newline.
-    pub fn column(&self) -> usize {
-        self.s.before().chars().rev().take_while(|&c| !is_newline(c)).count()
+    /// The number of characters until the most recent newline from an index.
+    pub fn column(&self, index: usize) -> usize {
+        let mut s = self.s; // Make a new temporary scanner (cheap).
+        s.jump(index);
+        s.before().chars().rev().take_while(|&c| !is_newline(c)).count()
     }
 }
 
diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index b26cc0020..5fc621d6d 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -1545,11 +1545,7 @@ struct Token {
 /// Information about a newline if present (currently only relevant in Markup).
 #[derive(Debug, Clone, Copy)]
 struct Newline {
-    /// The column of our token in its line.
-    ///
-    /// Note that this is actually the column of the first non-whitespace
-    /// `SyntaxKind` in the line, so `\n  /**/- list` has column 2 (not 6)
-    /// because the block comment is the first non-space kind.
+    /// The column of the start of our token in its line.
     column: Option<usize>,
     /// Whether any of our newlines were paragraph breaks.
     parbreak: bool,
@@ -1684,10 +1680,6 @@ impl<'s> Parser<'s> {
 
     /// The number of characters until the most recent newline from the current
     /// token, or 0 if it did not follow a newline.
-    ///
-    /// Note that this is actually the column of the first non-whitespace
-    /// `SyntaxKind` in the line, so `\n  /**/- list` has column 2 (not 6)
-    /// because the block comment is the first non-space kind.
     fn current_column(&self) -> usize {
         self.token.newline.and_then(|newline| newline.column).unwrap_or(0)
     }
@@ -1852,29 +1844,30 @@ impl<'s> Parser<'s> {
         let (mut kind, mut node) = lexer.next();
         let mut n_trivia = 0;
         let mut had_newline = false;
-        let mut newline = Newline { column: None, parbreak: false };
+        let mut parbreak = false;
 
         while kind.is_trivia() {
-            if lexer.newline() {
-                // Newlines are always trivia.
-                had_newline = true;
-                newline.parbreak |= kind == SyntaxKind::Parbreak;
-                if lexer.mode() == LexMode::Markup {
-                    newline.column = Some(lexer.column());
-                }
-            }
+            had_newline |= lexer.newline(); // Newlines are always trivia.
+            parbreak |= kind == SyntaxKind::Parbreak;
             n_trivia += 1;
             nodes.push(node);
             start = lexer.cursor();
             (kind, node) = lexer.next();
         }
-        if had_newline && nl_mode.stop_at(newline, kind) {
-            // Insert a temporary `SyntaxKind::End` to halt the parser.
-            // The actual kind will be restored from `node` later.
-            kind = SyntaxKind::End;
-        }
 
-        let newline = had_newline.then_some(newline);
+        let newline = if had_newline {
+            let column = (lexer.mode() == LexMode::Markup).then(|| lexer.column(start));
+            let newline = Newline { column, parbreak };
+            if nl_mode.stop_at(newline, kind) {
+                // Insert a temporary `SyntaxKind::End` to halt the parser.
+                // The actual kind will be restored from `node` later.
+                kind = SyntaxKind::End;
+            }
+            Some(newline)
+        } else {
+            None
+        };
+
         Token { kind, node, n_trivia, newline, start, prev_end }
     }
 }
diff --git a/tests/suite/model/list.typ b/tests/suite/model/list.typ
index 46f4621f5..c3c123de1 100644
--- a/tests/suite/model/list.typ
+++ b/tests/suite/model/list.typ
@@ -34,6 +34,51 @@ _Shopping list_
    - C
 - D
 
+--- list-indent-trivia-nesting ---
+// Test indent nesting behavior with odd trivia (comments and spaces).
+
+#let indented = [
+- a
+ /**/- b
+/**/ - c
+   /*spanning
+     multiple
+      lines */ - d
+    - e
+/**/       - f
+/**/  - g
+]
+// Current behavior is that list columns are based on the first non-whitespace
+// element in their line, so the block comments here determine the column the
+// list starts at
+
+#let item = list.item
+#let manual = {
+  [ ]
+  item({
+    [a]
+    [ ]
+    item[b]
+    [ ]; [ ]
+    item({
+      [c]
+      [ ]; [ ]
+      item[d]
+    })
+    [ ]
+    item({
+      [e]
+      [ ]; [ ]
+      item[f]
+      [ ]; [ ]
+      item[g]
+    })
+  })
+  [ ]
+}
+
+#test(indented, manual)
+
 --- list-tabs ---
 // This works because tabs are used consistently.
 	- A with 1 tab