15. Convert Markup mode to use newline modes

(And break out Newline info into separate struct)
2025-07-03 10:42:52 +08:00 · 2024-10-22 00:13:56 -04:00 · 2024-10-22 00:13:56 -04:00 · 26c61be1dc
commit 26c61be1dc
parent 4ce0b069f6
3 changed files with 166 additions and 149 deletions
--- a/crates/typst-syntax/src/lexer.rs
+++ b/crates/typst-syntax/src/lexer.rs
@ -68,6 +68,11 @@ impl<'s> Lexer<'s> {
    pub fn newline(&self) -> bool {
        self.newline
    }
    /// The number of characters until the most recent newline.
    pub fn column(&self) -> usize {
        self.s.before().chars().rev().take_while(|&c| !is_newline(c)).count()
    }
 }
 impl Lexer<'_> {
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@ -6,13 +6,13 @@ use ecow::{eco_format, EcoString};
 use unicode_math_class::MathClass;
 use crate::set::{syntax_set, SyntaxSet};
-use crate::{ast, is_newline, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode};
+use crate::{ast, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode};
 /// Parses a source file as top-level markup.
 pub fn parse(text: &str) -> SyntaxNode {
    let _scope = typst_timing::TimingScope::new("parse");
    let mut p = Parser::new(text, 0, LexMode::Markup);
-    markup_exprs(&mut p, true, 0, |_| false);
+    markup_exprs(&mut p, true, |_| false);
    p.finish_into(SyntaxKind::Markup)
 }
@ -36,11 +36,14 @@ pub fn parse_math(text: &str) -> SyntaxNode {
 fn markup(
    p: &mut Parser,
    at_start: bool,
-    min_indent: usize,
+    wrap_trivia: bool,
    stop: impl FnMut(&Parser) -> bool,
 ) {
-    let m = p.marker();
+    let m = if wrap_trivia { p.before_trivia() } else { p.marker() };
-    markup_exprs(p, at_start, min_indent, stop);
+    markup_exprs(p, at_start, stop);
    if wrap_trivia {
        p.flush_trivia();
    }
    p.wrap(m, SyntaxKind::Markup);
 }
@ -48,9 +51,9 @@ fn markup(
 fn markup_exprs(
    p: &mut Parser,
    mut at_start: bool,
    min_indent: usize,
    mut stop: impl FnMut(&Parser) -> bool,
 ) {
    at_start |= p.had_newline();
    let mut nesting: usize = 0;
    while !p.end() {
        match p.current() {
@ -59,17 +62,8 @@ fn markup_exprs(
            _ if stop(p) => break,
            _ => {}
        }
-
+        markup_expr(p, at_start);
-        if p.newline() {
+        at_start = p.had_newline();
            at_start = true;
            if min_indent > 0 && p.column(p.current_end()) < min_indent {
                break;
            }
            p.eat();
            continue;
        }
        markup_expr(p, &mut at_start);
    }
 }
@ -82,6 +76,7 @@ pub(super) fn reparse_markup(
    mut stop: impl FnMut(SyntaxKind) -> bool,
 ) -> Option<Vec<SyntaxNode>> {
    let mut p = Parser::new(text, range.start, LexMode::Markup);
    *at_start |= p.had_newline();
    while !p.end() && p.current_start() < range.end {
        match p.current() {
            SyntaxKind::LeftBracket => *nesting += 1,
@ -89,30 +84,17 @@ pub(super) fn reparse_markup(
            _ if stop(p.current()) => break,
            _ => {}
        }
-
+        markup_expr(&mut p, *at_start);
-        if p.newline() {
+        *at_start = p.had_newline();
            *at_start = true;
            p.eat();
            continue;
        }
        markup_expr(&mut p, at_start);
    }
    (p.balanced && p.current_start() == range.end).then(|| p.finish())
 }
-/// Parses a single markup expression. This includes markup elements like
+/// Parses a single markup expression. This includes markup elements like text,
-/// spaces, text, and headings, and embedded code expressions.
+/// headings, strong/emph, lists/enums, etc. This is also the entry point for
-fn markup_expr(p: &mut Parser, at_start: &mut bool) {
+/// parsing math equations and embedded code expressions.
 fn markup_expr(p: &mut Parser, at_start: bool) {
    match p.current() {
        SyntaxKind::Space
        | SyntaxKind::Parbreak
        | SyntaxKind::LineComment
        | SyntaxKind::BlockComment => {
            p.eat();
            return;
        }
        SyntaxKind::Text
        | SyntaxKind::Linebreak
        | SyntaxKind::Escape
@ -126,10 +108,10 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) {
        SyntaxKind::Hash => embedded_code_expr(p),
        SyntaxKind::Star => strong(p),
        SyntaxKind::Underscore => emph(p),
-        SyntaxKind::HeadingMarker if *at_start => heading(p),
+        SyntaxKind::HeadingMarker if at_start => heading(p),
-        SyntaxKind::ListMarker if *at_start => list_item(p),
+        SyntaxKind::ListMarker if at_start => list_item(p),
-        SyntaxKind::EnumMarker if *at_start => enum_item(p),
+        SyntaxKind::EnumMarker if at_start => enum_item(p),
-        SyntaxKind::TermMarker if *at_start => term_item(p),
+        SyntaxKind::TermMarker if at_start => term_item(p),
        SyntaxKind::RefMarker => reference(p),
        SyntaxKind::Dollar => equation(p),
@ -141,76 +123,74 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) {
        | SyntaxKind::TermMarker
        | SyntaxKind::Colon => p.convert_and_eat(SyntaxKind::Text),
-        _ => {
+        _ => p.unexpected(),
            p.unexpected();
            return; // Don't set `at_start`
        }
    }
    *at_start = false;
 }
 /// Parses strong content: `*Strong*`.
 fn strong(p: &mut Parser) {
-    let m = p.marker();
+    p.with_nl_mode(AtNewline::StopParBreak, |p| {
-    p.assert(SyntaxKind::Star);
+        let m = p.marker();
-    markup(p, false, 0, |p| p.at_set(syntax_set!(Star, Parbreak, RightBracket)));
+        p.assert(SyntaxKind::Star);
-    p.expect_closing_delimiter(m, SyntaxKind::Star);
+        markup(p, false, true, |p| p.at_set(syntax_set!(Star, RightBracket)));
-    p.wrap(m, SyntaxKind::Strong);
+        p.expect_closing_delimiter(m, SyntaxKind::Star);
        p.wrap(m, SyntaxKind::Strong);
    });
 }
 /// Parses emphasized content: `_Emphasized_`.
 fn emph(p: &mut Parser) {
-    let m = p.marker();
+    p.with_nl_mode(AtNewline::StopParBreak, |p| {
-    p.assert(SyntaxKind::Underscore);
+        let m = p.marker();
-    markup(p, false, 0, |p| p.at_set(syntax_set!(Underscore, Parbreak, RightBracket)));
+        p.assert(SyntaxKind::Underscore);
-    p.expect_closing_delimiter(m, SyntaxKind::Underscore);
+        markup(p, false, true, |p| p.at_set(syntax_set!(Underscore, RightBracket)));
-    p.wrap(m, SyntaxKind::Emph);
+        p.expect_closing_delimiter(m, SyntaxKind::Underscore);
        p.wrap(m, SyntaxKind::Emph);
    });
 }
 /// Parses a section heading: `= Introduction`.
 fn heading(p: &mut Parser) {
-    let m = p.marker();
+    p.with_nl_mode(AtNewline::Stop, |p| {
-    p.assert(SyntaxKind::HeadingMarker);
+        let m = p.marker();
-    whitespace_line(p);
+        p.assert(SyntaxKind::HeadingMarker);
-    markup(p, false, usize::MAX, |p| {
+        markup(p, false, false, |p| p.at_set(syntax_set!(Label, RightBracket)));
-        p.at_set(syntax_set!(Label, Space, RightBracket))
+        p.wrap(m, SyntaxKind::Heading);
            && (!p.at(SyntaxKind::Space) || p.lexer.clone().next().0 == SyntaxKind::Label)
    });
    p.wrap(m, SyntaxKind::Heading);
 }
 /// Parses an item in a bullet list: `- ...`.
 fn list_item(p: &mut Parser) {
-    let m = p.marker();
+    p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| {
-    let min_indent = p.column(p.current_start()) + 1;
+        let m = p.marker();
-    p.assert(SyntaxKind::ListMarker);
+        p.assert(SyntaxKind::ListMarker);
-    whitespace_line(p);
+        markup(p, false, false, |p| p.at_set(syntax_set!(RightBracket)));
-    markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket));
+        p.wrap(m, SyntaxKind::ListItem);
-    p.wrap(m, SyntaxKind::ListItem);
+    });
 }
 /// Parses an item in an enumeration (numbered list): `+ ...` or `1. ...`.
 fn enum_item(p: &mut Parser) {
-    let m = p.marker();
+    p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| {
-    let min_indent = p.column(p.current_start()) + 1;
+        let m = p.marker();
-    p.assert(SyntaxKind::EnumMarker);
+        p.assert(SyntaxKind::EnumMarker);
-    whitespace_line(p);
+        markup(p, false, false, |p| p.at(SyntaxKind::RightBracket));
-    markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket));
+        p.wrap(m, SyntaxKind::EnumItem);
-    p.wrap(m, SyntaxKind::EnumItem);
+    });
 }
 /// Parses an item in a term list: `/ Term: Details`.
 fn term_item(p: &mut Parser) {
-    let m = p.marker();
+    p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| {
-    p.assert(SyntaxKind::TermMarker);
+        let m = p.marker();
-    let min_indent = p.column(p.prev_end());
+        p.with_nl_mode(AtNewline::Stop, |p| {
-    whitespace_line(p);
+            p.assert(SyntaxKind::TermMarker);
-    markup(p, false, usize::MAX, |p| p.at_set(syntax_set!(Colon, RightBracket)));
+            markup(p, false, false, |p| p.at_set(syntax_set!(Colon, RightBracket)));
-    p.expect(SyntaxKind::Colon);
+        });
-    whitespace_line(p);
+        p.expect(SyntaxKind::Colon);
-    markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket));
+        markup(p, false, false, |p| p.at(SyntaxKind::RightBracket));
-    p.wrap(m, SyntaxKind::TermItem);
+        p.wrap(m, SyntaxKind::TermItem);
    });
 }
 /// Parses a reference: `@target`, `@target[..]`.
@ -223,20 +203,15 @@ fn reference(p: &mut Parser) {
    p.wrap(m, SyntaxKind::Ref);
 }
 /// Consumes whitespace that does not contain a newline.
 fn whitespace_line(p: &mut Parser) {
    while !p.newline() && p.current().is_trivia() {
        p.eat();
    }
 }
 /// Parses a mathematical equation: `$x$`, `$ x^2 $`.
 fn equation(p: &mut Parser) {
    let m = p.marker();
    p.with_mode(LexMode::Math, |p| {
-        p.assert(SyntaxKind::Dollar);
+        p.with_nl_mode(AtNewline::Continue, |p| {
-        math(p, |p| p.at(SyntaxKind::Dollar));
+            p.assert(SyntaxKind::Dollar);
-        p.expect_closing_delimiter(m, SyntaxKind::Dollar);
+            math(p, |p| p.at(SyntaxKind::Dollar));
            p.expect_closing_delimiter(m, SyntaxKind::Dollar);
        });
    });
    p.wrap(m, SyntaxKind::Equation);
 }
@ -602,7 +577,7 @@ fn code(p: &mut Parser, stop: impl FnMut(&Parser) -> bool) {
 /// Parses a sequence of code expressions.
 fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) {
    while !p.end() && !stop(p) {
-        p.with_nl_mode(AtNewline::Contextual, |p| {
+        p.with_nl_mode(AtNewline::ContextualContinue, |p| {
            if !p.at_set(set::CODE_EXPR) {
                p.unexpected();
                return;
@ -818,9 +793,11 @@ fn code_block(p: &mut Parser) {
 fn content_block(p: &mut Parser) {
    let m = p.marker();
    p.with_mode(LexMode::Markup, |p| {
-        p.assert(SyntaxKind::LeftBracket);
+        p.with_nl_mode(AtNewline::Continue, |p| {
-        markup(p, true, 0, |p| p.at(SyntaxKind::RightBracket));
+            p.assert(SyntaxKind::LeftBracket);
-        p.expect_closing_delimiter(m, SyntaxKind::RightBracket);
+            markup(p, true, true, |p| p.at(SyntaxKind::RightBracket));
            p.expect_closing_delimiter(m, SyntaxKind::RightBracket);
        });
    });
    p.wrap(m, SyntaxKind::ContentBlock);
 }
@ -1526,15 +1503,11 @@ fn pattern_leaf<'s>(
 /// [lexer modes](`LexMode`) and [newline modes](`AtNewline`).
 ///
 /// The lexer modes map to the three Typst modes and are stored in the lexer,
-/// changing which`SyntaxKind`s it will generate. The mode also affects how the
+/// changing which`SyntaxKind`s it will generate.
 /// parser treats trivia tokens (comments and whitespace). In Markup, trivia is
 /// handled manually to deal with list indentation and must be explicitly eaten.
 /// In Code and Math, trivia is managed internally and is implicitly eaten by
 /// pushing onto the end of the `nodes` vector until a non-trivia kind is found.
 ///
-/// The newline mode is used in Code to determine whether a newline should end
+/// The newline mode is used to determine whether a newline should end the
-/// the current expression. If so, the parser temporarily changes `token`'s kind
+/// current expression. If so, the parser temporarily changes `token`'s kind to
-/// to a fake [`SyntaxKind::End`]. When the parser exits the mode the original
+/// a fake [`SyntaxKind::End`]. When the parser exits the mode the original
 /// `SyntaxKind` is restored.
 struct Parser<'s> {
    /// The source text shared with the lexer.
@ -1543,7 +1516,7 @@ struct Parser<'s> {
    /// of tokens and determines their [`SyntaxKind`]. Contains the [`LexMode`]
    /// defining our current Typst mode.
    lexer: Lexer<'s>,
-    /// The newline mode: whether to insert a temporary end at newlines in Code.
+    /// The newline mode: whether to insert a temporary end at newlines.
    nl_mode: AtNewline,
    /// The current token under inspection, not yet present in `nodes`. This
    /// acts like a single item of lookahead for the parser.
@ -1574,7 +1547,7 @@ struct Token {
    /// The number of preceding trivia before this token.
    n_trivia: usize,
    /// Whether this token's preceding trivia contained a newline.
-    had_newline: bool,
+    newline: Option<Newline>,
    /// The index into `text` of the start of our current token (the end is
    /// stored as the lexer's cursor).
    start: usize,
@ -1582,28 +1555,52 @@ struct Token {
    prev_end: usize,
 }
-/// How to proceed with parsing when at a newline in Code.
+/// Information about a newline if present (currently only relevant in Markup).
 #[derive(Debug, Clone, Copy)]
 struct Newline {
    /// The column of our token in its line.
    ///
    /// Note that this is actually the column of the first non-whitespace
    /// `SyntaxKind` in the line, so `\n  /**/- list` has column 2 (not 6)
    /// because the block comment is the first non-space kind.
    column: Option<usize>,
    /// Whether any of our newlines were paragraph breaks.
    parbreak: bool,
 }
 /// How to proceed with parsing when at a newline.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 enum AtNewline {
    /// Continue at newlines.
    Continue,
    /// Stop at any newline.
    Stop,
-    /// Continue only if there is no continuation with `else` or `.`.
+    /// Continue only if there is no continuation with `else` or `.` (Code only).
-    Contextual,
+    ContextualContinue,
    /// Stop only at a parbreak, not normal newlines (Markup only).
    StopParBreak,
    /// Require that the token's column be greater or equal to a column (Markup
    /// only). If this is `0`, acts like `Continue`; if this is `usize::MAX`,
    /// acts like `Stop`.
    RequireColumn(usize),
 }
 impl AtNewline {
    /// Whether to stop at a newline or continue based on the current context.
-    fn stop(self, kind: SyntaxKind) -> bool {
+    fn stop_at(self, Newline { column, parbreak }: Newline, kind: SyntaxKind) -> bool {
        #[allow(clippy::match_like_matches_macro)]
        match self {
            AtNewline::Continue => false,
            AtNewline::Stop => true,
-            AtNewline::Contextual => match kind {
+            AtNewline::ContextualContinue => match kind {
                SyntaxKind::Else | SyntaxKind::Dot => false,
                _ => true,
            },
            AtNewline::StopParBreak => parbreak,
            AtNewline::RequireColumn(min_col) => match column {
                Some(column) => column <= min_col,
                None => false, // Don't stop if we had no column.
            },
        }
    }
 }
@ -1688,19 +1685,24 @@ impl<'s> Parser<'s> {
        self.token.kind == kind && !self.had_trivia()
    }
-    /// Whether `token` had any trivia before it in Code/Math.
+    /// Whether `token` had any preceding trivia.
    fn had_trivia(&self) -> bool {
        self.token.n_trivia > 0
    }
-    /// Whether the current token is a newline, only used in Markup.
+    /// Whether `token` had a newline among any of its preceding trivia.
-    fn newline(&self) -> bool {
+    fn had_newline(&self) -> bool {
-        self.token.had_newline
+        self.token.newline.is_some()
    }
-    /// The number of characters until the most recent newline in `text`.
+    /// The number of characters until the most recent newline from the current
-    fn column(&self, at: usize) -> usize {
+    /// token, or 0 if it did not follow a newline.
-        self.text[..at].chars().rev().take_while(|&c| !is_newline(c)).count()
+    ///
    /// Note that this is actually the column of the first non-whitespace
    /// `SyntaxKind` in the line, so `\n  /**/- list` has column 2 (not 6)
    /// because the block comment is the first non-space kind.
    fn current_column(&self) -> usize {
        self.token.newline.and_then(|newline| newline.column).unwrap_or(0)
    }
    /// The current token's text.
@ -1834,12 +1836,15 @@ impl<'s> Parser<'s> {
        self.nl_mode = mode;
        func(self);
        self.nl_mode = previous;
-        if mode != previous && self.token.had_newline {
+        if let Some(newline) = self.token.newline {
-            let actual_kind = self.token.node.kind();
+            if mode != previous {
-            if self.nl_mode.stop(actual_kind) {
+                // Restore our actual token's kind or insert a fake end.
-                self.token.kind = SyntaxKind::End;
+                let actual_kind = self.token.node.kind();
-            } else {
+                if self.nl_mode.stop_at(newline, actual_kind) {
-                self.token.kind = actual_kind;
+                    self.token.kind = SyntaxKind::End;
                } else {
                    self.token.kind = actual_kind;
                }
            }
        }
    }
@ -1854,25 +1859,31 @@ impl<'s> Parser<'s> {
        let mut start = prev_end;
        let (mut kind, mut node) = lexer.next();
        let mut n_trivia = 0;
-        let mut had_newline = lexer.newline();
+        let mut had_newline = false;
        let mut newline = Newline { column: None, parbreak: false };
-        if lexer.mode() != LexMode::Markup {
+        while kind.is_trivia() {
-            while kind.is_trivia() {
+            if lexer.newline() {
-                n_trivia += 1;
+                // Newlines are always trivia.
-                nodes.push(node);
+                had_newline = true;
-                start = lexer.cursor();
+                newline.parbreak |= kind == SyntaxKind::Parbreak;
-                (kind, node) = lexer.next();
+                if lexer.mode() == LexMode::Markup {
-                had_newline |= lexer.newline();
+                    newline.column = Some(lexer.column());
            }
            if lexer.mode() == LexMode::Code && had_newline {
                // Insert a temporary ['SyntaxKind::End'] to halt the parser.
                // The actual `SyntaxKind` will be restored from `node` later.
                if nl_mode.stop(kind) {
                    kind = SyntaxKind::End;
                }
            }
            n_trivia += 1;
            nodes.push(node);
            start = lexer.cursor();
            (kind, node) = lexer.next();
        }
-        Token { kind, node, n_trivia, had_newline, start, prev_end }
+        if had_newline && nl_mode.stop_at(newline, kind) {
            // Insert a temporary `SyntaxKind::End` to halt the parser.
            // The actual kind will be restored from `node` later.
            kind = SyntaxKind::End;
        }
        let newline = had_newline.then_some(newline);
        Token { kind, node, n_trivia, newline, start, prev_end }
    }
 }
--- a/tests/suite/model/heading.typ
+++ b/tests/suite/model/heading.typ
@ -38,7 +38,7 @@ multiline.
 --- heading-trailing-whitespace ---
 // Whether headings contain trailing whitespace with or without comments/labels.
 // Labels are special cased to immediately end headings in the parser, but also
-// have unique whitespace behavior.
+// #strike[have unique whitespace behavior] Now their behavior is consistent!
 #let join(..xs) = xs.pos().join()
 #let head(h) = heading(depth: 1, h)
@ -49,19 +49,20 @@ multiline.
 #test(head[h], [= h<a>])
 #test(head[h], [= h/**/<b>])
-// Label behaves differently than normal trailing space and comment.
+// #strike[Label behaves differently than normal trailing space and comment.]
-#test(head(join[h][ ]), [= h  ])
+// Now they behave the same!
-#test(head(join[h][ ]), [= h  /**/])
+#test(join(head[h])[ ], [= h  ])
 #test(join(head[h])[ ], [= h  /**/])
 #test(join(head[h])[ ], [= h  <c>])
 // Combinations.
-#test(head(join[h][ ][ ]), [= h  /**/  ])
+#test(join(head[h])[ ][ ], [= h  /**/  ])
 #test(join(head[h])[ ][ ], [= h  <d>  ])
-#test(head(join[h][ ]), [= h  /**/<e>])
+#test(join(head[h])[ ], [= h  /**/<e>])
 #test(join(head[h])[ ], [= h/**/  <f>])
-// The first space attaches, but not the second
+// #strike[The first space attaches, but not the second] Now neither attaches!
-#test(join(head(join[h][ ]))[ ], [= h  /**/  <g>])
+#test(join(head(join[h]))[ ][ ], [= h  /**/  <g>])
 --- heading-leading-whitespace ---
 // Test that leading whitespace and comments don't matter.