Fix incremental parsing bugs

2025-07-04 03:02:53 +08:00 · 2022-01-31 13:26:40 +01:00 · 2022-01-31 13:26:40 +01:00 · 8f37189d6f
commit 8f37189d6f
parent b1a82ae22d
6 changed files with 95 additions and 62 deletions
--- a/src/layout/incremental.rs
+++ b/src/layout/incremental.rs
@ -408,7 +408,7 @@ mod tests {
    }

    #[test]
-    fn test_incremental_temperature() {
+    fn test_layout_incremental_temperature() {
        let mut cache = LayoutCache::new(EvictionPolicy::None, 20);
        let regions = zero_regions();
        cache.policy = EvictionPolicy::None;
@ -447,7 +447,7 @@ mod tests {
    }

    #[test]
-    fn test_incremental_properties() {
+    fn test_layout_incremental_properties() {
        let mut cache = LayoutCache::new(EvictionPolicy::None, 20);
        cache.policy = EvictionPolicy::None;
        cache.insert(0, FramesEntry::new(empty_frames(), 1));
--- a/src/parse/incremental.rs
+++ b/src/parse/incremental.rs
@ -116,7 +116,7 @@ impl Reparser<'_> {
            // This is because in Markup mode, we want to examine all nodes
            // touching a replacement but in code we want to atomically replace.
            if child_span.contains(&self.replace_range.start)
-                || (mode == TokenMode::Markup
+                || (child_mode == TokenMode::Markup
                    && self.replace_range.start == child_span.end)
            {
                first = Some((i, offset));
@ -139,12 +139,12 @@ impl Reparser<'_> {
            // neighbor!
            if child_span.contains(&self.replace_range.end)
                || self.replace_range.end == child_span.end
-                    && (mode != TokenMode::Markup || i + 1 == original_count)
+                    && (child_mode != TokenMode::Markup || i + 1 == original_count)
            {
                outermost &= i + 1 == original_count;
                last = Some((i, offset + child.len()));
                break;
-            } else if mode != TokenMode::Markup
+            } else if child_mode != TokenMode::Markup
                || !child.kind().succession_rule().safe_in_markup()
            {
                break;
@ -404,10 +404,10 @@ impl NodeKind {
        let mode = self.mode().unwrap_or(parent_mode);
        match self.succession_rule() {
            SuccessionRule::Unsafe | SuccessionRule::UnsafeLayer => None,
-            SuccessionRule::AtomicPrimary if mode == TokenMode::Code => {
-                Some(parse_atomic)
-            }
-            SuccessionRule::AtomicPrimary => Some(parse_atomic_markup),
+            SuccessionRule::AtomicPrimary => match mode {
+                TokenMode::Code => Some(parse_atomic),
+                TokenMode::Markup => Some(parse_atomic_markup),
+            },
            SuccessionRule::SameKind(x) if x == None || x == Some(mode) => match self {
                NodeKind::Markup(_) => Some(parse_markup),
                NodeKind::Template => Some(parse_template),
@ -601,28 +601,29 @@ impl SuccessionRule {
 }

 #[cfg(test)]
+#[rustfmt::skip]
 mod tests {
    use super::*;
    use crate::parse::parse;
+    use crate::parse::tests::check;
    use crate::source::SourceFile;

-    #[test]
-    #[rustfmt::skip]
-    fn test_incremental_parse() {
-        #[track_caller]
-        fn test(prev: &str, range: Range<usize>, with: &str, goal: Range<usize>) {
-            let mut source = SourceFile::detached(prev);
-            let range = source.edit(range, with);
-            assert_eq!(range, goal);
-            assert_eq!(parse(source.src()), *source.root());
-        }
+    #[track_caller]
+    fn test(prev: &str, range: Range<usize>, with: &str, goal: Range<usize>) {
+        let mut source = SourceFile::detached(prev);
+        let range = source.edit(range, with);
+        check(source.src(), source.root(), &parse(source.src()));
+        assert_eq!(range, goal);
+    }

-        // Test simple replacements.
+    #[test]
+    fn test_parse_incremental_simple_replacements() {
        test("hello world", 6 .. 11, "walkers", 5 .. 13);
        test("some content", 0..12, "", 0..0);
        test("", 0..0, "do it", 0..5);
        test("a d e", 1 .. 3, " b c d", 0 .. 8);
        test("a #f() e", 1 .. 6, " b c d", 0 .. 8);
+        test("{a}", 1 .. 2, "b", 1 .. 2);
        test("{(0, 1, 2)}", 5 .. 6, "11pt", 5 .. 9);
        test("= A heading", 3 .. 3, "n evocative", 2 .. 22);
        test("your thing", 5 .. 5, "a", 4 .. 11);
@ -641,8 +642,12 @@ mod tests {
        test("understanding `code` is complicated", 15 .. 15, "C ", 0 .. 37);
        test("{ let x = g() }", 10 .. 12, "f(54", 2 .. 15);
        test("a #let rect with (fill: eastern)\nb", 16 .. 31, " (stroke: conifer", 2 .. 34);
+        test(r#"a ```typst hello``` b"#, 16 .. 17, "", 0 .. 20);
+        test(r#"a ```typst hello```"#, 16 .. 17, "", 0 .. 18);
+    }

-        // Test the whitespace invariants.
+    #[test]
+    fn test_parse_incremental_whitespace_invariants() {
        test("hello \\ world", 7 .. 8, "a ", 6 .. 14);
        test("hello \\ world", 7 .. 8, " a", 6 .. 14);
        test("x = y", 1 .. 1, " + y", 0 .. 6);
@ -652,8 +657,10 @@ mod tests {
        test("#let x = (1, 2 + ; Five\r\n\r", 19..22, "2.", 18..22);
        test("hey #myfriend", 4 .. 4, "\\", 0 .. 14);
        test("hey  #myfriend", 4 .. 4, "\\", 3 .. 6);
+    }

-        // Test type invariants.
+    #[test]
+    fn test_parse_incremental_type_invariants() {
        test("a #for x in array {x}", 18 .. 21, "[#x]", 2 .. 22);
        test("a #let x = 1 {5}", 3 .. 6, "if", 0 .. 15);
        test("a {let x = 1 {5}} b", 3 .. 6, "if", 2 .. 16);
@ -664,9 +671,11 @@ mod tests {
        test("a{\nf()\n//g(a)\n}b", 7 .. 9, "", 1 .. 13);
        test("a #while x {\n g(x) \n}  b", 11 .. 11, "//", 0 .. 26);
        test("{(1, 2)}", 1 .. 1, "while ", 0 .. 14);
-        test("a b c", 1 .. 1, "{[}", 0 .. 8);
+        test("a b c", 1 .. 1, "{[}", 0 .. 5);
+    }

-        // Test unclosed things.
+    #[test]
+    fn test_parse_incremental_wrongly_or_unclosed_things() {
        test(r#"{"hi"}"#, 4 .. 5, "c", 0 .. 6);
        test(r"this \u{abcd}", 8 .. 9, "", 5 .. 12);
        test(r"this \u{abcd} that", 12 .. 13, "", 0 .. 17);
@ -675,9 +684,10 @@ mod tests {
        test("a b c", 1 .. 1, " /* letters", 0 .. 16);
        test("{if i==1 {a} else [b]; b()}", 12 .. 12, " /* letters */", 1 .. 35);
        test("{if i==1 {a} else [b]; b()}", 12 .. 12, " /* letters", 0 .. 38);
-
-        // Test raw tokens.
-        test(r#"a ```typst hello``` b"#, 16 .. 17, "", 0 .. 20);
-        test(r#"a ```typst hello```"#, 16 .. 17, "", 0 .. 18);
+        test("~~~~", 2 .. 2, "[]", 1 .. 5);
+        test("a[]b", 2 .. 2, "{", 1 .. 4);
+        test("[hello]", 2 .. 3, "]", 0 .. 7);
+        test("{a}", 1 .. 2, "b", 1 .. 2);
+        test("{ a; b; c }", 5 .. 6, "[}]", 0 .. 13);
    }
 }
--- a/src/parse/mod.rs
+++ b/src/parse/mod.rs
@ -68,7 +68,7 @@ pub fn parse_atomic(
 ) -> Option<(Vec<Green>, bool)> {
    let mut p = Parser::with_prefix(prefix, src, TokenMode::Code);
    primary(&mut p, true).ok()?;
-    p.consume_unterminated()
+    p.consume_open_ended()
 }

 /// Parse an atomic primary. Returns `Some` if all of the input was consumed.
@ -80,7 +80,7 @@ pub fn parse_atomic_markup(
 ) -> Option<(Vec<Green>, bool)> {
    let mut p = Parser::with_prefix(prefix, src, TokenMode::Markup);
    markup_expr(&mut p);
-    p.consume_unterminated()
+    p.consume_open_ended()
 }

 /// Parse a template literal. Returns `Some` if all of the input was consumed.
@ -919,3 +919,21 @@ fn comment(p: &mut Parser) -> ParseResult {
        _ => Err(ParseError),
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::fmt::Debug;
+
+    #[track_caller]
+    pub fn check<T>(src: &str, found: T, expected: T)
+    where
+        T: Debug + PartialEq,
+    {
+        if found != expected {
+            println!("source:   {src:?}");
+            println!("expected: {expected:#?}");
+            println!("found:    {found:#?}");
+            panic!("test failed");
+        }
+    }
+}
--- a/src/parse/parser.rs
+++ b/src/parse/parser.rs
@ -8,6 +8,8 @@ use crate::util::EcoString;

 /// A convenient token-based parser.
 pub struct Parser<'s> {
+    /// Offsets the indentation on the first line of the source.
+    column_offset: usize,
    /// An iterator over the source tokens.
    tokens: Tokens<'s>,
    /// Whether we are at the end of the file or of a group.
@ -22,11 +24,10 @@ pub struct Parser<'s> {
    groups: Vec<GroupEntry>,
    /// The children of the currently built node.
    children: Vec<Green>,
-    /// Is `Some` if there is an unterminated group at the last position where
-    /// groups were terminated.
-    last_unterminated: Option<usize>,
-    /// Offsets the indentation on the first line of the source.
-    column_offset: usize,
+    /// Whether the last group was not correctly terminated.
+    unterminated_group: bool,
+    /// Whether a group terminator was found, that did not close a group.
+    stray_terminator: bool,
 }

 impl<'s> Parser<'s> {
@ -35,6 +36,7 @@ impl<'s> Parser<'s> {
        let mut tokens = Tokens::new(src, mode);
        let current = tokens.next();
        Self {
+            column_offset: 0,
            tokens,
            eof: current.is_none(),
            current,
@ -42,8 +44,8 @@ impl<'s> Parser<'s> {
            current_start: 0,
            groups: vec![],
            children: vec![],
-            last_unterminated: None,
-            column_offset: 0,
+            unterminated_group: false,
+            stray_terminator: false,
        }
    }

@ -70,7 +72,7 @@ impl<'s> Parser<'s> {

    /// End the parsing process and return multiple children and whether the
    /// last token was terminated, even if there remains stuff in the string.
-    pub fn consume_unterminated(self) -> Option<(Vec<Green>, bool)> {
+    pub fn consume_open_ended(self) -> Option<(Vec<Green>, bool)> {
        self.terminated().then(|| (self.children, self.tokens.terminated()))
    }

@ -120,6 +122,13 @@ impl<'s> Parser<'s> {

    /// Consume the current token and also trailing trivia.
    pub fn eat(&mut self) {
+        self.stray_terminator |= match self.current {
+            Some(NodeKind::RightParen) => !self.inside(Group::Paren),
+            Some(NodeKind::RightBracket) => !self.inside(Group::Bracket),
+            Some(NodeKind::RightBrace) => !self.inside(Group::Brace),
+            _ => false,
+        };
+
        self.prev_end = self.tokens.index();
        self.bump();

@ -259,13 +268,14 @@ impl<'s> Parser<'s> {
    /// This panics if no group was started.
    #[track_caller]
    pub fn end_group(&mut self) {
+        // If another group closes after a group with the missing terminator,
+        // its scope of influence ends here and no longer taints the rest of the
+        // reparse.
+        self.unterminated_group = false;
+
        let group_mode = self.tokens.mode();
        let group = self.groups.pop().expect("no started group");
        self.tokens.set_mode(group.prev_mode);
-        self.repeek();
-        if self.last_unterminated != Some(self.prev_end()) {
-            self.last_unterminated = None;
-        }

        let mut rescan = self.tokens.mode() != group_mode;

@ -280,12 +290,16 @@ impl<'s> Parser<'s> {
            Group::Imports => None,
        } {
            if self.current.as_ref() == Some(&end) {
-                // Bump the delimeter and return. No need to rescan in this case.
+                // Bump the delimeter and return. No need to rescan in this
+                // case. Also, we know that the delimiter is not stray even
+                // though we already removed the group.
+                let s = self.stray_terminator;
                self.eat();
+                self.stray_terminator = s;
                rescan = false;
            } else if required {
                self.push_error(format_eco!("expected {}", end));
-                self.last_unterminated = Some(self.prev_end());
+                self.unterminated_group = true;
            }
        }

@ -299,13 +313,14 @@ impl<'s> Parser<'s> {
            self.prev_end = self.tokens.index();
            self.current_start = self.tokens.index();
            self.current = self.tokens.next();
-            self.repeek();
        }
+
+        self.repeek();
    }

    /// Checks if all groups were correctly terminated.
-    pub fn terminated(&self) -> bool {
-        self.groups.is_empty() && self.last_unterminated.is_none()
+    fn terminated(&self) -> bool {
+        self.groups.is_empty() && !self.unterminated_group && !self.stray_terminator
    }

    /// Low-level bump that consumes exactly one token without special trivia
--- a/src/parse/tokens.rs
+++ b/src/parse/tokens.rs
@ -568,9 +568,8 @@ fn keyword(ident: &str) -> Option<NodeKind> {
 #[cfg(test)]
 #[allow(non_snake_case)]
 mod tests {
-    use std::fmt::Debug;
-
    use super::*;
+    use crate::parse::tests::check;

    use ErrorPos::*;
    use NodeKind::*;
@ -687,19 +686,6 @@ mod tests {
        }};
    }

-    #[track_caller]
-    fn check<T>(src: &str, found: T, expected: T)
-    where
-        T: Debug + PartialEq,
-    {
-        if found != expected {
-            println!("source:   {src:?}");
-            println!("expected: {expected:#?}");
-            println!("found:    {found:#?}");
-            panic!("test failed");
-        }
-    }
-
    #[test]
    fn test_tokenize_brackets() {
        // Test in markup.
--- a/tests/typeset.rs
+++ b/tests/typeset.rs
@ -392,6 +392,10 @@ fn print_error(source: &SourceFile, line: usize, error: &Error) {
 fn test_reparse(src: &str, i: usize, rng: &mut LinearShift) -> bool {
    let supplements = [
        "[",
+        "]",
+        "{",
+        "}",
+        "(",
        ")",
        "#rect()",
        "a word",