From ef4fc040b279104f6c95a5ea2f9a9d10fb0e9019 Mon Sep 17 00:00:00 2001
From: Ian Wrzesinski <133046678+wrzian@users.noreply.github.com>
Date: Wed, 11 Dec 2024 06:31:04 -0500
Subject: [PATCH] Improve raw trimming (#5541)

---
 crates/typst-syntax/src/lexer.rs | 109 +++++++++++++++++++++++--------
 tests/ref/raw-empty-lines.png    | Bin 0 -> 92 bytes
 tests/suite/text/raw.typ         |  55 +++++++++++++++-
 3 files changed, 137 insertions(+), 27 deletions(-)
 create mode 100644 tests/ref/raw-empty-lines.png
diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs
index 358c25b20..b0cb5c464 100644
--- a/crates/typst-syntax/src/lexer.rs
+++ b/crates/typst-syntax/src/lexer.rs
@@ -251,8 +251,9 @@ impl Lexer<'_> {
         }
     }
 
-    /// Lex an entire raw segment at once. This is a convenience to avoid going
-    /// to and from the parser for each raw section.
+    /// We parse entire raw segments in the lexer as a convenience to avoid
+    /// going to and from the parser for each raw section. See comments in
+    /// [`Self::blocky_raw`] and [`Self::inline_raw`] for specific details.
     fn raw(&mut self) -> (SyntaxKind, SyntaxNode) {
         let start = self.s.cursor() - 1;
 
@@ -313,6 +314,35 @@ impl Lexer<'_> {
         (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes))
     }
 
+    /// Raw blocks parse a language tag, have smart behavior for trimming
+    /// whitespace in the start/end lines, and trim common leading whitespace
+    /// from all other lines as the "dedent". The exact behavior is described
+    /// below.
+    ///
+    /// ### The initial line:
+    /// - A valid Typst identifier immediately following the opening delimiter
+    ///   is parsed as the language tag.
+    /// - We check the rest of the line and if all characters are whitespace,
+    ///   trim it. Otherwise we trim a single leading space if present.
+    ///   - If more trimmed characters follow on future lines, they will be
+    ///     merged into the same trimmed element.
+    /// - If we didn't trim the entire line, the rest is kept as text.
+    ///
+    /// ### Inner lines:
+    /// - We determine the "dedent" by iterating over the lines. The dedent is
+    ///   the minimum number of leading whitespace characters (not bytes) before
+    ///   each line that has any non-whitespace characters.
+    ///   - The opening delimiter's line does not contribute to the dedent, but
+    ///     the closing delimiter's line does (even if that line is entirely
+    ///     whitespace up to the delimiter).
+    /// - We then trim the newline and dedent characters of each line, and add a
+    ///   (potentially empty) text element of all remaining characters.
+    ///
+    /// ### The final line:
+    /// - If the last line is entirely whitespace, it is trimmed.
+    /// - Otherwise its text is kept like an inner line. However, if the last
+    ///   non-whitespace character of the final line is a backtick, then one
+    ///   ascii space (if present) is trimmed from the end.
     fn blocky_raw<F>(&mut self, inner_end: usize, mut push_raw: F)
     where
         F: FnMut(SyntaxKind, &Scanner),
@@ -323,12 +353,10 @@ impl Lexer<'_> {
             push_raw(SyntaxKind::RawLang, &self.s);
         }
 
-        // Determine inner content between backticks.
-        self.s.eat_if(' ');
-        let inner = self.s.to(inner_end);
+        // The rest of the function operates on the lines between the backticks.
+        let mut lines = split_newlines(self.s.to(inner_end));
 
         // Determine dedent level.
-        let mut lines = split_newlines(inner);
         let dedent = lines
             .iter()
             .skip(1)
@@ -339,35 +367,61 @@ impl Lexer<'_> {
             .min()
             .unwrap_or(0);
 
-        // Trim single space in last line if text ends with a backtick. The last
-        // line is the one directly before the closing backticks and if it is
-        // just whitespace, it will be completely trimmed below.
-        if inner.trim_end().ends_with('`') {
-            if let Some(last) = lines.last_mut() {
+        // Trim whitespace from the last line. Will be added as a `RawTrimmed`
+        // kind by the check for `self.s.cursor() != inner_end` below.
+        if lines.last().is_some_and(|last| last.chars().all(char::is_whitespace)) {
+            lines.pop();
+        } else if let Some(last) = lines.last_mut() {
+            // If last line ends in a backtick, try to trim a single space. This
+            // check must happen before we add the first line since the last and
+            // first lines might be the same.
+            if last.trim_end().ends_with('`') {
                 *last = last.strip_suffix(' ').unwrap_or(last);
             }
         }
 
-        let is_whitespace = |line: &&str| line.chars().all(char::is_whitespace);
-        let starts_whitespace = lines.first().is_some_and(is_whitespace);
-        let ends_whitespace = lines.last().is_some_and(is_whitespace);
-
         let mut lines = lines.into_iter();
-        let mut skipped = false;
 
-        // Trim whitespace + newline at start.
-        if starts_whitespace {
-            self.s.advance(lines.next().unwrap().len());
-            skipped = true;
-        }
-        // Trim whitespace + newline at end.
-        if ends_whitespace {
-            lines.next_back();
+        // Handle the first line: trim if all whitespace, or trim a single space
+        // at the start. Note that the first line does not affect the dedent
+        // value.
+        if let Some(first_line) = lines.next() {
+            if first_line.chars().all(char::is_whitespace) {
+                self.s.advance(first_line.len());
+                // This is the only spot we advance the scanner, but don't
+                // immediately call `push_raw`. But the rest of the function
+                // ensures we will always add this text to a `RawTrimmed` later.
+                debug_assert!(self.s.cursor() != inner_end);
+                // A proof by cases follows:
+                // # First case: The loop runs
+                // If the loop runs, there must be a newline following, so
+                // `cursor != inner_end`. And if the loop runs, the first thing
+                // it does is add a trimmed element.
+                // # Second case: The final if-statement runs.
+                // To _not_ reach the loop from here, we must have only one or
+                // two lines:
+                // 1. If one line, we cannot be here, because the first and last
+                //    lines are the same, so this line will have been removed by
+                //    the check for the last line being all whitespace.
+                // 2. If two lines, the loop will run unless the last is fully
+                //    whitespace, but if it is, it will have been popped, then
+                //    the final if-statement will run because the text removed
+                //    by the last line must include at least a newline, so
+                //    `cursor != inner_end` here.
+            } else {
+                let line_end = self.s.cursor() + first_line.len();
+                if self.s.eat_if(' ') {
+                    // Trim a single space after the lang tag on the first line.
+                    push_raw(SyntaxKind::RawTrimmed, &self.s);
+                }
+                // We know here that the rest of the line is non-empty.
+                self.s.jump(line_end);
+                push_raw(SyntaxKind::Text, &self.s);
+            }
         }
 
         // Add lines.
-        for (i, line) in lines.enumerate() {
-            let dedent = if i == 0 && !skipped { 0 } else { dedent };
+        for line in lines {
             let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum();
             self.s.eat_newline();
             self.s.advance(offset);
@@ -383,6 +437,9 @@ impl Lexer<'_> {
         }
     }
 
+    /// Inline raw text is split on lines with non-newlines as `Text` kinds and
+    /// newlines as `RawTrimmed`. Inline raw text does not dedent the text, all
+    /// non-newline whitespace is kept.
     fn inline_raw<F>(&mut self, inner_end: usize, mut push_raw: F)
     where
         F: FnMut(SyntaxKind, &Scanner),
diff --git a/tests/ref/raw-empty-lines.png b/tests/ref/raw-empty-lines.png
new file mode 100644
index 0000000000000000000000000000000000000000..dcf0d926142a1089d82c8dedb3803e8686c522e8
GIT binary patch
literal 92
zcmeAS@N?(olHy`uVBq!ia0vp^6+mpr1SA;hUTARxDP>O=$B>F!$v^tV4SKvDPx!Z$
oi#@_#%N7WIqhDT})M=B+V1NDiu_LRjOF-H@UHx3vIVCg!02g{2TmS$7

literal 0
HcmV?d00001

diff --git a/tests/suite/text/raw.typ b/tests/suite/text/raw.typ
index fa9e630fa..1ba216302 100644
--- a/tests/suite/text/raw.typ
+++ b/tests/suite/text/raw.typ
@@ -282,10 +282,40 @@ int main() {
 
 --- raw-blocky ---
 // Test various raw parsing edge cases.
+
 #let empty = (
   name: "empty",
   input: ``,
   text: "",
+  block: false,
+)
+
+#let empty-spaces = (
+  name: "empty-spaces",
+  input: ```   ```,
+  text: "",
+  block: false,
+)
+
+#let empty-newlines = (
+  name: "empty-newlines",
+  input: ```
+
+
+```,
+  text: "\n",
+  block: true,
+)
+
+#let newlines-backtick = (
+  name: "newlines-backtick",
+  input: ```
+
+`
+
+```,
+  text: "\n`\n",
+  block: true,
 )
 
 #let backtick = (
@@ -423,8 +453,18 @@ test
   block: true,
 )
 
+#let extra-first-line-ws = (
+  name: "extra-first-line-ws",
+  input: eval("```   \n```"),
+  text: "",
+  block: true,
+)
+
 #let cases = (
   empty,
+  empty-spaces,
+  empty-newlines,
+  newlines-backtick,
   backtick,
   lang-backtick,
   lang-space,
@@ -438,10 +478,11 @@ test
   blocky-dedent-lastline2,
   blocky-tab,
   blocky-tab-dedent,
+  extra-first-line-ws,
 )
 
 #for c in cases {
-  let block = c.at("block", default: false)
+  let block = c.block
   assert.eq(c.text, c.input.text, message: "in point " + c.name + ", expect " + repr(c.text) + ", got " + repr(c.input.text) + "")
   assert.eq(block, c.input.block, message: "in point " + c.name + ", expect " + repr(block) + ", got " + repr(c.input.block) + "")
 }
@@ -556,6 +597,18 @@ print(y)
 --- issue-3601-empty-raw ---
 // Test that empty raw block with `typ` language doesn't cause a crash.
 ```typ
+```
+
+--- raw-empty-lines ---
+// Test raw with multiple empty lines.
+
+#show raw: block.with(width: 100%, fill: gray)
+
+```
+
+
+
+
 ```
 
 --- issue-3841-tabs-in-raw-type-code ---