From ef4fc040b279104f6c95a5ea2f9a9d10fb0e9019 Mon Sep 17 00:00:00 2001 From: Ian Wrzesinski <133046678+wrzian@users.noreply.github.com> Date: Wed, 11 Dec 2024 06:31:04 -0500 Subject: [PATCH] Improve raw trimming (#5541) --- crates/typst-syntax/src/lexer.rs | 109 +++++++++++++++++++++++-------- tests/ref/raw-empty-lines.png | Bin 0 -> 92 bytes tests/suite/text/raw.typ | 55 +++++++++++++++- 3 files changed, 137 insertions(+), 27 deletions(-) create mode 100644 tests/ref/raw-empty-lines.png diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs index 358c25b20..b0cb5c464 100644 --- a/crates/typst-syntax/src/lexer.rs +++ b/crates/typst-syntax/src/lexer.rs @@ -251,8 +251,9 @@ impl Lexer<'_> { } } - /// Lex an entire raw segment at once. This is a convenience to avoid going - /// to and from the parser for each raw section. + /// We parse entire raw segments in the lexer as a convenience to avoid + /// going to and from the parser for each raw section. See comments in + /// [`Self::blocky_raw`] and [`Self::inline_raw`] for specific details. fn raw(&mut self) -> (SyntaxKind, SyntaxNode) { let start = self.s.cursor() - 1; @@ -313,6 +314,35 @@ impl Lexer<'_> { (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes)) } + /// Raw blocks parse a language tag, have smart behavior for trimming + /// whitespace in the start/end lines, and trim common leading whitespace + /// from all other lines as the "dedent". The exact behavior is described + /// below. + /// + /// ### The initial line: + /// - A valid Typst identifier immediately following the opening delimiter + /// is parsed as the language tag. + /// - We check the rest of the line and if all characters are whitespace, + /// trim it. Otherwise we trim a single leading space if present. + /// - If more trimmed characters follow on future lines, they will be + /// merged into the same trimmed element. + /// - If we didn't trim the entire line, the rest is kept as text. + /// + /// ### Inner lines: + /// - We determine the "dedent" by iterating over the lines. The dedent is + /// the minimum number of leading whitespace characters (not bytes) before + /// each line that has any non-whitespace characters. + /// - The opening delimiter's line does not contribute to the dedent, but + /// the closing delimiter's line does (even if that line is entirely + /// whitespace up to the delimiter). + /// - We then trim the newline and dedent characters of each line, and add a + /// (potentially empty) text element of all remaining characters. + /// + /// ### The final line: + /// - If the last line is entirely whitespace, it is trimmed. + /// - Otherwise its text is kept like an inner line. However, if the last + /// non-whitespace character of the final line is a backtick, then one + /// ascii space (if present) is trimmed from the end. fn blocky_raw(&mut self, inner_end: usize, mut push_raw: F) where F: FnMut(SyntaxKind, &Scanner), @@ -323,12 +353,10 @@ impl Lexer<'_> { push_raw(SyntaxKind::RawLang, &self.s); } - // Determine inner content between backticks. - self.s.eat_if(' '); - let inner = self.s.to(inner_end); + // The rest of the function operates on the lines between the backticks. + let mut lines = split_newlines(self.s.to(inner_end)); // Determine dedent level. - let mut lines = split_newlines(inner); let dedent = lines .iter() .skip(1) @@ -339,35 +367,61 @@ impl Lexer<'_> { .min() .unwrap_or(0); - // Trim single space in last line if text ends with a backtick. The last - // line is the one directly before the closing backticks and if it is - // just whitespace, it will be completely trimmed below. - if inner.trim_end().ends_with('`') { - if let Some(last) = lines.last_mut() { + // Trim whitespace from the last line. Will be added as a `RawTrimmed` + // kind by the check for `self.s.cursor() != inner_end` below. + if lines.last().is_some_and(|last| last.chars().all(char::is_whitespace)) { + lines.pop(); + } else if let Some(last) = lines.last_mut() { + // If last line ends in a backtick, try to trim a single space. This + // check must happen before we add the first line since the last and + // first lines might be the same. + if last.trim_end().ends_with('`') { *last = last.strip_suffix(' ').unwrap_or(last); } } - let is_whitespace = |line: &&str| line.chars().all(char::is_whitespace); - let starts_whitespace = lines.first().is_some_and(is_whitespace); - let ends_whitespace = lines.last().is_some_and(is_whitespace); - let mut lines = lines.into_iter(); - let mut skipped = false; - // Trim whitespace + newline at start. - if starts_whitespace { - self.s.advance(lines.next().unwrap().len()); - skipped = true; - } - // Trim whitespace + newline at end. - if ends_whitespace { - lines.next_back(); + // Handle the first line: trim if all whitespace, or trim a single space + // at the start. Note that the first line does not affect the dedent + // value. + if let Some(first_line) = lines.next() { + if first_line.chars().all(char::is_whitespace) { + self.s.advance(first_line.len()); + // This is the only spot we advance the scanner, but don't + // immediately call `push_raw`. But the rest of the function + // ensures we will always add this text to a `RawTrimmed` later. + debug_assert!(self.s.cursor() != inner_end); + // A proof by cases follows: + // # First case: The loop runs + // If the loop runs, there must be a newline following, so + // `cursor != inner_end`. And if the loop runs, the first thing + // it does is add a trimmed element. + // # Second case: The final if-statement runs. + // To _not_ reach the loop from here, we must have only one or + // two lines: + // 1. If one line, we cannot be here, because the first and last + // lines are the same, so this line will have been removed by + // the check for the last line being all whitespace. + // 2. If two lines, the loop will run unless the last is fully + // whitespace, but if it is, it will have been popped, then + // the final if-statement will run because the text removed + // by the last line must include at least a newline, so + // `cursor != inner_end` here. + } else { + let line_end = self.s.cursor() + first_line.len(); + if self.s.eat_if(' ') { + // Trim a single space after the lang tag on the first line. + push_raw(SyntaxKind::RawTrimmed, &self.s); + } + // We know here that the rest of the line is non-empty. + self.s.jump(line_end); + push_raw(SyntaxKind::Text, &self.s); + } } // Add lines. - for (i, line) in lines.enumerate() { - let dedent = if i == 0 && !skipped { 0 } else { dedent }; + for line in lines { let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum(); self.s.eat_newline(); self.s.advance(offset); @@ -383,6 +437,9 @@ impl Lexer<'_> { } } + /// Inline raw text is split on lines with non-newlines as `Text` kinds and + /// newlines as `RawTrimmed`. Inline raw text does not dedent the text, all + /// non-newline whitespace is kept. fn inline_raw(&mut self, inner_end: usize, mut push_raw: F) where F: FnMut(SyntaxKind, &Scanner), diff --git a/tests/ref/raw-empty-lines.png b/tests/ref/raw-empty-lines.png new file mode 100644 index 0000000000000000000000000000000000000000..dcf0d926142a1089d82c8dedb3803e8686c522e8 GIT binary patch literal 92 zcmeAS@N?(olHy`uVBq!ia0vp^6+mpr1SA;hUTARxDP>O=$B>F!$v^tV4SKvDPx!Z$ oi#@_#%N7WIqhDT})M=B+V1NDiu_LRjOF-H@UHx3vIVCg!02g{2TmS$7 literal 0 HcmV?d00001 diff --git a/tests/suite/text/raw.typ b/tests/suite/text/raw.typ index fa9e630fa..1ba216302 100644 --- a/tests/suite/text/raw.typ +++ b/tests/suite/text/raw.typ @@ -282,10 +282,40 @@ int main() { --- raw-blocky --- // Test various raw parsing edge cases. + #let empty = ( name: "empty", input: ``, text: "", + block: false, +) + +#let empty-spaces = ( + name: "empty-spaces", + input: ``` ```, + text: "", + block: false, +) + +#let empty-newlines = ( + name: "empty-newlines", + input: ``` + + +```, + text: "\n", + block: true, +) + +#let newlines-backtick = ( + name: "newlines-backtick", + input: ``` + +` + +```, + text: "\n`\n", + block: true, ) #let backtick = ( @@ -423,8 +453,18 @@ test block: true, ) +#let extra-first-line-ws = ( + name: "extra-first-line-ws", + input: eval("``` \n```"), + text: "", + block: true, +) + #let cases = ( empty, + empty-spaces, + empty-newlines, + newlines-backtick, backtick, lang-backtick, lang-space, @@ -438,10 +478,11 @@ test blocky-dedent-lastline2, blocky-tab, blocky-tab-dedent, + extra-first-line-ws, ) #for c in cases { - let block = c.at("block", default: false) + let block = c.block assert.eq(c.text, c.input.text, message: "in point " + c.name + ", expect " + repr(c.text) + ", got " + repr(c.input.text) + "") assert.eq(block, c.input.block, message: "in point " + c.name + ", expect " + repr(block) + ", got " + repr(c.input.block) + "") } @@ -556,6 +597,18 @@ print(y) --- issue-3601-empty-raw --- // Test that empty raw block with `typ` language doesn't cause a crash. ```typ +``` + +--- raw-empty-lines --- +// Test raw with multiple empty lines. + +#show raw: block.with(width: 100%, fill: gray) + +``` + + + + ``` --- issue-3841-tabs-in-raw-type-code ---