mirror of
https://github.com/typst/typst
synced 2025-05-14 04:56:26 +08:00
Improve raw trimming (#5541)
This commit is contained in:
parent
a5ade167dd
commit
ef4fc040b2
@ -251,8 +251,9 @@ impl Lexer<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Lex an entire raw segment at once. This is a convenience to avoid going
|
||||
/// to and from the parser for each raw section.
|
||||
/// We parse entire raw segments in the lexer as a convenience to avoid
|
||||
/// going to and from the parser for each raw section. See comments in
|
||||
/// [`Self::blocky_raw`] and [`Self::inline_raw`] for specific details.
|
||||
fn raw(&mut self) -> (SyntaxKind, SyntaxNode) {
|
||||
let start = self.s.cursor() - 1;
|
||||
|
||||
@ -313,6 +314,35 @@ impl Lexer<'_> {
|
||||
(SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes))
|
||||
}
|
||||
|
||||
/// Raw blocks parse a language tag, have smart behavior for trimming
|
||||
/// whitespace in the start/end lines, and trim common leading whitespace
|
||||
/// from all other lines as the "dedent". The exact behavior is described
|
||||
/// below.
|
||||
///
|
||||
/// ### The initial line:
|
||||
/// - A valid Typst identifier immediately following the opening delimiter
|
||||
/// is parsed as the language tag.
|
||||
/// - We check the rest of the line and if all characters are whitespace,
|
||||
/// trim it. Otherwise we trim a single leading space if present.
|
||||
/// - If more trimmed characters follow on future lines, they will be
|
||||
/// merged into the same trimmed element.
|
||||
/// - If we didn't trim the entire line, the rest is kept as text.
|
||||
///
|
||||
/// ### Inner lines:
|
||||
/// - We determine the "dedent" by iterating over the lines. The dedent is
|
||||
/// the minimum number of leading whitespace characters (not bytes) before
|
||||
/// each line that has any non-whitespace characters.
|
||||
/// - The opening delimiter's line does not contribute to the dedent, but
|
||||
/// the closing delimiter's line does (even if that line is entirely
|
||||
/// whitespace up to the delimiter).
|
||||
/// - We then trim the newline and dedent characters of each line, and add a
|
||||
/// (potentially empty) text element of all remaining characters.
|
||||
///
|
||||
/// ### The final line:
|
||||
/// - If the last line is entirely whitespace, it is trimmed.
|
||||
/// - Otherwise its text is kept like an inner line. However, if the last
|
||||
/// non-whitespace character of the final line is a backtick, then one
|
||||
/// ascii space (if present) is trimmed from the end.
|
||||
fn blocky_raw<F>(&mut self, inner_end: usize, mut push_raw: F)
|
||||
where
|
||||
F: FnMut(SyntaxKind, &Scanner),
|
||||
@ -323,12 +353,10 @@ impl Lexer<'_> {
|
||||
push_raw(SyntaxKind::RawLang, &self.s);
|
||||
}
|
||||
|
||||
// Determine inner content between backticks.
|
||||
self.s.eat_if(' ');
|
||||
let inner = self.s.to(inner_end);
|
||||
// The rest of the function operates on the lines between the backticks.
|
||||
let mut lines = split_newlines(self.s.to(inner_end));
|
||||
|
||||
// Determine dedent level.
|
||||
let mut lines = split_newlines(inner);
|
||||
let dedent = lines
|
||||
.iter()
|
||||
.skip(1)
|
||||
@ -339,35 +367,61 @@ impl Lexer<'_> {
|
||||
.min()
|
||||
.unwrap_or(0);
|
||||
|
||||
// Trim single space in last line if text ends with a backtick. The last
|
||||
// line is the one directly before the closing backticks and if it is
|
||||
// just whitespace, it will be completely trimmed below.
|
||||
if inner.trim_end().ends_with('`') {
|
||||
if let Some(last) = lines.last_mut() {
|
||||
// Trim whitespace from the last line. Will be added as a `RawTrimmed`
|
||||
// kind by the check for `self.s.cursor() != inner_end` below.
|
||||
if lines.last().is_some_and(|last| last.chars().all(char::is_whitespace)) {
|
||||
lines.pop();
|
||||
} else if let Some(last) = lines.last_mut() {
|
||||
// If last line ends in a backtick, try to trim a single space. This
|
||||
// check must happen before we add the first line since the last and
|
||||
// first lines might be the same.
|
||||
if last.trim_end().ends_with('`') {
|
||||
*last = last.strip_suffix(' ').unwrap_or(last);
|
||||
}
|
||||
}
|
||||
|
||||
let is_whitespace = |line: &&str| line.chars().all(char::is_whitespace);
|
||||
let starts_whitespace = lines.first().is_some_and(is_whitespace);
|
||||
let ends_whitespace = lines.last().is_some_and(is_whitespace);
|
||||
|
||||
let mut lines = lines.into_iter();
|
||||
let mut skipped = false;
|
||||
|
||||
// Trim whitespace + newline at start.
|
||||
if starts_whitespace {
|
||||
self.s.advance(lines.next().unwrap().len());
|
||||
skipped = true;
|
||||
}
|
||||
// Trim whitespace + newline at end.
|
||||
if ends_whitespace {
|
||||
lines.next_back();
|
||||
// Handle the first line: trim if all whitespace, or trim a single space
|
||||
// at the start. Note that the first line does not affect the dedent
|
||||
// value.
|
||||
if let Some(first_line) = lines.next() {
|
||||
if first_line.chars().all(char::is_whitespace) {
|
||||
self.s.advance(first_line.len());
|
||||
// This is the only spot we advance the scanner, but don't
|
||||
// immediately call `push_raw`. But the rest of the function
|
||||
// ensures we will always add this text to a `RawTrimmed` later.
|
||||
debug_assert!(self.s.cursor() != inner_end);
|
||||
// A proof by cases follows:
|
||||
// # First case: The loop runs
|
||||
// If the loop runs, there must be a newline following, so
|
||||
// `cursor != inner_end`. And if the loop runs, the first thing
|
||||
// it does is add a trimmed element.
|
||||
// # Second case: The final if-statement runs.
|
||||
// To _not_ reach the loop from here, we must have only one or
|
||||
// two lines:
|
||||
// 1. If one line, we cannot be here, because the first and last
|
||||
// lines are the same, so this line will have been removed by
|
||||
// the check for the last line being all whitespace.
|
||||
// 2. If two lines, the loop will run unless the last is fully
|
||||
// whitespace, but if it is, it will have been popped, then
|
||||
// the final if-statement will run because the text removed
|
||||
// by the last line must include at least a newline, so
|
||||
// `cursor != inner_end` here.
|
||||
} else {
|
||||
let line_end = self.s.cursor() + first_line.len();
|
||||
if self.s.eat_if(' ') {
|
||||
// Trim a single space after the lang tag on the first line.
|
||||
push_raw(SyntaxKind::RawTrimmed, &self.s);
|
||||
}
|
||||
// We know here that the rest of the line is non-empty.
|
||||
self.s.jump(line_end);
|
||||
push_raw(SyntaxKind::Text, &self.s);
|
||||
}
|
||||
}
|
||||
|
||||
// Add lines.
|
||||
for (i, line) in lines.enumerate() {
|
||||
let dedent = if i == 0 && !skipped { 0 } else { dedent };
|
||||
for line in lines {
|
||||
let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum();
|
||||
self.s.eat_newline();
|
||||
self.s.advance(offset);
|
||||
@ -383,6 +437,9 @@ impl Lexer<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Inline raw text is split on lines with non-newlines as `Text` kinds and
|
||||
/// newlines as `RawTrimmed`. Inline raw text does not dedent the text, all
|
||||
/// non-newline whitespace is kept.
|
||||
fn inline_raw<F>(&mut self, inner_end: usize, mut push_raw: F)
|
||||
where
|
||||
F: FnMut(SyntaxKind, &Scanner),
|
||||
|
BIN
tests/ref/raw-empty-lines.png
Normal file
BIN
tests/ref/raw-empty-lines.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 92 B |
@ -282,10 +282,40 @@ int main() {
|
||||
|
||||
--- raw-blocky ---
|
||||
// Test various raw parsing edge cases.
|
||||
|
||||
#let empty = (
|
||||
name: "empty",
|
||||
input: ``,
|
||||
text: "",
|
||||
block: false,
|
||||
)
|
||||
|
||||
#let empty-spaces = (
|
||||
name: "empty-spaces",
|
||||
input: ``` ```,
|
||||
text: "",
|
||||
block: false,
|
||||
)
|
||||
|
||||
#let empty-newlines = (
|
||||
name: "empty-newlines",
|
||||
input: ```
|
||||
|
||||
|
||||
```,
|
||||
text: "\n",
|
||||
block: true,
|
||||
)
|
||||
|
||||
#let newlines-backtick = (
|
||||
name: "newlines-backtick",
|
||||
input: ```
|
||||
|
||||
`
|
||||
|
||||
```,
|
||||
text: "\n`\n",
|
||||
block: true,
|
||||
)
|
||||
|
||||
#let backtick = (
|
||||
@ -423,8 +453,18 @@ test
|
||||
block: true,
|
||||
)
|
||||
|
||||
#let extra-first-line-ws = (
|
||||
name: "extra-first-line-ws",
|
||||
input: eval("``` \n```"),
|
||||
text: "",
|
||||
block: true,
|
||||
)
|
||||
|
||||
#let cases = (
|
||||
empty,
|
||||
empty-spaces,
|
||||
empty-newlines,
|
||||
newlines-backtick,
|
||||
backtick,
|
||||
lang-backtick,
|
||||
lang-space,
|
||||
@ -438,10 +478,11 @@ test
|
||||
blocky-dedent-lastline2,
|
||||
blocky-tab,
|
||||
blocky-tab-dedent,
|
||||
extra-first-line-ws,
|
||||
)
|
||||
|
||||
#for c in cases {
|
||||
let block = c.at("block", default: false)
|
||||
let block = c.block
|
||||
assert.eq(c.text, c.input.text, message: "in point " + c.name + ", expect " + repr(c.text) + ", got " + repr(c.input.text) + "")
|
||||
assert.eq(block, c.input.block, message: "in point " + c.name + ", expect " + repr(block) + ", got " + repr(c.input.block) + "")
|
||||
}
|
||||
@ -556,6 +597,18 @@ print(y)
|
||||
--- issue-3601-empty-raw ---
|
||||
// Test that empty raw block with `typ` language doesn't cause a crash.
|
||||
```typ
|
||||
```
|
||||
|
||||
--- raw-empty-lines ---
|
||||
// Test raw with multiple empty lines.
|
||||
|
||||
#show raw: block.with(width: 100%, fill: gray)
|
||||
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
```
|
||||
|
||||
--- issue-3841-tabs-in-raw-type-code ---
|
||||
|
Loading…
x
Reference in New Issue
Block a user