From 23f1c86b847642170ffdbb191ef6a2d0984f8928 Mon Sep 17 00:00:00 2001 From: Tobias Schmitz Date: Mon, 19 May 2025 19:17:20 +0200 Subject: [PATCH] feat: try to report line and column in files that contain invalid utf-8 --- Cargo.lock | 1 + Cargo.toml | 1 + crates/typst-library/Cargo.toml | 1 + crates/typst-library/src/diag.rs | 44 ++++++++++++++++++++++--- crates/typst-library/src/loading/mod.rs | 2 +- 5 files changed, 43 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 64276b901..23052a870 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3112,6 +3112,7 @@ dependencies = [ "unicode-segmentation", "unscanny", "usvg", + "utf8_iter", "wasmi", "xmlwriter", ] diff --git a/Cargo.toml b/Cargo.toml index b4890e3c1..b548245fa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -135,6 +135,7 @@ unicode-segmentation = "1" unscanny = "0.1" ureq = { version = "2", default-features = false, features = ["native-tls", "gzip", "json"] } usvg = { version = "0.45", default-features = false, features = ["text"] } +utf8_iter = "1.0.4" walkdir = "2" wasmi = "0.40.0" web-sys = "0.3" diff --git a/crates/typst-library/Cargo.toml b/crates/typst-library/Cargo.toml index b210637a8..f4b219882 100644 --- a/crates/typst-library/Cargo.toml +++ b/crates/typst-library/Cargo.toml @@ -66,6 +66,7 @@ unicode-normalization = { workspace = true } unicode-segmentation = { workspace = true } unscanny = { workspace = true } usvg = { workspace = true } +utf8_iter = { workspace = true } wasmi = { workspace = true } xmlwriter = { workspace = true } diff --git a/crates/typst-library/src/diag.rs b/crates/typst-library/src/diag.rs index 1d8018afd..6372c1c69 100644 --- a/crates/typst-library/src/diag.rs +++ b/crates/typst-library/src/diag.rs @@ -10,6 +10,7 @@ use comemo::Tracked; use ecow::{eco_vec, EcoVec}; use typst_syntax::package::{PackageSpec, PackageVersion}; use typst_syntax::{Lines, Span, Spanned, SyntaxError}; +use utf8_iter::ErrorReportingUtf8Chars; use crate::engine::Engine; use crate::loading::{LoadSource, Loaded}; @@ -577,12 +578,12 @@ impl Loaded { msg: impl std::fmt::Display, error: impl std::fmt::Display, ) -> EcoVec { + let pos = pos.into(); let lines = Lines::from_bytes(&self.bytes); match (self.source.v, lines) { // Only report an error in an external file, // if it is human readable (valid utf-8). (LoadSource::Path(file_id), Ok(lines)) => { - let pos = pos.into(); if let Some(range) = pos.range(&lines) { let span = Span::from_range(file_id, range); return eco_vec!(error!(span, "{msg} ({error})")); @@ -600,20 +601,28 @@ impl Loaded { }; eco_vec![error] } - _ => self.err_in_bytes(pos, msg, error), + (_, Ok(lines)) => { + let error = if let Some(pair) = pos.line_col(&lines) { + let (line, col) = pair.numbers(); + error!(self.source.span, "{msg} ({error} at {line}:{col})") + } else { + error!(self.source.span, "{msg} ({error})") + }; + eco_vec![error] + } + _ => self.err_in_invalid_text(pos, msg, error), } } /// Report an error, possibly in an external file. - pub fn err_in_bytes( + pub fn err_in_invalid_text( &self, pos: impl Into, msg: impl std::fmt::Display, error: impl std::fmt::Display, ) -> EcoVec { let pos = pos.into(); - let result = Lines::from_bytes(&self.bytes).ok().and_then(|l| pos.line_col(&l)); - let error = if let Some(pair) = result { + let error = if let Some(pair) = pos.try_line_col(&self.bytes) { let (line, col) = pair.numbers(); error!(self.source.span, "{msg} ({error} at {line}:{col})") } else { @@ -671,6 +680,17 @@ impl ReportPos { ReportPos::None => None, } } + + /// Either get the the line/column pair, or try to compute it from possibly + /// invalid utf-8 data. + fn try_line_col(&self, bytes: &[u8]) -> Option { + match self { + &ReportPos::Full(_, pair) => Some(pair), + ReportPos::Range(range) => LineCol::try_from_byte_pos(range.start, bytes), + &ReportPos::LineCol(pair) => Some(pair), + ReportPos::None => None, + } + } } /// A line/column pair. @@ -696,6 +716,20 @@ impl LineCol { } } + /// Try to compute a line/column pair from possibly invalid utf-8 data. + pub fn try_from_byte_pos(pos: usize, bytes: &[u8]) -> Option { + let bytes = &bytes[..pos]; + let mut line = 0; + let line_start = memchr::memchr_iter(b'\n', bytes) + .inspect(|_| line += 1) + .last() + .map(|i| i + 1) + .unwrap_or(bytes.len()); + + let col = ErrorReportingUtf8Chars::new(&bytes[line_start..]).count(); + Some(LineCol::zero_based(line, col)) + } + /// Returns the 0-based line/column indices. pub fn indices(&self) -> (usize, usize) { (self.line, self.col) diff --git a/crates/typst-library/src/loading/mod.rs b/crates/typst-library/src/loading/mod.rs index 53d5e9290..eb3726e7d 100644 --- a/crates/typst-library/src/loading/mod.rs +++ b/crates/typst-library/src/loading/mod.rs @@ -145,7 +145,7 @@ impl Loaded { let start = err.valid_up_to(); let end = start + err.error_len().unwrap_or(0); // always report this error in the source file. - self.err_in_bytes( + self.err_in_invalid_text( start..end, "failed to convert to string", FileError::from(err),