feat: try to report line and column in files that contain invalid utf-8

This commit is contained in:
Tobias Schmitz 2025-05-19 19:17:20 +02:00
parent e5d8f02554
commit 23f1c86b84
No known key found for this signature in database
5 changed files with 43 additions and 6 deletions

1
Cargo.lock generated
View File

@ -3112,6 +3112,7 @@ dependencies = [
"unicode-segmentation",
"unscanny",
"usvg",
"utf8_iter",
"wasmi",
"xmlwriter",
]

View File

@ -135,6 +135,7 @@ unicode-segmentation = "1"
unscanny = "0.1"
ureq = { version = "2", default-features = false, features = ["native-tls", "gzip", "json"] }
usvg = { version = "0.45", default-features = false, features = ["text"] }
utf8_iter = "1.0.4"
walkdir = "2"
wasmi = "0.40.0"
web-sys = "0.3"

View File

@ -66,6 +66,7 @@ unicode-normalization = { workspace = true }
unicode-segmentation = { workspace = true }
unscanny = { workspace = true }
usvg = { workspace = true }
utf8_iter = { workspace = true }
wasmi = { workspace = true }
xmlwriter = { workspace = true }

View File

@ -10,6 +10,7 @@ use comemo::Tracked;
use ecow::{eco_vec, EcoVec};
use typst_syntax::package::{PackageSpec, PackageVersion};
use typst_syntax::{Lines, Span, Spanned, SyntaxError};
use utf8_iter::ErrorReportingUtf8Chars;
use crate::engine::Engine;
use crate::loading::{LoadSource, Loaded};
@ -577,12 +578,12 @@ impl Loaded {
msg: impl std::fmt::Display,
error: impl std::fmt::Display,
) -> EcoVec<SourceDiagnostic> {
let pos = pos.into();
let lines = Lines::from_bytes(&self.bytes);
match (self.source.v, lines) {
// Only report an error in an external file,
// if it is human readable (valid utf-8).
(LoadSource::Path(file_id), Ok(lines)) => {
let pos = pos.into();
if let Some(range) = pos.range(&lines) {
let span = Span::from_range(file_id, range);
return eco_vec!(error!(span, "{msg} ({error})"));
@ -600,20 +601,28 @@ impl Loaded {
};
eco_vec![error]
}
_ => self.err_in_bytes(pos, msg, error),
(_, Ok(lines)) => {
let error = if let Some(pair) = pos.line_col(&lines) {
let (line, col) = pair.numbers();
error!(self.source.span, "{msg} ({error} at {line}:{col})")
} else {
error!(self.source.span, "{msg} ({error})")
};
eco_vec![error]
}
_ => self.err_in_invalid_text(pos, msg, error),
}
}
/// Report an error, possibly in an external file.
pub fn err_in_bytes(
pub fn err_in_invalid_text(
&self,
pos: impl Into<ReportPos>,
msg: impl std::fmt::Display,
error: impl std::fmt::Display,
) -> EcoVec<SourceDiagnostic> {
let pos = pos.into();
let result = Lines::from_bytes(&self.bytes).ok().and_then(|l| pos.line_col(&l));
let error = if let Some(pair) = result {
let error = if let Some(pair) = pos.try_line_col(&self.bytes) {
let (line, col) = pair.numbers();
error!(self.source.span, "{msg} ({error} at {line}:{col})")
} else {
@ -671,6 +680,17 @@ impl ReportPos {
ReportPos::None => None,
}
}
/// Either get the the line/column pair, or try to compute it from possibly
/// invalid utf-8 data.
fn try_line_col(&self, bytes: &[u8]) -> Option<LineCol> {
match self {
&ReportPos::Full(_, pair) => Some(pair),
ReportPos::Range(range) => LineCol::try_from_byte_pos(range.start, bytes),
&ReportPos::LineCol(pair) => Some(pair),
ReportPos::None => None,
}
}
}
/// A line/column pair.
@ -696,6 +716,20 @@ impl LineCol {
}
}
/// Try to compute a line/column pair from possibly invalid utf-8 data.
pub fn try_from_byte_pos(pos: usize, bytes: &[u8]) -> Option<Self> {
let bytes = &bytes[..pos];
let mut line = 0;
let line_start = memchr::memchr_iter(b'\n', bytes)
.inspect(|_| line += 1)
.last()
.map(|i| i + 1)
.unwrap_or(bytes.len());
let col = ErrorReportingUtf8Chars::new(&bytes[line_start..]).count();
Some(LineCol::zero_based(line, col))
}
/// Returns the 0-based line/column indices.
pub fn indices(&self) -> (usize, usize) {
(self.line, self.col)

View File

@ -145,7 +145,7 @@ impl Loaded {
let start = err.valid_up_to();
let end = start + err.error_len().unwrap_or(0);
// always report this error in the source file.
self.err_in_bytes(
self.err_in_invalid_text(
start..end,
"failed to convert to string",
FileError::from(err),