diff --git a/Cargo.lock b/Cargo.lock index a9b3756a6..2778ec489 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3112,6 +3112,7 @@ dependencies = [ "unicode-segmentation", "unscanny", "usvg", + "utf8_iter", "wasmi", "xmlwriter", ] diff --git a/Cargo.toml b/Cargo.toml index b4890e3c1..b548245fa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -135,6 +135,7 @@ unicode-segmentation = "1" unscanny = "0.1" ureq = { version = "2", default-features = false, features = ["native-tls", "gzip", "json"] } usvg = { version = "0.45", default-features = false, features = ["text"] } +utf8_iter = "1.0.4" walkdir = "2" wasmi = "0.40.0" web-sys = "0.3" diff --git a/crates/typst-library/Cargo.toml b/crates/typst-library/Cargo.toml index b210637a8..f4b219882 100644 --- a/crates/typst-library/Cargo.toml +++ b/crates/typst-library/Cargo.toml @@ -66,6 +66,7 @@ unicode-normalization = { workspace = true } unicode-segmentation = { workspace = true } unscanny = { workspace = true } usvg = { workspace = true } +utf8_iter = { workspace = true } wasmi = { workspace = true } xmlwriter = { workspace = true } diff --git a/crates/typst-library/src/loading/mod.rs b/crates/typst-library/src/loading/mod.rs index 580a27294..c8151ab75 100644 --- a/crates/typst-library/src/loading/mod.rs +++ b/crates/typst-library/src/loading/mod.rs @@ -18,6 +18,7 @@ mod yaml_; use comemo::Tracked; use ecow::{eco_vec, EcoString, EcoVec}; use typst_syntax::{FileId, Span, Spanned}; +use utf8_iter::ErrorReportingUtf8Chars; pub use self::cbor_::*; pub use self::csv_::*; @@ -260,8 +261,6 @@ impl LineCol { } } - // TODO: this function should only return None if the position is out of - // bounds not if there is invalid utf-8 pub fn from_byte_pos(pos: usize, bytes: &[u8]) -> Option { let bytes = &bytes[..pos]; let mut line = 0; @@ -270,10 +269,9 @@ impl LineCol { .last() .map(|i| i + 1) .unwrap_or(bytes.len()); - // TODO: streaming-utf8 decoding ignore invalid characters - // might neeed to update error reporting too (use utf8_iter) - let str = std::str::from_utf8(&bytes[line_start..]).ok()?; - let col = str.chars().count(); + + // Try to compute a column even if the string isn't valid utf-8. + let col = ErrorReportingUtf8Chars::new(&bytes[line_start..]).count(); Some(LineCol::zero_based(line, col)) } @@ -319,18 +317,15 @@ impl LineCol { } } -// TODO: this function should only return None if the position is out of -// bounds not if there is invalid utf-8 fn col_offset(line_offset: usize, col: usize, bytes: &[u8]) -> Option { let line = &bytes[line_offset..]; // TODO: streaming-utf8 decoding ignore invalid characters // might neeed to update error reporting too (use utf8_iter) - - // validate the whole line, so it can be displayed - let len = memchr::memchr(b'\n', line).unwrap_or(line.len()); - let str = std::str::from_utf8(&line[..len]).ok()?; if let Some(idx) = col.checked_sub(1) { - str.char_indices().nth(idx).map(|(i, c)| i + c.len_utf8()) + // Try to compute position even if the string isn't valid utf-8. + let mut iter = ErrorReportingUtf8Chars::new(line); + _ = iter.nth(idx)?; + Some(line.len() - iter.as_slice().len()) } else { Some(0) }