experiment with utf8_iter

This commit is contained in:
Tobias Schmitz 2025-05-19 11:14:27 +02:00
parent a79120b668
commit 3879d0826a
No known key found for this signature in database
4 changed files with 11 additions and 13 deletions

1
Cargo.lock generated
View File

@ -3112,6 +3112,7 @@ dependencies = [
"unicode-segmentation",
"unscanny",
"usvg",
"utf8_iter",
"wasmi",
"xmlwriter",
]

View File

@ -135,6 +135,7 @@ unicode-segmentation = "1"
unscanny = "0.1"
ureq = { version = "2", default-features = false, features = ["native-tls", "gzip", "json"] }
usvg = { version = "0.45", default-features = false, features = ["text"] }
utf8_iter = "1.0.4"
walkdir = "2"
wasmi = "0.40.0"
web-sys = "0.3"

View File

@ -66,6 +66,7 @@ unicode-normalization = { workspace = true }
unicode-segmentation = { workspace = true }
unscanny = { workspace = true }
usvg = { workspace = true }
utf8_iter = { workspace = true }
wasmi = { workspace = true }
xmlwriter = { workspace = true }

View File

@ -18,6 +18,7 @@ mod yaml_;
use comemo::Tracked;
use ecow::{eco_vec, EcoString, EcoVec};
use typst_syntax::{FileId, Span, Spanned};
use utf8_iter::ErrorReportingUtf8Chars;
pub use self::cbor_::*;
pub use self::csv_::*;
@ -260,8 +261,6 @@ impl LineCol {
}
}
// TODO: this function should only return None if the position is out of
// bounds not if there is invalid utf-8
pub fn from_byte_pos(pos: usize, bytes: &[u8]) -> Option<Self> {
let bytes = &bytes[..pos];
let mut line = 0;
@ -270,10 +269,9 @@ impl LineCol {
.last()
.map(|i| i + 1)
.unwrap_or(bytes.len());
// TODO: streaming-utf8 decoding ignore invalid characters
// might neeed to update error reporting too (use utf8_iter)
let str = std::str::from_utf8(&bytes[line_start..]).ok()?;
let col = str.chars().count();
// Try to compute a column even if the string isn't valid utf-8.
let col = ErrorReportingUtf8Chars::new(&bytes[line_start..]).count();
Some(LineCol::zero_based(line, col))
}
@ -319,18 +317,15 @@ impl LineCol {
}
}
// TODO: this function should only return None if the position is out of
// bounds not if there is invalid utf-8
fn col_offset(line_offset: usize, col: usize, bytes: &[u8]) -> Option<usize> {
let line = &bytes[line_offset..];
// TODO: streaming-utf8 decoding ignore invalid characters
// might neeed to update error reporting too (use utf8_iter)
// validate the whole line, so it can be displayed
let len = memchr::memchr(b'\n', line).unwrap_or(line.len());
let str = std::str::from_utf8(&line[..len]).ok()?;
if let Some(idx) = col.checked_sub(1) {
str.char_indices().nth(idx).map(|(i, c)| i + c.len_utf8())
// Try to compute position even if the string isn't valid utf-8.
let mut iter = ErrorReportingUtf8Chars::new(line);
_ = iter.nth(idx)?;
Some(line.len() - iter.as_slice().len())
} else {
Some(0)
}