From a18ca3481da17a4de1cc7f9890f0c61efb480655 Mon Sep 17 00:00:00 2001 From: Tobias Schmitz Date: Tue, 10 Jun 2025 14:46:27 +0200 Subject: [PATCH] Report errors in external files (#6308) Co-authored-by: Laurenz --- Cargo.lock | 3 + Cargo.toml | 1 + crates/typst-cli/src/compile.rs | 4 +- crates/typst-cli/src/timings.rs | 2 +- crates/typst-cli/src/world.rs | 23 +- crates/typst-layout/Cargo.toml | 1 + crates/typst-layout/src/image.rs | 14 +- crates/typst-library/Cargo.toml | 1 + crates/typst-library/src/diag.rs | 303 ++++++++++++- crates/typst-library/src/foundations/bytes.rs | 11 + .../typst-library/src/foundations/plugin.rs | 4 +- crates/typst-library/src/loading/cbor.rs | 4 +- crates/typst-library/src/loading/csv.rs | 38 +- crates/typst-library/src/loading/json.rs | 13 +- crates/typst-library/src/loading/mod.rs | 49 ++- crates/typst-library/src/loading/read.rs | 15 +- crates/typst-library/src/loading/toml.rs | 28 +- crates/typst-library/src/loading/xml.rs | 11 +- crates/typst-library/src/loading/yaml.rs | 23 +- .../typst-library/src/model/bibliography.rs | 119 +++--- crates/typst-library/src/text/raw.rs | 81 ++-- .../typst-library/src/visualize/image/mod.rs | 20 +- .../typst-library/src/visualize/image/svg.rs | 26 +- crates/typst-syntax/Cargo.toml | 1 + crates/typst-syntax/src/lib.rs | 2 + crates/typst-syntax/src/lines.rs | 402 ++++++++++++++++++ crates/typst-syntax/src/source.rs | 326 +------------- tests/src/collect.rs | 98 ++++- tests/src/run.rs | 74 ++-- tests/src/world.rs | 21 +- tests/suite/loading/csv.typ | 4 +- tests/suite/loading/json.typ | 2 +- tests/suite/loading/read.typ | 2 +- tests/suite/loading/toml.typ | 2 +- tests/suite/loading/xml.typ | 2 +- tests/suite/loading/yaml.typ | 2 +- tests/suite/scripting/import.typ | 1 + tests/suite/visualize/image.typ | 4 +- 38 files changed, 1165 insertions(+), 572 deletions(-) create mode 100644 crates/typst-syntax/src/lines.rs diff --git a/Cargo.lock b/Cargo.lock index a9b3756a6..b699d2450 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3039,6 +3039,7 @@ dependencies = [ "icu_provider_blob", "icu_segmenter", "kurbo", + "memchr", "rustybuzz", "smallvec", "ttf-parser", @@ -3112,6 +3113,7 @@ dependencies = [ "unicode-segmentation", "unscanny", "usvg", + "utf8_iter", "wasmi", "xmlwriter", ] @@ -3200,6 +3202,7 @@ dependencies = [ name = "typst-syntax" version = "0.13.1" dependencies = [ + "comemo", "ecow", "serde", "toml", diff --git a/Cargo.toml b/Cargo.toml index b4890e3c1..b548245fa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -135,6 +135,7 @@ unicode-segmentation = "1" unscanny = "0.1" ureq = { version = "2", default-features = false, features = ["native-tls", "gzip", "json"] } usvg = { version = "0.45", default-features = false, features = ["text"] } +utf8_iter = "1.0.4" walkdir = "2" wasmi = "0.40.0" web-sys = "0.3" diff --git a/crates/typst-cli/src/compile.rs b/crates/typst-cli/src/compile.rs index 4edb4c323..207bb7d09 100644 --- a/crates/typst-cli/src/compile.rs +++ b/crates/typst-cli/src/compile.rs @@ -16,7 +16,7 @@ use typst::diag::{ use typst::foundations::{Datetime, Smart}; use typst::html::HtmlDocument; use typst::layout::{Frame, Page, PageRanges, PagedDocument}; -use typst::syntax::{FileId, Source, Span}; +use typst::syntax::{FileId, Lines, Span}; use typst::WorldExt; use typst_pdf::{PdfOptions, PdfStandards, Timestamp}; @@ -696,7 +696,7 @@ fn label(world: &SystemWorld, span: Span) -> Option> { impl<'a> codespan_reporting::files::Files<'a> for SystemWorld { type FileId = FileId; type Name = String; - type Source = Source; + type Source = Lines; fn name(&'a self, id: FileId) -> CodespanResult { let vpath = id.vpath(); diff --git a/crates/typst-cli/src/timings.rs b/crates/typst-cli/src/timings.rs index 9f017dc12..3d10bbc67 100644 --- a/crates/typst-cli/src/timings.rs +++ b/crates/typst-cli/src/timings.rs @@ -85,6 +85,6 @@ fn resolve_span(world: &SystemWorld, span: Span) -> Option<(String, u32)> { let id = span.id()?; let source = world.source(id).ok()?; let range = source.range(span)?; - let line = source.byte_to_line(range.start)?; + let line = source.lines().byte_to_line(range.start)?; Some((format!("{id:?}"), line as u32 + 1)) } diff --git a/crates/typst-cli/src/world.rs b/crates/typst-cli/src/world.rs index 2da03d4d5..f63d34b63 100644 --- a/crates/typst-cli/src/world.rs +++ b/crates/typst-cli/src/world.rs @@ -9,7 +9,7 @@ use ecow::{eco_format, EcoString}; use parking_lot::Mutex; use typst::diag::{FileError, FileResult}; use typst::foundations::{Bytes, Datetime, Dict, IntoValue}; -use typst::syntax::{FileId, Source, VirtualPath}; +use typst::syntax::{FileId, Lines, Source, VirtualPath}; use typst::text::{Font, FontBook}; use typst::utils::LazyHash; use typst::{Library, World}; @@ -181,10 +181,20 @@ impl SystemWorld { } } - /// Lookup a source file by id. + /// Lookup line metadata for a file by id. #[track_caller] - pub fn lookup(&self, id: FileId) -> Source { - self.source(id).expect("file id does not point to any source file") + pub fn lookup(&self, id: FileId) -> Lines { + self.slot(id, |slot| { + if let Some(source) = slot.source.get() { + let source = source.as_ref().expect("file is not valid"); + source.lines() + } else if let Some(bytes) = slot.file.get() { + let bytes = bytes.as_ref().expect("file is not valid"); + Lines::try_from(bytes).expect("file is not valid utf-8") + } else { + panic!("file id does not point to any source file"); + } + }) } } @@ -339,6 +349,11 @@ impl SlotCell { self.accessed = false; } + /// Gets the contents of the cell. + fn get(&self) -> Option<&FileResult> { + self.data.as_ref() + } + /// Gets the contents of the cell or initialize them. fn get_or_init( &mut self, diff --git a/crates/typst-layout/Cargo.toml b/crates/typst-layout/Cargo.toml index 438e09e43..cc355a3db 100644 --- a/crates/typst-layout/Cargo.toml +++ b/crates/typst-layout/Cargo.toml @@ -30,6 +30,7 @@ icu_provider_adapters = { workspace = true } icu_provider_blob = { workspace = true } icu_segmenter = { workspace = true } kurbo = { workspace = true } +memchr = { workspace = true } rustybuzz = { workspace = true } smallvec = { workspace = true } ttf-parser = { workspace = true } diff --git a/crates/typst-layout/src/image.rs b/crates/typst-layout/src/image.rs index 8136a25a3..a8f4a0c81 100644 --- a/crates/typst-layout/src/image.rs +++ b/crates/typst-layout/src/image.rs @@ -1,6 +1,6 @@ use std::ffi::OsStr; -use typst_library::diag::{warning, At, SourceResult, StrResult}; +use typst_library::diag::{warning, At, LoadedWithin, SourceResult, StrResult}; use typst_library::engine::Engine; use typst_library::foundations::{Bytes, Derived, Packed, Smart, StyleChain}; use typst_library::introspection::Locator; @@ -27,17 +27,17 @@ pub fn layout_image( // Take the format that was explicitly defined, or parse the extension, // or try to detect the format. - let Derived { source, derived: data } = &elem.source; + let Derived { source, derived: loaded } = &elem.source; let format = match elem.format(styles) { Smart::Custom(v) => v, - Smart::Auto => determine_format(source, data).at(span)?, + Smart::Auto => determine_format(source, &loaded.data).at(span)?, }; // Warn the user if the image contains a foreign object. Not perfect // because the svg could also be encoded, but that's an edge case. if format == ImageFormat::Vector(VectorFormat::Svg) { let has_foreign_object = - data.as_str().is_ok_and(|s| s.contains(" ImageKind::Raster( RasterImage::new( - data.clone(), + loaded.data.clone(), format, elem.icc(styles).as_ref().map(|icc| icc.derived.clone()), ) @@ -61,11 +61,11 @@ pub fn layout_image( ), ImageFormat::Vector(VectorFormat::Svg) => ImageKind::Svg( SvgImage::with_fonts( - data.clone(), + loaded.data.clone(), engine.world, &families(styles).map(|f| f.as_str()).collect::>(), ) - .at(span)?, + .within(loaded)?, ), }; diff --git a/crates/typst-library/Cargo.toml b/crates/typst-library/Cargo.toml index b210637a8..f4b219882 100644 --- a/crates/typst-library/Cargo.toml +++ b/crates/typst-library/Cargo.toml @@ -66,6 +66,7 @@ unicode-normalization = { workspace = true } unicode-segmentation = { workspace = true } unscanny = { workspace = true } usvg = { workspace = true } +utf8_iter = { workspace = true } wasmi = { workspace = true } xmlwriter = { workspace = true } diff --git a/crates/typst-library/src/diag.rs b/crates/typst-library/src/diag.rs index 49cbd02c6..41b92ed65 100644 --- a/crates/typst-library/src/diag.rs +++ b/crates/typst-library/src/diag.rs @@ -1,17 +1,20 @@ //! Diagnostics. -use std::fmt::{self, Display, Formatter}; +use std::fmt::{self, Display, Formatter, Write as _}; use std::io; use std::path::{Path, PathBuf}; use std::str::Utf8Error; use std::string::FromUtf8Error; +use az::SaturatingAs; use comemo::Tracked; use ecow::{eco_vec, EcoVec}; use typst_syntax::package::{PackageSpec, PackageVersion}; -use typst_syntax::{Span, Spanned, SyntaxError}; +use typst_syntax::{Lines, Span, Spanned, SyntaxError}; +use utf8_iter::ErrorReportingUtf8Chars; use crate::engine::Engine; +use crate::loading::{LoadSource, Loaded}; use crate::{World, WorldExt}; /// Early-return with a [`StrResult`] or [`SourceResult`]. @@ -148,7 +151,7 @@ pub struct Warned { pub warnings: EcoVec, } -/// An error or warning in a source file. +/// An error or warning in a source or text file. /// /// The contained spans will only be detached if any of the input source files /// were detached. @@ -568,31 +571,287 @@ impl From for EcoString { } } +/// A result type with a data-loading-related error. +pub type LoadResult = Result; + +/// A call site independent error that occurred during data loading. This avoids +/// polluting the memoization with [`Span`]s and [`FileId`]s from source files. +/// Can be turned into a [`SourceDiagnostic`] using the [`LoadedWithin::within`] +/// method available on [`LoadResult`]. +/// +/// [`FileId`]: typst_syntax::FileId +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub struct LoadError { + /// The position in the file at which the error occured. + pos: ReportPos, + /// Must contain a message formatted like this: `"failed to do thing (cause)"`. + message: EcoString, +} + +impl LoadError { + /// Creates a new error from a position in a file, a base message + /// (e.g. `failed to parse JSON`) and a concrete error (e.g. `invalid + /// number`) + pub fn new( + pos: impl Into, + message: impl std::fmt::Display, + error: impl std::fmt::Display, + ) -> Self { + Self { + pos: pos.into(), + message: eco_format!("{message} ({error})"), + } + } +} + +impl From for LoadError { + fn from(err: Utf8Error) -> Self { + let start = err.valid_up_to(); + let end = start + err.error_len().unwrap_or(0); + LoadError::new( + start..end, + "failed to convert to string", + "file is not valid utf-8", + ) + } +} + +/// Convert a [`LoadResult`] to a [`SourceResult`] by adding the [`Loaded`] +/// context. +pub trait LoadedWithin { + /// Report an error, possibly in an external file. + fn within(self, loaded: &Loaded) -> SourceResult; +} + +impl LoadedWithin for Result +where + E: Into, +{ + fn within(self, loaded: &Loaded) -> SourceResult { + self.map_err(|err| { + let LoadError { pos, message } = err.into(); + load_err_in_text(loaded, pos, message) + }) + } +} + +/// Report an error, possibly in an external file. This will delegate to +/// [`load_err_in_invalid_text`] if the data isn't valid utf-8. +fn load_err_in_text( + loaded: &Loaded, + pos: impl Into, + mut message: EcoString, +) -> EcoVec { + let pos = pos.into(); + // This also does utf-8 validation. Only report an error in an external + // file if it is human readable (valid utf-8), otherwise fall back to + // `load_err_in_invalid_text`. + let lines = Lines::try_from(&loaded.data); + match (loaded.source.v, lines) { + (LoadSource::Path(file_id), Ok(lines)) => { + if let Some(range) = pos.range(&lines) { + let span = Span::from_range(file_id, range); + return eco_vec![SourceDiagnostic::error(span, message)]; + } + + // Either `ReportPos::None` was provided, or resolving the range + // from the line/column failed. If present report the possibly + // wrong line/column in the error message anyway. + let span = Span::from_range(file_id, 0..loaded.data.len()); + if let Some(pair) = pos.line_col(&lines) { + message.pop(); + let (line, col) = pair.numbers(); + write!(&mut message, " at {line}:{col})").ok(); + } + eco_vec![SourceDiagnostic::error(span, message)] + } + (LoadSource::Bytes, Ok(lines)) => { + if let Some(pair) = pos.line_col(&lines) { + message.pop(); + let (line, col) = pair.numbers(); + write!(&mut message, " at {line}:{col})").ok(); + } + eco_vec![SourceDiagnostic::error(loaded.source.span, message)] + } + _ => load_err_in_invalid_text(loaded, pos, message), + } +} + +/// Report an error (possibly from an external file) that isn't valid utf-8. +fn load_err_in_invalid_text( + loaded: &Loaded, + pos: impl Into, + mut message: EcoString, +) -> EcoVec { + let line_col = pos.into().try_line_col(&loaded.data).map(|p| p.numbers()); + match (loaded.source.v, line_col) { + (LoadSource::Path(file), _) => { + message.pop(); + if let Some(package) = file.package() { + write!( + &mut message, + " in {package}{}", + file.vpath().as_rooted_path().display() + ) + .ok(); + } else { + write!(&mut message, " in {}", file.vpath().as_rootless_path().display()) + .ok(); + }; + if let Some((line, col)) = line_col { + write!(&mut message, ":{line}:{col}").ok(); + } + message.push(')'); + } + (LoadSource::Bytes, Some((line, col))) => { + message.pop(); + write!(&mut message, " at {line}:{col})").ok(); + } + (LoadSource::Bytes, None) => (), + } + eco_vec![SourceDiagnostic::error(loaded.source.span, message)] +} + +/// A position at which an error was reported. +#[derive(Clone, Debug, Default, PartialEq, Eq, Hash)] +pub enum ReportPos { + /// Contains a range, and a line/column pair. + Full(std::ops::Range, LineCol), + /// Contains a range. + Range(std::ops::Range), + /// Contains a line/column pair. + LineCol(LineCol), + #[default] + None, +} + +impl From> for ReportPos { + fn from(value: std::ops::Range) -> Self { + Self::Range(value.start.saturating_as()..value.end.saturating_as()) + } +} + +impl From for ReportPos { + fn from(value: LineCol) -> Self { + Self::LineCol(value) + } +} + +impl ReportPos { + /// Creates a position from a pre-existing range and line-column pair. + pub fn full(range: std::ops::Range, pair: LineCol) -> Self { + let range = range.start.saturating_as()..range.end.saturating_as(); + Self::Full(range, pair) + } + + /// Tries to determine the byte range for this position. + fn range(&self, lines: &Lines) -> Option> { + match self { + ReportPos::Full(range, _) => Some(range.start as usize..range.end as usize), + ReportPos::Range(range) => Some(range.start as usize..range.end as usize), + &ReportPos::LineCol(pair) => { + let i = + lines.line_column_to_byte(pair.line as usize, pair.col as usize)?; + Some(i..i) + } + ReportPos::None => None, + } + } + + /// Tries to determine the line/column for this position. + fn line_col(&self, lines: &Lines) -> Option { + match self { + &ReportPos::Full(_, pair) => Some(pair), + ReportPos::Range(range) => { + let (line, col) = lines.byte_to_line_column(range.start as usize)?; + Some(LineCol::zero_based(line, col)) + } + &ReportPos::LineCol(pair) => Some(pair), + ReportPos::None => None, + } + } + + /// Either gets the line/column pair, or tries to compute it from possibly + /// invalid utf-8 data. + fn try_line_col(&self, bytes: &[u8]) -> Option { + match self { + &ReportPos::Full(_, pair) => Some(pair), + ReportPos::Range(range) => { + LineCol::try_from_byte_pos(range.start as usize, bytes) + } + &ReportPos::LineCol(pair) => Some(pair), + ReportPos::None => None, + } + } +} + +/// A line/column pair. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub struct LineCol { + /// The 0-based line. + line: u32, + /// The 0-based column. + col: u32, +} + +impl LineCol { + /// Constructs the line/column pair from 0-based indices. + pub fn zero_based(line: usize, col: usize) -> Self { + Self { + line: line.saturating_as(), + col: col.saturating_as(), + } + } + + /// Constructs the line/column pair from 1-based numbers. + pub fn one_based(line: usize, col: usize) -> Self { + Self::zero_based(line.saturating_sub(1), col.saturating_sub(1)) + } + + /// Try to compute a line/column pair from possibly invalid utf-8 data. + pub fn try_from_byte_pos(pos: usize, bytes: &[u8]) -> Option { + let bytes = &bytes[..pos]; + let mut line = 0; + #[allow(clippy::double_ended_iterator_last)] + let line_start = memchr::memchr_iter(b'\n', bytes) + .inspect(|_| line += 1) + .last() + .map(|i| i + 1) + .unwrap_or(bytes.len()); + + let col = ErrorReportingUtf8Chars::new(&bytes[line_start..]).count(); + Some(LineCol::zero_based(line, col)) + } + + /// Returns the 0-based line/column indices. + pub fn indices(&self) -> (usize, usize) { + (self.line as usize, self.col as usize) + } + + /// Returns the 1-based line/column numbers. + pub fn numbers(&self) -> (usize, usize) { + (self.line as usize + 1, self.col as usize + 1) + } +} + /// Format a user-facing error message for an XML-like file format. -pub fn format_xml_like_error(format: &str, error: roxmltree::Error) -> EcoString { - match error { - roxmltree::Error::UnexpectedCloseTag(expected, actual, pos) => { - eco_format!( - "failed to parse {format} (found closing tag '{actual}' \ - instead of '{expected}' in line {})", - pos.row - ) +pub fn format_xml_like_error(format: &str, error: roxmltree::Error) -> LoadError { + let pos = LineCol::one_based(error.pos().row as usize, error.pos().col as usize); + let message = match error { + roxmltree::Error::UnexpectedCloseTag(expected, actual, _) => { + eco_format!("failed to parse {format} (found closing tag '{actual}' instead of '{expected}')") } - roxmltree::Error::UnknownEntityReference(entity, pos) => { - eco_format!( - "failed to parse {format} (unknown entity '{entity}' in line {})", - pos.row - ) + roxmltree::Error::UnknownEntityReference(entity, _) => { + eco_format!("failed to parse {format} (unknown entity '{entity}')") } - roxmltree::Error::DuplicatedAttribute(attr, pos) => { - eco_format!( - "failed to parse {format} (duplicate attribute '{attr}' in line {})", - pos.row - ) + roxmltree::Error::DuplicatedAttribute(attr, _) => { + eco_format!("failed to parse {format} (duplicate attribute '{attr}')") } roxmltree::Error::NoRootNode => { eco_format!("failed to parse {format} (missing root node)") } err => eco_format!("failed to parse {format} ({err})"), - } + }; + + LoadError { pos: pos.into(), message } } diff --git a/crates/typst-library/src/foundations/bytes.rs b/crates/typst-library/src/foundations/bytes.rs index d633c99ad..180dcdad5 100644 --- a/crates/typst-library/src/foundations/bytes.rs +++ b/crates/typst-library/src/foundations/bytes.rs @@ -7,6 +7,7 @@ use std::sync::Arc; use ecow::{eco_format, EcoString}; use serde::{Serialize, Serializer}; +use typst_syntax::Lines; use typst_utils::LazyHash; use crate::diag::{bail, StrResult}; @@ -286,6 +287,16 @@ impl Serialize for Bytes { } } +impl TryFrom<&Bytes> for Lines { + type Error = Utf8Error; + + #[comemo::memoize] + fn try_from(value: &Bytes) -> Result, Utf8Error> { + let text = value.as_str()?; + Ok(Lines::new(text.to_string())) + } +} + /// Any type that can back a byte buffer. trait Bytelike: Send + Sync { fn as_bytes(&self) -> &[u8]; diff --git a/crates/typst-library/src/foundations/plugin.rs b/crates/typst-library/src/foundations/plugin.rs index 31f8cd732..a04443bf4 100644 --- a/crates/typst-library/src/foundations/plugin.rs +++ b/crates/typst-library/src/foundations/plugin.rs @@ -151,8 +151,8 @@ pub fn plugin( /// A [path]($syntax/#paths) to a WebAssembly file or raw WebAssembly bytes. source: Spanned, ) -> SourceResult { - let data = source.load(engine.world)?; - Plugin::module(data).at(source.span) + let loaded = source.load(engine.world)?; + Plugin::module(loaded.data).at(source.span) } #[scope] diff --git a/crates/typst-library/src/loading/cbor.rs b/crates/typst-library/src/loading/cbor.rs index aa14c5c77..d95f73844 100644 --- a/crates/typst-library/src/loading/cbor.rs +++ b/crates/typst-library/src/loading/cbor.rs @@ -23,8 +23,8 @@ pub fn cbor( /// A [path]($syntax/#paths) to a CBOR file or raw CBOR bytes. source: Spanned, ) -> SourceResult { - let data = source.load(engine.world)?; - ciborium::from_reader(data.as_slice()) + let loaded = source.load(engine.world)?; + ciborium::from_reader(loaded.data.as_slice()) .map_err(|err| eco_format!("failed to parse CBOR ({err})")) .at(source.span) } diff --git a/crates/typst-library/src/loading/csv.rs b/crates/typst-library/src/loading/csv.rs index 6afb5baeb..d5b54a06c 100644 --- a/crates/typst-library/src/loading/csv.rs +++ b/crates/typst-library/src/loading/csv.rs @@ -1,7 +1,7 @@ -use ecow::{eco_format, EcoString}; +use az::SaturatingAs; use typst_syntax::Spanned; -use crate::diag::{bail, At, SourceResult}; +use crate::diag::{bail, LineCol, LoadError, LoadedWithin, ReportPos, SourceResult}; use crate::engine::Engine; use crate::foundations::{cast, func, scope, Array, Dict, IntoValue, Type, Value}; use crate::loading::{DataSource, Load, Readable}; @@ -44,7 +44,7 @@ pub fn csv( #[default(RowType::Array)] row_type: RowType, ) -> SourceResult { - let data = source.load(engine.world)?; + let loaded = source.load(engine.world)?; let mut builder = ::csv::ReaderBuilder::new(); let has_headers = row_type == RowType::Dict; @@ -53,7 +53,7 @@ pub fn csv( // Counting lines from 1 by default. let mut line_offset: usize = 1; - let mut reader = builder.from_reader(data.as_slice()); + let mut reader = builder.from_reader(loaded.data.as_slice()); let mut headers: Option<::csv::StringRecord> = None; if has_headers { @@ -62,9 +62,9 @@ pub fn csv( headers = Some( reader .headers() + .cloned() .map_err(|err| format_csv_error(err, 1)) - .at(source.span)? - .clone(), + .within(&loaded)?, ); } @@ -74,7 +74,7 @@ pub fn csv( // incorrect with `has_headers` set to `false`. See issue: // https://github.com/BurntSushi/rust-csv/issues/184 let line = line + line_offset; - let row = result.map_err(|err| format_csv_error(err, line)).at(source.span)?; + let row = result.map_err(|err| format_csv_error(err, line)).within(&loaded)?; let item = if let Some(headers) = &headers { let mut dict = Dict::new(); for (field, value) in headers.iter().zip(&row) { @@ -164,15 +164,23 @@ cast! { } /// Format the user-facing CSV error message. -fn format_csv_error(err: ::csv::Error, line: usize) -> EcoString { +fn format_csv_error(err: ::csv::Error, line: usize) -> LoadError { + let msg = "failed to parse CSV"; + let pos = (err.kind().position()) + .map(|pos| { + let start = pos.byte().saturating_as(); + ReportPos::from(start..start) + }) + .unwrap_or(LineCol::one_based(line, 1).into()); match err.kind() { - ::csv::ErrorKind::Utf8 { .. } => "file is not valid utf-8".into(), - ::csv::ErrorKind::UnequalLengths { expected_len, len, .. } => { - eco_format!( - "failed to parse CSV (found {len} instead of \ - {expected_len} fields in line {line})" - ) + ::csv::ErrorKind::Utf8 { .. } => { + LoadError::new(pos, msg, "file is not valid utf-8") } - _ => eco_format!("failed to parse CSV ({err})"), + ::csv::ErrorKind::UnequalLengths { expected_len, len, .. } => { + let err = + format!("found {len} instead of {expected_len} fields in line {line}"); + LoadError::new(pos, msg, err) + } + _ => LoadError::new(pos, "failed to parse CSV", err), } } diff --git a/crates/typst-library/src/loading/json.rs b/crates/typst-library/src/loading/json.rs index aa908cca4..7d0732ba0 100644 --- a/crates/typst-library/src/loading/json.rs +++ b/crates/typst-library/src/loading/json.rs @@ -1,7 +1,7 @@ use ecow::eco_format; use typst_syntax::Spanned; -use crate::diag::{At, SourceResult}; +use crate::diag::{At, LineCol, LoadError, LoadedWithin, SourceResult}; use crate::engine::Engine; use crate::foundations::{func, scope, Str, Value}; use crate::loading::{DataSource, Load, Readable}; @@ -54,10 +54,13 @@ pub fn json( /// A [path]($syntax/#paths) to a JSON file or raw JSON bytes. source: Spanned, ) -> SourceResult { - let data = source.load(engine.world)?; - serde_json::from_slice(data.as_slice()) - .map_err(|err| eco_format!("failed to parse JSON ({err})")) - .at(source.span) + let loaded = source.load(engine.world)?; + serde_json::from_slice(loaded.data.as_slice()) + .map_err(|err| { + let pos = LineCol::one_based(err.line(), err.column()); + LoadError::new(pos, "failed to parse JSON", err) + }) + .within(&loaded) } #[scope] diff --git a/crates/typst-library/src/loading/mod.rs b/crates/typst-library/src/loading/mod.rs index c57e02888..67f4be834 100644 --- a/crates/typst-library/src/loading/mod.rs +++ b/crates/typst-library/src/loading/mod.rs @@ -17,7 +17,7 @@ mod yaml_; use comemo::Tracked; use ecow::EcoString; -use typst_syntax::Spanned; +use typst_syntax::{FileId, Spanned}; pub use self::cbor_::*; pub use self::csv_::*; @@ -74,39 +74,44 @@ pub trait Load { } impl Load for Spanned { - type Output = Bytes; + type Output = Loaded; - fn load(&self, world: Tracked) -> SourceResult { + fn load(&self, world: Tracked) -> SourceResult { self.as_ref().load(world) } } impl Load for Spanned<&DataSource> { - type Output = Bytes; + type Output = Loaded; - fn load(&self, world: Tracked) -> SourceResult { + fn load(&self, world: Tracked) -> SourceResult { match &self.v { DataSource::Path(path) => { let file_id = self.span.resolve_path(path).at(self.span)?; - world.file(file_id).at(self.span) + let data = world.file(file_id).at(self.span)?; + let source = Spanned::new(LoadSource::Path(file_id), self.span); + Ok(Loaded::new(source, data)) + } + DataSource::Bytes(data) => { + let source = Spanned::new(LoadSource::Bytes, self.span); + Ok(Loaded::new(source, data.clone())) } - DataSource::Bytes(bytes) => Ok(bytes.clone()), } } } impl Load for Spanned> { - type Output = Vec; + type Output = Vec; - fn load(&self, world: Tracked) -> SourceResult> { + fn load(&self, world: Tracked) -> SourceResult { self.as_ref().load(world) } } impl Load for Spanned<&OneOrMultiple> { - type Output = Vec; + type Output = Vec; - fn load(&self, world: Tracked) -> SourceResult> { + fn load(&self, world: Tracked) -> SourceResult { self.v .0 .iter() @@ -115,6 +120,28 @@ impl Load for Spanned<&OneOrMultiple> { } } +/// Data loaded from a [`DataSource`]. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub struct Loaded { + /// Details about where `data` was loaded from. + pub source: Spanned, + /// The loaded data. + pub data: Bytes, +} + +impl Loaded { + pub fn new(source: Spanned, bytes: Bytes) -> Self { + Self { source, data: bytes } + } +} + +/// A loaded [`DataSource`]. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum LoadSource { + Path(FileId), + Bytes, +} + /// A value that can be read from a file. #[derive(Debug, Clone, PartialEq, Hash)] pub enum Readable { diff --git a/crates/typst-library/src/loading/read.rs b/crates/typst-library/src/loading/read.rs index 32dadc799..91e6e4366 100644 --- a/crates/typst-library/src/loading/read.rs +++ b/crates/typst-library/src/loading/read.rs @@ -1,11 +1,10 @@ use ecow::EcoString; use typst_syntax::Spanned; -use crate::diag::{At, FileError, SourceResult}; +use crate::diag::{LoadedWithin, SourceResult}; use crate::engine::Engine; use crate::foundations::{func, Cast}; -use crate::loading::Readable; -use crate::World; +use crate::loading::{DataSource, Load, Readable}; /// Reads plain text or data from a file. /// @@ -36,14 +35,10 @@ pub fn read( #[default(Some(Encoding::Utf8))] encoding: Option, ) -> SourceResult { - let Spanned { v: path, span } = path; - let id = span.resolve_path(&path).at(span)?; - let data = engine.world.file(id).at(span)?; + let loaded = path.map(DataSource::Path).load(engine.world)?; Ok(match encoding { - None => Readable::Bytes(data), - Some(Encoding::Utf8) => { - Readable::Str(data.to_str().map_err(FileError::from).at(span)?) - } + None => Readable::Bytes(loaded.data), + Some(Encoding::Utf8) => Readable::Str(loaded.data.to_str().within(&loaded)?), }) } diff --git a/crates/typst-library/src/loading/toml.rs b/crates/typst-library/src/loading/toml.rs index f04b2e746..a4252feca 100644 --- a/crates/typst-library/src/loading/toml.rs +++ b/crates/typst-library/src/loading/toml.rs @@ -1,7 +1,7 @@ -use ecow::{eco_format, EcoString}; -use typst_syntax::{is_newline, Spanned}; +use ecow::eco_format; +use typst_syntax::Spanned; -use crate::diag::{At, FileError, SourceResult}; +use crate::diag::{At, LoadError, LoadedWithin, ReportPos, SourceResult}; use crate::engine::Engine; use crate::foundations::{func, scope, Str, Value}; use crate::loading::{DataSource, Load, Readable}; @@ -32,11 +32,9 @@ pub fn toml( /// A [path]($syntax/#paths) to a TOML file or raw TOML bytes. source: Spanned, ) -> SourceResult { - let data = source.load(engine.world)?; - let raw = data.as_str().map_err(FileError::from).at(source.span)?; - ::toml::from_str(raw) - .map_err(|err| format_toml_error(err, raw)) - .at(source.span) + let loaded = source.load(engine.world)?; + let raw = loaded.data.as_str().within(&loaded)?; + ::toml::from_str(raw).map_err(format_toml_error).within(&loaded) } #[scope] @@ -71,15 +69,7 @@ impl toml { } /// Format the user-facing TOML error message. -fn format_toml_error(error: ::toml::de::Error, raw: &str) -> EcoString { - if let Some(head) = error.span().and_then(|range| raw.get(..range.start)) { - let line = head.lines().count(); - let column = 1 + head.chars().rev().take_while(|&c| !is_newline(c)).count(); - eco_format!( - "failed to parse TOML ({} at line {line} column {column})", - error.message(), - ) - } else { - eco_format!("failed to parse TOML ({})", error.message()) - } +fn format_toml_error(error: ::toml::de::Error) -> LoadError { + let pos = error.span().map(ReportPos::from).unwrap_or_default(); + LoadError::new(pos, "failed to parse TOML", error.message()) } diff --git a/crates/typst-library/src/loading/xml.rs b/crates/typst-library/src/loading/xml.rs index e76c4e9cf..0023c5df5 100644 --- a/crates/typst-library/src/loading/xml.rs +++ b/crates/typst-library/src/loading/xml.rs @@ -1,8 +1,7 @@ -use ecow::EcoString; use roxmltree::ParsingOptions; use typst_syntax::Spanned; -use crate::diag::{format_xml_like_error, At, FileError, SourceResult}; +use crate::diag::{format_xml_like_error, LoadError, LoadedWithin, SourceResult}; use crate::engine::Engine; use crate::foundations::{dict, func, scope, Array, Dict, IntoValue, Str, Value}; use crate::loading::{DataSource, Load, Readable}; @@ -61,14 +60,14 @@ pub fn xml( /// A [path]($syntax/#paths) to an XML file or raw XML bytes. source: Spanned, ) -> SourceResult { - let data = source.load(engine.world)?; - let text = data.as_str().map_err(FileError::from).at(source.span)?; + let loaded = source.load(engine.world)?; + let text = loaded.data.as_str().within(&loaded)?; let document = roxmltree::Document::parse_with_options( text, ParsingOptions { allow_dtd: true, ..Default::default() }, ) .map_err(format_xml_error) - .at(source.span)?; + .within(&loaded)?; Ok(convert_xml(document.root())) } @@ -111,6 +110,6 @@ fn convert_xml(node: roxmltree::Node) -> Value { } /// Format the user-facing XML error message. -fn format_xml_error(error: roxmltree::Error) -> EcoString { +fn format_xml_error(error: roxmltree::Error) -> LoadError { format_xml_like_error("XML", error) } diff --git a/crates/typst-library/src/loading/yaml.rs b/crates/typst-library/src/loading/yaml.rs index 3f48113e8..0edf1f901 100644 --- a/crates/typst-library/src/loading/yaml.rs +++ b/crates/typst-library/src/loading/yaml.rs @@ -1,7 +1,7 @@ use ecow::eco_format; use typst_syntax::Spanned; -use crate::diag::{At, SourceResult}; +use crate::diag::{At, LineCol, LoadError, LoadedWithin, ReportPos, SourceResult}; use crate::engine::Engine; use crate::foundations::{func, scope, Str, Value}; use crate::loading::{DataSource, Load, Readable}; @@ -44,10 +44,10 @@ pub fn yaml( /// A [path]($syntax/#paths) to a YAML file or raw YAML bytes. source: Spanned, ) -> SourceResult { - let data = source.load(engine.world)?; - serde_yaml::from_slice(data.as_slice()) - .map_err(|err| eco_format!("failed to parse YAML ({err})")) - .at(source.span) + let loaded = source.load(engine.world)?; + serde_yaml::from_slice(loaded.data.as_slice()) + .map_err(format_yaml_error) + .within(&loaded) } #[scope] @@ -76,3 +76,16 @@ impl yaml { .at(span) } } + +/// Format the user-facing YAML error message. +pub fn format_yaml_error(error: serde_yaml::Error) -> LoadError { + let pos = error + .location() + .map(|loc| { + let line_col = LineCol::one_based(loc.line(), loc.column()); + let range = loc.index()..loc.index(); + ReportPos::full(range, line_col) + }) + .unwrap_or_default(); + LoadError::new(pos, "failed to parse YAML", error) +} diff --git a/crates/typst-library/src/model/bibliography.rs b/crates/typst-library/src/model/bibliography.rs index 51e3b03b0..114356575 100644 --- a/crates/typst-library/src/model/bibliography.rs +++ b/crates/typst-library/src/model/bibliography.rs @@ -19,7 +19,10 @@ use smallvec::{smallvec, SmallVec}; use typst_syntax::{Span, Spanned}; use typst_utils::{Get, ManuallyHash, NonZeroExt, PicoStr}; -use crate::diag::{bail, error, At, FileError, HintedStrResult, SourceResult, StrResult}; +use crate::diag::{ + bail, error, At, HintedStrResult, LoadError, LoadResult, LoadedWithin, ReportPos, + SourceResult, StrResult, +}; use crate::engine::{Engine, Sink}; use crate::foundations::{ elem, Bytes, CastInfo, Content, Derived, FromValue, IntoValue, Label, NativeElement, @@ -31,7 +34,7 @@ use crate::layout::{ BlockBody, BlockElem, Em, GridCell, GridChild, GridElem, GridItem, HElem, PadElem, Sides, Sizing, TrackSizings, }; -use crate::loading::{DataSource, Load}; +use crate::loading::{format_yaml_error, DataSource, Load, LoadSource, Loaded}; use crate::model::{ CitationForm, CiteGroup, Destination, FootnoteElem, HeadingElem, LinkElem, ParElem, Url, @@ -294,24 +297,21 @@ impl Bibliography { world: Tracked, sources: Spanned>, ) -> SourceResult, Self>> { - let data = sources.load(world)?; - let bibliography = Self::decode(&sources.v, &data).at(sources.span)?; + let loaded = sources.load(world)?; + let bibliography = Self::decode(&loaded)?; Ok(Derived::new(sources.v, bibliography)) } /// Decode a bibliography from loaded data sources. #[comemo::memoize] #[typst_macros::time(name = "load bibliography")] - fn decode( - sources: &OneOrMultiple, - data: &[Bytes], - ) -> StrResult { + fn decode(data: &[Loaded]) -> SourceResult { let mut map = IndexMap::new(); let mut duplicates = Vec::::new(); // We might have multiple bib/yaml files - for (source, data) in sources.0.iter().zip(data) { - let library = decode_library(source, data)?; + for d in data.iter() { + let library = decode_library(d)?; for entry in library { match map.entry(Label::new(PicoStr::intern(entry.key()))) { indexmap::map::Entry::Vacant(vacant) => { @@ -325,7 +325,11 @@ impl Bibliography { } if !duplicates.is_empty() { - bail!("duplicate bibliography keys: {}", duplicates.join(", ")); + // TODO: Store spans of entries for duplicate key error messages. + // Requires hayagriva entries to store their location, which should + // be fine, since they are 1kb anyway. + let span = data.first().unwrap().source.span; + bail!(span, "duplicate bibliography keys: {}", duplicates.join(", ")); } Ok(Bibliography(Arc::new(ManuallyHash::new(map, typst_utils::hash128(data))))) @@ -351,36 +355,47 @@ impl Debug for Bibliography { } /// Decode on library from one data source. -fn decode_library(source: &DataSource, data: &Bytes) -> StrResult { - let src = data.as_str().map_err(FileError::from)?; +fn decode_library(loaded: &Loaded) -> SourceResult { + let data = loaded.data.as_str().within(loaded)?; - if let DataSource::Path(path) = source { + if let LoadSource::Path(file_id) = loaded.source.v { // If we got a path, use the extension to determine whether it is // YAML or BibLaTeX. - let ext = Path::new(path.as_str()) + let ext = file_id + .vpath() + .as_rooted_path() .extension() .and_then(OsStr::to_str) .unwrap_or_default(); match ext.to_lowercase().as_str() { - "yml" | "yaml" => hayagriva::io::from_yaml_str(src) - .map_err(|err| eco_format!("failed to parse YAML ({err})")), - "bib" => hayagriva::io::from_biblatex_str(src) - .map_err(|errors| format_biblatex_error(src, Some(path), errors)), - _ => bail!("unknown bibliography format (must be .yml/.yaml or .bib)"), + "yml" | "yaml" => hayagriva::io::from_yaml_str(data) + .map_err(format_yaml_error) + .within(loaded), + "bib" => hayagriva::io::from_biblatex_str(data) + .map_err(format_biblatex_error) + .within(loaded), + _ => bail!( + loaded.source.span, + "unknown bibliography format (must be .yml/.yaml or .bib)" + ), } } else { // If we just got bytes, we need to guess. If it can be decoded as // hayagriva YAML, we'll use that. - let haya_err = match hayagriva::io::from_yaml_str(src) { + let haya_err = match hayagriva::io::from_yaml_str(data) { Ok(library) => return Ok(library), Err(err) => err, }; // If it can be decoded as BibLaTeX, we use that isntead. - let bib_errs = match hayagriva::io::from_biblatex_str(src) { - Ok(library) => return Ok(library), - Err(err) => err, + let bib_errs = match hayagriva::io::from_biblatex_str(data) { + // If the file is almost valid yaml, but contains no `@` character + // it will be successfully parsed as an empty BibLaTeX library, + // since BibLaTeX does support arbitrary text outside of entries. + Ok(library) if !library.is_empty() => return Ok(library), + Ok(_) => None, + Err(err) => Some(err), }; // If neither decoded correctly, check whether `:` or `{` appears @@ -388,7 +403,7 @@ fn decode_library(source: &DataSource, data: &Bytes) -> StrResult { // and emit the more appropriate error. let mut yaml = 0; let mut biblatex = 0; - for c in src.chars() { + for c in data.chars() { match c { ':' => yaml += 1, '{' => biblatex += 1, @@ -396,37 +411,33 @@ fn decode_library(source: &DataSource, data: &Bytes) -> StrResult { } } - if yaml > biblatex { - bail!("failed to parse YAML ({haya_err})") - } else { - Err(format_biblatex_error(src, None, bib_errs)) + match bib_errs { + Some(bib_errs) if biblatex >= yaml => { + Err(format_biblatex_error(bib_errs)).within(loaded) + } + _ => Err(format_yaml_error(haya_err)).within(loaded), } } } /// Format a BibLaTeX loading error. -fn format_biblatex_error( - src: &str, - path: Option<&str>, - errors: Vec, -) -> EcoString { - let Some(error) = errors.first() else { - return match path { - Some(path) => eco_format!("failed to parse BibLaTeX file ({path})"), - None => eco_format!("failed to parse BibLaTeX"), - }; +fn format_biblatex_error(errors: Vec) -> LoadError { + // TODO: return multiple errors? + let Some(error) = errors.into_iter().next() else { + // TODO: can this even happen, should we just unwrap? + return LoadError::new( + ReportPos::None, + "failed to parse BibLaTeX", + "something went wrong", + ); }; - let (span, msg) = match error { - BibLaTeXError::Parse(error) => (&error.span, error.kind.to_string()), - BibLaTeXError::Type(error) => (&error.span, error.kind.to_string()), + let (range, msg) = match error { + BibLaTeXError::Parse(error) => (error.span, error.kind.to_string()), + BibLaTeXError::Type(error) => (error.span, error.kind.to_string()), }; - let line = src.get(..span.start).unwrap_or_default().lines().count(); - match path { - Some(path) => eco_format!("failed to parse BibLaTeX file ({path}:{line}: {msg})"), - None => eco_format!("failed to parse BibLaTeX ({line}: {msg})"), - } + LoadError::new(range, "failed to parse BibLaTeX", msg) } /// A loaded CSL style. @@ -442,8 +453,8 @@ impl CslStyle { let style = match &source { CslSource::Named(style) => Self::from_archived(*style), CslSource::Normal(source) => { - let data = Spanned::new(source, span).load(world)?; - Self::from_data(data).at(span)? + let loaded = Spanned::new(source, span).load(world)?; + Self::from_data(&loaded.data).within(&loaded)? } }; Ok(Derived::new(source, style)) @@ -464,16 +475,18 @@ impl CslStyle { /// Load a CSL style from file contents. #[comemo::memoize] - pub fn from_data(data: Bytes) -> StrResult { - let text = data.as_str().map_err(FileError::from)?; + pub fn from_data(bytes: &Bytes) -> LoadResult { + let text = bytes.as_str()?; citationberg::IndependentStyle::from_xml(text) .map(|style| { Self(Arc::new(ManuallyHash::new( style, - typst_utils::hash128(&(TypeId::of::(), data)), + typst_utils::hash128(&(TypeId::of::(), bytes)), ))) }) - .map_err(|err| eco_format!("failed to load CSL style ({err})")) + .map_err(|err| { + LoadError::new(ReportPos::None, "failed to load CSL style", err) + }) } /// Get the underlying independent style. diff --git a/crates/typst-library/src/text/raw.rs b/crates/typst-library/src/text/raw.rs index d5c07424d..f2485e16b 100644 --- a/crates/typst-library/src/text/raw.rs +++ b/crates/typst-library/src/text/raw.rs @@ -3,15 +3,17 @@ use std::ops::Range; use std::sync::{Arc, LazyLock}; use comemo::Tracked; -use ecow::{eco_format, EcoString, EcoVec}; -use syntect::highlighting as synt; -use syntect::parsing::{SyntaxDefinition, SyntaxSet, SyntaxSetBuilder}; +use ecow::{EcoString, EcoVec}; +use syntect::highlighting::{self as synt}; +use syntect::parsing::{ParseSyntaxError, SyntaxDefinition, SyntaxSet, SyntaxSetBuilder}; use typst_syntax::{split_newlines, LinkedNode, Span, Spanned}; use typst_utils::ManuallyHash; use unicode_segmentation::UnicodeSegmentation; use super::Lang; -use crate::diag::{At, FileError, SourceResult, StrResult}; +use crate::diag::{ + LineCol, LoadError, LoadResult, LoadedWithin, ReportPos, SourceResult, +}; use crate::engine::Engine; use crate::foundations::{ cast, elem, scope, Bytes, Content, Derived, NativeElement, OneOrMultiple, Packed, @@ -539,40 +541,29 @@ impl RawSyntax { world: Tracked, sources: Spanned>, ) -> SourceResult, Vec>> { - let data = sources.load(world)?; - let list = sources - .v - .0 + let loaded = sources.load(world)?; + let list = loaded .iter() - .zip(&data) - .map(|(source, data)| Self::decode(source, data)) - .collect::>() - .at(sources.span)?; + .map(|data| Self::decode(&data.data).within(data)) + .collect::>()?; Ok(Derived::new(sources.v, list)) } /// Decode a syntax from a loaded source. #[comemo::memoize] #[typst_macros::time(name = "load syntaxes")] - fn decode(source: &DataSource, data: &Bytes) -> StrResult { - let src = data.as_str().map_err(FileError::from)?; - let syntax = SyntaxDefinition::load_from_str(src, false, None).map_err( - |err| match source { - DataSource::Path(path) => { - eco_format!("failed to parse syntax file `{path}` ({err})") - } - DataSource::Bytes(_) => { - eco_format!("failed to parse syntax ({err})") - } - }, - )?; + fn decode(bytes: &Bytes) -> LoadResult { + let str = bytes.as_str()?; + + let syntax = SyntaxDefinition::load_from_str(str, false, None) + .map_err(format_syntax_error)?; let mut builder = SyntaxSetBuilder::new(); builder.add(syntax); Ok(RawSyntax(Arc::new(ManuallyHash::new( builder.build(), - typst_utils::hash128(data), + typst_utils::hash128(bytes), )))) } @@ -582,6 +573,24 @@ impl RawSyntax { } } +fn format_syntax_error(error: ParseSyntaxError) -> LoadError { + let pos = syntax_error_pos(&error); + LoadError::new(pos, "failed to parse syntax", error) +} + +fn syntax_error_pos(error: &ParseSyntaxError) -> ReportPos { + match error { + ParseSyntaxError::InvalidYaml(scan_error) => { + let m = scan_error.marker(); + ReportPos::full( + m.index()..m.index(), + LineCol::one_based(m.line(), m.col() + 1), + ) + } + _ => ReportPos::None, + } +} + /// A loaded syntect theme. #[derive(Debug, Clone, PartialEq, Hash)] pub struct RawTheme(Arc>); @@ -592,18 +601,18 @@ impl RawTheme { world: Tracked, source: Spanned, ) -> SourceResult> { - let data = source.load(world)?; - let theme = Self::decode(&data).at(source.span)?; + let loaded = source.load(world)?; + let theme = Self::decode(&loaded.data).within(&loaded)?; Ok(Derived::new(source.v, theme)) } /// Decode a theme from bytes. #[comemo::memoize] - fn decode(data: &Bytes) -> StrResult { - let mut cursor = std::io::Cursor::new(data.as_slice()); - let theme = synt::ThemeSet::load_from_reader(&mut cursor) - .map_err(|err| eco_format!("failed to parse theme ({err})"))?; - Ok(RawTheme(Arc::new(ManuallyHash::new(theme, typst_utils::hash128(data))))) + fn decode(bytes: &Bytes) -> LoadResult { + let mut cursor = std::io::Cursor::new(bytes.as_slice()); + let theme = + synt::ThemeSet::load_from_reader(&mut cursor).map_err(format_theme_error)?; + Ok(RawTheme(Arc::new(ManuallyHash::new(theme, typst_utils::hash128(bytes))))) } /// Get the underlying syntect theme. @@ -612,6 +621,14 @@ impl RawTheme { } } +fn format_theme_error(error: syntect::LoadingError) -> LoadError { + let pos = match &error { + syntect::LoadingError::ParseSyntax(err, _) => syntax_error_pos(err), + _ => ReportPos::None, + }; + LoadError::new(pos, "failed to parse theme", error) +} + /// A highlighted line of raw text. /// /// This is a helper element that is synthesized by [`raw`] elements. diff --git a/crates/typst-library/src/visualize/image/mod.rs b/crates/typst-library/src/visualize/image/mod.rs index f9e345e70..f5109798b 100644 --- a/crates/typst-library/src/visualize/image/mod.rs +++ b/crates/typst-library/src/visualize/image/mod.rs @@ -22,7 +22,7 @@ use crate::foundations::{ Smart, StyleChain, }; use crate::layout::{BlockElem, Length, Rel, Sizing}; -use crate::loading::{DataSource, Load, Readable}; +use crate::loading::{DataSource, Load, LoadSource, Loaded, Readable}; use crate::model::Figurable; use crate::text::LocalName; @@ -65,10 +65,10 @@ pub struct ImageElem { #[required] #[parse( let source = args.expect::>("source")?; - let data = source.load(engine.world)?; - Derived::new(source.v, data) + let loaded = source.load(engine.world)?; + Derived::new(source.v, loaded) )] - pub source: Derived, + pub source: Derived, /// The image's format. /// @@ -154,8 +154,8 @@ pub struct ImageElem { /// to `{auto}`, Typst will try to extract an ICC profile from the image. #[parse(match args.named::>>("icc")? { Some(Spanned { v: Smart::Custom(source), span }) => Some(Smart::Custom({ - let data = Spanned::new(&source, span).load(engine.world)?; - Derived::new(source, data) + let loaded = Spanned::new(&source, span).load(engine.world)?; + Derived::new(source, loaded.data) })), Some(Spanned { v: Smart::Auto, .. }) => Some(Smart::Auto), None => None, @@ -173,7 +173,7 @@ impl ImageElem { pub fn decode( span: Span, /// The data to decode as an image. Can be a string for SVGs. - data: Readable, + data: Spanned, /// The image's format. Detected automatically by default. #[named] format: Option>, @@ -193,8 +193,10 @@ impl ImageElem { #[named] scaling: Option>, ) -> StrResult { - let bytes = data.into_bytes(); - let source = Derived::new(DataSource::Bytes(bytes.clone()), bytes); + let bytes = data.v.into_bytes(); + let loaded = + Loaded::new(Spanned::new(LoadSource::Bytes, data.span), bytes.clone()); + let source = Derived::new(DataSource::Bytes(bytes), loaded); let mut elem = ImageElem::new(source); if let Some(format) = format { elem.push_format(format); diff --git a/crates/typst-library/src/visualize/image/svg.rs b/crates/typst-library/src/visualize/image/svg.rs index 9bf1ead0d..1a3f6d474 100644 --- a/crates/typst-library/src/visualize/image/svg.rs +++ b/crates/typst-library/src/visualize/image/svg.rs @@ -3,10 +3,9 @@ use std::hash::{Hash, Hasher}; use std::sync::{Arc, Mutex}; use comemo::Tracked; -use ecow::EcoString; use siphasher::sip128::{Hasher128, SipHasher13}; -use crate::diag::{format_xml_like_error, StrResult}; +use crate::diag::{format_xml_like_error, LoadError, LoadResult, ReportPos}; use crate::foundations::Bytes; use crate::layout::Axes; use crate::text::{ @@ -30,7 +29,7 @@ impl SvgImage { /// Decode an SVG image without fonts. #[comemo::memoize] #[typst_macros::time(name = "load svg")] - pub fn new(data: Bytes) -> StrResult { + pub fn new(data: Bytes) -> LoadResult { let tree = usvg::Tree::from_data(&data, &base_options()).map_err(format_usvg_error)?; Ok(Self(Arc::new(Repr { data, size: tree_size(&tree), font_hash: 0, tree }))) @@ -43,7 +42,7 @@ impl SvgImage { data: Bytes, world: Tracked, families: &[&str], - ) -> StrResult { + ) -> LoadResult { let book = world.book(); let resolver = Mutex::new(FontResolver::new(world, book, families)); let tree = usvg::Tree::from_data( @@ -125,16 +124,15 @@ fn tree_size(tree: &usvg::Tree) -> Axes { } /// Format the user-facing SVG decoding error message. -fn format_usvg_error(error: usvg::Error) -> EcoString { - match error { - usvg::Error::NotAnUtf8Str => "file is not valid utf-8".into(), - usvg::Error::MalformedGZip => "file is not compressed correctly".into(), - usvg::Error::ElementsLimitReached => "file is too large".into(), - usvg::Error::InvalidSize => { - "failed to parse SVG (width, height, or viewbox is invalid)".into() - } - usvg::Error::ParsingFailed(error) => format_xml_like_error("SVG", error), - } +fn format_usvg_error(error: usvg::Error) -> LoadError { + let error = match error { + usvg::Error::NotAnUtf8Str => "file is not valid utf-8", + usvg::Error::MalformedGZip => "file is not compressed correctly", + usvg::Error::ElementsLimitReached => "file is too large", + usvg::Error::InvalidSize => "width, height, or viewbox is invalid", + usvg::Error::ParsingFailed(error) => return format_xml_like_error("SVG", error), + }; + LoadError::new(ReportPos::None, "failed to parse SVG", error) } /// Provides Typst's fonts to usvg. diff --git a/crates/typst-syntax/Cargo.toml b/crates/typst-syntax/Cargo.toml index 263595bd4..c20f6a087 100644 --- a/crates/typst-syntax/Cargo.toml +++ b/crates/typst-syntax/Cargo.toml @@ -15,6 +15,7 @@ readme = { workspace = true } [dependencies] typst-timing = { workspace = true } typst-utils = { workspace = true } +comemo = { workspace = true } ecow = { workspace = true } serde = { workspace = true } toml = { workspace = true } diff --git a/crates/typst-syntax/src/lib.rs b/crates/typst-syntax/src/lib.rs index 5e7b710fc..1249f88e9 100644 --- a/crates/typst-syntax/src/lib.rs +++ b/crates/typst-syntax/src/lib.rs @@ -7,6 +7,7 @@ mod file; mod highlight; mod kind; mod lexer; +mod lines; mod node; mod parser; mod path; @@ -22,6 +23,7 @@ pub use self::lexer::{ is_id_continue, is_id_start, is_ident, is_newline, is_valid_label_literal_id, link_prefix, split_newlines, }; +pub use self::lines::Lines; pub use self::node::{LinkedChildren, LinkedNode, Side, SyntaxError, SyntaxNode}; pub use self::parser::{parse, parse_code, parse_math}; pub use self::path::VirtualPath; diff --git a/crates/typst-syntax/src/lines.rs b/crates/typst-syntax/src/lines.rs new file mode 100644 index 000000000..fa1e77563 --- /dev/null +++ b/crates/typst-syntax/src/lines.rs @@ -0,0 +1,402 @@ +use std::hash::{Hash, Hasher}; +use std::iter::zip; +use std::ops::Range; +use std::sync::Arc; + +use crate::is_newline; + +/// A text buffer and metadata about lines. +#[derive(Clone)] +pub struct Lines(Arc>); + +#[derive(Clone)] +struct Repr { + lines: Vec, + text: T, +} + +/// Metadata about a line. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub struct Line { + /// The UTF-8 byte offset where the line starts. + byte_idx: usize, + /// The UTF-16 codepoint offset where the line starts. + utf16_idx: usize, +} + +impl> Lines { + /// Create from the text buffer and compute the line metadata. + pub fn new(text: T) -> Self { + let lines = lines(text.as_ref()); + Lines(Arc::new(Repr { lines, text })) + } + + /// The text as a string slice. + pub fn text(&self) -> &str { + self.0.text.as_ref() + } + + /// Get the length of the file in UTF-8 encoded bytes. + pub fn len_bytes(&self) -> usize { + self.0.text.as_ref().len() + } + + /// Get the length of the file in UTF-16 code units. + pub fn len_utf16(&self) -> usize { + let last = self.0.lines.last().unwrap(); + last.utf16_idx + len_utf16(&self.text()[last.byte_idx..]) + } + + /// Get the length of the file in lines. + pub fn len_lines(&self) -> usize { + self.0.lines.len() + } + + /// Return the index of the UTF-16 code unit at the byte index. + pub fn byte_to_utf16(&self, byte_idx: usize) -> Option { + let line_idx = self.byte_to_line(byte_idx)?; + let line = self.0.lines.get(line_idx)?; + let head = self.text().get(line.byte_idx..byte_idx)?; + Some(line.utf16_idx + len_utf16(head)) + } + + /// Return the index of the line that contains the given byte index. + pub fn byte_to_line(&self, byte_idx: usize) -> Option { + (byte_idx <= self.text().len()).then(|| { + match self.0.lines.binary_search_by_key(&byte_idx, |line| line.byte_idx) { + Ok(i) => i, + Err(i) => i - 1, + } + }) + } + + /// Return the index of the column at the byte index. + /// + /// The column is defined as the number of characters in the line before the + /// byte index. + pub fn byte_to_column(&self, byte_idx: usize) -> Option { + let line = self.byte_to_line(byte_idx)?; + let start = self.line_to_byte(line)?; + let head = self.text().get(start..byte_idx)?; + Some(head.chars().count()) + } + + /// Return the index of the line and column at the byte index. + pub fn byte_to_line_column(&self, byte_idx: usize) -> Option<(usize, usize)> { + let line = self.byte_to_line(byte_idx)?; + let start = self.line_to_byte(line)?; + let head = self.text().get(start..byte_idx)?; + let col = head.chars().count(); + Some((line, col)) + } + + /// Return the byte index at the UTF-16 code unit. + pub fn utf16_to_byte(&self, utf16_idx: usize) -> Option { + let line = self.0.lines.get( + match self.0.lines.binary_search_by_key(&utf16_idx, |line| line.utf16_idx) { + Ok(i) => i, + Err(i) => i - 1, + }, + )?; + + let text = self.text(); + let mut k = line.utf16_idx; + for (i, c) in text[line.byte_idx..].char_indices() { + if k >= utf16_idx { + return Some(line.byte_idx + i); + } + k += c.len_utf16(); + } + + (k == utf16_idx).then_some(text.len()) + } + + /// Return the byte position at which the given line starts. + pub fn line_to_byte(&self, line_idx: usize) -> Option { + self.0.lines.get(line_idx).map(|line| line.byte_idx) + } + + /// Return the range which encloses the given line. + pub fn line_to_range(&self, line_idx: usize) -> Option> { + let start = self.line_to_byte(line_idx)?; + let end = self.line_to_byte(line_idx + 1).unwrap_or(self.text().len()); + Some(start..end) + } + + /// Return the byte index of the given (line, column) pair. + /// + /// The column defines the number of characters to go beyond the start of + /// the line. + pub fn line_column_to_byte( + &self, + line_idx: usize, + column_idx: usize, + ) -> Option { + let range = self.line_to_range(line_idx)?; + let line = self.text().get(range.clone())?; + let mut chars = line.chars(); + for _ in 0..column_idx { + chars.next(); + } + Some(range.start + (line.len() - chars.as_str().len())) + } +} + +impl Lines { + /// Fully replace the source text. + /// + /// This performs a naive (suffix/prefix-based) diff of the old and new text + /// to produce the smallest single edit that transforms old into new and + /// then calls [`edit`](Self::edit) with it. + /// + /// Returns whether any changes were made. + pub fn replace(&mut self, new: &str) -> bool { + let Some((prefix, suffix)) = self.replacement_range(new) else { + return false; + }; + + let old = self.text(); + let replace = prefix..old.len() - suffix; + let with = &new[prefix..new.len() - suffix]; + self.edit(replace, with); + + true + } + + /// Returns the common prefix and suffix lengths. + /// Returns [`None`] if the old and new strings are equal. + pub fn replacement_range(&self, new: &str) -> Option<(usize, usize)> { + let old = self.text(); + + let mut prefix = + zip(old.bytes(), new.bytes()).take_while(|(x, y)| x == y).count(); + + if prefix == old.len() && prefix == new.len() { + return None; + } + + while !old.is_char_boundary(prefix) || !new.is_char_boundary(prefix) { + prefix -= 1; + } + + let mut suffix = zip(old[prefix..].bytes().rev(), new[prefix..].bytes().rev()) + .take_while(|(x, y)| x == y) + .count(); + + while !old.is_char_boundary(old.len() - suffix) + || !new.is_char_boundary(new.len() - suffix) + { + suffix += 1; + } + + Some((prefix, suffix)) + } + + /// Edit the source file by replacing the given range. + /// + /// Returns the range in the new source that was ultimately reparsed. + /// + /// The method panics if the `replace` range is out of bounds. + #[track_caller] + pub fn edit(&mut self, replace: Range, with: &str) { + let start_byte = replace.start; + let start_utf16 = self.byte_to_utf16(start_byte).unwrap(); + let line = self.byte_to_line(start_byte).unwrap(); + + let inner = Arc::make_mut(&mut self.0); + + // Update the text itself. + inner.text.replace_range(replace.clone(), with); + + // Remove invalidated line starts. + inner.lines.truncate(line + 1); + + // Handle adjoining of \r and \n. + if inner.text[..start_byte].ends_with('\r') && with.starts_with('\n') { + inner.lines.pop(); + } + + // Recalculate the line starts after the edit. + inner.lines.extend(lines_from( + start_byte, + start_utf16, + &inner.text[start_byte..], + )); + } +} + +impl Hash for Lines { + fn hash(&self, state: &mut H) { + self.0.text.hash(state); + } +} + +impl> AsRef for Lines { + fn as_ref(&self) -> &str { + self.0.text.as_ref() + } +} + +/// Create a line vector. +fn lines(text: &str) -> Vec { + std::iter::once(Line { byte_idx: 0, utf16_idx: 0 }) + .chain(lines_from(0, 0, text)) + .collect() +} + +/// Compute a line iterator from an offset. +fn lines_from( + byte_offset: usize, + utf16_offset: usize, + text: &str, +) -> impl Iterator + '_ { + let mut s = unscanny::Scanner::new(text); + let mut utf16_idx = utf16_offset; + + std::iter::from_fn(move || { + s.eat_until(|c: char| { + utf16_idx += c.len_utf16(); + is_newline(c) + }); + + if s.done() { + return None; + } + + if s.eat() == Some('\r') && s.eat_if('\n') { + utf16_idx += 1; + } + + Some(Line { byte_idx: byte_offset + s.cursor(), utf16_idx }) + }) +} + +/// The number of code units this string would use if it was encoded in +/// UTF16. This runs in linear time. +fn len_utf16(string: &str) -> usize { + string.chars().map(char::len_utf16).sum() +} + +#[cfg(test)] +mod tests { + use super::*; + + const TEST: &str = "ä\tcde\nf💛g\r\nhi\rjkl"; + + #[test] + fn test_source_file_new() { + let lines = Lines::new(TEST); + assert_eq!( + lines.0.lines, + [ + Line { byte_idx: 0, utf16_idx: 0 }, + Line { byte_idx: 7, utf16_idx: 6 }, + Line { byte_idx: 15, utf16_idx: 12 }, + Line { byte_idx: 18, utf16_idx: 15 }, + ] + ); + } + + #[test] + fn test_source_file_pos_to_line() { + let lines = Lines::new(TEST); + assert_eq!(lines.byte_to_line(0), Some(0)); + assert_eq!(lines.byte_to_line(2), Some(0)); + assert_eq!(lines.byte_to_line(6), Some(0)); + assert_eq!(lines.byte_to_line(7), Some(1)); + assert_eq!(lines.byte_to_line(8), Some(1)); + assert_eq!(lines.byte_to_line(12), Some(1)); + assert_eq!(lines.byte_to_line(21), Some(3)); + assert_eq!(lines.byte_to_line(22), None); + } + + #[test] + fn test_source_file_pos_to_column() { + let lines = Lines::new(TEST); + assert_eq!(lines.byte_to_column(0), Some(0)); + assert_eq!(lines.byte_to_column(2), Some(1)); + assert_eq!(lines.byte_to_column(6), Some(5)); + assert_eq!(lines.byte_to_column(7), Some(0)); + assert_eq!(lines.byte_to_column(8), Some(1)); + assert_eq!(lines.byte_to_column(12), Some(2)); + } + + #[test] + fn test_source_file_utf16() { + #[track_caller] + fn roundtrip(lines: &Lines<&str>, byte_idx: usize, utf16_idx: usize) { + let middle = lines.byte_to_utf16(byte_idx).unwrap(); + let result = lines.utf16_to_byte(middle).unwrap(); + assert_eq!(middle, utf16_idx); + assert_eq!(result, byte_idx); + } + + let lines = Lines::new(TEST); + roundtrip(&lines, 0, 0); + roundtrip(&lines, 2, 1); + roundtrip(&lines, 3, 2); + roundtrip(&lines, 8, 7); + roundtrip(&lines, 12, 9); + roundtrip(&lines, 21, 18); + assert_eq!(lines.byte_to_utf16(22), None); + assert_eq!(lines.utf16_to_byte(19), None); + } + + #[test] + fn test_source_file_roundtrip() { + #[track_caller] + fn roundtrip(lines: &Lines<&str>, byte_idx: usize) { + let line = lines.byte_to_line(byte_idx).unwrap(); + let column = lines.byte_to_column(byte_idx).unwrap(); + let result = lines.line_column_to_byte(line, column).unwrap(); + assert_eq!(result, byte_idx); + } + + let lines = Lines::new(TEST); + roundtrip(&lines, 0); + roundtrip(&lines, 7); + roundtrip(&lines, 12); + roundtrip(&lines, 21); + } + + #[test] + fn test_source_file_edit() { + // This tests only the non-parser parts. The reparsing itself is + // tested separately. + #[track_caller] + fn test(prev: &str, range: Range, with: &str, after: &str) { + let reference = Lines::new(after); + + let mut edited = Lines::new(prev.to_string()); + edited.edit(range.clone(), with); + assert_eq!(edited.text(), reference.text()); + assert_eq!(edited.0.lines, reference.0.lines); + + let mut replaced = Lines::new(prev.to_string()); + replaced.replace(&{ + let mut s = prev.to_string(); + s.replace_range(range, with); + s + }); + assert_eq!(replaced.text(), reference.text()); + assert_eq!(replaced.0.lines, reference.0.lines); + } + + // Test inserting at the beginning. + test("abc\n", 0..0, "hi\n", "hi\nabc\n"); + test("\nabc", 0..0, "hi\r", "hi\r\nabc"); + + // Test editing in the middle. + test(TEST, 4..16, "❌", "ä\tc❌i\rjkl"); + + // Test appending. + test("abc\ndef", 7..7, "hi", "abc\ndefhi"); + test("abc\ndef\n", 8..8, "hi", "abc\ndef\nhi"); + + // Test appending with adjoining \r and \n. + test("abc\ndef\r", 8..8, "\nghi", "abc\ndef\r\nghi"); + + // Test removing everything. + test(TEST, 0..21, "", ""); + } +} diff --git a/crates/typst-syntax/src/source.rs b/crates/typst-syntax/src/source.rs index 6ff94c73f..514cb9a4a 100644 --- a/crates/typst-syntax/src/source.rs +++ b/crates/typst-syntax/src/source.rs @@ -2,14 +2,14 @@ use std::fmt::{self, Debug, Formatter}; use std::hash::{Hash, Hasher}; -use std::iter::zip; use std::ops::Range; use std::sync::Arc; use typst_utils::LazyHash; +use crate::lines::Lines; use crate::reparser::reparse; -use crate::{is_newline, parse, FileId, LinkedNode, Span, SyntaxNode, VirtualPath}; +use crate::{parse, FileId, LinkedNode, Span, SyntaxNode, VirtualPath}; /// A source file. /// @@ -24,9 +24,8 @@ pub struct Source(Arc); #[derive(Clone)] struct Repr { id: FileId, - text: LazyHash, root: LazyHash, - lines: Vec, + lines: LazyHash>, } impl Source { @@ -37,8 +36,7 @@ impl Source { root.numberize(id, Span::FULL).unwrap(); Self(Arc::new(Repr { id, - lines: lines(&text), - text: LazyHash::new(text), + lines: LazyHash::new(Lines::new(text)), root: LazyHash::new(root), })) } @@ -58,9 +56,14 @@ impl Source { self.0.id } + /// The whole source as a string slice. + pub fn lines(&self) -> Lines { + Lines::clone(&self.0.lines) + } + /// The whole source as a string slice. pub fn text(&self) -> &str { - &self.0.text + self.0.lines.text() } /// Slice out the part of the source code enclosed by the range. @@ -77,29 +80,12 @@ impl Source { /// Returns the range in the new source that was ultimately reparsed. pub fn replace(&mut self, new: &str) -> Range { let _scope = typst_timing::TimingScope::new("replace source"); - let old = self.text(); - let mut prefix = - zip(old.bytes(), new.bytes()).take_while(|(x, y)| x == y).count(); - - if prefix == old.len() && prefix == new.len() { + let Some((prefix, suffix)) = self.0.lines.replacement_range(new) else { return 0..0; - } - - while !old.is_char_boundary(prefix) || !new.is_char_boundary(prefix) { - prefix -= 1; - } - - let mut suffix = zip(old[prefix..].bytes().rev(), new[prefix..].bytes().rev()) - .take_while(|(x, y)| x == y) - .count(); - - while !old.is_char_boundary(old.len() - suffix) - || !new.is_char_boundary(new.len() - suffix) - { - suffix += 1; - } + }; + let old = self.text(); let replace = prefix..old.len() - suffix; let with = &new[prefix..new.len() - suffix]; self.edit(replace, with) @@ -112,48 +98,28 @@ impl Source { /// The method panics if the `replace` range is out of bounds. #[track_caller] pub fn edit(&mut self, replace: Range, with: &str) -> Range { - let start_byte = replace.start; - let start_utf16 = self.byte_to_utf16(start_byte).unwrap(); - let line = self.byte_to_line(start_byte).unwrap(); - let inner = Arc::make_mut(&mut self.0); - // Update the text itself. - inner.text.replace_range(replace.clone(), with); - - // Remove invalidated line starts. - inner.lines.truncate(line + 1); - - // Handle adjoining of \r and \n. - if inner.text[..start_byte].ends_with('\r') && with.starts_with('\n') { - inner.lines.pop(); - } - - // Recalculate the line starts after the edit. - inner.lines.extend(lines_from( - start_byte, - start_utf16, - &inner.text[start_byte..], - )); + // Update the text and lines. + inner.lines.edit(replace.clone(), with); // Incrementally reparse the replaced range. - reparse(&mut inner.root, &inner.text, replace, with.len()) + reparse(&mut inner.root, inner.lines.text(), replace, with.len()) } /// Get the length of the file in UTF-8 encoded bytes. pub fn len_bytes(&self) -> usize { - self.text().len() + self.0.lines.len_bytes() } /// Get the length of the file in UTF-16 code units. pub fn len_utf16(&self) -> usize { - let last = self.0.lines.last().unwrap(); - last.utf16_idx + len_utf16(&self.0.text[last.byte_idx..]) + self.0.lines.len_utf16() } /// Get the length of the file in lines. pub fn len_lines(&self) -> usize { - self.0.lines.len() + self.0.lines.len_lines() } /// Find the node with the given span. @@ -171,85 +137,6 @@ impl Source { pub fn range(&self, span: Span) -> Option> { Some(self.find(span)?.range()) } - - /// Return the index of the UTF-16 code unit at the byte index. - pub fn byte_to_utf16(&self, byte_idx: usize) -> Option { - let line_idx = self.byte_to_line(byte_idx)?; - let line = self.0.lines.get(line_idx)?; - let head = self.0.text.get(line.byte_idx..byte_idx)?; - Some(line.utf16_idx + len_utf16(head)) - } - - /// Return the index of the line that contains the given byte index. - pub fn byte_to_line(&self, byte_idx: usize) -> Option { - (byte_idx <= self.0.text.len()).then(|| { - match self.0.lines.binary_search_by_key(&byte_idx, |line| line.byte_idx) { - Ok(i) => i, - Err(i) => i - 1, - } - }) - } - - /// Return the index of the column at the byte index. - /// - /// The column is defined as the number of characters in the line before the - /// byte index. - pub fn byte_to_column(&self, byte_idx: usize) -> Option { - let line = self.byte_to_line(byte_idx)?; - let start = self.line_to_byte(line)?; - let head = self.get(start..byte_idx)?; - Some(head.chars().count()) - } - - /// Return the byte index at the UTF-16 code unit. - pub fn utf16_to_byte(&self, utf16_idx: usize) -> Option { - let line = self.0.lines.get( - match self.0.lines.binary_search_by_key(&utf16_idx, |line| line.utf16_idx) { - Ok(i) => i, - Err(i) => i - 1, - }, - )?; - - let mut k = line.utf16_idx; - for (i, c) in self.0.text[line.byte_idx..].char_indices() { - if k >= utf16_idx { - return Some(line.byte_idx + i); - } - k += c.len_utf16(); - } - - (k == utf16_idx).then_some(self.0.text.len()) - } - - /// Return the byte position at which the given line starts. - pub fn line_to_byte(&self, line_idx: usize) -> Option { - self.0.lines.get(line_idx).map(|line| line.byte_idx) - } - - /// Return the range which encloses the given line. - pub fn line_to_range(&self, line_idx: usize) -> Option> { - let start = self.line_to_byte(line_idx)?; - let end = self.line_to_byte(line_idx + 1).unwrap_or(self.0.text.len()); - Some(start..end) - } - - /// Return the byte index of the given (line, column) pair. - /// - /// The column defines the number of characters to go beyond the start of - /// the line. - pub fn line_column_to_byte( - &self, - line_idx: usize, - column_idx: usize, - ) -> Option { - let range = self.line_to_range(line_idx)?; - let line = self.get(range.clone())?; - let mut chars = line.chars(); - for _ in 0..column_idx { - chars.next(); - } - Some(range.start + (line.len() - chars.as_str().len())) - } } impl Debug for Source { @@ -261,7 +148,7 @@ impl Debug for Source { impl Hash for Source { fn hash(&self, state: &mut H) { self.0.id.hash(state); - self.0.text.hash(state); + self.0.lines.hash(state); self.0.root.hash(state); } } @@ -271,176 +158,3 @@ impl AsRef for Source { self.text() } } - -/// Metadata about a line. -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -struct Line { - /// The UTF-8 byte offset where the line starts. - byte_idx: usize, - /// The UTF-16 codepoint offset where the line starts. - utf16_idx: usize, -} - -/// Create a line vector. -fn lines(text: &str) -> Vec { - std::iter::once(Line { byte_idx: 0, utf16_idx: 0 }) - .chain(lines_from(0, 0, text)) - .collect() -} - -/// Compute a line iterator from an offset. -fn lines_from( - byte_offset: usize, - utf16_offset: usize, - text: &str, -) -> impl Iterator + '_ { - let mut s = unscanny::Scanner::new(text); - let mut utf16_idx = utf16_offset; - - std::iter::from_fn(move || { - s.eat_until(|c: char| { - utf16_idx += c.len_utf16(); - is_newline(c) - }); - - if s.done() { - return None; - } - - if s.eat() == Some('\r') && s.eat_if('\n') { - utf16_idx += 1; - } - - Some(Line { byte_idx: byte_offset + s.cursor(), utf16_idx }) - }) -} - -/// The number of code units this string would use if it was encoded in -/// UTF16. This runs in linear time. -fn len_utf16(string: &str) -> usize { - string.chars().map(char::len_utf16).sum() -} - -#[cfg(test)] -mod tests { - use super::*; - - const TEST: &str = "ä\tcde\nf💛g\r\nhi\rjkl"; - - #[test] - fn test_source_file_new() { - let source = Source::detached(TEST); - assert_eq!( - source.0.lines, - [ - Line { byte_idx: 0, utf16_idx: 0 }, - Line { byte_idx: 7, utf16_idx: 6 }, - Line { byte_idx: 15, utf16_idx: 12 }, - Line { byte_idx: 18, utf16_idx: 15 }, - ] - ); - } - - #[test] - fn test_source_file_pos_to_line() { - let source = Source::detached(TEST); - assert_eq!(source.byte_to_line(0), Some(0)); - assert_eq!(source.byte_to_line(2), Some(0)); - assert_eq!(source.byte_to_line(6), Some(0)); - assert_eq!(source.byte_to_line(7), Some(1)); - assert_eq!(source.byte_to_line(8), Some(1)); - assert_eq!(source.byte_to_line(12), Some(1)); - assert_eq!(source.byte_to_line(21), Some(3)); - assert_eq!(source.byte_to_line(22), None); - } - - #[test] - fn test_source_file_pos_to_column() { - let source = Source::detached(TEST); - assert_eq!(source.byte_to_column(0), Some(0)); - assert_eq!(source.byte_to_column(2), Some(1)); - assert_eq!(source.byte_to_column(6), Some(5)); - assert_eq!(source.byte_to_column(7), Some(0)); - assert_eq!(source.byte_to_column(8), Some(1)); - assert_eq!(source.byte_to_column(12), Some(2)); - } - - #[test] - fn test_source_file_utf16() { - #[track_caller] - fn roundtrip(source: &Source, byte_idx: usize, utf16_idx: usize) { - let middle = source.byte_to_utf16(byte_idx).unwrap(); - let result = source.utf16_to_byte(middle).unwrap(); - assert_eq!(middle, utf16_idx); - assert_eq!(result, byte_idx); - } - - let source = Source::detached(TEST); - roundtrip(&source, 0, 0); - roundtrip(&source, 2, 1); - roundtrip(&source, 3, 2); - roundtrip(&source, 8, 7); - roundtrip(&source, 12, 9); - roundtrip(&source, 21, 18); - assert_eq!(source.byte_to_utf16(22), None); - assert_eq!(source.utf16_to_byte(19), None); - } - - #[test] - fn test_source_file_roundtrip() { - #[track_caller] - fn roundtrip(source: &Source, byte_idx: usize) { - let line = source.byte_to_line(byte_idx).unwrap(); - let column = source.byte_to_column(byte_idx).unwrap(); - let result = source.line_column_to_byte(line, column).unwrap(); - assert_eq!(result, byte_idx); - } - - let source = Source::detached(TEST); - roundtrip(&source, 0); - roundtrip(&source, 7); - roundtrip(&source, 12); - roundtrip(&source, 21); - } - - #[test] - fn test_source_file_edit() { - // This tests only the non-parser parts. The reparsing itself is - // tested separately. - #[track_caller] - fn test(prev: &str, range: Range, with: &str, after: &str) { - let reference = Source::detached(after); - - let mut edited = Source::detached(prev); - edited.edit(range.clone(), with); - assert_eq!(edited.text(), reference.text()); - assert_eq!(edited.0.lines, reference.0.lines); - - let mut replaced = Source::detached(prev); - replaced.replace(&{ - let mut s = prev.to_string(); - s.replace_range(range, with); - s - }); - assert_eq!(replaced.text(), reference.text()); - assert_eq!(replaced.0.lines, reference.0.lines); - } - - // Test inserting at the beginning. - test("abc\n", 0..0, "hi\n", "hi\nabc\n"); - test("\nabc", 0..0, "hi\r", "hi\r\nabc"); - - // Test editing in the middle. - test(TEST, 4..16, "❌", "ä\tc❌i\rjkl"); - - // Test appending. - test("abc\ndef", 7..7, "hi", "abc\ndefhi"); - test("abc\ndef\n", 8..8, "hi", "abc\ndef\nhi"); - - // Test appending with adjoining \r and \n. - test("abc\ndef\r", 8..8, "\nghi", "abc\ndef\r\nghi"); - - // Test removing everything. - test(TEST, 0..21, "", ""); - } -} diff --git a/tests/src/collect.rs b/tests/src/collect.rs index 84af04d2d..173488b01 100644 --- a/tests/src/collect.rs +++ b/tests/src/collect.rs @@ -7,7 +7,9 @@ use std::sync::LazyLock; use ecow::{eco_format, EcoString}; use typst_syntax::package::PackageVersion; -use typst_syntax::{is_id_continue, is_ident, is_newline, FileId, Source, VirtualPath}; +use typst_syntax::{ + is_id_continue, is_ident, is_newline, FileId, Lines, Source, VirtualPath, +}; use unscanny::Scanner; /// Collects all tests from all files. @@ -79,6 +81,8 @@ impl Display for FileSize { pub struct Note { pub pos: FilePos, pub kind: NoteKind, + /// The file [`Self::range`] belongs to. + pub file: FileId, pub range: Option>, pub message: String, } @@ -341,9 +345,28 @@ impl<'a> Parser<'a> { let kind: NoteKind = head.parse().ok()?; self.s.eat_if(' '); + let mut file = None; + if self.s.eat_if('"') { + let path = self.s.eat_until(|c| is_newline(c) || c == '"'); + if !self.s.eat_if('"') { + self.error("expected closing quote after file path"); + return None; + } + + let vpath = VirtualPath::new(path); + file = Some(FileId::new(None, vpath)); + + self.s.eat_if(' '); + } + let mut range = None; if self.s.at('-') || self.s.at(char::is_numeric) { - range = self.parse_range(source); + if let Some(file) = file { + range = self.parse_range_external(file); + } else { + range = self.parse_range(source); + } + if range.is_none() { self.error("range is malformed"); return None; @@ -359,11 +382,78 @@ impl<'a> Parser<'a> { Some(Note { pos: FilePos::new(self.path, self.line), kind, + file: file.unwrap_or(source.id()), range, message, }) } + #[cfg(not(feature = "default"))] + fn parse_range_external(&mut self, _file: FileId) -> Option> { + panic!("external file ranges are not expected when testing `typst_syntax`"); + } + + /// Parse a range in an external file, optionally abbreviated as just a position + /// if the range is empty. + #[cfg(feature = "default")] + fn parse_range_external(&mut self, file: FileId) -> Option> { + use typst::foundations::Bytes; + + use crate::world::{read, system_path}; + + let path = match system_path(file) { + Ok(path) => path, + Err(err) => { + self.error(err.to_string()); + return None; + } + }; + + let bytes = match read(&path) { + Ok(data) => Bytes::new(data), + Err(err) => { + self.error(err.to_string()); + return None; + } + }; + + let start = self.parse_line_col()?; + let lines = Lines::try_from(&bytes).expect( + "errors shouldn't be annotated for files \ + that aren't human readable (not valid utf-8)", + ); + let range = if self.s.eat_if('-') { + let (line, col) = start; + let start = lines.line_column_to_byte(line, col); + let (line, col) = self.parse_line_col()?; + let end = lines.line_column_to_byte(line, col); + Option::zip(start, end).map(|(a, b)| a..b) + } else { + let (line, col) = start; + lines.line_column_to_byte(line, col).map(|i| i..i) + }; + if range.is_none() { + self.error("range is out of bounds"); + } + range + } + + /// Parses absolute `line:column` indices in an external file. + fn parse_line_col(&mut self) -> Option<(usize, usize)> { + let line = self.parse_number()?; + if !self.s.eat_if(':') { + self.error("positions in external files always require both `:`"); + return None; + } + let col = self.parse_number()?; + if line < 0 || col < 0 { + self.error("line and column numbers must be positive"); + return None; + } + + Some(((line as usize).saturating_sub(1), (col as usize).saturating_sub(1))) + } + /// Parse a range, optionally abbreviated as just a position if the range /// is empty. fn parse_range(&mut self, source: &Source) -> Option> { @@ -389,13 +479,13 @@ impl<'a> Parser<'a> { let line_idx = (line_idx_in_test + comments).checked_add_signed(line_delta)?; let column_idx = if column < 0 { // Negative column index is from the back. - let range = source.line_to_range(line_idx)?; + let range = source.lines().line_to_range(line_idx)?; text[range].chars().count().saturating_add_signed(column) } else { usize::try_from(column).ok()?.checked_sub(1)? }; - source.line_column_to_byte(line_idx, column_idx) + source.lines().line_column_to_byte(line_idx, column_idx) } /// Parse a number. diff --git a/tests/src/run.rs b/tests/src/run.rs index 4d08362cf..a34e38db5 100644 --- a/tests/src/run.rs +++ b/tests/src/run.rs @@ -10,10 +10,11 @@ use typst::layout::{Abs, Frame, FrameItem, PagedDocument, Transform}; use typst::visualize::Color; use typst::{Document, WorldExt}; use typst_pdf::PdfOptions; +use typst_syntax::FileId; use crate::collect::{Attr, FileSize, NoteKind, Test}; use crate::logger::TestResult; -use crate::world::TestWorld; +use crate::world::{system_path, TestWorld}; /// Runs a single test. /// @@ -117,7 +118,7 @@ impl<'a> Runner<'a> { if seen { continue; } - let note_range = self.format_range(¬e.range); + let note_range = self.format_range(note.file, ¬e.range); if first { log!(self, "not emitted"); first = false; @@ -208,10 +209,6 @@ impl<'a> Runner<'a> { /// Compare a subset of notes with a given kind against diagnostics of /// that same kind. fn check_diagnostic(&mut self, kind: NoteKind, diag: &SourceDiagnostic) { - // Ignore diagnostics from other sources than the test file itself. - if diag.span.id().is_some_and(|id| id != self.test.source.id()) { - return; - } // TODO: remove this once HTML export is stable if diag.message == "html export is under active development and incomplete" { return; @@ -219,11 +216,11 @@ impl<'a> Runner<'a> { let message = diag.message.replace("\\", "/"); let range = self.world.range(diag.span); - self.validate_note(kind, range.clone(), &message); + self.validate_note(kind, diag.span.id(), range.clone(), &message); // Check hints. for hint in &diag.hints { - self.validate_note(NoteKind::Hint, range.clone(), hint); + self.validate_note(NoteKind::Hint, diag.span.id(), range.clone(), hint); } } @@ -235,15 +232,18 @@ impl<'a> Runner<'a> { fn validate_note( &mut self, kind: NoteKind, + file: Option, range: Option>, message: &str, ) { // Try to find perfect match. + let file = file.unwrap_or(self.test.source.id()); if let Some((i, _)) = self.test.notes.iter().enumerate().find(|&(i, note)| { !self.seen[i] && note.kind == kind && note.range == range && note.message == message + && note.file == file }) { self.seen[i] = true; return; @@ -257,7 +257,7 @@ impl<'a> Runner<'a> { && (note.range == range || note.message == message) }) else { // Not even a close match, diagnostic is not annotated. - let diag_range = self.format_range(&range); + let diag_range = self.format_range(file, &range); log!(into: self.not_annotated, " {kind}: {diag_range} {}", message); return; }; @@ -267,10 +267,10 @@ impl<'a> Runner<'a> { // Range is wrong. if range != note.range { - let note_range = self.format_range(¬e.range); - let note_text = self.text_for_range(¬e.range); - let diag_range = self.format_range(&range); - let diag_text = self.text_for_range(&range); + let note_range = self.format_range(note.file, ¬e.range); + let note_text = self.text_for_range(note.file, ¬e.range); + let diag_range = self.format_range(file, &range); + let diag_text = self.text_for_range(file, &range); log!(self, "mismatched range ({}):", note.pos); log!(self, " message | {}", note.message); log!(self, " annotated | {note_range:<9} | {note_text}"); @@ -286,39 +286,49 @@ impl<'a> Runner<'a> { } /// Display the text for a range. - fn text_for_range(&self, range: &Option>) -> String { + fn text_for_range(&self, file: FileId, range: &Option>) -> String { let Some(range) = range else { return "No text".into() }; if range.is_empty() { - "(empty)".into() - } else { - format!("`{}`", self.test.source.text()[range.clone()].replace('\n', "\\n")) + return "(empty)".into(); } + + let lines = self.world.lookup(file); + lines.text()[range.clone()].replace('\n', "\\n").replace('\r', "\\r") } /// Display a byte range as a line:column range. - fn format_range(&self, range: &Option>) -> String { + fn format_range(&self, file: FileId, range: &Option>) -> String { let Some(range) = range else { return "No range".into() }; + + let mut preamble = String::new(); + if file != self.test.source.id() { + preamble = format!("\"{}\" ", system_path(file).unwrap().display()); + } + if range.start == range.end { - self.format_pos(range.start) + format!("{preamble}{}", self.format_pos(file, range.start)) } else { - format!("{}-{}", self.format_pos(range.start,), self.format_pos(range.end,)) + format!( + "{preamble}{}-{}", + self.format_pos(file, range.start), + self.format_pos(file, range.end) + ) } } /// Display a position as a line:column pair. - fn format_pos(&self, pos: usize) -> String { - if let (Some(line_idx), Some(column_idx)) = - (self.test.source.byte_to_line(pos), self.test.source.byte_to_column(pos)) - { - let line = self.test.pos.line + line_idx; - let column = column_idx + 1; - if line == 1 { - format!("{column}") - } else { - format!("{line}:{column}") - } + fn format_pos(&self, file: FileId, pos: usize) -> String { + let lines = self.world.lookup(file); + + let res = lines.byte_to_line_column(pos).map(|(line, col)| (line + 1, col + 1)); + let Some((line, col)) = res else { + return "oob".into(); + }; + + if line == 1 { + format!("{col}") } else { - "oob".into() + format!("{line}:{col}") } } } diff --git a/tests/src/world.rs b/tests/src/world.rs index fe2bd45ea..bc3e690b2 100644 --- a/tests/src/world.rs +++ b/tests/src/world.rs @@ -20,6 +20,7 @@ use typst::text::{Font, FontBook, TextElem, TextSize}; use typst::utils::{singleton, LazyHash}; use typst::visualize::Color; use typst::{Feature, Library, World}; +use typst_syntax::Lines; /// A world that provides access to the tests environment. #[derive(Clone)] @@ -84,6 +85,22 @@ impl TestWorld { let mut map = self.base.slots.lock(); f(map.entry(id).or_insert_with(|| FileSlot::new(id))) } + + /// Lookup line metadata for a file by id. + #[track_caller] + pub fn lookup(&self, id: FileId) -> Lines { + self.slot(id, |slot| { + if let Some(source) = slot.source.get() { + let source = source.as_ref().expect("file is not valid"); + source.lines() + } else if let Some(bytes) = slot.file.get() { + let bytes = bytes.as_ref().expect("file is not valid"); + Lines::try_from(bytes).expect("file is not valid utf-8") + } else { + panic!("file id does not point to any source file"); + } + }) + } } /// Shared foundation of all test worlds. @@ -149,7 +166,7 @@ impl FileSlot { } /// The file system path for a file ID. -fn system_path(id: FileId) -> FileResult { +pub(crate) fn system_path(id: FileId) -> FileResult { let root: PathBuf = match id.package() { Some(spec) => format!("tests/packages/{}-{}", spec.name, spec.version).into(), None => PathBuf::new(), @@ -159,7 +176,7 @@ fn system_path(id: FileId) -> FileResult { } /// Read a file. -fn read(path: &Path) -> FileResult> { +pub(crate) fn read(path: &Path) -> FileResult> { // Resolve asset. if let Ok(suffix) = path.strip_prefix("assets/") { return typst_dev_assets::get(&suffix.to_string_lossy()) diff --git a/tests/suite/loading/csv.typ b/tests/suite/loading/csv.typ index 6f57ec458..046345bec 100644 --- a/tests/suite/loading/csv.typ +++ b/tests/suite/loading/csv.typ @@ -18,12 +18,12 @@ #csv("nope.csv") --- csv-invalid --- -// Error: 6-28 failed to parse CSV (found 3 instead of 2 fields in line 3) +// Error: "/assets/data/bad.csv" 3:1 failed to parse CSV (found 3 instead of 2 fields in line 3) #csv("/assets/data/bad.csv") --- csv-invalid-row-type-dict --- // Test error numbering with dictionary rows. -// Error: 6-28 failed to parse CSV (found 3 instead of 2 fields in line 3) +// Error: "/assets/data/bad.csv" 3:1 failed to parse CSV (found 3 instead of 2 fields in line 3) #csv("/assets/data/bad.csv", row-type: dictionary) --- csv-invalid-delimiter --- diff --git a/tests/suite/loading/json.typ b/tests/suite/loading/json.typ index c8df1ff6e..9e433992d 100644 --- a/tests/suite/loading/json.typ +++ b/tests/suite/loading/json.typ @@ -6,7 +6,7 @@ #test(data.at(2).weight, 150) --- json-invalid --- -// Error: 7-30 failed to parse JSON (expected value at line 3 column 14) +// Error: "/assets/data/bad.json" 3:14 failed to parse JSON (expected value at line 3 column 14) #json("/assets/data/bad.json") --- json-decode-deprecated --- diff --git a/tests/suite/loading/read.typ b/tests/suite/loading/read.typ index b5c9c0892..57bfc1d5c 100644 --- a/tests/suite/loading/read.typ +++ b/tests/suite/loading/read.typ @@ -8,5 +8,5 @@ #let data = read("/assets/text/missing.txt") --- read-invalid-utf-8 --- -// Error: 18-40 file is not valid utf-8 +// Error: 18-40 failed to convert to string (file is not valid utf-8 in assets/text/bad.txt:1:1) #let data = read("/assets/text/bad.txt") diff --git a/tests/suite/loading/toml.typ b/tests/suite/loading/toml.typ index a4318a015..9d65da452 100644 --- a/tests/suite/loading/toml.typ +++ b/tests/suite/loading/toml.typ @@ -37,7 +37,7 @@ )) --- toml-invalid --- -// Error: 7-30 failed to parse TOML (expected `.`, `=` at line 1 column 16) +// Error: "/assets/data/bad.toml" 1:16-2:1 failed to parse TOML (expected `.`, `=`) #toml("/assets/data/bad.toml") --- toml-decode-deprecated --- diff --git a/tests/suite/loading/xml.typ b/tests/suite/loading/xml.typ index 933f3c480..eed7db0ae 100644 --- a/tests/suite/loading/xml.typ +++ b/tests/suite/loading/xml.typ @@ -24,7 +24,7 @@ ),)) --- xml-invalid --- -// Error: 6-28 failed to parse XML (found closing tag 'data' instead of 'hello' in line 3) +// Error: "/assets/data/bad.xml" 3:0 failed to parse XML (found closing tag 'data' instead of 'hello') #xml("/assets/data/bad.xml") --- xml-decode-deprecated --- diff --git a/tests/suite/loading/yaml.typ b/tests/suite/loading/yaml.typ index a8089052c..ad171c6ef 100644 --- a/tests/suite/loading/yaml.typ +++ b/tests/suite/loading/yaml.typ @@ -13,7 +13,7 @@ #test(data.at("1"), "ok") --- yaml-invalid --- -// Error: 7-30 failed to parse YAML (did not find expected ',' or ']' at line 2 column 1, while parsing a flow sequence at line 1 column 18) +// Error: "/assets/data/bad.yaml" 2:1 failed to parse YAML (did not find expected ',' or ']' at line 2 column 1, while parsing a flow sequence at line 1 column 18) #yaml("/assets/data/bad.yaml") --- yaml-decode-deprecated --- diff --git a/tests/suite/scripting/import.typ b/tests/suite/scripting/import.typ index 49b66ee56..382e444cc 100644 --- a/tests/suite/scripting/import.typ +++ b/tests/suite/scripting/import.typ @@ -334,6 +334,7 @@ --- import-cyclic-in-other-file --- // Cyclic import in other file. +// Error: "tests/suite/scripting/modules/cycle2.typ" 2:9-2:21 cyclic import #import "./modules/cycle1.typ": * This is never reached. diff --git a/tests/suite/visualize/image.typ b/tests/suite/visualize/image.typ index 73c4feff8..45c70c4b8 100644 --- a/tests/suite/visualize/image.typ +++ b/tests/suite/visualize/image.typ @@ -167,7 +167,7 @@ A #box(image("/assets/images/tiger.jpg", height: 1cm, width: 80%)) B #image("/assets/plugins/hello.wasm") --- image-bad-svg --- -// Error: 2-33 failed to parse SVG (found closing tag 'g' instead of 'style' in line 4) +// Error: "/assets/images/bad.svg" 4:3 failed to parse SVG (found closing tag 'g' instead of 'style') #image("/assets/images/bad.svg") --- image-decode-svg --- @@ -176,7 +176,7 @@ A #box(image("/assets/images/tiger.jpg", height: 1cm, width: 80%)) B #image.decode(``.text, format: "svg") --- image-decode-bad-svg --- -// Error: 2-168 failed to parse SVG (missing root node) +// Error: 15-152 failed to parse SVG (missing root node at 1:1) // Warning: 8-14 `image.decode` is deprecated, directly pass bytes to `image` instead #image.decode(``.text, format: "svg")