From dacd6acd5e73d35c6e7a7a3b144f16ae70d03daa Mon Sep 17 00:00:00 2001 From: Laurenz Date: Wed, 8 Jan 2025 11:57:56 +0100 Subject: [PATCH] More flexible and efficient `Bytes` representation (#5670) --- crates/typst-cli/src/world.rs | 2 +- crates/typst-ide/src/tests.rs | 4 +- crates/typst-kit/src/fonts.rs | 8 +- crates/typst-layout/src/image.rs | 2 +- crates/typst-library/src/foundations/bytes.rs | 146 +++++++++++++----- crates/typst-library/src/foundations/float.rs | 12 +- crates/typst-library/src/foundations/int.rs | 5 +- .../typst-library/src/foundations/plugin.rs | 2 +- crates/typst-library/src/foundations/value.rs | 6 +- crates/typst-library/src/loading/cbor.rs | 2 +- crates/typst-library/src/loading/mod.rs | 24 ++- crates/typst-library/src/text/font/color.rs | 13 +- .../src/visualize/image/raster.rs | 2 +- crates/typst-svg/src/text.rs | 5 +- docs/src/html.rs | 2 +- docs/src/lib.rs | 2 +- tests/fuzz/src/compile.rs | 2 +- tests/src/world.rs | 6 +- 18 files changed, 160 insertions(+), 85 deletions(-) diff --git a/crates/typst-cli/src/world.rs b/crates/typst-cli/src/world.rs index af6cf228f..12e80d273 100644 --- a/crates/typst-cli/src/world.rs +++ b/crates/typst-cli/src/world.rs @@ -305,7 +305,7 @@ impl FileSlot { ) -> FileResult { self.file.get_or_init( || read(self.id, project_root, package_storage), - |data, _| Ok(data.into()), + |data, _| Ok(Bytes::new(data)), ) } } diff --git a/crates/typst-ide/src/tests.rs b/crates/typst-ide/src/tests.rs index f41808dac..6678ab841 100644 --- a/crates/typst-ide/src/tests.rs +++ b/crates/typst-ide/src/tests.rs @@ -55,7 +55,7 @@ impl TestWorld { pub fn with_asset_at(mut self, path: &str, filename: &str) -> Self { let id = FileId::new(None, VirtualPath::new(path)); let data = typst_dev_assets::get_by_name(filename).unwrap(); - let bytes = Bytes::from_static(data); + let bytes = Bytes::new(data); Arc::make_mut(&mut self.files).assets.insert(id, bytes); self } @@ -152,7 +152,7 @@ impl Default for TestBase { fn default() -> Self { let fonts: Vec<_> = typst_assets::fonts() .chain(typst_dev_assets::fonts()) - .flat_map(|data| Font::iter(Bytes::from_static(data))) + .flat_map(|data| Font::iter(Bytes::new(data))) .collect(); Self { diff --git a/crates/typst-kit/src/fonts.rs b/crates/typst-kit/src/fonts.rs index 83e13fd8f..c15d739ec 100644 --- a/crates/typst-kit/src/fonts.rs +++ b/crates/typst-kit/src/fonts.rs @@ -13,6 +13,7 @@ use std::path::{Path, PathBuf}; use std::sync::OnceLock; use fontdb::{Database, Source}; +use typst_library::foundations::Bytes; use typst_library::text::{Font, FontBook, FontInfo}; use typst_timing::TimingScope; @@ -52,9 +53,8 @@ impl FontSlot { .as_ref() .expect("`path` is not `None` if `font` is uninitialized"), ) - .ok()? - .into(); - Font::new(data, self.index) + .ok()?; + Font::new(Bytes::new(data), self.index) }) .clone() } @@ -196,7 +196,7 @@ impl FontSearcher { #[cfg(feature = "embed-fonts")] fn add_embedded(&mut self) { for data in typst_assets::fonts() { - let buffer = typst_library::foundations::Bytes::from_static(data); + let buffer = Bytes::new(data); for (i, font) in Font::iter(buffer).enumerate() { self.book.push(font.info().clone()); self.fonts.push(FontSlot { diff --git a/crates/typst-layout/src/image.rs b/crates/typst-layout/src/image.rs index 77e1d0838..59e2c0210 100644 --- a/crates/typst-layout/src/image.rs +++ b/crates/typst-layout/src/image.rs @@ -50,7 +50,7 @@ pub fn layout_image( // Construct the image itself. let image = Image::with_fonts( - data.clone().into(), + data.clone().into_bytes(), format, elem.alt(styles), engine.world, diff --git a/crates/typst-library/src/foundations/bytes.rs b/crates/typst-library/src/foundations/bytes.rs index 05fe4763a..20034d074 100644 --- a/crates/typst-library/src/foundations/bytes.rs +++ b/crates/typst-library/src/foundations/bytes.rs @@ -1,5 +1,6 @@ -use std::borrow::Cow; +use std::any::Any; use std::fmt::{self, Debug, Formatter}; +use std::hash::{Hash, Hasher}; use std::ops::{Add, AddAssign, Deref}; use std::sync::Arc; @@ -39,18 +40,44 @@ use crate::foundations::{cast, func, scope, ty, Array, Reflect, Repr, Str, Value /// #str(data.slice(1, 4)) /// ``` #[ty(scope, cast)] -#[derive(Clone, Hash, Eq, PartialEq)] -pub struct Bytes(Arc>>); +#[derive(Clone, Hash)] +#[allow(clippy::derived_hash_with_manual_eq)] +pub struct Bytes(Arc>); impl Bytes { - /// Create a buffer from a static byte slice. - pub fn from_static(slice: &'static [u8]) -> Self { - Self(Arc::new(LazyHash::new(Cow::Borrowed(slice)))) + /// Create `Bytes` from anything byte-like. + /// + /// The `data` type will directly back this bytes object. This means you can + /// e.g. pass `&'static [u8]` or `[u8; 8]` and no extra vector will be + /// allocated. + /// + /// If the type is `Vec` and the `Bytes` are unique (i.e. not cloned), + /// the vector will be reused when mutating to the `Bytes`. + /// + /// If your source type is a string, prefer [`Bytes::from_string`] to + /// directly use the UTF-8 encoded string data without any copying. + pub fn new(data: T) -> Self + where + T: AsRef<[u8]> + Send + Sync + 'static, + { + Self(Arc::new(LazyHash::new(data))) + } + + /// Create `Bytes` from anything string-like, implicitly viewing the UTF-8 + /// representation. + /// + /// The `data` type will directly back this bytes object. This means you can + /// e.g. pass `String` or `EcoString` without any copying. + pub fn from_string(data: T) -> Self + where + T: AsRef + Send + Sync + 'static, + { + Self(Arc::new(LazyHash::new(StrWrapper(data)))) } /// Return `true` if the length is 0. pub fn is_empty(&self) -> bool { - self.0.is_empty() + self.as_slice().is_empty() } /// Return a view into the buffer. @@ -60,7 +87,7 @@ impl Bytes { /// Return a copy of the buffer as a vector. pub fn to_vec(&self) -> Vec { - self.0.to_vec() + self.as_slice().to_vec() } /// Resolve an index or throw an out of bounds error. @@ -72,12 +99,10 @@ impl Bytes { /// /// `index == len` is considered in bounds. fn locate_opt(&self, index: i64) -> Option { + let len = self.as_slice().len(); let wrapped = - if index >= 0 { Some(index) } else { (self.len() as i64).checked_add(index) }; - - wrapped - .and_then(|v| usize::try_from(v).ok()) - .filter(|&v| v <= self.0.len()) + if index >= 0 { Some(index) } else { (len as i64).checked_add(index) }; + wrapped.and_then(|v| usize::try_from(v).ok()).filter(|&v| v <= len) } } @@ -106,7 +131,7 @@ impl Bytes { /// The length in bytes. #[func(title = "Length")] pub fn len(&self) -> usize { - self.0.len() + self.as_slice().len() } /// Returns the byte at the specified index. Returns the default value if @@ -122,13 +147,13 @@ impl Bytes { default: Option, ) -> StrResult { self.locate_opt(index) - .and_then(|i| self.0.get(i).map(|&b| Value::Int(b.into()))) + .and_then(|i| self.as_slice().get(i).map(|&b| Value::Int(b.into()))) .or(default) .ok_or_else(|| out_of_bounds_no_default(index, self.len())) } - /// Extracts a subslice of the bytes. Fails with an error if the start or end - /// index is out of bounds. + /// Extracts a subslice of the bytes. Fails with an error if the start or + /// end index is out of bounds. #[func] pub fn slice( &self, @@ -148,9 +173,17 @@ impl Bytes { if end.is_none() { end = count.map(|c: i64| start + c); } + let start = self.locate(start)?; let end = self.locate(end.unwrap_or(self.len() as i64))?.max(start); - Ok(self.0[start..end].into()) + let slice = &self.as_slice()[start..end]; + + // We could hold a view into the original bytes here instead of + // making a copy, but it's unclear when that's worth it. Java + // originally did that for strings, but went back on it because a + // very small view into a very large buffer would be a sort of + // memory leak. + Ok(Bytes::new(slice.to_vec())) } } @@ -170,7 +203,15 @@ impl Deref for Bytes { type Target = [u8]; fn deref(&self) -> &Self::Target { - &self.0 + self.0.as_bytes() + } +} + +impl Eq for Bytes {} + +impl PartialEq for Bytes { + fn eq(&self, other: &Self) -> bool { + self.0.eq(&other.0) } } @@ -180,18 +221,6 @@ impl AsRef<[u8]> for Bytes { } } -impl From<&[u8]> for Bytes { - fn from(slice: &[u8]) -> Self { - Self(Arc::new(LazyHash::new(slice.to_vec().into()))) - } -} - -impl From> for Bytes { - fn from(vec: Vec) -> Self { - Self(Arc::new(LazyHash::new(vec.into()))) - } -} - impl Add for Bytes { type Output = Self; @@ -207,10 +236,12 @@ impl AddAssign for Bytes { // Nothing to do } else if self.is_empty() { *self = rhs; - } else if Arc::strong_count(&self.0) == 1 && matches!(**self.0, Cow::Owned(_)) { - Arc::make_mut(&mut self.0).to_mut().extend_from_slice(&rhs); + } else if let Some(vec) = Arc::get_mut(&mut self.0) + .and_then(|unique| unique.as_any_mut().downcast_mut::>()) + { + vec.extend_from_slice(&rhs); } else { - *self = Self::from([self.as_slice(), rhs.as_slice()].concat()); + *self = Self::new([self.as_slice(), rhs.as_slice()].concat()); } } } @@ -228,20 +259,61 @@ impl Serialize for Bytes { } } +/// Any type that can back a byte buffer. +trait Bytelike: Send + Sync { + fn as_bytes(&self) -> &[u8]; + fn as_any_mut(&mut self) -> &mut dyn Any; +} + +impl Bytelike for T +where + T: AsRef<[u8]> + Send + Sync + 'static, +{ + fn as_bytes(&self) -> &[u8] { + self.as_ref() + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } +} + +impl Hash for dyn Bytelike { + fn hash(&self, state: &mut H) { + self.as_bytes().hash(state); + } +} + +/// Makes string-like objects usable with `Bytes`. +struct StrWrapper(T); + +impl Bytelike for StrWrapper +where + T: AsRef + Send + Sync + 'static, +{ + fn as_bytes(&self) -> &[u8] { + self.0.as_ref().as_bytes() + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } +} + /// A value that can be cast to bytes. pub struct ToBytes(Bytes); cast! { ToBytes, - v: Str => Self(v.as_bytes().into()), + v: Str => Self(Bytes::from_string(v)), v: Array => Self(v.iter() .map(|item| match item { Value::Int(byte @ 0..=255) => Ok(*byte as u8), Value::Int(_) => bail!("number must be between 0 and 255"), value => Err(::error(value)), }) - .collect::, _>>()? - .into() + .collect::, _>>() + .map(Bytes::new)? ), v: Bytes => Self(v), } diff --git a/crates/typst-library/src/foundations/float.rs b/crates/typst-library/src/foundations/float.rs index c3d4e0e73..fcc46b034 100644 --- a/crates/typst-library/src/foundations/float.rs +++ b/crates/typst-library/src/foundations/float.rs @@ -163,18 +163,14 @@ impl f64 { size: u32, ) -> StrResult { Ok(match size { - 8 => match endian { + 8 => Bytes::new(match endian { Endianness::Little => self.to_le_bytes(), Endianness::Big => self.to_be_bytes(), - } - .as_slice() - .into(), - 4 => match endian { + }), + 4 => Bytes::new(match endian { Endianness::Little => (self as f32).to_le_bytes(), Endianness::Big => (self as f32).to_be_bytes(), - } - .as_slice() - .into(), + }), _ => bail!("size must be either 4 or 8"), }) } diff --git a/crates/typst-library/src/foundations/int.rs b/crates/typst-library/src/foundations/int.rs index bddffada3..83a89bf8a 100644 --- a/crates/typst-library/src/foundations/int.rs +++ b/crates/typst-library/src/foundations/int.rs @@ -1,6 +1,7 @@ use std::num::{NonZeroI64, NonZeroIsize, NonZeroU64, NonZeroUsize, ParseIntError}; use ecow::{eco_format, EcoString}; +use smallvec::SmallVec; use crate::diag::{bail, StrResult}; use crate::foundations::{ @@ -322,7 +323,7 @@ impl i64 { Endianness::Little => self.to_le_bytes(), }; - let mut buf = vec![0u8; size]; + let mut buf = SmallVec::<[u8; 8]>::from_elem(0, size); match endian { Endianness::Big => { // Copy the bytes from the array to the buffer, starting from @@ -339,7 +340,7 @@ impl i64 { } } - Bytes::from(buf) + Bytes::new(buf) } } diff --git a/crates/typst-library/src/foundations/plugin.rs b/crates/typst-library/src/foundations/plugin.rs index f57257a45..a7c341d8c 100644 --- a/crates/typst-library/src/foundations/plugin.rs +++ b/crates/typst-library/src/foundations/plugin.rs @@ -293,7 +293,7 @@ impl Plugin { _ => bail!("plugin did not respect the protocol"), }; - Ok(output.into()) + Ok(Bytes::new(output)) } /// An iterator over all the function names defined by the plugin. diff --git a/crates/typst-library/src/foundations/value.rs b/crates/typst-library/src/foundations/value.rs index eb0d6eedc..efc480d3f 100644 --- a/crates/typst-library/src/foundations/value.rs +++ b/crates/typst-library/src/foundations/value.rs @@ -459,15 +459,15 @@ impl<'de> Visitor<'de> for ValueVisitor { } fn visit_bytes(self, v: &[u8]) -> Result { - Ok(Bytes::from(v).into_value()) + Ok(Bytes::new(v.to_vec()).into_value()) } fn visit_borrowed_bytes(self, v: &'de [u8]) -> Result { - Ok(Bytes::from(v).into_value()) + Ok(Bytes::new(v.to_vec()).into_value()) } fn visit_byte_buf(self, v: Vec) -> Result { - Ok(Bytes::from(v).into_value()) + Ok(Bytes::new(v).into_value()) } fn visit_none(self) -> Result { diff --git a/crates/typst-library/src/loading/cbor.rs b/crates/typst-library/src/loading/cbor.rs index 977059c3d..a03e5c998 100644 --- a/crates/typst-library/src/loading/cbor.rs +++ b/crates/typst-library/src/loading/cbor.rs @@ -55,7 +55,7 @@ impl cbor { let Spanned { v: value, span } = value; let mut res = Vec::new(); ciborium::into_writer(&value, &mut res) - .map(|_| res.into()) + .map(|_| Bytes::new(res)) .map_err(|err| eco_format!("failed to encode value as CBOR ({err})")) .at(span) } diff --git a/crates/typst-library/src/loading/mod.rs b/crates/typst-library/src/loading/mod.rs index ae74df864..120b3e3af 100644 --- a/crates/typst-library/src/loading/mod.rs +++ b/crates/typst-library/src/loading/mod.rs @@ -56,15 +56,22 @@ pub enum Readable { impl Readable { pub fn as_slice(&self) -> &[u8] { match self { - Readable::Bytes(v) => v, - Readable::Str(v) => v.as_bytes(), + Self::Bytes(v) => v, + Self::Str(v) => v.as_bytes(), } } pub fn as_str(&self) -> Option<&str> { match self { - Readable::Str(v) => Some(v.as_str()), - Readable::Bytes(v) => std::str::from_utf8(v).ok(), + Self::Str(v) => Some(v.as_str()), + Self::Bytes(v) => std::str::from_utf8(v).ok(), + } + } + + pub fn into_bytes(self) -> Bytes { + match self { + Self::Bytes(v) => v, + Self::Str(v) => Bytes::from_string(v), } } } @@ -78,12 +85,3 @@ cast! { v: Str => Self::Str(v), v: Bytes => Self::Bytes(v), } - -impl From for Bytes { - fn from(value: Readable) -> Self { - match value { - Readable::Bytes(v) => v, - Readable::Str(v) => v.as_bytes().into(), - } - } -} diff --git a/crates/typst-library/src/text/font/color.rs b/crates/typst-library/src/text/font/color.rs index 08f6fe0a3..e3183e885 100644 --- a/crates/typst-library/src/text/font/color.rs +++ b/crates/typst-library/src/text/font/color.rs @@ -7,6 +7,7 @@ use typst_syntax::Span; use usvg::tiny_skia_path; use xmlwriter::XmlWriter; +use crate::foundations::Bytes; use crate::layout::{Abs, Frame, FrameItem, Point, Size}; use crate::text::{Font, Glyph}; use crate::visualize::{FixedStroke, Geometry, Image, RasterFormat, VectorFormat}; @@ -101,8 +102,12 @@ fn draw_raster_glyph( upem: Abs, raster_image: ttf_parser::RasterGlyphImage, ) -> Option<()> { - let image = - Image::new(raster_image.data.into(), RasterFormat::Png.into(), None).ok()?; + let image = Image::new( + Bytes::new(raster_image.data.to_vec()), + RasterFormat::Png.into(), + None, + ) + .ok()?; // Apple Color emoji doesn't provide offset information (or at least // not in a way ttf-parser understands), so we artificially shift their @@ -175,7 +180,7 @@ fn draw_colr_glyph( let data = svg.end_document().into_bytes(); - let image = Image::new(data.into(), VectorFormat::Svg.into(), None).ok()?; + let image = Image::new(Bytes::new(data), VectorFormat::Svg.into(), None).ok()?; let y_shift = Abs::pt(upem.to_pt() - y_max); let position = Point::new(Abs::pt(x_min), y_shift); @@ -251,7 +256,7 @@ fn draw_svg_glyph( ); let image = - Image::new(wrapper_svg.into_bytes().into(), VectorFormat::Svg.into(), None) + Image::new(Bytes::new(wrapper_svg.into_bytes()), VectorFormat::Svg.into(), None) .ok()?; let position = Point::new(Abs::pt(left), Abs::pt(top) + upem); diff --git a/crates/typst-library/src/visualize/image/raster.rs b/crates/typst-library/src/visualize/image/raster.rs index 829826c75..098843a25 100644 --- a/crates/typst-library/src/visualize/image/raster.rs +++ b/crates/typst-library/src/visualize/image/raster.rs @@ -274,7 +274,7 @@ mod tests { #[track_caller] fn test(path: &str, format: RasterFormat, dpi: f64) { let data = typst_dev_assets::get(path).unwrap(); - let bytes = Bytes::from_static(data); + let bytes = Bytes::new(data); let image = RasterImage::new(bytes, format).unwrap(); assert_eq!(image.dpi().map(f64::round), Some(dpi)); } diff --git a/crates/typst-svg/src/text.rs b/crates/typst-svg/src/text.rs index 80de32089..fa471b2ae 100644 --- a/crates/typst-svg/src/text.rs +++ b/crates/typst-svg/src/text.rs @@ -3,6 +3,7 @@ use std::io::Read; use base64::Engine; use ecow::EcoString; use ttf_parser::GlyphId; +use typst_library::foundations::Bytes; use typst_library::layout::{Abs, Point, Ratio, Size, Transform}; use typst_library::text::{Font, TextItem}; use typst_library::visualize::{FillRule, Image, Paint, RasterFormat, RelativeTo}; @@ -243,7 +244,9 @@ fn convert_bitmap_glyph_to_image(font: &Font, id: GlyphId) -> Option<(Image, f64 if raster.format != ttf_parser::RasterImageFormat::PNG { return None; } - let image = Image::new(raster.data.into(), RasterFormat::Png.into(), None).ok()?; + let image = + Image::new(Bytes::new(raster.data.to_vec()), RasterFormat::Png.into(), None) + .ok()?; Some((image, raster.x as f64, raster.y as f64)) } diff --git a/docs/src/html.rs b/docs/src/html.rs index a1206032d..4eb3954c3 100644 --- a/docs/src/html.rs +++ b/docs/src/html.rs @@ -486,7 +486,7 @@ impl World for DocWorld { fn file(&self, id: FileId) -> FileResult { assert!(id.package().is_none()); - Ok(Bytes::from_static( + Ok(Bytes::new( typst_dev_assets::get_by_name( &id.vpath().as_rootless_path().to_string_lossy(), ) diff --git a/docs/src/lib.rs b/docs/src/lib.rs index e92799718..2751500e3 100644 --- a/docs/src/lib.rs +++ b/docs/src/lib.rs @@ -78,7 +78,7 @@ static LIBRARY: LazyLock> = LazyLock::new(|| { static FONTS: LazyLock<(LazyHash, Vec)> = LazyLock::new(|| { let fonts: Vec<_> = typst_assets::fonts() .chain(typst_dev_assets::fonts()) - .flat_map(|data| Font::iter(Bytes::from_static(data))) + .flat_map(|data| Font::iter(Bytes::new(data))) .collect(); let book = FontBook::from_fonts(&fonts); (LazyHash::new(book), fonts) diff --git a/tests/fuzz/src/compile.rs b/tests/fuzz/src/compile.rs index 37e21deb9..3dedfb737 100644 --- a/tests/fuzz/src/compile.rs +++ b/tests/fuzz/src/compile.rs @@ -19,7 +19,7 @@ struct FuzzWorld { impl FuzzWorld { fn new(text: &str) -> Self { let data = typst_assets::fonts().next().unwrap(); - let font = Font::new(Bytes::from_static(data), 0).unwrap(); + let font = Font::new(Bytes::new(data), 0).unwrap(); let book = FontBook::from_fonts([&font]); Self { library: LazyHash::new(Library::default()), diff --git a/tests/src/world.rs b/tests/src/world.rs index a08f1efa8..5c2678328 100644 --- a/tests/src/world.rs +++ b/tests/src/world.rs @@ -98,7 +98,7 @@ impl Default for TestBase { fn default() -> Self { let fonts: Vec<_> = typst_assets::fonts() .chain(typst_dev_assets::fonts()) - .flat_map(|data| Font::iter(Bytes::from_static(data))) + .flat_map(|data| Font::iter(Bytes::new(data))) .collect(); Self { @@ -140,8 +140,8 @@ impl FileSlot { self.file .get_or_init(|| { read(&system_path(self.id)?).map(|cow| match cow { - Cow::Owned(buf) => buf.into(), - Cow::Borrowed(buf) => Bytes::from_static(buf), + Cow::Owned(buf) => Bytes::new(buf), + Cow::Borrowed(buf) => Bytes::new(buf), }) }) .clone()