From 0343e038d373b9164679c4d7b2c5345c58c123b7 Mon Sep 17 00:00:00 2001 From: Laurenz Date: Thu, 3 Oct 2024 15:00:58 +0200 Subject: [PATCH] Fix default ignorables (#5099) --- crates/typst-pdf/src/content.rs | 16 +++----- crates/typst/src/layout/inline/collect.rs | 3 +- crates/typst/src/layout/inline/linebreak.rs | 17 +------- crates/typst/src/layout/inline/mod.rs | 2 +- crates/typst/src/layout/inline/shaping.rs | 19 ++++++--- crates/typst/src/text/item.rs | 44 +++++++++------------ crates/typst/src/text/mod.rs | 18 +++++++++ 7 files changed, 62 insertions(+), 57 deletions(-) diff --git a/crates/typst-pdf/src/content.rs b/crates/typst-pdf/src/content.rs index aa20e55af..4f2baafb9 100644 --- a/crates/typst-pdf/src/content.rs +++ b/crates/typst-pdf/src/content.rs @@ -425,7 +425,7 @@ fn write_text(ctx: &mut Builder, pos: Point, text: &TextItem) -> SourceResult<() bail!( g.span.0, "the text {} could not be displayed with any font", - text.text[g.range()].repr() + TextItemView::full(text).glyph_text(g).repr(), ); } } @@ -441,7 +441,7 @@ fn write_text(ctx: &mut Builder, pos: Point, text: &TextItem) -> SourceResult<() || tables.svg.is_some() || tables.colr.is_some(); if !has_color_glyphs { - write_normal_text(ctx, pos, TextItemView::all_of(text))?; + write_normal_text(ctx, pos, TextItemView::full(text))?; return Ok(()); } @@ -449,9 +449,9 @@ fn write_text(ctx: &mut Builder, pos: Point, text: &TextItem) -> SourceResult<() text.glyphs.iter().filter(|g| is_color_glyph(&text.font, g)).count(); if color_glyph_count == text.glyphs.len() { - write_color_glyphs(ctx, pos, TextItemView::all_of(text))?; + write_color_glyphs(ctx, pos, TextItemView::full(text))?; } else if color_glyph_count == 0 { - write_normal_text(ctx, pos, TextItemView::all_of(text))?; + write_normal_text(ctx, pos, TextItemView::full(text))?; } else { // Otherwise we need to split it in smaller text runs let mut offset = 0; @@ -493,9 +493,7 @@ fn write_normal_text( let glyph_set = ctx.resources.glyph_sets.entry(text.item.font.clone()).or_default(); for g in text.glyphs() { - let t = text.text(); - let segment = &t[g.range()]; - glyph_set.entry(g.id).or_insert_with(|| segment.into()); + glyph_set.entry(g.id).or_insert_with(|| text.glyph_text(&g)); } let fill_transform = ctx.state.transforms(Size::zero(), pos); @@ -640,9 +638,7 @@ fn write_color_glyphs( ctx.content.show(Str(&[index])); - glyph_set - .entry(glyph.id) - .or_insert_with(|| text.text()[glyph.range()].into()); + glyph_set.entry(glyph.id).or_insert_with(|| text.glyph_text(&glyph)); } ctx.content.end_text(); diff --git a/crates/typst/src/layout/inline/collect.rs b/crates/typst/src/layout/inline/collect.rs index f1bb5869c..af14b1527 100644 --- a/crates/typst/src/layout/inline/collect.rs +++ b/crates/typst/src/layout/inline/collect.rs @@ -8,7 +8,8 @@ use crate::layout::{ }; use crate::syntax::Span; use crate::text::{ - LinebreakElem, SmartQuoteElem, SmartQuoter, SmartQuotes, SpaceElem, TextElem, + is_default_ignorable, LinebreakElem, SmartQuoteElem, SmartQuoter, SmartQuotes, + SpaceElem, TextElem, }; use crate::utils::Numeric; diff --git a/crates/typst/src/layout/inline/linebreak.rs b/crates/typst/src/layout/inline/linebreak.rs index b72507fdb..f5048aaec 100644 --- a/crates/typst/src/layout/inline/linebreak.rs +++ b/crates/typst/src/layout/inline/linebreak.rs @@ -2,7 +2,6 @@ use std::ops::{Add, Sub}; use az::SaturatingAs; use icu_properties::maps::{CodePointMapData, CodePointMapDataBorrowed}; -use icu_properties::sets::CodePointSetData; use icu_properties::LineBreak; use icu_provider::AsDeserializingBufferProvider; use icu_provider_adapters::fork::ForkByKeyProvider; @@ -16,7 +15,7 @@ use crate::engine::Engine; use crate::layout::{Abs, Em}; use crate::model::Linebreaks; use crate::syntax::link_prefix; -use crate::text::{Lang, TextElem}; +use crate::text::{is_default_ignorable, Lang, TextElem}; /// The cost of a line or paragraph layout. type Cost = f64; @@ -58,12 +57,6 @@ static LINEBREAK_DATA: Lazy> = Lazy::new(|| { icu_properties::maps::load_line_break(&blob().as_deserializing()).unwrap() }); -/// The set of Unicode default ignorables. -static DEFAULT_IGNORABLE_DATA: Lazy = Lazy::new(|| { - icu_properties::sets::load_default_ignorable_code_point(&blob().as_deserializing()) - .unwrap() -}); - /// A line break opportunity. #[derive(Debug, Copy, Clone, Eq, PartialEq)] pub enum Breakpoint { @@ -80,8 +73,7 @@ impl Breakpoint { /// Trim a line before this breakpoint. pub fn trim(self, line: &str) -> &str { // Trim default ignorables. - let ignorable = DEFAULT_IGNORABLE_DATA.as_borrowed(); - let line = line.trim_end_matches(|c| ignorable.contains(c)); + let line = line.trim_end_matches(is_default_ignorable); match self { // Trim whitespace. @@ -986,8 +978,3 @@ where } } } - -/// Whether a codepoint is Unicode `Default_Ignorable`. -pub fn is_default_ignorable(c: char) -> bool { - DEFAULT_IGNORABLE_DATA.as_borrowed().contains(c) -} diff --git a/crates/typst/src/layout/inline/mod.rs b/crates/typst/src/layout/inline/mod.rs index 889a028d5..275cb3326 100644 --- a/crates/typst/src/layout/inline/mod.rs +++ b/crates/typst/src/layout/inline/mod.rs @@ -10,7 +10,7 @@ use comemo::{Track, Tracked, TrackedMut}; use self::collect::{collect, Item, Segment, SpanMapper}; use self::finalize::finalize; use self::line::{commit, line, Line}; -use self::linebreak::{is_default_ignorable, linebreak, Breakpoint}; +use self::linebreak::{linebreak, Breakpoint}; use self::prepare::{prepare, Preparation}; use self::shaping::{ cjk_punct_style, is_of_cj_script, shape_range, ShapedGlyph, ShapedText, diff --git a/crates/typst/src/layout/inline/shaping.rs b/crates/typst/src/layout/inline/shaping.rs index 4344192c1..9d83a9a02 100644 --- a/crates/typst/src/layout/inline/shaping.rs +++ b/crates/typst/src/layout/inline/shaping.rs @@ -5,7 +5,7 @@ use std::sync::Arc; use az::SaturatingAs; use ecow::EcoString; -use rustybuzz::{ShapePlan, UnicodeBuffer}; +use rustybuzz::{BufferFlags, ShapePlan, UnicodeBuffer}; use ttf_parser::Tag; use unicode_bidi::{BidiInfo, Level as BidiLevel}; use unicode_script::{Script, UnicodeScript}; @@ -15,8 +15,8 @@ use crate::engine::Engine; use crate::foundations::{Smart, StyleChain}; use crate::layout::{Abs, Dir, Em, Frame, FrameItem, Point, Size}; use crate::text::{ - decorate, families, features, variant, Font, FontVariant, Glyph, Lang, Region, - TextElem, TextItem, + decorate, families, features, is_default_ignorable, variant, Font, FontVariant, + Glyph, Lang, Region, TextElem, TextItem, }; use crate::utils::SliceExt; use crate::World; @@ -725,8 +725,11 @@ fn shape_segment<'a>( text: &str, mut families: impl Iterator + Clone, ) { - // Fonts dont have newlines and tabs. - if text.chars().all(|c| c == '\n' || c == '\t') { + // Don't try shaping newlines, tabs, or default ignorables. + if text + .chars() + .all(|c| c == '\n' || c == '\t' || is_default_ignorable(c)) + { return; } @@ -774,6 +777,12 @@ fn shape_segment<'a>( }); buffer.guess_segment_properties(); + // By default, Harfbuzz will create zero-width space glyphs for default + // ignorables. This is probably useful for GUI apps that want noticable + // effects on the cursor for those, but for us it's not useful and hurts + // text extraction. + buffer.set_flags(BufferFlags::REMOVE_DEFAULT_IGNORABLES); + // Prepare the shape plan. This plan depends on direction, script, language, // and features, but is independent from the text and can thus be memoized. let plan = create_shape_plan( diff --git a/crates/typst/src/text/item.rs b/crates/typst/src/text/item.rs index 5de30a8dc..f6044b2f3 100644 --- a/crates/typst/src/text/item.rs +++ b/crates/typst/src/text/item.rs @@ -5,7 +5,7 @@ use ecow::EcoString; use crate::layout::{Abs, Em}; use crate::syntax::Span; -use crate::text::{Font, Lang, Region}; +use crate::text::{is_default_ignorable, Font, Lang, Region}; use crate::visualize::{FixedStroke, Paint}; /// A run of shaped text. @@ -78,7 +78,7 @@ pub struct TextItemView<'a> { impl<'a> TextItemView<'a> { /// Build a TextItemView for the whole contents of a TextItem. - pub fn all_of(text: &'a TextItem) -> Self { + pub fn full(text: &'a TextItem) -> Self { Self::from_glyph_range(text, 0..text.glyphs.len()) } @@ -87,28 +87,30 @@ impl<'a> TextItemView<'a> { TextItemView { item: text, glyph_range } } - /// Obtains a glyph in this slice, remapping the range that it represents in - /// the original text so that it is relative to the start of the slice - pub fn glyph_at(&self, index: usize) -> Glyph { - let g = &self.item.glyphs[self.glyph_range.start + index]; - let base = self.text_range().start as u16; - Glyph { - range: g.range.start - base..g.range.end - base, - ..*g - } - } - /// Returns an iterator over the glyphs of the slice. /// /// The range of text that each glyph represents is remapped to be relative /// to the start of the slice. pub fn glyphs(&self) -> impl Iterator + '_ { - (0..self.glyph_range.len()).map(|index| self.glyph_at(index)) + let first = self.item.glyphs[self.glyph_range.start].range(); + let last = self.item.glyphs[self.glyph_range.end - 1].range(); + let base = first.start.min(last.start) as u16; + (0..self.glyph_range.len()).map(move |index| { + let g = &self.item.glyphs[self.glyph_range.start + index]; + Glyph { + range: g.range.start - base..g.range.end - base, + ..*g + } + }) } - /// The plain text that this slice represents - pub fn text(&self) -> &str { - &self.item.text[self.text_range()] + /// The plain text for the given glyph. This is an approximation since + /// glyphs do not correspond 1-1 with codepoints. + pub fn glyph_text(&self, glyph: &Glyph) -> EcoString { + self.item.text[glyph.range()] + .chars() + .filter(|&c| !is_default_ignorable(c)) + .collect() } /// The total width of this text slice @@ -119,12 +121,4 @@ impl<'a> TextItemView<'a> { .sum::() .at(self.item.size) } - - /// The range of text in the original TextItem that this slice corresponds - /// to. - fn text_range(&self) -> Range { - let first = self.item.glyphs[self.glyph_range.start].range(); - let last = self.item.glyphs[self.glyph_range.end - 1].range(); - first.start.min(last.start)..first.end.max(last.end) - } } diff --git a/crates/typst/src/text/mod.rs b/crates/typst/src/text/mod.rs index fbd5b9685..2e53b11a1 100644 --- a/crates/typst/src/text/mod.rs +++ b/crates/typst/src/text/mod.rs @@ -31,6 +31,10 @@ pub use self::space::*; use std::fmt::{self, Debug, Formatter}; use ecow::{eco_format, EcoString}; +use icu_properties::sets::CodePointSetData; +use icu_provider::AsDeserializingBufferProvider; +use icu_provider_blob::BlobDataProvider; +use once_cell::sync::Lazy; use rustybuzz::Feature; use smallvec::SmallVec; use ttf_parser::{Rect, Tag}; @@ -1310,6 +1314,20 @@ cast! { }, } +/// Whether a codepoint is Unicode `Default_Ignorable`. +pub(crate) fn is_default_ignorable(c: char) -> bool { + /// The set of Unicode default ignorables. + static DEFAULT_IGNORABLE_DATA: Lazy = Lazy::new(|| { + icu_properties::sets::load_default_ignorable_code_point( + &BlobDataProvider::try_new_from_static_blob(typst_assets::icu::ICU) + .unwrap() + .as_deserializing(), + ) + .unwrap() + }); + DEFAULT_IGNORABLE_DATA.as_borrowed().contains(c) +} + /// Pushes `text` wrapped in LRE/RLE + PDF to `out`. pub(crate) fn isolate(text: Content, styles: StyleChain, out: &mut Vec) { out.push(TextElem::packed(match TextElem::dir_in(styles) {