Fix default ignorables (#5099)

This commit is contained in:
Laurenz 2024-10-03 15:00:58 +02:00 committed by GitHub
parent d86789c1f7
commit 0343e038d3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 62 additions and 57 deletions

View File

@ -425,7 +425,7 @@ fn write_text(ctx: &mut Builder, pos: Point, text: &TextItem) -> SourceResult<()
bail!( bail!(
g.span.0, g.span.0,
"the text {} could not be displayed with any font", "the text {} could not be displayed with any font",
text.text[g.range()].repr() TextItemView::full(text).glyph_text(g).repr(),
); );
} }
} }
@ -441,7 +441,7 @@ fn write_text(ctx: &mut Builder, pos: Point, text: &TextItem) -> SourceResult<()
|| tables.svg.is_some() || tables.svg.is_some()
|| tables.colr.is_some(); || tables.colr.is_some();
if !has_color_glyphs { if !has_color_glyphs {
write_normal_text(ctx, pos, TextItemView::all_of(text))?; write_normal_text(ctx, pos, TextItemView::full(text))?;
return Ok(()); return Ok(());
} }
@ -449,9 +449,9 @@ fn write_text(ctx: &mut Builder, pos: Point, text: &TextItem) -> SourceResult<()
text.glyphs.iter().filter(|g| is_color_glyph(&text.font, g)).count(); text.glyphs.iter().filter(|g| is_color_glyph(&text.font, g)).count();
if color_glyph_count == text.glyphs.len() { if color_glyph_count == text.glyphs.len() {
write_color_glyphs(ctx, pos, TextItemView::all_of(text))?; write_color_glyphs(ctx, pos, TextItemView::full(text))?;
} else if color_glyph_count == 0 { } else if color_glyph_count == 0 {
write_normal_text(ctx, pos, TextItemView::all_of(text))?; write_normal_text(ctx, pos, TextItemView::full(text))?;
} else { } else {
// Otherwise we need to split it in smaller text runs // Otherwise we need to split it in smaller text runs
let mut offset = 0; let mut offset = 0;
@ -493,9 +493,7 @@ fn write_normal_text(
let glyph_set = ctx.resources.glyph_sets.entry(text.item.font.clone()).or_default(); let glyph_set = ctx.resources.glyph_sets.entry(text.item.font.clone()).or_default();
for g in text.glyphs() { for g in text.glyphs() {
let t = text.text(); glyph_set.entry(g.id).or_insert_with(|| text.glyph_text(&g));
let segment = &t[g.range()];
glyph_set.entry(g.id).or_insert_with(|| segment.into());
} }
let fill_transform = ctx.state.transforms(Size::zero(), pos); let fill_transform = ctx.state.transforms(Size::zero(), pos);
@ -640,9 +638,7 @@ fn write_color_glyphs(
ctx.content.show(Str(&[index])); ctx.content.show(Str(&[index]));
glyph_set glyph_set.entry(glyph.id).or_insert_with(|| text.glyph_text(&glyph));
.entry(glyph.id)
.or_insert_with(|| text.text()[glyph.range()].into());
} }
ctx.content.end_text(); ctx.content.end_text();

View File

@ -8,7 +8,8 @@ use crate::layout::{
}; };
use crate::syntax::Span; use crate::syntax::Span;
use crate::text::{ use crate::text::{
LinebreakElem, SmartQuoteElem, SmartQuoter, SmartQuotes, SpaceElem, TextElem, is_default_ignorable, LinebreakElem, SmartQuoteElem, SmartQuoter, SmartQuotes,
SpaceElem, TextElem,
}; };
use crate::utils::Numeric; use crate::utils::Numeric;

View File

@ -2,7 +2,6 @@ use std::ops::{Add, Sub};
use az::SaturatingAs; use az::SaturatingAs;
use icu_properties::maps::{CodePointMapData, CodePointMapDataBorrowed}; use icu_properties::maps::{CodePointMapData, CodePointMapDataBorrowed};
use icu_properties::sets::CodePointSetData;
use icu_properties::LineBreak; use icu_properties::LineBreak;
use icu_provider::AsDeserializingBufferProvider; use icu_provider::AsDeserializingBufferProvider;
use icu_provider_adapters::fork::ForkByKeyProvider; use icu_provider_adapters::fork::ForkByKeyProvider;
@ -16,7 +15,7 @@ use crate::engine::Engine;
use crate::layout::{Abs, Em}; use crate::layout::{Abs, Em};
use crate::model::Linebreaks; use crate::model::Linebreaks;
use crate::syntax::link_prefix; use crate::syntax::link_prefix;
use crate::text::{Lang, TextElem}; use crate::text::{is_default_ignorable, Lang, TextElem};
/// The cost of a line or paragraph layout. /// The cost of a line or paragraph layout.
type Cost = f64; type Cost = f64;
@ -58,12 +57,6 @@ static LINEBREAK_DATA: Lazy<CodePointMapData<LineBreak>> = Lazy::new(|| {
icu_properties::maps::load_line_break(&blob().as_deserializing()).unwrap() icu_properties::maps::load_line_break(&blob().as_deserializing()).unwrap()
}); });
/// The set of Unicode default ignorables.
static DEFAULT_IGNORABLE_DATA: Lazy<CodePointSetData> = Lazy::new(|| {
icu_properties::sets::load_default_ignorable_code_point(&blob().as_deserializing())
.unwrap()
});
/// A line break opportunity. /// A line break opportunity.
#[derive(Debug, Copy, Clone, Eq, PartialEq)] #[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum Breakpoint { pub enum Breakpoint {
@ -80,8 +73,7 @@ impl Breakpoint {
/// Trim a line before this breakpoint. /// Trim a line before this breakpoint.
pub fn trim(self, line: &str) -> &str { pub fn trim(self, line: &str) -> &str {
// Trim default ignorables. // Trim default ignorables.
let ignorable = DEFAULT_IGNORABLE_DATA.as_borrowed(); let line = line.trim_end_matches(is_default_ignorable);
let line = line.trim_end_matches(|c| ignorable.contains(c));
match self { match self {
// Trim whitespace. // Trim whitespace.
@ -986,8 +978,3 @@ where
} }
} }
} }
/// Whether a codepoint is Unicode `Default_Ignorable`.
pub fn is_default_ignorable(c: char) -> bool {
DEFAULT_IGNORABLE_DATA.as_borrowed().contains(c)
}

View File

@ -10,7 +10,7 @@ use comemo::{Track, Tracked, TrackedMut};
use self::collect::{collect, Item, Segment, SpanMapper}; use self::collect::{collect, Item, Segment, SpanMapper};
use self::finalize::finalize; use self::finalize::finalize;
use self::line::{commit, line, Line}; use self::line::{commit, line, Line};
use self::linebreak::{is_default_ignorable, linebreak, Breakpoint}; use self::linebreak::{linebreak, Breakpoint};
use self::prepare::{prepare, Preparation}; use self::prepare::{prepare, Preparation};
use self::shaping::{ use self::shaping::{
cjk_punct_style, is_of_cj_script, shape_range, ShapedGlyph, ShapedText, cjk_punct_style, is_of_cj_script, shape_range, ShapedGlyph, ShapedText,

View File

@ -5,7 +5,7 @@ use std::sync::Arc;
use az::SaturatingAs; use az::SaturatingAs;
use ecow::EcoString; use ecow::EcoString;
use rustybuzz::{ShapePlan, UnicodeBuffer}; use rustybuzz::{BufferFlags, ShapePlan, UnicodeBuffer};
use ttf_parser::Tag; use ttf_parser::Tag;
use unicode_bidi::{BidiInfo, Level as BidiLevel}; use unicode_bidi::{BidiInfo, Level as BidiLevel};
use unicode_script::{Script, UnicodeScript}; use unicode_script::{Script, UnicodeScript};
@ -15,8 +15,8 @@ use crate::engine::Engine;
use crate::foundations::{Smart, StyleChain}; use crate::foundations::{Smart, StyleChain};
use crate::layout::{Abs, Dir, Em, Frame, FrameItem, Point, Size}; use crate::layout::{Abs, Dir, Em, Frame, FrameItem, Point, Size};
use crate::text::{ use crate::text::{
decorate, families, features, variant, Font, FontVariant, Glyph, Lang, Region, decorate, families, features, is_default_ignorable, variant, Font, FontVariant,
TextElem, TextItem, Glyph, Lang, Region, TextElem, TextItem,
}; };
use crate::utils::SliceExt; use crate::utils::SliceExt;
use crate::World; use crate::World;
@ -725,8 +725,11 @@ fn shape_segment<'a>(
text: &str, text: &str,
mut families: impl Iterator<Item = &'a str> + Clone, mut families: impl Iterator<Item = &'a str> + Clone,
) { ) {
// Fonts dont have newlines and tabs. // Don't try shaping newlines, tabs, or default ignorables.
if text.chars().all(|c| c == '\n' || c == '\t') { if text
.chars()
.all(|c| c == '\n' || c == '\t' || is_default_ignorable(c))
{
return; return;
} }
@ -774,6 +777,12 @@ fn shape_segment<'a>(
}); });
buffer.guess_segment_properties(); buffer.guess_segment_properties();
// By default, Harfbuzz will create zero-width space glyphs for default
// ignorables. This is probably useful for GUI apps that want noticable
// effects on the cursor for those, but for us it's not useful and hurts
// text extraction.
buffer.set_flags(BufferFlags::REMOVE_DEFAULT_IGNORABLES);
// Prepare the shape plan. This plan depends on direction, script, language, // Prepare the shape plan. This plan depends on direction, script, language,
// and features, but is independent from the text and can thus be memoized. // and features, but is independent from the text and can thus be memoized.
let plan = create_shape_plan( let plan = create_shape_plan(

View File

@ -5,7 +5,7 @@ use ecow::EcoString;
use crate::layout::{Abs, Em}; use crate::layout::{Abs, Em};
use crate::syntax::Span; use crate::syntax::Span;
use crate::text::{Font, Lang, Region}; use crate::text::{is_default_ignorable, Font, Lang, Region};
use crate::visualize::{FixedStroke, Paint}; use crate::visualize::{FixedStroke, Paint};
/// A run of shaped text. /// A run of shaped text.
@ -78,7 +78,7 @@ pub struct TextItemView<'a> {
impl<'a> TextItemView<'a> { impl<'a> TextItemView<'a> {
/// Build a TextItemView for the whole contents of a TextItem. /// Build a TextItemView for the whole contents of a TextItem.
pub fn all_of(text: &'a TextItem) -> Self { pub fn full(text: &'a TextItem) -> Self {
Self::from_glyph_range(text, 0..text.glyphs.len()) Self::from_glyph_range(text, 0..text.glyphs.len())
} }
@ -87,28 +87,30 @@ impl<'a> TextItemView<'a> {
TextItemView { item: text, glyph_range } TextItemView { item: text, glyph_range }
} }
/// Obtains a glyph in this slice, remapping the range that it represents in
/// the original text so that it is relative to the start of the slice
pub fn glyph_at(&self, index: usize) -> Glyph {
let g = &self.item.glyphs[self.glyph_range.start + index];
let base = self.text_range().start as u16;
Glyph {
range: g.range.start - base..g.range.end - base,
..*g
}
}
/// Returns an iterator over the glyphs of the slice. /// Returns an iterator over the glyphs of the slice.
/// ///
/// The range of text that each glyph represents is remapped to be relative /// The range of text that each glyph represents is remapped to be relative
/// to the start of the slice. /// to the start of the slice.
pub fn glyphs(&self) -> impl Iterator<Item = Glyph> + '_ { pub fn glyphs(&self) -> impl Iterator<Item = Glyph> + '_ {
(0..self.glyph_range.len()).map(|index| self.glyph_at(index)) let first = self.item.glyphs[self.glyph_range.start].range();
let last = self.item.glyphs[self.glyph_range.end - 1].range();
let base = first.start.min(last.start) as u16;
(0..self.glyph_range.len()).map(move |index| {
let g = &self.item.glyphs[self.glyph_range.start + index];
Glyph {
range: g.range.start - base..g.range.end - base,
..*g
}
})
} }
/// The plain text that this slice represents /// The plain text for the given glyph. This is an approximation since
pub fn text(&self) -> &str { /// glyphs do not correspond 1-1 with codepoints.
&self.item.text[self.text_range()] pub fn glyph_text(&self, glyph: &Glyph) -> EcoString {
self.item.text[glyph.range()]
.chars()
.filter(|&c| !is_default_ignorable(c))
.collect()
} }
/// The total width of this text slice /// The total width of this text slice
@ -119,12 +121,4 @@ impl<'a> TextItemView<'a> {
.sum::<Em>() .sum::<Em>()
.at(self.item.size) .at(self.item.size)
} }
/// The range of text in the original TextItem that this slice corresponds
/// to.
fn text_range(&self) -> Range<usize> {
let first = self.item.glyphs[self.glyph_range.start].range();
let last = self.item.glyphs[self.glyph_range.end - 1].range();
first.start.min(last.start)..first.end.max(last.end)
}
} }

View File

@ -31,6 +31,10 @@ pub use self::space::*;
use std::fmt::{self, Debug, Formatter}; use std::fmt::{self, Debug, Formatter};
use ecow::{eco_format, EcoString}; use ecow::{eco_format, EcoString};
use icu_properties::sets::CodePointSetData;
use icu_provider::AsDeserializingBufferProvider;
use icu_provider_blob::BlobDataProvider;
use once_cell::sync::Lazy;
use rustybuzz::Feature; use rustybuzz::Feature;
use smallvec::SmallVec; use smallvec::SmallVec;
use ttf_parser::{Rect, Tag}; use ttf_parser::{Rect, Tag};
@ -1310,6 +1314,20 @@ cast! {
}, },
} }
/// Whether a codepoint is Unicode `Default_Ignorable`.
pub(crate) fn is_default_ignorable(c: char) -> bool {
/// The set of Unicode default ignorables.
static DEFAULT_IGNORABLE_DATA: Lazy<CodePointSetData> = Lazy::new(|| {
icu_properties::sets::load_default_ignorable_code_point(
&BlobDataProvider::try_new_from_static_blob(typst_assets::icu::ICU)
.unwrap()
.as_deserializing(),
)
.unwrap()
});
DEFAULT_IGNORABLE_DATA.as_borrowed().contains(c)
}
/// Pushes `text` wrapped in LRE/RLE + PDF to `out`. /// Pushes `text` wrapped in LRE/RLE + PDF to `out`.
pub(crate) fn isolate(text: Content, styles: StyleChain, out: &mut Vec<Content>) { pub(crate) fn isolate(text: Content, styles: StyleChain, out: &mut Vec<Content>) {
out.push(TextElem::packed(match TextElem::dir_in(styles) { out.push(TextElem::packed(match TextElem::dir_in(styles) {