Fix default ignorables (#5099)

This commit is contained in:
Laurenz 2024-10-03 15:00:58 +02:00 committed by GitHub
parent d86789c1f7
commit 0343e038d3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 62 additions and 57 deletions

View File

@ -425,7 +425,7 @@ fn write_text(ctx: &mut Builder, pos: Point, text: &TextItem) -> SourceResult<()
bail!(
g.span.0,
"the text {} could not be displayed with any font",
text.text[g.range()].repr()
TextItemView::full(text).glyph_text(g).repr(),
);
}
}
@ -441,7 +441,7 @@ fn write_text(ctx: &mut Builder, pos: Point, text: &TextItem) -> SourceResult<()
|| tables.svg.is_some()
|| tables.colr.is_some();
if !has_color_glyphs {
write_normal_text(ctx, pos, TextItemView::all_of(text))?;
write_normal_text(ctx, pos, TextItemView::full(text))?;
return Ok(());
}
@ -449,9 +449,9 @@ fn write_text(ctx: &mut Builder, pos: Point, text: &TextItem) -> SourceResult<()
text.glyphs.iter().filter(|g| is_color_glyph(&text.font, g)).count();
if color_glyph_count == text.glyphs.len() {
write_color_glyphs(ctx, pos, TextItemView::all_of(text))?;
write_color_glyphs(ctx, pos, TextItemView::full(text))?;
} else if color_glyph_count == 0 {
write_normal_text(ctx, pos, TextItemView::all_of(text))?;
write_normal_text(ctx, pos, TextItemView::full(text))?;
} else {
// Otherwise we need to split it in smaller text runs
let mut offset = 0;
@ -493,9 +493,7 @@ fn write_normal_text(
let glyph_set = ctx.resources.glyph_sets.entry(text.item.font.clone()).or_default();
for g in text.glyphs() {
let t = text.text();
let segment = &t[g.range()];
glyph_set.entry(g.id).or_insert_with(|| segment.into());
glyph_set.entry(g.id).or_insert_with(|| text.glyph_text(&g));
}
let fill_transform = ctx.state.transforms(Size::zero(), pos);
@ -640,9 +638,7 @@ fn write_color_glyphs(
ctx.content.show(Str(&[index]));
glyph_set
.entry(glyph.id)
.or_insert_with(|| text.text()[glyph.range()].into());
glyph_set.entry(glyph.id).or_insert_with(|| text.glyph_text(&glyph));
}
ctx.content.end_text();

View File

@ -8,7 +8,8 @@ use crate::layout::{
};
use crate::syntax::Span;
use crate::text::{
LinebreakElem, SmartQuoteElem, SmartQuoter, SmartQuotes, SpaceElem, TextElem,
is_default_ignorable, LinebreakElem, SmartQuoteElem, SmartQuoter, SmartQuotes,
SpaceElem, TextElem,
};
use crate::utils::Numeric;

View File

@ -2,7 +2,6 @@ use std::ops::{Add, Sub};
use az::SaturatingAs;
use icu_properties::maps::{CodePointMapData, CodePointMapDataBorrowed};
use icu_properties::sets::CodePointSetData;
use icu_properties::LineBreak;
use icu_provider::AsDeserializingBufferProvider;
use icu_provider_adapters::fork::ForkByKeyProvider;
@ -16,7 +15,7 @@ use crate::engine::Engine;
use crate::layout::{Abs, Em};
use crate::model::Linebreaks;
use crate::syntax::link_prefix;
use crate::text::{Lang, TextElem};
use crate::text::{is_default_ignorable, Lang, TextElem};
/// The cost of a line or paragraph layout.
type Cost = f64;
@ -58,12 +57,6 @@ static LINEBREAK_DATA: Lazy<CodePointMapData<LineBreak>> = Lazy::new(|| {
icu_properties::maps::load_line_break(&blob().as_deserializing()).unwrap()
});
/// The set of Unicode default ignorables.
static DEFAULT_IGNORABLE_DATA: Lazy<CodePointSetData> = Lazy::new(|| {
icu_properties::sets::load_default_ignorable_code_point(&blob().as_deserializing())
.unwrap()
});
/// A line break opportunity.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum Breakpoint {
@ -80,8 +73,7 @@ impl Breakpoint {
/// Trim a line before this breakpoint.
pub fn trim(self, line: &str) -> &str {
// Trim default ignorables.
let ignorable = DEFAULT_IGNORABLE_DATA.as_borrowed();
let line = line.trim_end_matches(|c| ignorable.contains(c));
let line = line.trim_end_matches(is_default_ignorable);
match self {
// Trim whitespace.
@ -986,8 +978,3 @@ where
}
}
}
/// Whether a codepoint is Unicode `Default_Ignorable`.
pub fn is_default_ignorable(c: char) -> bool {
DEFAULT_IGNORABLE_DATA.as_borrowed().contains(c)
}

View File

@ -10,7 +10,7 @@ use comemo::{Track, Tracked, TrackedMut};
use self::collect::{collect, Item, Segment, SpanMapper};
use self::finalize::finalize;
use self::line::{commit, line, Line};
use self::linebreak::{is_default_ignorable, linebreak, Breakpoint};
use self::linebreak::{linebreak, Breakpoint};
use self::prepare::{prepare, Preparation};
use self::shaping::{
cjk_punct_style, is_of_cj_script, shape_range, ShapedGlyph, ShapedText,

View File

@ -5,7 +5,7 @@ use std::sync::Arc;
use az::SaturatingAs;
use ecow::EcoString;
use rustybuzz::{ShapePlan, UnicodeBuffer};
use rustybuzz::{BufferFlags, ShapePlan, UnicodeBuffer};
use ttf_parser::Tag;
use unicode_bidi::{BidiInfo, Level as BidiLevel};
use unicode_script::{Script, UnicodeScript};
@ -15,8 +15,8 @@ use crate::engine::Engine;
use crate::foundations::{Smart, StyleChain};
use crate::layout::{Abs, Dir, Em, Frame, FrameItem, Point, Size};
use crate::text::{
decorate, families, features, variant, Font, FontVariant, Glyph, Lang, Region,
TextElem, TextItem,
decorate, families, features, is_default_ignorable, variant, Font, FontVariant,
Glyph, Lang, Region, TextElem, TextItem,
};
use crate::utils::SliceExt;
use crate::World;
@ -725,8 +725,11 @@ fn shape_segment<'a>(
text: &str,
mut families: impl Iterator<Item = &'a str> + Clone,
) {
// Fonts dont have newlines and tabs.
if text.chars().all(|c| c == '\n' || c == '\t') {
// Don't try shaping newlines, tabs, or default ignorables.
if text
.chars()
.all(|c| c == '\n' || c == '\t' || is_default_ignorable(c))
{
return;
}
@ -774,6 +777,12 @@ fn shape_segment<'a>(
});
buffer.guess_segment_properties();
// By default, Harfbuzz will create zero-width space glyphs for default
// ignorables. This is probably useful for GUI apps that want noticable
// effects on the cursor for those, but for us it's not useful and hurts
// text extraction.
buffer.set_flags(BufferFlags::REMOVE_DEFAULT_IGNORABLES);
// Prepare the shape plan. This plan depends on direction, script, language,
// and features, but is independent from the text and can thus be memoized.
let plan = create_shape_plan(

View File

@ -5,7 +5,7 @@ use ecow::EcoString;
use crate::layout::{Abs, Em};
use crate::syntax::Span;
use crate::text::{Font, Lang, Region};
use crate::text::{is_default_ignorable, Font, Lang, Region};
use crate::visualize::{FixedStroke, Paint};
/// A run of shaped text.
@ -78,7 +78,7 @@ pub struct TextItemView<'a> {
impl<'a> TextItemView<'a> {
/// Build a TextItemView for the whole contents of a TextItem.
pub fn all_of(text: &'a TextItem) -> Self {
pub fn full(text: &'a TextItem) -> Self {
Self::from_glyph_range(text, 0..text.glyphs.len())
}
@ -87,28 +87,30 @@ impl<'a> TextItemView<'a> {
TextItemView { item: text, glyph_range }
}
/// Obtains a glyph in this slice, remapping the range that it represents in
/// the original text so that it is relative to the start of the slice
pub fn glyph_at(&self, index: usize) -> Glyph {
let g = &self.item.glyphs[self.glyph_range.start + index];
let base = self.text_range().start as u16;
Glyph {
range: g.range.start - base..g.range.end - base,
..*g
}
}
/// Returns an iterator over the glyphs of the slice.
///
/// The range of text that each glyph represents is remapped to be relative
/// to the start of the slice.
pub fn glyphs(&self) -> impl Iterator<Item = Glyph> + '_ {
(0..self.glyph_range.len()).map(|index| self.glyph_at(index))
let first = self.item.glyphs[self.glyph_range.start].range();
let last = self.item.glyphs[self.glyph_range.end - 1].range();
let base = first.start.min(last.start) as u16;
(0..self.glyph_range.len()).map(move |index| {
let g = &self.item.glyphs[self.glyph_range.start + index];
Glyph {
range: g.range.start - base..g.range.end - base,
..*g
}
})
}
/// The plain text that this slice represents
pub fn text(&self) -> &str {
&self.item.text[self.text_range()]
/// The plain text for the given glyph. This is an approximation since
/// glyphs do not correspond 1-1 with codepoints.
pub fn glyph_text(&self, glyph: &Glyph) -> EcoString {
self.item.text[glyph.range()]
.chars()
.filter(|&c| !is_default_ignorable(c))
.collect()
}
/// The total width of this text slice
@ -119,12 +121,4 @@ impl<'a> TextItemView<'a> {
.sum::<Em>()
.at(self.item.size)
}
/// The range of text in the original TextItem that this slice corresponds
/// to.
fn text_range(&self) -> Range<usize> {
let first = self.item.glyphs[self.glyph_range.start].range();
let last = self.item.glyphs[self.glyph_range.end - 1].range();
first.start.min(last.start)..first.end.max(last.end)
}
}

View File

@ -31,6 +31,10 @@ pub use self::space::*;
use std::fmt::{self, Debug, Formatter};
use ecow::{eco_format, EcoString};
use icu_properties::sets::CodePointSetData;
use icu_provider::AsDeserializingBufferProvider;
use icu_provider_blob::BlobDataProvider;
use once_cell::sync::Lazy;
use rustybuzz::Feature;
use smallvec::SmallVec;
use ttf_parser::{Rect, Tag};
@ -1310,6 +1314,20 @@ cast! {
},
}
/// Whether a codepoint is Unicode `Default_Ignorable`.
pub(crate) fn is_default_ignorable(c: char) -> bool {
/// The set of Unicode default ignorables.
static DEFAULT_IGNORABLE_DATA: Lazy<CodePointSetData> = Lazy::new(|| {
icu_properties::sets::load_default_ignorable_code_point(
&BlobDataProvider::try_new_from_static_blob(typst_assets::icu::ICU)
.unwrap()
.as_deserializing(),
)
.unwrap()
});
DEFAULT_IGNORABLE_DATA.as_borrowed().contains(c)
}
/// Pushes `text` wrapped in LRE/RLE + PDF to `out`.
pub(crate) fn isolate(text: Content, styles: StyleChain, out: &mut Vec<Content>) {
out.push(TextElem::packed(match TextElem::dir_in(styles) {