Better smartquotes (#4849)

This commit is contained in:
Laurenz 2024-08-28 10:21:21 +02:00 committed by GitHub
parent 4e4c5175e5
commit ef4482ce4b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 97 additions and 114 deletions

View File

@ -16,8 +16,6 @@ use crate::utils::Numeric;
// paragraph's full text. // paragraph's full text.
const SPACING_REPLACE: &str = " "; // Space const SPACING_REPLACE: &str = " "; // Space
const OBJ_REPLACE: &str = "\u{FFFC}"; // Object Replacement Character const OBJ_REPLACE: &str = "\u{FFFC}"; // Object Replacement Character
const SPACING_REPLACE_CHAR: char = ' ';
const OBJ_REPLACE_CHAR: char = '\u{FFFC}';
// Unicode BiDi control characters. // Unicode BiDi control characters.
const LTR_EMBEDDING: &str = "\u{202A}"; const LTR_EMBEDDING: &str = "\u{202A}";
@ -125,8 +123,8 @@ pub fn collect<'a>(
consecutive: bool, consecutive: bool,
) -> SourceResult<(String, Vec<Segment<'a>>, SpanMapper)> { ) -> SourceResult<(String, Vec<Segment<'a>>, SpanMapper)> {
let mut collector = Collector::new(2 + children.len()); let mut collector = Collector::new(2 + children.len());
let mut iter = children.iter(styles).peekable();
let mut locator = locator.split(); let mut locator = locator.split();
let mut quoter = SmartQuoter::new();
let outer_dir = TextElem::dir_in(*styles); let outer_dir = TextElem::dir_in(*styles);
let first_line_indent = ParElem::first_line_indent_in(*styles); let first_line_indent = ParElem::first_line_indent_in(*styles);
@ -144,7 +142,7 @@ pub fn collect<'a>(
collector.spans.push(1, Span::detached()); collector.spans.push(1, Span::detached());
} }
while let Some((child, styles)) = iter.next() { for (child, styles) in children.iter(styles) {
let prev_len = collector.full.len(); let prev_len = collector.full.len();
if child.is::<SpaceElem>() { if child.is::<SpaceElem>() {
@ -191,32 +189,16 @@ pub fn collect<'a>(
} else if let Some(elem) = child.to_packed::<SmartQuoteElem>() { } else if let Some(elem) = child.to_packed::<SmartQuoteElem>() {
let double = elem.double(styles); let double = elem.double(styles);
if elem.enabled(styles) { if elem.enabled(styles) {
let quotes = SmartQuotes::new( let quotes = SmartQuotes::get(
elem.quotes(styles), elem.quotes(styles),
TextElem::lang_in(styles), TextElem::lang_in(styles),
TextElem::region_in(styles), TextElem::region_in(styles),
elem.alternative(styles), elem.alternative(styles),
); );
let peeked = iter.peek().and_then(|(child, _)| { let before =
if let Some(elem) = child.to_packed::<TextElem>() { collector.full.chars().rev().find(|&c| !is_default_ignorable(c));
elem.text().chars().find(|c| !is_default_ignorable(*c)) let quote = quoter.quote(before, &quotes, double);
} else if child.is::<SmartQuoteElem>() { collector.push_text(quote, styles);
Some('"')
} else if child.is::<SpaceElem>()
|| child.is::<HElem>()
|| child.is::<LinebreakElem>()
// This is a temporary hack. We should rather skip these
// and peek at the next child.
|| child.is::<TagElem>()
{
Some(SPACING_REPLACE_CHAR)
} else {
Some(OBJ_REPLACE_CHAR)
}
});
let quote = collector.quoter.quote(&quotes, double, peeked);
collector.push_quote(quote, styles);
} else { } else {
collector.push_text(if double { "\"" } else { "'" }, styles); collector.push_text(if double { "\"" } else { "'" }, styles);
} }
@ -261,7 +243,6 @@ struct Collector<'a> {
full: String, full: String,
segments: Vec<Segment<'a>>, segments: Vec<Segment<'a>>,
spans: SpanMapper, spans: SpanMapper,
quoter: SmartQuoter,
} }
impl<'a> Collector<'a> { impl<'a> Collector<'a> {
@ -270,13 +251,12 @@ impl<'a> Collector<'a> {
full: String::new(), full: String::new(),
segments: Vec::with_capacity(capacity), segments: Vec::with_capacity(capacity),
spans: SpanMapper::new(), spans: SpanMapper::new(),
quoter: SmartQuoter::new(),
} }
} }
fn push_text(&mut self, text: &str, styles: StyleChain<'a>) { fn push_text(&mut self, text: &str, styles: StyleChain<'a>) {
self.full.push_str(text); self.full.push_str(text);
self.push_segment(Segment::Text(text.len(), styles), false); self.push_segment(Segment::Text(text.len(), styles));
} }
fn build_text<F>(&mut self, styles: StyleChain<'a>, f: F) fn build_text<F>(&mut self, styles: StyleChain<'a>, f: F)
@ -286,24 +266,15 @@ impl<'a> Collector<'a> {
let prev = self.full.len(); let prev = self.full.len();
f(&mut self.full); f(&mut self.full);
let len = self.full.len() - prev; let len = self.full.len() - prev;
self.push_segment(Segment::Text(len, styles), false); self.push_segment(Segment::Text(len, styles));
}
fn push_quote(&mut self, quote: &str, styles: StyleChain<'a>) {
self.full.push_str(quote);
self.push_segment(Segment::Text(quote.len(), styles), true);
} }
fn push_item(&mut self, item: Item<'a>) { fn push_item(&mut self, item: Item<'a>) {
self.full.push_str(item.textual()); self.full.push_str(item.textual());
self.push_segment(Segment::Item(item), false); self.push_segment(Segment::Item(item));
}
fn push_segment(&mut self, segment: Segment<'a>, is_quote: bool) {
if let Some(last) = self.full.chars().rev().find(|c| !is_default_ignorable(*c)) {
self.quoter.last(last, is_quote);
} }
fn push_segment(&mut self, segment: Segment<'a>) {
if let (Some(Segment::Text(last_len, last_styles)), Segment::Text(len, styles)) = if let (Some(Segment::Text(last_len, last_styles)), Segment::Text(len, styles)) =
(self.segments.last_mut(), &segment) (self.segments.last_mut(), &segment)
{ {

View File

@ -159,7 +159,7 @@ impl Show for Packed<QuoteElem> {
let block = self.block(styles); let block = self.block(styles);
if self.quotes(styles) == Smart::Custom(true) || !block { if self.quotes(styles) == Smart::Custom(true) || !block {
let quotes = SmartQuotes::new( let quotes = SmartQuotes::get(
SmartQuoteElem::quotes_in(styles), SmartQuoteElem::quotes_in(styles),
TextElem::lang_in(styles), TextElem::lang_in(styles),
TextElem::region_in(styles), TextElem::region_in(styles),

View File

@ -97,68 +97,80 @@ impl PlainText for Packed<SmartQuoteElem> {
} }
} }
/// State machine for smart quote substitution. /// A smart quote substitutor with zero lookahead.
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct SmartQuoter { pub struct SmartQuoter {
/// How many quotes have been opened. /// The amount of quotes that have been opened.
quote_depth: usize, depth: u8,
/// Whether an opening quote might follow. /// Each bit indicates whether the quote at this nesting depth is a double.
expect_opening: bool, /// Maximum supported depth is thus 32.
/// Whether the last character was numeric. kinds: u32,
last_num: bool,
/// The previous type of quote character, if it was an opening quote.
prev_quote_type: Option<bool>,
} }
impl SmartQuoter { impl SmartQuoter {
/// Start quoting. /// Start quoting.
pub fn new() -> Self { pub fn new() -> Self {
Self { Self { depth: 0, kinds: 0 }
quote_depth: 0,
expect_opening: true,
last_num: false,
prev_quote_type: None,
}
} }
/// Process the last seen character. /// Determine which smart quote to substitute given this quoter's nesting
pub fn last(&mut self, c: char, is_quote: bool) { /// state and the character immediately preceding the quote.
self.expect_opening = is_exterior_to_quote(c) || is_opening_bracket(c);
self.last_num = c.is_numeric();
if !is_quote {
self.prev_quote_type = None;
}
}
/// Process and substitute a quote.
pub fn quote<'a>( pub fn quote<'a>(
&mut self, &mut self,
before: Option<char>,
quotes: &SmartQuotes<'a>, quotes: &SmartQuotes<'a>,
double: bool, double: bool,
peeked: Option<char>,
) -> &'a str { ) -> &'a str {
let peeked = peeked.unwrap_or(' '); let opened = self.top();
let mut expect_opening = self.expect_opening; let before = before.unwrap_or(' ');
if let Some(prev_double) = self.prev_quote_type.take() {
if double != prev_double { // If we are after a number and haven't most recently opened a quote of
expect_opening = true; // this kind, produce a prime. Otherwise, we prefer a closing quote.
if before.is_numeric() && opened != Some(double) {
return if double { "" } else { "" };
}
// If we have a single smart quote, didn't recently open a single
// quotation, and are after an alphabetic char, interpret this as an
// apostrophe.
if !double && opened != Some(false) && before.is_alphabetic() {
return "";
}
// If the most recently opened quotation is of this kind and the
// previous char does not indicate a nested quotation, close it.
if opened == Some(double)
&& !before.is_whitespace()
&& !is_newline(before)
&& !is_opening_bracket(before)
{
self.pop();
return quotes.close(double);
}
// Otherwise, open a new the quotation.
self.push(double);
quotes.open(double)
}
/// The top of our quotation stack. Returns `Some(double)` for the most
/// recently opened quote or `None` if we didn't open one.
fn top(&self) -> Option<bool> {
self.depth.checked_sub(1).map(|i| (self.kinds >> i) & 1 == 1)
}
/// Push onto the quotation stack.
fn push(&mut self, double: bool) {
if self.depth < 32 {
self.kinds |= (double as u32) << self.depth;
self.depth += 1;
} }
} }
if expect_opening { /// Pop from the quotation stack.
self.quote_depth += 1; fn pop(&mut self) {
self.prev_quote_type = Some(double); self.depth -= 1;
quotes.open(double) self.kinds &= (1 << self.depth) - 1;
} else if self.quote_depth > 0
&& (peeked.is_ascii_punctuation() || is_exterior_to_quote(peeked))
{
self.quote_depth -= 1;
quotes.close(double)
} else if self.last_num {
quotes.prime(double)
} else {
quotes.fallback(double)
}
} }
} }
@ -168,10 +180,7 @@ impl Default for SmartQuoter {
} }
} }
fn is_exterior_to_quote(c: char) -> bool { /// Whether the character is an opening bracket, parenthesis, or brace.
c.is_whitespace() || is_newline(c)
}
fn is_opening_bracket(c: char) -> bool { fn is_opening_bracket(c: char) -> bool {
matches!(c, '(' | '{' | '[') matches!(c, '(' | '{' | '[')
} }
@ -196,13 +205,13 @@ impl<'s> SmartQuotes<'s> {
/// region as an all-uppercase ISO 3166-alpha2 code. /// region as an all-uppercase ISO 3166-alpha2 code.
/// ///
/// Currently, the supported languages are: English, Czech, Danish, German, /// Currently, the supported languages are: English, Czech, Danish, German,
/// Swiss / Liechtensteinian German, Estonian, Icelandic, Italian, Latin, Lithuanian, /// Swiss / Liechtensteinian German, Estonian, Icelandic, Italian, Latin,
/// Latvian, Slovak, Slovenian, Spanish, Bosnian, Finnish, Swedish, French, /// Lithuanian, Latvian, Slovak, Slovenian, Spanish, Bosnian, Finnish,
/// Hungarian, Polish, Romanian, Japanese, Traditional Chinese, Russian, and /// Swedish, French, Hungarian, Polish, Romanian, Japanese, Traditional
/// Norwegian. /// Chinese, Russian, and Norwegian.
/// ///
/// For unknown languages, the English quotes are used as fallback. /// For unknown languages, the English quotes are used as fallback.
pub fn new( pub fn get(
quotes: &'s Smart<SmartQuoteDict>, quotes: &'s Smart<SmartQuoteDict>,
lang: Lang, lang: Lang,
region: Option<Region>, region: Option<Region>,
@ -281,24 +290,6 @@ impl<'s> SmartQuotes<'s> {
self.single_close self.single_close
} }
} }
/// Which character should be used as a prime.
pub fn prime(&self, double: bool) -> &'static str {
if double {
""
} else {
""
}
}
/// Which character should be used as a fallback quote.
pub fn fallback(&self, double: bool) -> &'static str {
if double {
"\""
} else {
""
}
}
} }
/// An opening and closing quote. /// An opening and closing quote.

Binary file not shown.

After

Width:  |  Height:  |  Size: 563 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 614 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

After

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 742 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

View File

@ -56,7 +56,28 @@ He said "I'm a big fella."
--- smartquote-escape --- --- smartquote-escape ---
// Test escape sequences. // Test escape sequences.
The 5\'11\" 'quick\' brown fox jumps over the \"lazy" dog\'s ear. The 5\'11\" 'quick\' brown fox jumps over the \"lazy' dog\'s ear.
--- smartquote-slash ---
// Test that smartquotes can open before non-whitespace if not nested.
"Hello"/"World" \
'"Hello"/"World"' \
""Hello"/"World""
--- smartquote-close-before-letter ---
// Test that smartquotes can close before alphabetic letters.
Straight "A"s and "B"s
--- smartquote-prime ---
// Test that primes result after numbers when possible.
A 2" nail. \
'A 2" nail.' \
"A 2" nail."
--- smartquote-bracket ---
// Test that brackets indicate an opening quote.
"a ["b"] c" \
"a b"c"d e"
--- smartquote-disable --- --- smartquote-disable ---
// Test turning smart quotes off. // Test turning smart quotes off.