Better smartquotes (#4849)

This commit is contained in:
Laurenz 2024-08-28 10:21:21 +02:00 committed by GitHub
parent 4e4c5175e5
commit ef4482ce4b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 97 additions and 114 deletions

View File

@ -16,8 +16,6 @@ use crate::utils::Numeric;
// paragraph's full text.
const SPACING_REPLACE: &str = " "; // Space
const OBJ_REPLACE: &str = "\u{FFFC}"; // Object Replacement Character
const SPACING_REPLACE_CHAR: char = ' ';
const OBJ_REPLACE_CHAR: char = '\u{FFFC}';
// Unicode BiDi control characters.
const LTR_EMBEDDING: &str = "\u{202A}";
@ -125,8 +123,8 @@ pub fn collect<'a>(
consecutive: bool,
) -> SourceResult<(String, Vec<Segment<'a>>, SpanMapper)> {
let mut collector = Collector::new(2 + children.len());
let mut iter = children.iter(styles).peekable();
let mut locator = locator.split();
let mut quoter = SmartQuoter::new();
let outer_dir = TextElem::dir_in(*styles);
let first_line_indent = ParElem::first_line_indent_in(*styles);
@ -144,7 +142,7 @@ pub fn collect<'a>(
collector.spans.push(1, Span::detached());
}
while let Some((child, styles)) = iter.next() {
for (child, styles) in children.iter(styles) {
let prev_len = collector.full.len();
if child.is::<SpaceElem>() {
@ -191,32 +189,16 @@ pub fn collect<'a>(
} else if let Some(elem) = child.to_packed::<SmartQuoteElem>() {
let double = elem.double(styles);
if elem.enabled(styles) {
let quotes = SmartQuotes::new(
let quotes = SmartQuotes::get(
elem.quotes(styles),
TextElem::lang_in(styles),
TextElem::region_in(styles),
elem.alternative(styles),
);
let peeked = iter.peek().and_then(|(child, _)| {
if let Some(elem) = child.to_packed::<TextElem>() {
elem.text().chars().find(|c| !is_default_ignorable(*c))
} else if child.is::<SmartQuoteElem>() {
Some('"')
} else if child.is::<SpaceElem>()
|| child.is::<HElem>()
|| child.is::<LinebreakElem>()
// This is a temporary hack. We should rather skip these
// and peek at the next child.
|| child.is::<TagElem>()
{
Some(SPACING_REPLACE_CHAR)
} else {
Some(OBJ_REPLACE_CHAR)
}
});
let quote = collector.quoter.quote(&quotes, double, peeked);
collector.push_quote(quote, styles);
let before =
collector.full.chars().rev().find(|&c| !is_default_ignorable(c));
let quote = quoter.quote(before, &quotes, double);
collector.push_text(quote, styles);
} else {
collector.push_text(if double { "\"" } else { "'" }, styles);
}
@ -261,7 +243,6 @@ struct Collector<'a> {
full: String,
segments: Vec<Segment<'a>>,
spans: SpanMapper,
quoter: SmartQuoter,
}
impl<'a> Collector<'a> {
@ -270,13 +251,12 @@ impl<'a> Collector<'a> {
full: String::new(),
segments: Vec::with_capacity(capacity),
spans: SpanMapper::new(),
quoter: SmartQuoter::new(),
}
}
fn push_text(&mut self, text: &str, styles: StyleChain<'a>) {
self.full.push_str(text);
self.push_segment(Segment::Text(text.len(), styles), false);
self.push_segment(Segment::Text(text.len(), styles));
}
fn build_text<F>(&mut self, styles: StyleChain<'a>, f: F)
@ -286,24 +266,15 @@ impl<'a> Collector<'a> {
let prev = self.full.len();
f(&mut self.full);
let len = self.full.len() - prev;
self.push_segment(Segment::Text(len, styles), false);
}
fn push_quote(&mut self, quote: &str, styles: StyleChain<'a>) {
self.full.push_str(quote);
self.push_segment(Segment::Text(quote.len(), styles), true);
self.push_segment(Segment::Text(len, styles));
}
fn push_item(&mut self, item: Item<'a>) {
self.full.push_str(item.textual());
self.push_segment(Segment::Item(item), false);
}
fn push_segment(&mut self, segment: Segment<'a>, is_quote: bool) {
if let Some(last) = self.full.chars().rev().find(|c| !is_default_ignorable(*c)) {
self.quoter.last(last, is_quote);
self.push_segment(Segment::Item(item));
}
fn push_segment(&mut self, segment: Segment<'a>) {
if let (Some(Segment::Text(last_len, last_styles)), Segment::Text(len, styles)) =
(self.segments.last_mut(), &segment)
{

View File

@ -159,7 +159,7 @@ impl Show for Packed<QuoteElem> {
let block = self.block(styles);
if self.quotes(styles) == Smart::Custom(true) || !block {
let quotes = SmartQuotes::new(
let quotes = SmartQuotes::get(
SmartQuoteElem::quotes_in(styles),
TextElem::lang_in(styles),
TextElem::region_in(styles),

View File

@ -97,68 +97,80 @@ impl PlainText for Packed<SmartQuoteElem> {
}
}
/// State machine for smart quote substitution.
/// A smart quote substitutor with zero lookahead.
#[derive(Debug, Clone)]
pub struct SmartQuoter {
/// How many quotes have been opened.
quote_depth: usize,
/// Whether an opening quote might follow.
expect_opening: bool,
/// Whether the last character was numeric.
last_num: bool,
/// The previous type of quote character, if it was an opening quote.
prev_quote_type: Option<bool>,
/// The amount of quotes that have been opened.
depth: u8,
/// Each bit indicates whether the quote at this nesting depth is a double.
/// Maximum supported depth is thus 32.
kinds: u32,
}
impl SmartQuoter {
/// Start quoting.
pub fn new() -> Self {
Self {
quote_depth: 0,
expect_opening: true,
last_num: false,
prev_quote_type: None,
}
Self { depth: 0, kinds: 0 }
}
/// Process the last seen character.
pub fn last(&mut self, c: char, is_quote: bool) {
self.expect_opening = is_exterior_to_quote(c) || is_opening_bracket(c);
self.last_num = c.is_numeric();
if !is_quote {
self.prev_quote_type = None;
}
}
/// Process and substitute a quote.
/// Determine which smart quote to substitute given this quoter's nesting
/// state and the character immediately preceding the quote.
pub fn quote<'a>(
&mut self,
before: Option<char>,
quotes: &SmartQuotes<'a>,
double: bool,
peeked: Option<char>,
) -> &'a str {
let peeked = peeked.unwrap_or(' ');
let mut expect_opening = self.expect_opening;
if let Some(prev_double) = self.prev_quote_type.take() {
if double != prev_double {
expect_opening = true;
let opened = self.top();
let before = before.unwrap_or(' ');
// If we are after a number and haven't most recently opened a quote of
// this kind, produce a prime. Otherwise, we prefer a closing quote.
if before.is_numeric() && opened != Some(double) {
return if double { "" } else { "" };
}
// If we have a single smart quote, didn't recently open a single
// quotation, and are after an alphabetic char, interpret this as an
// apostrophe.
if !double && opened != Some(false) && before.is_alphabetic() {
return "";
}
// If the most recently opened quotation is of this kind and the
// previous char does not indicate a nested quotation, close it.
if opened == Some(double)
&& !before.is_whitespace()
&& !is_newline(before)
&& !is_opening_bracket(before)
{
self.pop();
return quotes.close(double);
}
// Otherwise, open a new the quotation.
self.push(double);
quotes.open(double)
}
/// The top of our quotation stack. Returns `Some(double)` for the most
/// recently opened quote or `None` if we didn't open one.
fn top(&self) -> Option<bool> {
self.depth.checked_sub(1).map(|i| (self.kinds >> i) & 1 == 1)
}
/// Push onto the quotation stack.
fn push(&mut self, double: bool) {
if self.depth < 32 {
self.kinds |= (double as u32) << self.depth;
self.depth += 1;
}
}
if expect_opening {
self.quote_depth += 1;
self.prev_quote_type = Some(double);
quotes.open(double)
} else if self.quote_depth > 0
&& (peeked.is_ascii_punctuation() || is_exterior_to_quote(peeked))
{
self.quote_depth -= 1;
quotes.close(double)
} else if self.last_num {
quotes.prime(double)
} else {
quotes.fallback(double)
}
/// Pop from the quotation stack.
fn pop(&mut self) {
self.depth -= 1;
self.kinds &= (1 << self.depth) - 1;
}
}
@ -168,10 +180,7 @@ impl Default for SmartQuoter {
}
}
fn is_exterior_to_quote(c: char) -> bool {
c.is_whitespace() || is_newline(c)
}
/// Whether the character is an opening bracket, parenthesis, or brace.
fn is_opening_bracket(c: char) -> bool {
matches!(c, '(' | '{' | '[')
}
@ -196,13 +205,13 @@ impl<'s> SmartQuotes<'s> {
/// region as an all-uppercase ISO 3166-alpha2 code.
///
/// Currently, the supported languages are: English, Czech, Danish, German,
/// Swiss / Liechtensteinian German, Estonian, Icelandic, Italian, Latin, Lithuanian,
/// Latvian, Slovak, Slovenian, Spanish, Bosnian, Finnish, Swedish, French,
/// Hungarian, Polish, Romanian, Japanese, Traditional Chinese, Russian, and
/// Norwegian.
/// Swiss / Liechtensteinian German, Estonian, Icelandic, Italian, Latin,
/// Lithuanian, Latvian, Slovak, Slovenian, Spanish, Bosnian, Finnish,
/// Swedish, French, Hungarian, Polish, Romanian, Japanese, Traditional
/// Chinese, Russian, and Norwegian.
///
/// For unknown languages, the English quotes are used as fallback.
pub fn new(
pub fn get(
quotes: &'s Smart<SmartQuoteDict>,
lang: Lang,
region: Option<Region>,
@ -281,24 +290,6 @@ impl<'s> SmartQuotes<'s> {
self.single_close
}
}
/// Which character should be used as a prime.
pub fn prime(&self, double: bool) -> &'static str {
if double {
""
} else {
""
}
}
/// Which character should be used as a fallback quote.
pub fn fallback(&self, double: bool) -> &'static str {
if double {
"\""
} else {
""
}
}
}
/// An opening and closing quote.

Binary file not shown.

After

Width:  |  Height:  |  Size: 563 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 614 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

After

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 742 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

View File

@ -56,7 +56,28 @@ He said "I'm a big fella."
--- smartquote-escape ---
// Test escape sequences.
The 5\'11\" 'quick\' brown fox jumps over the \"lazy" dog\'s ear.
The 5\'11\" 'quick\' brown fox jumps over the \"lazy' dog\'s ear.
--- smartquote-slash ---
// Test that smartquotes can open before non-whitespace if not nested.
"Hello"/"World" \
'"Hello"/"World"' \
""Hello"/"World""
--- smartquote-close-before-letter ---
// Test that smartquotes can close before alphabetic letters.
Straight "A"s and "B"s
--- smartquote-prime ---
// Test that primes result after numbers when possible.
A 2" nail. \
'A 2" nail.' \
"A 2" nail."
--- smartquote-bracket ---
// Test that brackets indicate an opening quote.
"a ["b"] c" \
"a b"c"d e"
--- smartquote-disable ---
// Test turning smart quotes off.