From ef4482ce4b084aa97155dbde89f02dda3f7fb219 Mon Sep 17 00:00:00 2001 From: Laurenz Date: Wed, 28 Aug 2024 10:21:21 +0200 Subject: [PATCH] Better smartquotes (#4849) --- crates/typst/src/layout/inline/collect.rs | 51 ++----- crates/typst/src/model/quote.rs | 2 +- crates/typst/src/text/smartquote.rs | 135 +++++++++---------- tests/ref/smartquote-bracket.png | Bin 0 -> 563 bytes tests/ref/smartquote-close-before-letter.png | Bin 0 -> 614 bytes tests/ref/smartquote-escape.png | Bin 1307 -> 1306 bytes tests/ref/smartquote-prime.png | Bin 0 -> 742 bytes tests/ref/smartquote-slash.png | Bin 0 -> 1243 bytes tests/suite/text/smartquote.typ | 23 +++- 9 files changed, 97 insertions(+), 114 deletions(-) create mode 100644 tests/ref/smartquote-bracket.png create mode 100644 tests/ref/smartquote-close-before-letter.png create mode 100644 tests/ref/smartquote-prime.png create mode 100644 tests/ref/smartquote-slash.png diff --git a/crates/typst/src/layout/inline/collect.rs b/crates/typst/src/layout/inline/collect.rs index 53c684d15..624eedf32 100644 --- a/crates/typst/src/layout/inline/collect.rs +++ b/crates/typst/src/layout/inline/collect.rs @@ -16,8 +16,6 @@ use crate::utils::Numeric; // paragraph's full text. const SPACING_REPLACE: &str = " "; // Space const OBJ_REPLACE: &str = "\u{FFFC}"; // Object Replacement Character -const SPACING_REPLACE_CHAR: char = ' '; -const OBJ_REPLACE_CHAR: char = '\u{FFFC}'; // Unicode BiDi control characters. const LTR_EMBEDDING: &str = "\u{202A}"; @@ -125,8 +123,8 @@ pub fn collect<'a>( consecutive: bool, ) -> SourceResult<(String, Vec>, SpanMapper)> { let mut collector = Collector::new(2 + children.len()); - let mut iter = children.iter(styles).peekable(); let mut locator = locator.split(); + let mut quoter = SmartQuoter::new(); let outer_dir = TextElem::dir_in(*styles); let first_line_indent = ParElem::first_line_indent_in(*styles); @@ -144,7 +142,7 @@ pub fn collect<'a>( collector.spans.push(1, Span::detached()); } - while let Some((child, styles)) = iter.next() { + for (child, styles) in children.iter(styles) { let prev_len = collector.full.len(); if child.is::() { @@ -191,32 +189,16 @@ pub fn collect<'a>( } else if let Some(elem) = child.to_packed::() { let double = elem.double(styles); if elem.enabled(styles) { - let quotes = SmartQuotes::new( + let quotes = SmartQuotes::get( elem.quotes(styles), TextElem::lang_in(styles), TextElem::region_in(styles), elem.alternative(styles), ); - let peeked = iter.peek().and_then(|(child, _)| { - if let Some(elem) = child.to_packed::() { - elem.text().chars().find(|c| !is_default_ignorable(*c)) - } else if child.is::() { - Some('"') - } else if child.is::() - || child.is::() - || child.is::() - // This is a temporary hack. We should rather skip these - // and peek at the next child. - || child.is::() - { - Some(SPACING_REPLACE_CHAR) - } else { - Some(OBJ_REPLACE_CHAR) - } - }); - - let quote = collector.quoter.quote("es, double, peeked); - collector.push_quote(quote, styles); + let before = + collector.full.chars().rev().find(|&c| !is_default_ignorable(c)); + let quote = quoter.quote(before, "es, double); + collector.push_text(quote, styles); } else { collector.push_text(if double { "\"" } else { "'" }, styles); } @@ -261,7 +243,6 @@ struct Collector<'a> { full: String, segments: Vec>, spans: SpanMapper, - quoter: SmartQuoter, } impl<'a> Collector<'a> { @@ -270,13 +251,12 @@ impl<'a> Collector<'a> { full: String::new(), segments: Vec::with_capacity(capacity), spans: SpanMapper::new(), - quoter: SmartQuoter::new(), } } fn push_text(&mut self, text: &str, styles: StyleChain<'a>) { self.full.push_str(text); - self.push_segment(Segment::Text(text.len(), styles), false); + self.push_segment(Segment::Text(text.len(), styles)); } fn build_text(&mut self, styles: StyleChain<'a>, f: F) @@ -286,24 +266,15 @@ impl<'a> Collector<'a> { let prev = self.full.len(); f(&mut self.full); let len = self.full.len() - prev; - self.push_segment(Segment::Text(len, styles), false); - } - - fn push_quote(&mut self, quote: &str, styles: StyleChain<'a>) { - self.full.push_str(quote); - self.push_segment(Segment::Text(quote.len(), styles), true); + self.push_segment(Segment::Text(len, styles)); } fn push_item(&mut self, item: Item<'a>) { self.full.push_str(item.textual()); - self.push_segment(Segment::Item(item), false); + self.push_segment(Segment::Item(item)); } - fn push_segment(&mut self, segment: Segment<'a>, is_quote: bool) { - if let Some(last) = self.full.chars().rev().find(|c| !is_default_ignorable(*c)) { - self.quoter.last(last, is_quote); - } - + fn push_segment(&mut self, segment: Segment<'a>) { if let (Some(Segment::Text(last_len, last_styles)), Segment::Text(len, styles)) = (self.segments.last_mut(), &segment) { diff --git a/crates/typst/src/model/quote.rs b/crates/typst/src/model/quote.rs index 65a809dca..528c0998e 100644 --- a/crates/typst/src/model/quote.rs +++ b/crates/typst/src/model/quote.rs @@ -159,7 +159,7 @@ impl Show for Packed { let block = self.block(styles); if self.quotes(styles) == Smart::Custom(true) || !block { - let quotes = SmartQuotes::new( + let quotes = SmartQuotes::get( SmartQuoteElem::quotes_in(styles), TextElem::lang_in(styles), TextElem::region_in(styles), diff --git a/crates/typst/src/text/smartquote.rs b/crates/typst/src/text/smartquote.rs index 64fecb768..02c93fd6b 100644 --- a/crates/typst/src/text/smartquote.rs +++ b/crates/typst/src/text/smartquote.rs @@ -97,68 +97,80 @@ impl PlainText for Packed { } } -/// State machine for smart quote substitution. +/// A smart quote substitutor with zero lookahead. #[derive(Debug, Clone)] pub struct SmartQuoter { - /// How many quotes have been opened. - quote_depth: usize, - /// Whether an opening quote might follow. - expect_opening: bool, - /// Whether the last character was numeric. - last_num: bool, - /// The previous type of quote character, if it was an opening quote. - prev_quote_type: Option, + /// The amount of quotes that have been opened. + depth: u8, + /// Each bit indicates whether the quote at this nesting depth is a double. + /// Maximum supported depth is thus 32. + kinds: u32, } impl SmartQuoter { /// Start quoting. pub fn new() -> Self { - Self { - quote_depth: 0, - expect_opening: true, - last_num: false, - prev_quote_type: None, - } + Self { depth: 0, kinds: 0 } } - /// Process the last seen character. - pub fn last(&mut self, c: char, is_quote: bool) { - self.expect_opening = is_exterior_to_quote(c) || is_opening_bracket(c); - self.last_num = c.is_numeric(); - if !is_quote { - self.prev_quote_type = None; - } - } - - /// Process and substitute a quote. + /// Determine which smart quote to substitute given this quoter's nesting + /// state and the character immediately preceding the quote. pub fn quote<'a>( &mut self, + before: Option, quotes: &SmartQuotes<'a>, double: bool, - peeked: Option, ) -> &'a str { - let peeked = peeked.unwrap_or(' '); - let mut expect_opening = self.expect_opening; - if let Some(prev_double) = self.prev_quote_type.take() { - if double != prev_double { - expect_opening = true; - } + let opened = self.top(); + let before = before.unwrap_or(' '); + + // If we are after a number and haven't most recently opened a quote of + // this kind, produce a prime. Otherwise, we prefer a closing quote. + if before.is_numeric() && opened != Some(double) { + return if double { "″" } else { "′" }; } - if expect_opening { - self.quote_depth += 1; - self.prev_quote_type = Some(double); - quotes.open(double) - } else if self.quote_depth > 0 - && (peeked.is_ascii_punctuation() || is_exterior_to_quote(peeked)) - { - self.quote_depth -= 1; - quotes.close(double) - } else if self.last_num { - quotes.prime(double) - } else { - quotes.fallback(double) + // If we have a single smart quote, didn't recently open a single + // quotation, and are after an alphabetic char, interpret this as an + // apostrophe. + if !double && opened != Some(false) && before.is_alphabetic() { + return "’"; } + + // If the most recently opened quotation is of this kind and the + // previous char does not indicate a nested quotation, close it. + if opened == Some(double) + && !before.is_whitespace() + && !is_newline(before) + && !is_opening_bracket(before) + { + self.pop(); + return quotes.close(double); + } + + // Otherwise, open a new the quotation. + self.push(double); + quotes.open(double) + } + + /// The top of our quotation stack. Returns `Some(double)` for the most + /// recently opened quote or `None` if we didn't open one. + fn top(&self) -> Option { + self.depth.checked_sub(1).map(|i| (self.kinds >> i) & 1 == 1) + } + + /// Push onto the quotation stack. + fn push(&mut self, double: bool) { + if self.depth < 32 { + self.kinds |= (double as u32) << self.depth; + self.depth += 1; + } + } + + /// Pop from the quotation stack. + fn pop(&mut self) { + self.depth -= 1; + self.kinds &= (1 << self.depth) - 1; } } @@ -168,10 +180,7 @@ impl Default for SmartQuoter { } } -fn is_exterior_to_quote(c: char) -> bool { - c.is_whitespace() || is_newline(c) -} - +/// Whether the character is an opening bracket, parenthesis, or brace. fn is_opening_bracket(c: char) -> bool { matches!(c, '(' | '{' | '[') } @@ -196,13 +205,13 @@ impl<'s> SmartQuotes<'s> { /// region as an all-uppercase ISO 3166-alpha2 code. /// /// Currently, the supported languages are: English, Czech, Danish, German, - /// Swiss / Liechtensteinian German, Estonian, Icelandic, Italian, Latin, Lithuanian, - /// Latvian, Slovak, Slovenian, Spanish, Bosnian, Finnish, Swedish, French, - /// Hungarian, Polish, Romanian, Japanese, Traditional Chinese, Russian, and - /// Norwegian. + /// Swiss / Liechtensteinian German, Estonian, Icelandic, Italian, Latin, + /// Lithuanian, Latvian, Slovak, Slovenian, Spanish, Bosnian, Finnish, + /// Swedish, French, Hungarian, Polish, Romanian, Japanese, Traditional + /// Chinese, Russian, and Norwegian. /// /// For unknown languages, the English quotes are used as fallback. - pub fn new( + pub fn get( quotes: &'s Smart, lang: Lang, region: Option, @@ -281,24 +290,6 @@ impl<'s> SmartQuotes<'s> { self.single_close } } - - /// Which character should be used as a prime. - pub fn prime(&self, double: bool) -> &'static str { - if double { - "″" - } else { - "′" - } - } - - /// Which character should be used as a fallback quote. - pub fn fallback(&self, double: bool) -> &'static str { - if double { - "\"" - } else { - "’" - } - } } /// An opening and closing quote. diff --git a/tests/ref/smartquote-bracket.png b/tests/ref/smartquote-bracket.png new file mode 100644 index 0000000000000000000000000000000000000000..7efcccf8a6b89de0b5d2ff0f9fb3abb56ac77ef4 GIT binary patch literal 563 zcmV-30?hr1P)!xqBhwBicIMZr9Cw1ML}9%(Sxj@sL0+a z$s{OZji~v`q|DQuai)>h-U6GmxaZ8B#do>)aFXDUB8i82n1>SycAOkLnKCe-h-{iK zj0dQ`C3nLr$s6;BuQ2#kAYE;=~L|uBD)CDF)7miB=9SbdX!P z;OmJtNeo;JhgBS$5(dBpX9cIl9Q-VRUM&absKJB_){Lk{?r+%LsoDSzwlzwSj*^LHkfUo;m1F2ahgAU~@U0rs5golI%8qUFYCrayLUHvplN5IZyj4{?TNhNcM9 zOx1FWJ7I|0?$<#cfM)lOa@BY zbZTm*#iMB!y`Vm6n_5e;`Lelr)gOi(C~S1%fzR&?TsPd@cMv-yNC_)pCA=HLA9}&( zSKsS75UjIfY&FvYcw5K6f-pMY5lDDJ*ATzKPd24E5ZW#SEP8GgldE%)qcJw5>W7h79W^J?EW>z1ctZ0Kw9N@aD;_2H=_jalaz zzS5r3z-&QLFg^$E<$8pU`b>XeUPYgGT66L4S#;L9%9@F`bw1=Y@=5peT%vyClh|{! z7-046$_P-ygKmyaIwOg5asr?IFLViR0QHv1`U_{RkR$b^ExSS>(p^!4YGSZKhKX3L zNvQ0rAf&n=b_|e4Wi=vLXduM99!kXE+&NB{r;07*qoM6N<$f=lZb AO#lD@ literal 0 HcmV?d00001 diff --git a/tests/ref/smartquote-escape.png b/tests/ref/smartquote-escape.png index 45d8f6027e8730ffd0facefc600233487c0e587c..ea4aef9c4110fc7d3a2c375f01047c03b4f1430e 100644 GIT binary patch delta 1286 zcmV+h1^N1$3YrR#B!A&aL_t(|+U?cpPZI_h$8rCLWqaqFIgQ)mmY@=3BAR8g%{d0% zqHJIg5tK`XaVp1D(8@p>-<$5Z z`0I**ewTtl4S!&hv#VL>-HrC;{s}-?nXMAW z;qh#>hWUh-VmI3V6-x(Haz#K<9RS#9RiCo<#E|4L@gnsS++;%%+)iQjT6hs%>+f5s zx`zQsIQSA+|2ZsR&v}+;{#?p*BPhPMdhiT}nQT-NSbvP%mmiz=tjm4E<`!&?uY%Ke zp;{*@ib^aslw9cqK?QAoX+uY zzPC*ignRCe&k&di&>g;O*j^WV~P70*yuw)+1F8!vY= zsf?|v3Bf_FLA;PuZYHbMy5>#@w#TGlEoS9gkKn{a;G$2YoXq&mAodxEE(Kk1NqHbb zk=mmwTzo39rO8w7C#P1j2OvY7NlXx_X}avxhJVftvM}ghy!)mgRS=-3Ap4A2@aUJ+ z?jS8*zu{6QKvD)))O)fujSqXCUgmTReP(oiR1M@ECiFz710qdSZsiv^6y686G$9DZ zwds6F%j_{TVL=+LD^b5c3?R(JhNdm}*J4BKCTw8Yof_*DV7rWxoj7)7vACqMjqe}v z41bmqs0iNQ*kCjV9$bjroy)PL6?*{)#mqIHUyFA&b7*wM8sjqkNxiimL{Vf(C|cNy zyO75?bOUUyGakrcTe;AU0khy-*D6wv{HiV3JG@B>B<7PJDRHgyO1`@8N}A`xd1=Wv z2o(mAsmlO8w%pjjhkJe72cVR?=(PCb~^Qvv|V62?;srWo$; zZO6ICP<-?u`?BI2qZ2rm4gi&hzm-svo@JmmKPAGTM4D0yJeH(t^+oADnsE6p&3`0d z;3-^Hc_WDYK?mIhcd`ZnXbU7x(zOxE?jaXeJcjn|+HzpU{euY=)lDJ9dbcoXSd)Ep zIe6BBjXDDW5``p7vSaDB*RLIO(Sd%G(*qcsb@767^ti~0L-|Y=Uf#X8wD&vC?4}&i z5)}aCdhm*h^5Fc{w_J|Jlrpde z02uR#jPlSVCht=lSNXUWLV4`|CZJ?vYM1Qiyfs%h`woWPoVo#NuL4ZFD8=aerk5+n wpMTWoTynbRO~VFWl77(gBrLE6{?EaG0ac`M`T68~qyPW_07*qoM6N<$f{4I#ivR!s delta 1288 zcmV+j1^4=z3Y!X$B!A*bL_t(|+U?c(Pt$iC$MO9Q%YHOV=3Lwsx5R@W6VxnsDWWhgwJ6{LIz_ZHDndCdg+VLefpUsPK)K`=1Vj*2U~-MJ6@=3E`<%&C!Zex! z&dv1ssoz&vk)#96X@FV?X{*Y6*gD=178Wx9KPu+EH3a01@iMP zT3{T0nuXS|fPic)M+TO$T-~KA@lG!X0CR2Er`V-xl1>s5)4vp_Tul;A=a~CS*ou1_ zI_GwE3;_^&`~|T7b6CiM9F~s$xttjSl&qNh+Y=aO@qc{_J{hPy*jRU9z48%T>ah@B z2FLDVbRoCbjLm4L3&_V+1qdGj>0Ug`wiRe&1>dI94+6(M#Y5`+VyzdEQm*Xs`B)8V2N2C4k)RB7K3r%{;BkKYAYhdu8JD z3a`2c?tfxU*q#w-C;;J+@nd&(h(Q*U0LZxB%Ke>jjUJb#X+Rddq20sY!}U`DPaQ zAdTtYTofQb=Tl$nqViNxmUI9>CXeHwh{$MNVt-_Hn?tTY3@q8-D2^0+>q$$zU=%#^ zC1pE^4$(VYjRTO#m`jBV+oJiP`|;15R?TOEO>Z%9<|KZpZPvt$1}d+Af%TUd$Y795F*!sXwbX(?d)k@kyjF z>cLsc?^t&LY%3RzDR3{4I-xcSRyr0D8TE@c?eNg16d){_u*H6Je(gL zehWW&EMPbm6A6YB(C2ovyxvIyaM`_Dbbn}Idx&oDzKkgiNYE`NWcDvDmroKCy76wj zX{+A`xNVFH4Gwhxz(DuwgX=O*waw~aLU|&)Rsi8gG0z%|n_gI@@L)|h6KW_;&;y{9 z@T>v9>4gV+Ze!DJ$T_ML7^nX0svJKTM4yMvQcVMPX_3kNQ=q?G>QlfS%0$o zLQsxKJ$V>-3>RI$B_esyT6fu*_(1^ZbRkyKZ$pl|ZY@r2(CZ*v0qomk0XMTcT!>iT zsGll4ns|B@xRil;zYG9K{28Sr+Z){+?6giAz;i_#2s5taHtLiLE8-(yYVfL#!`VGw zb74OP#4POs0CH`R2H#nR+Lqm!eSbh@f1V5Qemlbv%27*V!{{HI3fyEPRrw~bYqn1~ zn2q{7p60PT*x?V^iQrBg-8UvdRe-n%D6|i6F7-)SSK+?3=V;lq4an^xo~tqn++Xu@ yDZF^Twv`#@3tuyAc=79i=}4Gh6Z~I;{{mpHZ>dee<$C}C002ovPDHLkU;%=0IADwb diff --git a/tests/ref/smartquote-prime.png b/tests/ref/smartquote-prime.png new file mode 100644 index 0000000000000000000000000000000000000000..35c3761225189db4fa6c6e06d24c4b64ef8febf0 GIT binary patch literal 742 zcmVW5TOIO+Y;mDOaXUZm1Yc`vyIcF{t zm2~QCw3^|Zx<(qgiEHZfur{|(apui))y)neKJS}z;5j_6i*xwyz<+875A!e&Z!6f{ zOASu(kY&}-9Y&^Nl=JPjV|ouQ*dlmJ)-2pn`b}-%6a*4z!Bzf5vd*Ad3DV-p_c!jt zt(SSc=WWzruW)FegY14~1rRN*8*zB!)z>0-k{Vn!F{P9M03&9xS%w4qF;gqO4#7uS zFk1`+;=%w5+zSnecJ4FwhEDzHEJfBxWH-L}D8SXXJQ%y$1fVESXa>nu++xo+zLR&0 zCrh6g^F}mN{N&={Ed}@CU$LVbU}j?t+tmQ1V<3(7h47$L%{klA>)OR)@nWoDMggo} z5s-bww-<$I)CM?bnE(z2u`od6jx{UsApo#MMlwJO6q;lrg!FZD8zRGBn9}1a_?p=T zl3*Uu9*S|oWidO2s8Laa2b6=lvt%cm!Df%!kZoUAX;RRjrUn-Pi1z_djYm|7R{0$1 z)_XlalcA_X!rJl@1-L!e0cV~x1V}EVHG;HSBw5N;Oq6xxSW8Mr(_b3({N&={tphU_ z`!YCv_b~uBXBa-4bv352+EoUiWeB8sGlB<=V$NYx|MX5jsy*e1DQs)4@*t8QozX3W zx?HB!KF&G3;=^GU9}E?2?C;p%JV5?B%sP`n`dx08%Mj^NFNCEgB(anp*G;cT3sSim z(JBkWN9Qd4v-wawqXs)fuQbQVu4x#X)o5|)Q&8J{1da97U@;3ME&(vLLx*VcCS*^_ zU;C3#UyDRPJ#B+KN5lY<)YQ8mtzEw6Nqsb2qB~<13e3lwyEOdd;$a@<;cW{4 Y0p`jm9TyX!`Tzg`07*qoM6N<$f(3P) z5Qqp$!BRm4IUgJagw_JmGJy&fsz49dOz)78`N%e8%hKxw+@JjKU-DhRztjRJ?1Y{0 z{|O$hs~rVqo04m_#n`GEX#%KyR?b^Bv;fL#Ds-Lp!YK`YlfflZ#@+d&F$9hxUJ)#GPrhKD!>zB=`{e%=>Vi3r}%wqZy_m}Vrzw8 zT*@1``E1}G0EggpSy@>haw|x&WXc8y)#BLDe7a>9}ZpJ9W)8X`6eAQd6E8^!XU;`ue7hX?r_A>Wq)S1EN-G$L{274p*Uo6k)eE($uW33HmH%jz#MDhG5Jtd zP^Ll1SONvW)!#F?8z|#U#PhO{vcszR8ju&x$#lYwz-1^0Wz5M?Lw^?lW~IgeP}cW^ ze`abP18axQsOp|Js_cL}q=Yp{dv^Jc8iU>106IV5%#Q#}xD{Q1g2&Pr{ZUUt#rium z?(9P7g-at$?aS5fO}nApX9sM&M1IF*!m~l)jjeUm9}zx&r?DF(#+1q?-D>cv;(0o= ziCTek`C5g1Xi#DQjiYWt^&yN|i7EkBCnTu>sObP;Q#byHNgbbvmsAm#8PNL%E=>A5 zA0hkU1Ifw3hwzgZx(o0I#?~@tt|ceO&aXs_Z#Um)OE*8C5w+Hz;I!kg@Zx24t-^~( z*Tb4gz$(F(e$C}^pNvvIv2aADhyYP*>=n*A42NEWaxcEUjh+=e5Ls7FumzyH6QQlq zlz(i&qZZ!*Supt4W&044?jYO~@QdCUb)c=!?}D=0Q+CFGuq`H0Ipvt)eh?z%>F3CB7(SZ_ZQIz9i;*=mU7oBO$8{P| z$r17lq;`=3?+QYjo4g^c^!9GzFFFVh_#V=lj%}~g>=i$3a?QQyu4)OtqyR{_Aat)D z#4sEx$>7v3