diff --git a/crates/typst-library/src/text/linebreak.rs b/crates/typst-library/src/text/linebreak.rs index a026df5da..8fd48df1e 100644 --- a/crates/typst-library/src/text/linebreak.rs +++ b/crates/typst-library/src/text/linebreak.rs @@ -5,6 +5,7 @@ use icu_provider_blob::BlobDataProvider; use icu_segmenter::LineSegmenter; use once_cell::sync::Lazy; use typst::doc::Lang; +use typst::syntax::link_prefix; use super::TextElem; use crate::layout::Preparation; @@ -82,25 +83,40 @@ pub(crate) fn breakpoints<'a>( p: &'a Preparation<'a>, mut f: impl FnMut(usize, Breakpoint), ) { + let text = p.bidi.text; + let hyphenate = p.hyphenate != Some(false); let lb = LINEBREAK_DATA.as_borrowed(); let segmenter = match p.lang { Some(Lang::CHINESE | Lang::JAPANESE) => &CJ_SEGMENTER, _ => &SEGMENTER, }; - let hyphenate = p.hyphenate != Some(false); let mut last = 0; + let mut iter = segmenter.segment_str(text).peekable(); + + loop { + // Special case for links. UAX #14 doesn't handle them well. + let (head, tail) = text.split_at(last); + if head.ends_with("://") || tail.starts_with("www.") { + let (link, _) = link_prefix(tail); + let end = last + link.len(); + linebreak_link(link, |i| f(last + i, Breakpoint::Normal)); + while iter.peek().map_or(false, |&p| p <= end) { + iter.next(); + } + } + + // Get the UAX #14 linebreak opportunities. + let Some(point) = iter.next() else { break }; - // Walk over all UAX #14 linebreak opportunities. - for point in segmenter.segment_str(p.bidi.text) { // Skip breakpoint if there is no char before it. icu4x generates one // at offset 0, but we don't want it. - let Some(c) = p.bidi.text[..point].chars().next_back() else { continue }; + let Some(c) = text[..point].chars().next_back() else { continue }; // Find out whether the last break was mandatory by checking against // rules LB4 and LB5, special-casing the end of text according to LB3. // See also: https://docs.rs/icu_segmenter/latest/icu_segmenter/struct.LineSegmenter.html - let breakpoint = if point == p.bidi.text.len() { + let breakpoint = if point == text.len() { Breakpoint::Mandatory } else { match lb.get(c) { @@ -121,8 +137,7 @@ pub(crate) fn breakpoints<'a>( } // Extract a hyphenatable "word". - let word = - &p.bidi.text[last..point].trim_end_matches(|c: char| !c.is_alphabetic()); + let word = &text[last..point].trim_end_matches(|c: char| !c.is_alphabetic()); if word.is_empty() { break 'hyphenate; } @@ -166,6 +181,69 @@ pub(crate) fn breakpoints<'a>( } } +/// Produce linebreak opportunities for a link. +fn linebreak_link(link: &str, mut f: impl FnMut(usize)) { + #[derive(PartialEq)] + enum Class { + Alphabetic, + Digit, + Open, + Other, + } + + impl Class { + fn of(c: char) -> Self { + if c.is_alphabetic() { + Class::Alphabetic + } else if c.is_numeric() { + Class::Digit + } else if matches!(c, '(' | '[') { + Class::Open + } else { + Class::Other + } + } + } + + let mut offset = 0; + let mut emit = |end: usize| { + let piece = &link[offset..end]; + if piece.len() < 16 { + // For bearably long segments, emit them as one. + offset = end; + f(offset); + } else { + // If it gets very long (e.g. a hash in the URL), just allow a + // break at every char. + for c in piece.chars() { + offset += c.len_utf8(); + f(offset); + } + } + }; + + let mut prev = Class::Other; + for (end, c) in link.char_indices() { + let class = Class::of(c); + + // Emit opportunities when going from + // - other -> other + // - alphabetic -> numeric + // - numeric -> alphabetic + // Never before after opening delimiters. + if end > 0 + && prev != Class::Open + && if class == Class::Other { prev == Class::Other } else { class != prev } + { + emit(end); + } + + prev = class; + } + + emit(link.len()); +} + /// Whether hyphenation is enabled at the given offset. fn hyphenate_at(p: &Preparation, offset: usize) -> bool { p.hyphenate diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs index 18622154e..a909dfa0a 100644 --- a/crates/typst-syntax/src/lexer.rs +++ b/crates/typst-syntax/src/lexer.rs @@ -253,43 +253,16 @@ impl Lexer<'_> { } fn link(&mut self) -> SyntaxKind { - let mut brackets = Vec::new(); + let (link, balanced) = link_prefix(self.s.after()); + self.s.jump(self.s.cursor() + link.len()); - #[rustfmt::skip] - self.s.eat_while(|c: char| { - match c { - | '0' ..= '9' - | 'a' ..= 'z' - | 'A' ..= 'Z' - | '!' | '#' | '$' | '%' | '&' | '*' | '+' - | ',' | '-' | '.' | '/' | ':' | ';' | '=' - | '?' | '@' | '_' | '~' | '\'' => true, - '[' => { - brackets.push(SyntaxKind::LeftBracket); - true - } - '(' => { - brackets.push(SyntaxKind::LeftParen); - true - } - ']' => brackets.pop() == Some(SyntaxKind::LeftBracket), - ')' => brackets.pop() == Some(SyntaxKind::LeftParen), - _ => false, - } - }); - - if !brackets.is_empty() { + if !balanced { return self.error( "automatic links cannot contain unbalanced brackets, \ use the `link` function instead", ); } - // Don't include the trailing characters likely to be part of text. - while matches!(self.s.scout(-1), Some('!' | ',' | '.' | ':' | ';' | '?' | '\'')) { - self.s.uneat(); - } - SyntaxKind::Link } @@ -662,6 +635,43 @@ pub fn is_newline(character: char) -> bool { ) } +/// Extracts a prefix of the text that is a link and also returns whether the +/// parentheses and brackets in the link were balanced. +pub fn link_prefix(text: &str) -> (&str, bool) { + let mut s = unscanny::Scanner::new(text); + let mut brackets = Vec::new(); + + #[rustfmt::skip] + s.eat_while(|c: char| { + match c { + | '0' ..= '9' + | 'a' ..= 'z' + | 'A' ..= 'Z' + | '!' | '#' | '$' | '%' | '&' | '*' | '+' + | ',' | '-' | '.' | '/' | ':' | ';' | '=' + | '?' | '@' | '_' | '~' | '\'' => true, + '[' => { + brackets.push(b'['); + true + } + '(' => { + brackets.push(b'('); + true + } + ']' => brackets.pop() == Some(b'['), + ')' => brackets.pop() == Some(b'('), + _ => false, + } + }); + + // Don't include the trailing characters likely to be part of text. + while matches!(s.scout(-1), Some('!' | ',' | '.' | ':' | ';' | '?' | '\'')) { + s.uneat(); + } + + (s.before(), brackets.is_empty()) +} + /// Split text at newlines. pub(super) fn split_newlines(text: &str) -> Vec<&str> { let mut s = Scanner::new(text); diff --git a/crates/typst-syntax/src/lib.rs b/crates/typst-syntax/src/lib.rs index 4ee370969..5cf740e7d 100644 --- a/crates/typst-syntax/src/lib.rs +++ b/crates/typst-syntax/src/lib.rs @@ -15,7 +15,7 @@ mod span; pub use self::file::{FileId, PackageSpec, PackageVersion, VirtualPath}; pub use self::highlight::{highlight, highlight_html, Tag}; pub use self::kind::SyntaxKind; -pub use self::lexer::{is_id_continue, is_id_start, is_ident, is_newline}; +pub use self::lexer::{is_id_continue, is_id_start, is_ident, is_newline, link_prefix}; pub use self::node::{LinkedChildren, LinkedNode, SyntaxError, SyntaxNode}; pub use self::parser::{parse, parse_code, parse_math}; pub use self::source::Source; diff --git a/tests/ref/meta/link.png b/tests/ref/meta/link.png index 1232f413d..01946e236 100644 Binary files a/tests/ref/meta/link.png and b/tests/ref/meta/link.png differ diff --git a/tests/ref/text/linebreak-link.png b/tests/ref/text/linebreak-link.png new file mode 100644 index 000000000..ab4e580f9 Binary files /dev/null and b/tests/ref/text/linebreak-link.png differ diff --git a/tests/typ/text/linebreak-link.typ b/tests/typ/text/linebreak-link.typ new file mode 100644 index 000000000..18b6d936b --- /dev/null +++ b/tests/typ/text/linebreak-link.typ @@ -0,0 +1,16 @@ +// Test linebreaking of links. + +--- +#link("https://example.com/(ab") \ +#link("https://example.com/(ab)") \ +#link("https://example.com/(paren)") \ +#link("https://example.com/paren)") \ +#link("https://hi.com/%%%%%%%%abcdef") \ + +--- +#set page(width: 240pt) +#set par(justify: true) + +Here's a link https://url.com/data/extern12840%data_urlenc and then there are more +links #link("www.url.com/data/extern12840%data_urlenc") in my text of links +http://mydataurl/hash/12098541029831025981024980124124214/incremental/progress%linkdata_information_setup_my_link_just_never_stops_going/on?query=false