Linebreaking for links

2025-07-03 10:42:52 +08:00 · 2023-10-28 23:35:13 +02:00 · 2023-10-28 23:35:13 +02:00 · 29130a26f8
commit 29130a26f8
parent 4c75adbb04
6 changed files with 142 additions and 38 deletions
--- a/crates/typst-library/src/text/linebreak.rs
+++ b/crates/typst-library/src/text/linebreak.rs
@ -5,6 +5,7 @@ use icu_provider_blob::BlobDataProvider;
 use icu_segmenter::LineSegmenter;
 use once_cell::sync::Lazy;
 use typst::doc::Lang;
 use typst::syntax::link_prefix;
 use super::TextElem;
 use crate::layout::Preparation;
@ -82,25 +83,40 @@ pub(crate) fn breakpoints<'a>(
    p: &'a Preparation<'a>,
    mut f: impl FnMut(usize, Breakpoint),
 ) {
    let text = p.bidi.text;
    let hyphenate = p.hyphenate != Some(false);
    let lb = LINEBREAK_DATA.as_borrowed();
    let segmenter = match p.lang {
        Some(Lang::CHINESE | Lang::JAPANESE) => &CJ_SEGMENTER,
        _ => &SEGMENTER,
    };
    let hyphenate = p.hyphenate != Some(false);
    let mut last = 0;
    let mut iter = segmenter.segment_str(text).peekable();
    loop {
        // Special case for links. UAX #14 doesn't handle them well.
        let (head, tail) = text.split_at(last);
        if head.ends_with("://") || tail.starts_with("www.") {
            let (link, _) = link_prefix(tail);
            let end = last + link.len();
            linebreak_link(link, |i| f(last + i, Breakpoint::Normal));
            while iter.peek().map_or(false, |&p| p <= end) {
                iter.next();
            }
        }
        // Get the UAX #14 linebreak opportunities.
        let Some(point) = iter.next() else { break };
    // Walk over all UAX #14 linebreak opportunities.
    for point in segmenter.segment_str(p.bidi.text) {
        // Skip breakpoint if there is no char before it. icu4x generates one
        // at offset 0, but we don't want it.
-        let Some(c) = p.bidi.text[..point].chars().next_back() else { continue };
+        let Some(c) = text[..point].chars().next_back() else { continue };
        // Find out whether the last break was mandatory by checking against
        // rules LB4 and LB5, special-casing the end of text according to LB3.
        // See also: https://docs.rs/icu_segmenter/latest/icu_segmenter/struct.LineSegmenter.html
-        let breakpoint = if point == p.bidi.text.len() {
+        let breakpoint = if point == text.len() {
            Breakpoint::Mandatory
        } else {
            match lb.get(c) {
@ -121,8 +137,7 @@ pub(crate) fn breakpoints<'a>(
            }
            // Extract a hyphenatable "word".
-            let word =
+            let word = &text[last..point].trim_end_matches(|c: char| !c.is_alphabetic());
                &p.bidi.text[last..point].trim_end_matches(|c: char| !c.is_alphabetic());
            if word.is_empty() {
                break 'hyphenate;
            }
@ -166,6 +181,69 @@ pub(crate) fn breakpoints<'a>(
    }
 }
 /// Produce linebreak opportunities for a link.
 fn linebreak_link(link: &str, mut f: impl FnMut(usize)) {
    #[derive(PartialEq)]
    enum Class {
        Alphabetic,
        Digit,
        Open,
        Other,
    }
    impl Class {
        fn of(c: char) -> Self {
            if c.is_alphabetic() {
                Class::Alphabetic
            } else if c.is_numeric() {
                Class::Digit
            } else if matches!(c, '(' | '[') {
                Class::Open
            } else {
                Class::Other
            }
        }
    }
    let mut offset = 0;
    let mut emit = |end: usize| {
        let piece = &link[offset..end];
        if piece.len() < 16 {
            // For bearably long segments, emit them as one.
            offset = end;
            f(offset);
        } else {
            // If it gets very long (e.g. a hash in the URL), just allow a
            // break at every char.
            for c in piece.chars() {
                offset += c.len_utf8();
                f(offset);
            }
        }
    };
    let mut prev = Class::Other;
    for (end, c) in link.char_indices() {
        let class = Class::of(c);
        // Emit opportunities when going from
        // - other -> other
        // - alphabetic -> numeric
        // - numeric -> alphabetic
        // Never before after opening delimiters.
        if end > 0
            && prev != Class::Open
            && if class == Class::Other { prev == Class::Other } else { class != prev }
        {
            emit(end);
        }
        prev = class;
    }
    emit(link.len());
 }
 /// Whether hyphenation is enabled at the given offset.
 fn hyphenate_at(p: &Preparation, offset: usize) -> bool {
    p.hyphenate
--- a/crates/typst-syntax/src/lexer.rs
+++ b/crates/typst-syntax/src/lexer.rs
@ -253,43 +253,16 @@ impl Lexer<'_> {
    }
    fn link(&mut self) -> SyntaxKind {
-        let mut brackets = Vec::new();
+        let (link, balanced) = link_prefix(self.s.after());
        self.s.jump(self.s.cursor() + link.len());
-        #[rustfmt::skip]
+        if !balanced {
        self.s.eat_while(|c: char| {
            match c {
                | '0' ..= '9'
                | 'a' ..= 'z'
                | 'A' ..= 'Z'
                | '!' | '#' | '$' | '%' | '&' | '*' | '+'
                | ',' | '-' | '.' | '/' | ':' | ';' | '='
                | '?' | '@' | '_' | '~' | '\'' => true,
                '[' => {
                    brackets.push(SyntaxKind::LeftBracket);
                    true
                }
                '(' => {
                    brackets.push(SyntaxKind::LeftParen);
                    true
                }
                ']' => brackets.pop() == Some(SyntaxKind::LeftBracket),
                ')' => brackets.pop() == Some(SyntaxKind::LeftParen),
                _ => false,
            }
        });
        if !brackets.is_empty() {
            return self.error(
                "automatic links cannot contain unbalanced brackets, \
                 use the `link` function instead",
            );
        }
        // Don't include the trailing characters likely to be part of text.
        while matches!(self.s.scout(-1), Some('!' | ',' | '.' | ':' | ';' | '?' | '\'')) {
            self.s.uneat();
        }
        SyntaxKind::Link
    }
@ -662,6 +635,43 @@ pub fn is_newline(character: char) -> bool {
    )
 }
 /// Extracts a prefix of the text that is a link and also returns whether the
 /// parentheses and brackets in the link were balanced.
 pub fn link_prefix(text: &str) -> (&str, bool) {
    let mut s = unscanny::Scanner::new(text);
    let mut brackets = Vec::new();
    #[rustfmt::skip]
    s.eat_while(|c: char| {
        match c {
            | '0' ..= '9'
            | 'a' ..= 'z'
            | 'A' ..= 'Z'
            | '!' | '#' | '$' | '%' | '&' | '*' | '+'
            | ',' | '-' | '.' | '/' | ':' | ';' | '='
            | '?' | '@' | '_' | '~' | '\'' => true,
            '[' => {
                brackets.push(b'[');
                true
            }
            '(' => {
                brackets.push(b'(');
                true
            }
            ']' => brackets.pop() == Some(b'['),
            ')' => brackets.pop() == Some(b'('),
            _ => false,
        }
    });
    // Don't include the trailing characters likely to be part of text.
    while matches!(s.scout(-1), Some('!' | ',' | '.' | ':' | ';' | '?' | '\'')) {
        s.uneat();
    }
    (s.before(), brackets.is_empty())
 }
 /// Split text at newlines.
 pub(super) fn split_newlines(text: &str) -> Vec<&str> {
    let mut s = Scanner::new(text);
--- a/crates/typst-syntax/src/lib.rs
+++ b/crates/typst-syntax/src/lib.rs
@ -15,7 +15,7 @@ mod span;
 pub use self::file::{FileId, PackageSpec, PackageVersion, VirtualPath};
 pub use self::highlight::{highlight, highlight_html, Tag};
 pub use self::kind::SyntaxKind;
-pub use self::lexer::{is_id_continue, is_id_start, is_ident, is_newline};
+pub use self::lexer::{is_id_continue, is_id_start, is_ident, is_newline, link_prefix};
 pub use self::node::{LinkedChildren, LinkedNode, SyntaxError, SyntaxNode};
 pub use self::parser::{parse, parse_code, parse_math};
 pub use self::source::Source;
--- a/tests/ref/meta/link.png
+++ b/tests/ref/meta/link.png
--- a/tests/ref/text/linebreak-link.png
+++ b/tests/ref/text/linebreak-link.png
--- a/tests/typ/text/linebreak-link.typ
+++ b/tests/typ/text/linebreak-link.typ
@ -0,0 +1,16 @@
 // Test linebreaking of links.
 ---
 #link("https://example.com/(ab") \
 #link("https://example.com/(ab)") \
 #link("https://example.com/(paren)") \
 #link("https://example.com/paren)") \
 #link("https://hi.com/%%%%%%%%abcdef") \
 ---
 #set page(width: 240pt)
 #set par(justify: true)
 Here's a link https://url.com/data/extern12840%data_urlenc and then there are more
 links #link("www.url.com/data/extern12840%data_urlenc") in my text of links
 http://mydataurl/hash/12098541029831025981024980124124214/incremental/progress%linkdata_information_setup_my_link_just_never_stops_going/on?query=false