mirror of
https://github.com/typst/typst
synced 2025-05-14 04:56:26 +08:00
8. Create Raw nodes entirely within the lexer
This commit is contained in:
parent
1cecae0333
commit
09975d1133
@ -16,8 +16,6 @@ pub(super) struct Lexer<'s> {
|
|||||||
mode: LexMode,
|
mode: LexMode,
|
||||||
/// Whether the last token contained a newline.
|
/// Whether the last token contained a newline.
|
||||||
newline: bool,
|
newline: bool,
|
||||||
/// The state held by raw line lexing.
|
|
||||||
raw: Vec<(SyntaxKind, usize)>,
|
|
||||||
/// An error for the last token.
|
/// An error for the last token.
|
||||||
error: Option<SyntaxError>,
|
error: Option<SyntaxError>,
|
||||||
}
|
}
|
||||||
@ -31,8 +29,6 @@ pub(super) enum LexMode {
|
|||||||
Math,
|
Math,
|
||||||
/// Keywords, literals and operators.
|
/// Keywords, literals and operators.
|
||||||
Code,
|
Code,
|
||||||
/// The contents of a raw block.
|
|
||||||
Raw,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'s> Lexer<'s> {
|
impl<'s> Lexer<'s> {
|
||||||
@ -44,7 +40,6 @@ impl<'s> Lexer<'s> {
|
|||||||
mode,
|
mode,
|
||||||
newline: false,
|
newline: false,
|
||||||
error: None,
|
error: None,
|
||||||
raw: Vec::new(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -97,16 +92,6 @@ impl Lexer<'_> {
|
|||||||
pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) {
|
pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) {
|
||||||
debug_assert!(self.error.is_none());
|
debug_assert!(self.error.is_none());
|
||||||
let start = self.s.cursor();
|
let start = self.s.cursor();
|
||||||
if self.mode == LexMode::Raw {
|
|
||||||
let kind = if let Some((kind, end)) = self.raw.pop() {
|
|
||||||
self.s.jump(end);
|
|
||||||
kind
|
|
||||||
} else {
|
|
||||||
SyntaxKind::End
|
|
||||||
};
|
|
||||||
let node = SyntaxNode::leaf(kind, self.s.from(start));
|
|
||||||
return (kind, node);
|
|
||||||
}
|
|
||||||
|
|
||||||
self.newline = false;
|
self.newline = false;
|
||||||
let kind = match self.s.eat() {
|
let kind = match self.s.eat() {
|
||||||
@ -121,12 +106,11 @@ impl Lexer<'_> {
|
|||||||
);
|
);
|
||||||
kind
|
kind
|
||||||
}
|
}
|
||||||
|
Some('`') if self.mode != LexMode::Math => return self.raw(),
|
||||||
Some(c) => match self.mode {
|
Some(c) => match self.mode {
|
||||||
LexMode::Markup => self.markup(start, c),
|
LexMode::Markup => self.markup(start, c),
|
||||||
LexMode::Math => self.math(start, c),
|
LexMode::Math => self.math(start, c),
|
||||||
LexMode::Code => self.code(start, c),
|
LexMode::Code => self.code(start, c),
|
||||||
LexMode::Raw => unreachable!(),
|
|
||||||
},
|
},
|
||||||
|
|
||||||
None => SyntaxKind::End,
|
None => SyntaxKind::End,
|
||||||
@ -193,7 +177,6 @@ impl Lexer<'_> {
|
|||||||
fn markup(&mut self, start: usize, c: char) -> SyntaxKind {
|
fn markup(&mut self, start: usize, c: char) -> SyntaxKind {
|
||||||
match c {
|
match c {
|
||||||
'\\' => self.backslash(),
|
'\\' => self.backslash(),
|
||||||
'`' => self.raw(),
|
|
||||||
'h' if self.s.eat_if("ttp://") => self.link(),
|
'h' if self.s.eat_if("ttp://") => self.link(),
|
||||||
'h' if self.s.eat_if("ttps://") => self.link(),
|
'h' if self.s.eat_if("ttps://") => self.link(),
|
||||||
'<' if self.s.at(is_id_continue) => self.label(),
|
'<' if self.s.at(is_id_continue) => self.label(),
|
||||||
@ -258,9 +241,10 @@ impl Lexer<'_> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn raw(&mut self) -> SyntaxKind {
|
/// Lex an entire raw segment at once. This is a convenience to avoid going
|
||||||
|
/// to and from the parser for each raw section.
|
||||||
|
fn raw(&mut self) -> (SyntaxKind, SyntaxNode) {
|
||||||
let start = self.s.cursor() - 1;
|
let start = self.s.cursor() - 1;
|
||||||
self.raw.clear();
|
|
||||||
|
|
||||||
// Determine number of opening backticks.
|
// Determine number of opening backticks.
|
||||||
let mut backticks = 1;
|
let mut backticks = 1;
|
||||||
@ -270,9 +254,11 @@ impl Lexer<'_> {
|
|||||||
|
|
||||||
// Special case for ``.
|
// Special case for ``.
|
||||||
if backticks == 2 {
|
if backticks == 2 {
|
||||||
self.push_raw(SyntaxKind::RawDelim);
|
let nodes = vec![
|
||||||
self.s.jump(start + 1);
|
SyntaxNode::leaf(SyntaxKind::RawDelim, "`"),
|
||||||
return SyntaxKind::RawDelim;
|
SyntaxNode::leaf(SyntaxKind::RawDelim, "`"),
|
||||||
|
];
|
||||||
|
return (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find end of raw text.
|
// Find end of raw text.
|
||||||
@ -281,43 +267,55 @@ impl Lexer<'_> {
|
|||||||
match self.s.eat() {
|
match self.s.eat() {
|
||||||
Some('`') => found += 1,
|
Some('`') => found += 1,
|
||||||
Some(_) => found = 0,
|
Some(_) => found = 0,
|
||||||
None => break,
|
None => {
|
||||||
|
let msg = SyntaxError::new("unclosed raw text");
|
||||||
|
let error = SyntaxNode::error(msg, self.s.from(start));
|
||||||
|
return (SyntaxKind::Error, error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if found != backticks {
|
|
||||||
return self.error("unclosed raw text");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let end = self.s.cursor();
|
let end = self.s.cursor();
|
||||||
if backticks >= 3 {
|
|
||||||
self.blocky_raw(start, end, backticks);
|
|
||||||
} else {
|
|
||||||
self.inline_raw(start, end, backticks);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Closing delimiter.
|
let mut nodes = Vec::with_capacity(3); // Will have at least 3.
|
||||||
self.push_raw(SyntaxKind::RawDelim);
|
|
||||||
|
|
||||||
// The saved tokens will be removed in reverse.
|
// A closure for pushing a node onto our raw vector. Assumes the caller
|
||||||
self.raw.reverse();
|
// will move the scanner to the next location at each step.
|
||||||
|
let mut prev_start = start;
|
||||||
|
let mut push_raw = |kind, s: &Scanner| {
|
||||||
|
nodes.push(SyntaxNode::leaf(kind, s.from(prev_start)));
|
||||||
|
prev_start = s.cursor();
|
||||||
|
};
|
||||||
|
|
||||||
// Opening delimiter.
|
// Opening delimiter.
|
||||||
self.s.jump(start + backticks);
|
self.s.jump(start + backticks);
|
||||||
SyntaxKind::RawDelim
|
push_raw(SyntaxKind::RawDelim, &self.s);
|
||||||
|
|
||||||
|
if backticks >= 3 {
|
||||||
|
self.blocky_raw(end - backticks, &mut push_raw);
|
||||||
|
} else {
|
||||||
|
self.inline_raw(end - backticks, &mut push_raw);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn blocky_raw(&mut self, start: usize, end: usize, backticks: usize) {
|
// Closing delimiter.
|
||||||
|
self.s.jump(end);
|
||||||
|
push_raw(SyntaxKind::RawDelim, &self.s);
|
||||||
|
|
||||||
|
(SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn blocky_raw<F>(&mut self, inner_end: usize, mut push_raw: F)
|
||||||
|
where
|
||||||
|
F: FnMut(SyntaxKind, &Scanner),
|
||||||
|
{
|
||||||
// Language tag.
|
// Language tag.
|
||||||
self.s.jump(start + backticks);
|
|
||||||
if self.s.eat_if(is_id_start) {
|
if self.s.eat_if(is_id_start) {
|
||||||
self.s.eat_while(is_id_continue);
|
self.s.eat_while(is_id_continue);
|
||||||
self.push_raw(SyntaxKind::RawLang);
|
push_raw(SyntaxKind::RawLang, &self.s);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determine inner content between backticks.
|
// Determine inner content between backticks.
|
||||||
self.s.eat_if(' ');
|
self.s.eat_if(' ');
|
||||||
let inner = self.s.to(end - backticks);
|
let inner = self.s.to(inner_end);
|
||||||
|
|
||||||
// Determine dedent level.
|
// Determine dedent level.
|
||||||
let mut lines = split_newlines(inner);
|
let mut lines = split_newlines(inner);
|
||||||
@ -363,41 +361,32 @@ impl Lexer<'_> {
|
|||||||
let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum();
|
let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum();
|
||||||
self.s.eat_newline();
|
self.s.eat_newline();
|
||||||
self.s.advance(offset);
|
self.s.advance(offset);
|
||||||
self.push_raw(SyntaxKind::RawTrimmed);
|
push_raw(SyntaxKind::RawTrimmed, &self.s);
|
||||||
self.s.advance(line.len() - offset);
|
self.s.advance(line.len() - offset);
|
||||||
self.push_raw(SyntaxKind::Text);
|
push_raw(SyntaxKind::Text, &self.s);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add final trimmed.
|
// Add final trimmed.
|
||||||
if self.s.cursor() < end - backticks {
|
if self.s.cursor() < inner_end {
|
||||||
self.s.jump(end - backticks);
|
self.s.jump(inner_end);
|
||||||
self.push_raw(SyntaxKind::RawTrimmed);
|
push_raw(SyntaxKind::RawTrimmed, &self.s);
|
||||||
}
|
}
|
||||||
self.s.jump(end);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn inline_raw(&mut self, start: usize, end: usize, backticks: usize) {
|
fn inline_raw<F>(&mut self, inner_end: usize, mut push_raw: F)
|
||||||
self.s.jump(start + backticks);
|
where
|
||||||
|
F: FnMut(SyntaxKind, &Scanner),
|
||||||
while self.s.cursor() < end - backticks {
|
{
|
||||||
|
while self.s.cursor() < inner_end {
|
||||||
if self.s.at(is_newline) {
|
if self.s.at(is_newline) {
|
||||||
self.push_raw(SyntaxKind::Text);
|
push_raw(SyntaxKind::Text, &self.s);
|
||||||
self.s.eat_newline();
|
self.s.eat_newline();
|
||||||
self.push_raw(SyntaxKind::RawTrimmed);
|
push_raw(SyntaxKind::RawTrimmed, &self.s);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
self.s.eat();
|
self.s.eat();
|
||||||
}
|
}
|
||||||
self.push_raw(SyntaxKind::Text);
|
push_raw(SyntaxKind::Text, &self.s);
|
||||||
|
|
||||||
self.s.jump(end);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Push the current cursor that marks the end of a raw segment of
|
|
||||||
/// the given `kind`.
|
|
||||||
fn push_raw(&mut self, kind: SyntaxKind) {
|
|
||||||
let end = self.s.cursor();
|
|
||||||
self.raw.push((kind, end));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn link(&mut self) -> SyntaxKind {
|
fn link(&mut self) -> SyntaxKind {
|
||||||
@ -605,7 +594,6 @@ impl Lexer<'_> {
|
|||||||
impl Lexer<'_> {
|
impl Lexer<'_> {
|
||||||
fn code(&mut self, start: usize, c: char) -> SyntaxKind {
|
fn code(&mut self, start: usize, c: char) -> SyntaxKind {
|
||||||
match c {
|
match c {
|
||||||
'`' => self.raw(),
|
|
||||||
'<' if self.s.at(is_id_continue) => self.label(),
|
'<' if self.s.at(is_id_continue) => self.label(),
|
||||||
'0'..='9' => self.number(start, c),
|
'0'..='9' => self.number(start, c),
|
||||||
'.' if self.s.at(char::is_ascii_digit) => self.number(start, c),
|
'.' if self.s.at(char::is_ascii_digit) => self.number(start, c),
|
||||||
|
@ -116,10 +116,11 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) {
|
|||||||
| SyntaxKind::Link
|
| SyntaxKind::Link
|
||||||
| SyntaxKind::Label => p.eat(),
|
| SyntaxKind::Label => p.eat(),
|
||||||
|
|
||||||
|
SyntaxKind::Raw => p.eat(), // Raw is handled entirely in the Lexer.
|
||||||
|
|
||||||
SyntaxKind::Hash => embedded_code_expr(p),
|
SyntaxKind::Hash => embedded_code_expr(p),
|
||||||
SyntaxKind::Star => strong(p),
|
SyntaxKind::Star => strong(p),
|
||||||
SyntaxKind::Underscore => emph(p),
|
SyntaxKind::Underscore => emph(p),
|
||||||
SyntaxKind::RawDelim => raw(p),
|
|
||||||
SyntaxKind::HeadingMarker if *at_start => heading(p),
|
SyntaxKind::HeadingMarker if *at_start => heading(p),
|
||||||
SyntaxKind::ListMarker if *at_start => list_item(p),
|
SyntaxKind::ListMarker if *at_start => list_item(p),
|
||||||
SyntaxKind::EnumMarker if *at_start => enum_item(p),
|
SyntaxKind::EnumMarker if *at_start => enum_item(p),
|
||||||
@ -162,22 +163,6 @@ fn emph(p: &mut Parser) {
|
|||||||
p.wrap(m, SyntaxKind::Emph);
|
p.wrap(m, SyntaxKind::Emph);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parses raw text with optional syntax highlighting: `` `...` ``.
|
|
||||||
fn raw(p: &mut Parser) {
|
|
||||||
let m = p.marker();
|
|
||||||
p.enter(LexMode::Raw);
|
|
||||||
p.assert(SyntaxKind::RawDelim);
|
|
||||||
|
|
||||||
// Eats until the closing delimiter.
|
|
||||||
while !p.end() && !p.at(SyntaxKind::RawDelim) {
|
|
||||||
p.eat();
|
|
||||||
}
|
|
||||||
|
|
||||||
p.expect(SyntaxKind::RawDelim);
|
|
||||||
p.exit();
|
|
||||||
p.wrap(m, SyntaxKind::Raw);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Parses a section heading: `= Introduction`.
|
/// Parses a section heading: `= Introduction`.
|
||||||
fn heading(p: &mut Parser) {
|
fn heading(p: &mut Parser) {
|
||||||
let m = p.marker();
|
let m = p.marker();
|
||||||
@ -767,7 +752,6 @@ fn code_primary(p: &mut Parser, atomic: bool) {
|
|||||||
SyntaxKind::LeftBrace => code_block(p),
|
SyntaxKind::LeftBrace => code_block(p),
|
||||||
SyntaxKind::LeftBracket => content_block(p),
|
SyntaxKind::LeftBracket => content_block(p),
|
||||||
SyntaxKind::LeftParen => expr_with_paren(p, atomic),
|
SyntaxKind::LeftParen => expr_with_paren(p, atomic),
|
||||||
SyntaxKind::RawDelim => raw(p),
|
|
||||||
SyntaxKind::Dollar => equation(p),
|
SyntaxKind::Dollar => equation(p),
|
||||||
SyntaxKind::Let => let_binding(p),
|
SyntaxKind::Let => let_binding(p),
|
||||||
SyntaxKind::Set => set_rule(p),
|
SyntaxKind::Set => set_rule(p),
|
||||||
@ -782,6 +766,8 @@ fn code_primary(p: &mut Parser, atomic: bool) {
|
|||||||
SyntaxKind::Continue => continue_stmt(p),
|
SyntaxKind::Continue => continue_stmt(p),
|
||||||
SyntaxKind::Return => return_stmt(p),
|
SyntaxKind::Return => return_stmt(p),
|
||||||
|
|
||||||
|
SyntaxKind::Raw => p.eat(), // Raw is handled entirely in the Lexer.
|
||||||
|
|
||||||
SyntaxKind::None
|
SyntaxKind::None
|
||||||
| SyntaxKind::Auto
|
| SyntaxKind::Auto
|
||||||
| SyntaxKind::Int
|
| SyntaxKind::Int
|
||||||
|
@ -104,7 +104,7 @@ pub const ATOMIC_CODE_PRIMARY: SyntaxSet = syntax_set!(
|
|||||||
Numeric,
|
Numeric,
|
||||||
Str,
|
Str,
|
||||||
Label,
|
Label,
|
||||||
RawDelim,
|
Raw,
|
||||||
);
|
);
|
||||||
|
|
||||||
/// Syntax kinds that are unary operators.
|
/// Syntax kinds that are unary operators.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user