mirror of
https://github.com/typst/typst
synced 2025-05-19 03:25:27 +08:00
Optimize scanner and tokenizer
This commit is contained in:
parent
0481192a77
commit
81f2f8f4c3
@ -94,6 +94,7 @@ pub fn search_column(src: &str) -> usize {
|
||||
}
|
||||
|
||||
/// Whether this character denotes a newline.
|
||||
#[inline]
|
||||
pub fn is_newline(character: char) -> bool {
|
||||
matches!(
|
||||
character,
|
||||
|
@ -13,11 +13,13 @@ pub struct Scanner<'s> {
|
||||
|
||||
impl<'s> Scanner<'s> {
|
||||
/// Create a new char scanner.
|
||||
#[inline]
|
||||
pub fn new(src: &'s str) -> Self {
|
||||
Self { src, index: 0 }
|
||||
}
|
||||
|
||||
/// Consume the next char.
|
||||
#[inline]
|
||||
pub fn eat(&mut self) -> Option<char> {
|
||||
let next = self.peek();
|
||||
if let Some(c) = next {
|
||||
@ -29,6 +31,7 @@ impl<'s> Scanner<'s> {
|
||||
/// Consume the next char if it is the given one.
|
||||
///
|
||||
/// Returns whether the char was consumed.
|
||||
#[inline]
|
||||
pub fn eat_if(&mut self, c: char) -> bool {
|
||||
let matches = self.peek() == Some(c);
|
||||
if matches {
|
||||
@ -38,12 +41,14 @@ impl<'s> Scanner<'s> {
|
||||
}
|
||||
|
||||
/// Consume the next char, debug-asserting that it is the given one.
|
||||
#[inline]
|
||||
pub fn eat_assert(&mut self, c: char) {
|
||||
let next = self.eat();
|
||||
debug_assert_eq!(next, Some(c));
|
||||
}
|
||||
|
||||
/// Consume the next char, coalescing `\r\n` to just `\n`.
|
||||
#[inline]
|
||||
pub fn eat_merging_crlf(&mut self) -> Option<char> {
|
||||
if self.rest().starts_with("\r\n") {
|
||||
self.index += 2;
|
||||
@ -54,6 +59,7 @@ impl<'s> Scanner<'s> {
|
||||
}
|
||||
|
||||
/// Eat chars while the condition is true.
|
||||
#[inline]
|
||||
pub fn eat_while<F>(&mut self, mut f: F) -> &'s str
|
||||
where
|
||||
F: FnMut(char) -> bool,
|
||||
@ -62,6 +68,7 @@ impl<'s> Scanner<'s> {
|
||||
}
|
||||
|
||||
/// Eat chars until the condition is true.
|
||||
#[inline]
|
||||
pub fn eat_until<F>(&mut self, mut f: F) -> &'s str
|
||||
where
|
||||
F: FnMut(char) -> bool,
|
||||
@ -77,11 +84,13 @@ impl<'s> Scanner<'s> {
|
||||
}
|
||||
|
||||
/// Uneat the last eaten char.
|
||||
#[inline]
|
||||
pub fn uneat(&mut self) {
|
||||
self.index = self.last_index();
|
||||
}
|
||||
|
||||
/// Peek at the next char without consuming it.
|
||||
#[inline]
|
||||
pub fn peek(&self) -> Option<char> {
|
||||
self.rest().chars().next()
|
||||
}
|
||||
@ -89,6 +98,7 @@ impl<'s> Scanner<'s> {
|
||||
/// Checks whether the next char fulfills a condition.
|
||||
///
|
||||
/// Returns `default` if there is no next char.
|
||||
#[inline]
|
||||
pub fn check_or<F>(&self, default: bool, f: F) -> bool
|
||||
where
|
||||
F: FnOnce(char) -> bool,
|
||||
@ -97,6 +107,7 @@ impl<'s> Scanner<'s> {
|
||||
}
|
||||
|
||||
/// The previous index in the source string.
|
||||
#[inline]
|
||||
pub fn last_index(&self) -> usize {
|
||||
self.eaten()
|
||||
.chars()
|
||||
@ -105,43 +116,53 @@ impl<'s> Scanner<'s> {
|
||||
}
|
||||
|
||||
/// The current index in the source string.
|
||||
#[inline]
|
||||
pub fn index(&self) -> usize {
|
||||
self.index
|
||||
}
|
||||
|
||||
/// Jump to an index in the source string.
|
||||
#[inline]
|
||||
pub fn jump(&mut self, index: usize) {
|
||||
// Make sure that the index is in bounds and on a codepoint boundary.
|
||||
self.src.get(index ..).expect("jumped to invalid index");
|
||||
self.index = index;
|
||||
}
|
||||
|
||||
/// Slice a part out of the source string.
|
||||
/// Slice out part of the source string.
|
||||
#[inline]
|
||||
pub fn get<I>(&self, index: I) -> &'s str
|
||||
where
|
||||
I: SliceIndex<str, Output = str>,
|
||||
{
|
||||
&self.src[index]
|
||||
// See `eaten_from` for details about `unwrap_or_default`.
|
||||
self.src.get(index).unwrap_or_default()
|
||||
}
|
||||
|
||||
/// The full source string up to the current index.
|
||||
pub fn eaten(&self) -> &'s str {
|
||||
/// The remaining source string after the current index.
|
||||
#[inline]
|
||||
pub fn rest(&self) -> &'s str {
|
||||
// SAFETY: The index is always in bounds and on a codepoint boundary
|
||||
// since it is:
|
||||
// - either increased by the length of a scanned character,
|
||||
// - or checked upon jumping.
|
||||
unsafe { self.src.get_unchecked(self.index ..) }
|
||||
}
|
||||
|
||||
/// The full source string up to the current index.
|
||||
#[inline]
|
||||
pub fn eaten(&self) -> &'s str {
|
||||
// SAFETY: The index is always okay, for details see `rest()`.
|
||||
unsafe { self.src.get_unchecked(.. self.index) }
|
||||
}
|
||||
|
||||
/// The source string from `start` to the current index.
|
||||
#[inline]
|
||||
pub fn eaten_from(&self, start: usize) -> &'s str {
|
||||
&self.src[start .. self.index]
|
||||
}
|
||||
|
||||
/// The remaining source string after the current index.
|
||||
pub fn rest(&self) -> &'s str {
|
||||
// SAFETY: The index is always okay, for details see `eaten()`.
|
||||
unsafe { self.src.get_unchecked(self.index ..) }
|
||||
// Using `unwrap_or_default` is much faster than unwrap, probably
|
||||
// because then the whole call to `eaten_from` is pure and can be
|
||||
// optimized away in some cases.
|
||||
self.src.get(start .. self.index).unwrap_or_default()
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -22,22 +22,26 @@ pub enum TokenMode {
|
||||
|
||||
impl<'s> Tokens<'s> {
|
||||
/// Create a new token iterator with the given mode.
|
||||
#[inline]
|
||||
pub fn new(src: &'s str, mode: TokenMode) -> Self {
|
||||
Self { s: Scanner::new(src), mode }
|
||||
}
|
||||
|
||||
/// Get the current token mode.
|
||||
#[inline]
|
||||
pub fn mode(&self) -> TokenMode {
|
||||
self.mode
|
||||
}
|
||||
|
||||
/// Change the token mode.
|
||||
#[inline]
|
||||
pub fn set_mode(&mut self, mode: TokenMode) {
|
||||
self.mode = mode;
|
||||
}
|
||||
|
||||
/// The index in the string at which the last token ends and next token
|
||||
/// will start.
|
||||
#[inline]
|
||||
pub fn index(&self) -> usize {
|
||||
self.s.index()
|
||||
}
|
||||
@ -45,11 +49,13 @@ impl<'s> Tokens<'s> {
|
||||
/// Jump to the given index in the string.
|
||||
///
|
||||
/// You need to know the correct column.
|
||||
#[inline]
|
||||
pub fn jump(&mut self, index: usize) {
|
||||
self.s.jump(index);
|
||||
}
|
||||
|
||||
/// The underlying scanner.
|
||||
#[inline]
|
||||
pub fn scanner(&self) -> Scanner<'s> {
|
||||
self.s
|
||||
}
|
||||
@ -59,6 +65,7 @@ impl<'s> Iterator for Tokens<'s> {
|
||||
type Item = Token<'s>;
|
||||
|
||||
/// Parse the next token in the source code.
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let start = self.s.index();
|
||||
let c = self.s.eat()?;
|
||||
@ -70,7 +77,8 @@ impl<'s> Iterator for Tokens<'s> {
|
||||
'}' => Token::RightBrace,
|
||||
|
||||
// Whitespace.
|
||||
c if c.is_whitespace() => self.whitespace(c),
|
||||
' ' if self.s.check_or(true, |c| !c.is_whitespace()) => Token::Space(0),
|
||||
c if c.is_whitespace() => self.whitespace(),
|
||||
|
||||
// Comments with special case for URLs.
|
||||
'/' if self.s.eat_if('*') => self.block_comment(),
|
||||
@ -87,6 +95,7 @@ impl<'s> Iterator for Tokens<'s> {
|
||||
}
|
||||
|
||||
impl<'s> Tokens<'s> {
|
||||
#[inline]
|
||||
fn markup(&mut self, start: usize, c: char) -> Token<'s> {
|
||||
match c {
|
||||
// Escape sequences.
|
||||
@ -158,54 +167,49 @@ impl<'s> Tokens<'s> {
|
||||
}
|
||||
}
|
||||
|
||||
fn whitespace(&mut self, first: char) -> Token<'s> {
|
||||
// Fast path for just a single space
|
||||
if first == ' ' && self.s.check_or(true, |c| !c.is_whitespace()) {
|
||||
Token::Space(0)
|
||||
} else {
|
||||
self.s.uneat();
|
||||
|
||||
// Count the number of newlines.
|
||||
let mut newlines = 0;
|
||||
while let Some(c) = self.s.eat_merging_crlf() {
|
||||
if !c.is_whitespace() {
|
||||
self.s.uneat();
|
||||
break;
|
||||
}
|
||||
|
||||
if is_newline(c) {
|
||||
newlines += 1;
|
||||
}
|
||||
}
|
||||
|
||||
Token::Space(newlines)
|
||||
#[inline]
|
||||
fn text(&mut self, start: usize) -> Token<'s> {
|
||||
macro_rules! table {
|
||||
($($c:literal)|*) => {{
|
||||
let mut t = [false; 128];
|
||||
$(t[$c as usize] = true;)*
|
||||
t
|
||||
}}
|
||||
}
|
||||
|
||||
const TABLE: [bool; 128] = table! {
|
||||
// Ascii whitespace.
|
||||
' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' |
|
||||
// Comments, parentheses, code.
|
||||
'/' | '[' | ']' | '{' | '}' | '#' |
|
||||
// Markup
|
||||
'~' | '*' | '_' | '`' | '$' | '-' | '\\'
|
||||
};
|
||||
|
||||
self.s.eat_until(|c| {
|
||||
TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
|
||||
});
|
||||
|
||||
Token::Text(self.s.eaten_from(start))
|
||||
}
|
||||
|
||||
fn text(&mut self, start: usize) -> Token<'s> {
|
||||
while let Some(c) = self.s.eat() {
|
||||
if match c {
|
||||
// Whitespace.
|
||||
c if c.is_whitespace() => true,
|
||||
// Comments.
|
||||
'/' => true,
|
||||
// Parentheses.
|
||||
'[' | ']' | '{' | '}' => true,
|
||||
// Code.
|
||||
'#' => true,
|
||||
// Markup.
|
||||
'~' | '*' | '_' | '`' | '$' | '-' => true,
|
||||
// Escaping.
|
||||
'\\' => true,
|
||||
// Just text.
|
||||
_ => false,
|
||||
} {
|
||||
fn whitespace(&mut self) -> Token<'s> {
|
||||
self.s.uneat();
|
||||
|
||||
// Count the number of newlines.
|
||||
let mut newlines = 0;
|
||||
while let Some(c) = self.s.eat_merging_crlf() {
|
||||
if !c.is_whitespace() {
|
||||
self.s.uneat();
|
||||
break;
|
||||
}
|
||||
|
||||
if is_newline(c) {
|
||||
newlines += 1;
|
||||
}
|
||||
}
|
||||
|
||||
Token::Text(self.s.eaten_from(start))
|
||||
Token::Space(newlines)
|
||||
}
|
||||
|
||||
fn backslash(&mut self) -> Token<'s> {
|
||||
@ -238,6 +242,7 @@ impl<'s> Tokens<'s> {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn hash(&mut self) -> Token<'s> {
|
||||
if self.s.check_or(false, is_id_start) {
|
||||
let read = self.s.eat_while(is_id_continue);
|
||||
|
Loading…
x
Reference in New Issue
Block a user