Optimize scanner and tokenizer

This commit is contained in:
Laurenz 2021-07-13 15:11:42 +02:00
parent 0481192a77
commit 81f2f8f4c3
3 changed files with 79 additions and 52 deletions

View File

@ -94,6 +94,7 @@ pub fn search_column(src: &str) -> usize {
} }
/// Whether this character denotes a newline. /// Whether this character denotes a newline.
#[inline]
pub fn is_newline(character: char) -> bool { pub fn is_newline(character: char) -> bool {
matches!( matches!(
character, character,

View File

@ -13,11 +13,13 @@ pub struct Scanner<'s> {
impl<'s> Scanner<'s> { impl<'s> Scanner<'s> {
/// Create a new char scanner. /// Create a new char scanner.
#[inline]
pub fn new(src: &'s str) -> Self { pub fn new(src: &'s str) -> Self {
Self { src, index: 0 } Self { src, index: 0 }
} }
/// Consume the next char. /// Consume the next char.
#[inline]
pub fn eat(&mut self) -> Option<char> { pub fn eat(&mut self) -> Option<char> {
let next = self.peek(); let next = self.peek();
if let Some(c) = next { if let Some(c) = next {
@ -29,6 +31,7 @@ impl<'s> Scanner<'s> {
/// Consume the next char if it is the given one. /// Consume the next char if it is the given one.
/// ///
/// Returns whether the char was consumed. /// Returns whether the char was consumed.
#[inline]
pub fn eat_if(&mut self, c: char) -> bool { pub fn eat_if(&mut self, c: char) -> bool {
let matches = self.peek() == Some(c); let matches = self.peek() == Some(c);
if matches { if matches {
@ -38,12 +41,14 @@ impl<'s> Scanner<'s> {
} }
/// Consume the next char, debug-asserting that it is the given one. /// Consume the next char, debug-asserting that it is the given one.
#[inline]
pub fn eat_assert(&mut self, c: char) { pub fn eat_assert(&mut self, c: char) {
let next = self.eat(); let next = self.eat();
debug_assert_eq!(next, Some(c)); debug_assert_eq!(next, Some(c));
} }
/// Consume the next char, coalescing `\r\n` to just `\n`. /// Consume the next char, coalescing `\r\n` to just `\n`.
#[inline]
pub fn eat_merging_crlf(&mut self) -> Option<char> { pub fn eat_merging_crlf(&mut self) -> Option<char> {
if self.rest().starts_with("\r\n") { if self.rest().starts_with("\r\n") {
self.index += 2; self.index += 2;
@ -54,6 +59,7 @@ impl<'s> Scanner<'s> {
} }
/// Eat chars while the condition is true. /// Eat chars while the condition is true.
#[inline]
pub fn eat_while<F>(&mut self, mut f: F) -> &'s str pub fn eat_while<F>(&mut self, mut f: F) -> &'s str
where where
F: FnMut(char) -> bool, F: FnMut(char) -> bool,
@ -62,6 +68,7 @@ impl<'s> Scanner<'s> {
} }
/// Eat chars until the condition is true. /// Eat chars until the condition is true.
#[inline]
pub fn eat_until<F>(&mut self, mut f: F) -> &'s str pub fn eat_until<F>(&mut self, mut f: F) -> &'s str
where where
F: FnMut(char) -> bool, F: FnMut(char) -> bool,
@ -77,11 +84,13 @@ impl<'s> Scanner<'s> {
} }
/// Uneat the last eaten char. /// Uneat the last eaten char.
#[inline]
pub fn uneat(&mut self) { pub fn uneat(&mut self) {
self.index = self.last_index(); self.index = self.last_index();
} }
/// Peek at the next char without consuming it. /// Peek at the next char without consuming it.
#[inline]
pub fn peek(&self) -> Option<char> { pub fn peek(&self) -> Option<char> {
self.rest().chars().next() self.rest().chars().next()
} }
@ -89,6 +98,7 @@ impl<'s> Scanner<'s> {
/// Checks whether the next char fulfills a condition. /// Checks whether the next char fulfills a condition.
/// ///
/// Returns `default` if there is no next char. /// Returns `default` if there is no next char.
#[inline]
pub fn check_or<F>(&self, default: bool, f: F) -> bool pub fn check_or<F>(&self, default: bool, f: F) -> bool
where where
F: FnOnce(char) -> bool, F: FnOnce(char) -> bool,
@ -97,6 +107,7 @@ impl<'s> Scanner<'s> {
} }
/// The previous index in the source string. /// The previous index in the source string.
#[inline]
pub fn last_index(&self) -> usize { pub fn last_index(&self) -> usize {
self.eaten() self.eaten()
.chars() .chars()
@ -105,43 +116,53 @@ impl<'s> Scanner<'s> {
} }
/// The current index in the source string. /// The current index in the source string.
#[inline]
pub fn index(&self) -> usize { pub fn index(&self) -> usize {
self.index self.index
} }
/// Jump to an index in the source string. /// Jump to an index in the source string.
#[inline]
pub fn jump(&mut self, index: usize) { pub fn jump(&mut self, index: usize) {
// Make sure that the index is in bounds and on a codepoint boundary. // Make sure that the index is in bounds and on a codepoint boundary.
self.src.get(index ..).expect("jumped to invalid index"); self.src.get(index ..).expect("jumped to invalid index");
self.index = index; self.index = index;
} }
/// Slice a part out of the source string. /// Slice out part of the source string.
#[inline]
pub fn get<I>(&self, index: I) -> &'s str pub fn get<I>(&self, index: I) -> &'s str
where where
I: SliceIndex<str, Output = str>, I: SliceIndex<str, Output = str>,
{ {
&self.src[index] // See `eaten_from` for details about `unwrap_or_default`.
self.src.get(index).unwrap_or_default()
} }
/// The full source string up to the current index. /// The remaining source string after the current index.
pub fn eaten(&self) -> &'s str { #[inline]
pub fn rest(&self) -> &'s str {
// SAFETY: The index is always in bounds and on a codepoint boundary // SAFETY: The index is always in bounds and on a codepoint boundary
// since it is: // since it is:
// - either increased by the length of a scanned character, // - either increased by the length of a scanned character,
// - or checked upon jumping. // - or checked upon jumping.
unsafe { self.src.get_unchecked(self.index ..) }
}
/// The full source string up to the current index.
#[inline]
pub fn eaten(&self) -> &'s str {
// SAFETY: The index is always okay, for details see `rest()`.
unsafe { self.src.get_unchecked(.. self.index) } unsafe { self.src.get_unchecked(.. self.index) }
} }
/// The source string from `start` to the current index. /// The source string from `start` to the current index.
#[inline]
pub fn eaten_from(&self, start: usize) -> &'s str { pub fn eaten_from(&self, start: usize) -> &'s str {
&self.src[start .. self.index] // Using `unwrap_or_default` is much faster than unwrap, probably
} // because then the whole call to `eaten_from` is pure and can be
// optimized away in some cases.
/// The remaining source string after the current index. self.src.get(start .. self.index).unwrap_or_default()
pub fn rest(&self) -> &'s str {
// SAFETY: The index is always okay, for details see `eaten()`.
unsafe { self.src.get_unchecked(self.index ..) }
} }
} }

View File

@ -22,22 +22,26 @@ pub enum TokenMode {
impl<'s> Tokens<'s> { impl<'s> Tokens<'s> {
/// Create a new token iterator with the given mode. /// Create a new token iterator with the given mode.
#[inline]
pub fn new(src: &'s str, mode: TokenMode) -> Self { pub fn new(src: &'s str, mode: TokenMode) -> Self {
Self { s: Scanner::new(src), mode } Self { s: Scanner::new(src), mode }
} }
/// Get the current token mode. /// Get the current token mode.
#[inline]
pub fn mode(&self) -> TokenMode { pub fn mode(&self) -> TokenMode {
self.mode self.mode
} }
/// Change the token mode. /// Change the token mode.
#[inline]
pub fn set_mode(&mut self, mode: TokenMode) { pub fn set_mode(&mut self, mode: TokenMode) {
self.mode = mode; self.mode = mode;
} }
/// The index in the string at which the last token ends and next token /// The index in the string at which the last token ends and next token
/// will start. /// will start.
#[inline]
pub fn index(&self) -> usize { pub fn index(&self) -> usize {
self.s.index() self.s.index()
} }
@ -45,11 +49,13 @@ impl<'s> Tokens<'s> {
/// Jump to the given index in the string. /// Jump to the given index in the string.
/// ///
/// You need to know the correct column. /// You need to know the correct column.
#[inline]
pub fn jump(&mut self, index: usize) { pub fn jump(&mut self, index: usize) {
self.s.jump(index); self.s.jump(index);
} }
/// The underlying scanner. /// The underlying scanner.
#[inline]
pub fn scanner(&self) -> Scanner<'s> { pub fn scanner(&self) -> Scanner<'s> {
self.s self.s
} }
@ -59,6 +65,7 @@ impl<'s> Iterator for Tokens<'s> {
type Item = Token<'s>; type Item = Token<'s>;
/// Parse the next token in the source code. /// Parse the next token in the source code.
#[inline]
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
let start = self.s.index(); let start = self.s.index();
let c = self.s.eat()?; let c = self.s.eat()?;
@ -70,7 +77,8 @@ impl<'s> Iterator for Tokens<'s> {
'}' => Token::RightBrace, '}' => Token::RightBrace,
// Whitespace. // Whitespace.
c if c.is_whitespace() => self.whitespace(c), ' ' if self.s.check_or(true, |c| !c.is_whitespace()) => Token::Space(0),
c if c.is_whitespace() => self.whitespace(),
// Comments with special case for URLs. // Comments with special case for URLs.
'/' if self.s.eat_if('*') => self.block_comment(), '/' if self.s.eat_if('*') => self.block_comment(),
@ -87,6 +95,7 @@ impl<'s> Iterator for Tokens<'s> {
} }
impl<'s> Tokens<'s> { impl<'s> Tokens<'s> {
#[inline]
fn markup(&mut self, start: usize, c: char) -> Token<'s> { fn markup(&mut self, start: usize, c: char) -> Token<'s> {
match c { match c {
// Escape sequences. // Escape sequences.
@ -158,54 +167,49 @@ impl<'s> Tokens<'s> {
} }
} }
fn whitespace(&mut self, first: char) -> Token<'s> { #[inline]
// Fast path for just a single space fn text(&mut self, start: usize) -> Token<'s> {
if first == ' ' && self.s.check_or(true, |c| !c.is_whitespace()) { macro_rules! table {
Token::Space(0) ($($c:literal)|*) => {{
} else { let mut t = [false; 128];
self.s.uneat(); $(t[$c as usize] = true;)*
t
// Count the number of newlines. }}
let mut newlines = 0;
while let Some(c) = self.s.eat_merging_crlf() {
if !c.is_whitespace() {
self.s.uneat();
break;
}
if is_newline(c) {
newlines += 1;
}
}
Token::Space(newlines)
} }
const TABLE: [bool; 128] = table! {
// Ascii whitespace.
' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' |
// Comments, parentheses, code.
'/' | '[' | ']' | '{' | '}' | '#' |
// Markup
'~' | '*' | '_' | '`' | '$' | '-' | '\\'
};
self.s.eat_until(|c| {
TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
});
Token::Text(self.s.eaten_from(start))
} }
fn text(&mut self, start: usize) -> Token<'s> { fn whitespace(&mut self) -> Token<'s> {
while let Some(c) = self.s.eat() { self.s.uneat();
if match c {
// Whitespace. // Count the number of newlines.
c if c.is_whitespace() => true, let mut newlines = 0;
// Comments. while let Some(c) = self.s.eat_merging_crlf() {
'/' => true, if !c.is_whitespace() {
// Parentheses.
'[' | ']' | '{' | '}' => true,
// Code.
'#' => true,
// Markup.
'~' | '*' | '_' | '`' | '$' | '-' => true,
// Escaping.
'\\' => true,
// Just text.
_ => false,
} {
self.s.uneat(); self.s.uneat();
break; break;
} }
if is_newline(c) {
newlines += 1;
}
} }
Token::Text(self.s.eaten_from(start)) Token::Space(newlines)
} }
fn backslash(&mut self) -> Token<'s> { fn backslash(&mut self) -> Token<'s> {
@ -238,6 +242,7 @@ impl<'s> Tokens<'s> {
} }
} }
#[inline]
fn hash(&mut self) -> Token<'s> { fn hash(&mut self) -> Token<'s> {
if self.s.check_or(false, is_id_start) { if self.s.check_or(false, is_id_start) {
let read = self.s.eat_while(is_id_continue); let read = self.s.eat_while(is_id_continue);