diff --git a/src/parse/lines.rs b/src/parse/lines.rs index b6d8a60f6..e42d110bd 100644 --- a/src/parse/lines.rs +++ b/src/parse/lines.rs @@ -94,6 +94,7 @@ pub fn search_column(src: &str) -> usize { } /// Whether this character denotes a newline. +#[inline] pub fn is_newline(character: char) -> bool { matches!( character, diff --git a/src/parse/scanner.rs b/src/parse/scanner.rs index af88aa684..c9c947691 100644 --- a/src/parse/scanner.rs +++ b/src/parse/scanner.rs @@ -13,11 +13,13 @@ pub struct Scanner<'s> { impl<'s> Scanner<'s> { /// Create a new char scanner. + #[inline] pub fn new(src: &'s str) -> Self { Self { src, index: 0 } } /// Consume the next char. + #[inline] pub fn eat(&mut self) -> Option { let next = self.peek(); if let Some(c) = next { @@ -29,6 +31,7 @@ impl<'s> Scanner<'s> { /// Consume the next char if it is the given one. /// /// Returns whether the char was consumed. + #[inline] pub fn eat_if(&mut self, c: char) -> bool { let matches = self.peek() == Some(c); if matches { @@ -38,12 +41,14 @@ impl<'s> Scanner<'s> { } /// Consume the next char, debug-asserting that it is the given one. + #[inline] pub fn eat_assert(&mut self, c: char) { let next = self.eat(); debug_assert_eq!(next, Some(c)); } /// Consume the next char, coalescing `\r\n` to just `\n`. + #[inline] pub fn eat_merging_crlf(&mut self) -> Option { if self.rest().starts_with("\r\n") { self.index += 2; @@ -54,6 +59,7 @@ impl<'s> Scanner<'s> { } /// Eat chars while the condition is true. + #[inline] pub fn eat_while(&mut self, mut f: F) -> &'s str where F: FnMut(char) -> bool, @@ -62,6 +68,7 @@ impl<'s> Scanner<'s> { } /// Eat chars until the condition is true. + #[inline] pub fn eat_until(&mut self, mut f: F) -> &'s str where F: FnMut(char) -> bool, @@ -77,11 +84,13 @@ impl<'s> Scanner<'s> { } /// Uneat the last eaten char. + #[inline] pub fn uneat(&mut self) { self.index = self.last_index(); } /// Peek at the next char without consuming it. + #[inline] pub fn peek(&self) -> Option { self.rest().chars().next() } @@ -89,6 +98,7 @@ impl<'s> Scanner<'s> { /// Checks whether the next char fulfills a condition. /// /// Returns `default` if there is no next char. + #[inline] pub fn check_or(&self, default: bool, f: F) -> bool where F: FnOnce(char) -> bool, @@ -97,6 +107,7 @@ impl<'s> Scanner<'s> { } /// The previous index in the source string. + #[inline] pub fn last_index(&self) -> usize { self.eaten() .chars() @@ -105,43 +116,53 @@ impl<'s> Scanner<'s> { } /// The current index in the source string. + #[inline] pub fn index(&self) -> usize { self.index } /// Jump to an index in the source string. + #[inline] pub fn jump(&mut self, index: usize) { // Make sure that the index is in bounds and on a codepoint boundary. self.src.get(index ..).expect("jumped to invalid index"); self.index = index; } - /// Slice a part out of the source string. + /// Slice out part of the source string. + #[inline] pub fn get(&self, index: I) -> &'s str where I: SliceIndex, { - &self.src[index] + // See `eaten_from` for details about `unwrap_or_default`. + self.src.get(index).unwrap_or_default() } - /// The full source string up to the current index. - pub fn eaten(&self) -> &'s str { + /// The remaining source string after the current index. + #[inline] + pub fn rest(&self) -> &'s str { // SAFETY: The index is always in bounds and on a codepoint boundary // since it is: // - either increased by the length of a scanned character, // - or checked upon jumping. + unsafe { self.src.get_unchecked(self.index ..) } + } + + /// The full source string up to the current index. + #[inline] + pub fn eaten(&self) -> &'s str { + // SAFETY: The index is always okay, for details see `rest()`. unsafe { self.src.get_unchecked(.. self.index) } } /// The source string from `start` to the current index. + #[inline] pub fn eaten_from(&self, start: usize) -> &'s str { - &self.src[start .. self.index] - } - - /// The remaining source string after the current index. - pub fn rest(&self) -> &'s str { - // SAFETY: The index is always okay, for details see `eaten()`. - unsafe { self.src.get_unchecked(self.index ..) } + // Using `unwrap_or_default` is much faster than unwrap, probably + // because then the whole call to `eaten_from` is pure and can be + // optimized away in some cases. + self.src.get(start .. self.index).unwrap_or_default() } } diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs index d979d0054..e9af7acb7 100644 --- a/src/parse/tokens.rs +++ b/src/parse/tokens.rs @@ -22,22 +22,26 @@ pub enum TokenMode { impl<'s> Tokens<'s> { /// Create a new token iterator with the given mode. + #[inline] pub fn new(src: &'s str, mode: TokenMode) -> Self { Self { s: Scanner::new(src), mode } } /// Get the current token mode. + #[inline] pub fn mode(&self) -> TokenMode { self.mode } /// Change the token mode. + #[inline] pub fn set_mode(&mut self, mode: TokenMode) { self.mode = mode; } /// The index in the string at which the last token ends and next token /// will start. + #[inline] pub fn index(&self) -> usize { self.s.index() } @@ -45,11 +49,13 @@ impl<'s> Tokens<'s> { /// Jump to the given index in the string. /// /// You need to know the correct column. + #[inline] pub fn jump(&mut self, index: usize) { self.s.jump(index); } /// The underlying scanner. + #[inline] pub fn scanner(&self) -> Scanner<'s> { self.s } @@ -59,6 +65,7 @@ impl<'s> Iterator for Tokens<'s> { type Item = Token<'s>; /// Parse the next token in the source code. + #[inline] fn next(&mut self) -> Option { let start = self.s.index(); let c = self.s.eat()?; @@ -70,7 +77,8 @@ impl<'s> Iterator for Tokens<'s> { '}' => Token::RightBrace, // Whitespace. - c if c.is_whitespace() => self.whitespace(c), + ' ' if self.s.check_or(true, |c| !c.is_whitespace()) => Token::Space(0), + c if c.is_whitespace() => self.whitespace(), // Comments with special case for URLs. '/' if self.s.eat_if('*') => self.block_comment(), @@ -87,6 +95,7 @@ impl<'s> Iterator for Tokens<'s> { } impl<'s> Tokens<'s> { + #[inline] fn markup(&mut self, start: usize, c: char) -> Token<'s> { match c { // Escape sequences. @@ -158,54 +167,49 @@ impl<'s> Tokens<'s> { } } - fn whitespace(&mut self, first: char) -> Token<'s> { - // Fast path for just a single space - if first == ' ' && self.s.check_or(true, |c| !c.is_whitespace()) { - Token::Space(0) - } else { - self.s.uneat(); - - // Count the number of newlines. - let mut newlines = 0; - while let Some(c) = self.s.eat_merging_crlf() { - if !c.is_whitespace() { - self.s.uneat(); - break; - } - - if is_newline(c) { - newlines += 1; - } - } - - Token::Space(newlines) + #[inline] + fn text(&mut self, start: usize) -> Token<'s> { + macro_rules! table { + ($($c:literal)|*) => {{ + let mut t = [false; 128]; + $(t[$c as usize] = true;)* + t + }} } + + const TABLE: [bool; 128] = table! { + // Ascii whitespace. + ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | + // Comments, parentheses, code. + '/' | '[' | ']' | '{' | '}' | '#' | + // Markup + '~' | '*' | '_' | '`' | '$' | '-' | '\\' + }; + + self.s.eat_until(|c| { + TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace()) + }); + + Token::Text(self.s.eaten_from(start)) } - fn text(&mut self, start: usize) -> Token<'s> { - while let Some(c) = self.s.eat() { - if match c { - // Whitespace. - c if c.is_whitespace() => true, - // Comments. - '/' => true, - // Parentheses. - '[' | ']' | '{' | '}' => true, - // Code. - '#' => true, - // Markup. - '~' | '*' | '_' | '`' | '$' | '-' => true, - // Escaping. - '\\' => true, - // Just text. - _ => false, - } { + fn whitespace(&mut self) -> Token<'s> { + self.s.uneat(); + + // Count the number of newlines. + let mut newlines = 0; + while let Some(c) = self.s.eat_merging_crlf() { + if !c.is_whitespace() { self.s.uneat(); break; } + + if is_newline(c) { + newlines += 1; + } } - Token::Text(self.s.eaten_from(start)) + Token::Space(newlines) } fn backslash(&mut self) -> Token<'s> { @@ -238,6 +242,7 @@ impl<'s> Tokens<'s> { } } + #[inline] fn hash(&mut self) -> Token<'s> { if self.s.check_or(false, is_id_start) { let read = self.s.eat_while(is_id_continue);