use std::slice::SliceIndex; use unicode_xid::UnicodeXID; /// A featureful char-based scanner. #[derive(Copy, Clone)] pub struct Scanner<'s> { /// The string to scan. src: &'s str, /// The index at which the peekable character starts. Must be in bounds and /// at a codepoint boundary to guarantee safety. index: usize, } impl<'s> Scanner<'s> { /// Create a new char scanner. #[inline] pub fn new(src: &'s str) -> Self { Self { src, index: 0 } } /// Whether the end of the string is reached. pub fn eof(&self) -> bool { self.index == self.src.len() } /// Consume the next char. #[inline] pub fn eat(&mut self) -> Option { let next = self.peek(); if let Some(c) = next { self.index += c.len_utf8(); } next } /// Consume the next char if it is the given one. /// /// Returns whether the char was consumed. #[inline] pub fn eat_if(&mut self, c: char) -> bool { let matches = self.peek() == Some(c); if matches { self.index += c.len_utf8(); } matches } /// Consume the next char, debug-asserting that it is the given one. #[inline] pub fn eat_assert(&mut self, c: char) { let next = self.eat(); debug_assert_eq!(next, Some(c)); } /// Eat chars while the condition is true. #[inline] pub fn eat_while(&mut self, mut f: F) -> &'s str where F: FnMut(char) -> bool, { self.eat_until(|c| !f(c)) } /// Eat chars until the condition is true. #[inline] pub fn eat_until(&mut self, mut f: F) -> &'s str where F: FnMut(char) -> bool, { let start = self.index; while let Some(c) = self.peek() { if f(c) { break; } self.index += c.len_utf8(); } self.eaten_from(start) } /// Uneat the last eaten char. #[inline] pub fn uneat(&mut self) { self.index = self.last_index(); } /// Peek at the next char without consuming it. #[inline] pub fn peek(&self) -> Option { self.rest().chars().next() } /// Get the nth-previous eaten char. #[inline] pub fn prev(&self, n: usize) -> Option { self.eaten().chars().nth_back(n) } /// Checks whether the next char fulfills a condition. /// /// Returns `default` if there is no next char. #[inline] pub fn check_or(&self, default: bool, f: F) -> bool where F: FnOnce(char) -> bool, { self.peek().map_or(default, f) } /// The previous index in the source string. #[inline] pub fn last_index(&self) -> usize { self.eaten().chars().last().map_or(0, |c| self.index - c.len_utf8()) } /// The current index in the source string. #[inline] pub fn index(&self) -> usize { self.index } /// Jump to an index in the source string. #[inline] pub fn jump(&mut self, index: usize) { // Make sure that the index is in bounds and on a codepoint boundary. self.src.get(index ..).expect("jumped to invalid index"); self.index = index; } /// The full source string. #[inline] pub fn src(&self) -> &'s str { self.src } /// Slice out part of the source string. #[inline] pub fn get(&self, index: I) -> &'s str where I: SliceIndex, { // See `eaten_from` for details about `unwrap_or_default`. self.src.get(index).unwrap_or_default() } /// The remaining source string after the current index. #[inline] pub fn rest(&self) -> &'s str { // Safety: The index is always in bounds and on a codepoint boundary // since it starts at zero and is is: // - either increased by the length of a scanned character, advacing // from one codepoint boundary to the next, // - or checked upon jumping. unsafe { self.src.get_unchecked(self.index ..) } } /// The full source string up to the current index. #[inline] pub fn eaten(&self) -> &'s str { // Safety: The index is always okay, for details see `rest()`. unsafe { self.src.get_unchecked(.. self.index) } } /// The source string from `start` to the current index. #[inline] pub fn eaten_from(&self, start: usize) -> &'s str { // Using `unwrap_or_default` is much faster than unwrap, probably // because then the whole call to `eaten_from` is pure and can be // optimized away in some cases. self.src.get(start .. self.index).unwrap_or_default() } } /// Whether this character denotes a newline. #[inline] pub fn is_newline(character: char) -> bool { matches!( character, // Line Feed, Vertical Tab, Form Feed, Carriage Return. '\n' | '\x0B' | '\x0C' | '\r' | // Next Line, Line Separator, Paragraph Separator. '\u{0085}' | '\u{2028}' | '\u{2029}' ) } /// Whether a string is a valid unicode identifier. /// /// In addition to what is specified in the [Unicode Standard][uax31], we allow: /// - `_` as a starting character, /// - `_` and `-` as continuing characters. /// /// [uax31]: http://www.unicode.org/reports/tr31/ #[inline] pub fn is_ident(string: &str) -> bool { let mut chars = string.chars(); chars .next() .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue)) } /// Whether a character can start an identifier. #[inline] pub fn is_id_start(c: char) -> bool { c.is_xid_start() || c == '_' } /// Whether a character can continue an identifier. #[inline] pub fn is_id_continue(c: char) -> bool { c.is_xid_continue() || c == '_' || c == '-' }