mirror of
https://github.com/typst/typst
synced 2025-05-14 17:15:28 +08:00
Rename CharParser to Scanner ✏
This commit is contained in:
parent
c0998b4802
commit
16f0bd430e
@ -1,11 +1,11 @@
|
|||||||
//! Parsing and tokenization.
|
//! Parsing and tokenization.
|
||||||
|
|
||||||
mod chars;
|
|
||||||
mod resolve;
|
mod resolve;
|
||||||
|
mod scanner;
|
||||||
mod tokens;
|
mod tokens;
|
||||||
|
|
||||||
pub use chars::*;
|
|
||||||
pub use resolve::*;
|
pub use resolve::*;
|
||||||
|
pub use scanner::*;
|
||||||
pub use tokens::*;
|
pub use tokens::*;
|
||||||
|
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
@ -1,41 +1,41 @@
|
|||||||
//! Resolve strings and raw blocks.
|
//! Resolve strings and raw blocks.
|
||||||
|
|
||||||
use super::{is_newline_char, CharParser};
|
use super::{is_newline_char, Scanner};
|
||||||
use crate::syntax::{Ident, Raw};
|
use crate::syntax::{Ident, Raw};
|
||||||
|
|
||||||
/// Resolves all escape sequences in a string.
|
/// Resolves all escape sequences in a string.
|
||||||
pub fn resolve_string(string: &str) -> String {
|
pub fn resolve_string(string: &str) -> String {
|
||||||
let mut out = String::with_capacity(string.len());
|
let mut out = String::with_capacity(string.len());
|
||||||
let mut p = CharParser::new(string);
|
let mut s = Scanner::new(string);
|
||||||
|
|
||||||
while let Some(c) = p.eat() {
|
while let Some(c) = s.eat() {
|
||||||
if c != '\\' {
|
if c != '\\' {
|
||||||
out.push(c);
|
out.push(c);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let start = p.prev_index();
|
let start = s.prev_index();
|
||||||
match p.eat() {
|
match s.eat() {
|
||||||
Some('\\') => out.push('\\'),
|
Some('\\') => out.push('\\'),
|
||||||
Some('"') => out.push('"'),
|
Some('"') => out.push('"'),
|
||||||
|
|
||||||
Some('n') => out.push('\n'),
|
Some('n') => out.push('\n'),
|
||||||
Some('t') => out.push('\t'),
|
Some('t') => out.push('\t'),
|
||||||
Some('u') if p.eat_if('{') => {
|
Some('u') if s.eat_if('{') => {
|
||||||
// TODO: Feedback if closing brace is missing.
|
// TODO: Feedback if closing brace is missing.
|
||||||
let sequence = p.eat_while(|c| c.is_ascii_hexdigit());
|
let sequence = s.eat_while(|c| c.is_ascii_hexdigit());
|
||||||
let _terminated = p.eat_if('}');
|
let _terminated = s.eat_if('}');
|
||||||
|
|
||||||
if let Some(c) = resolve_hex(sequence) {
|
if let Some(c) = resolve_hex(sequence) {
|
||||||
out.push(c);
|
out.push(c);
|
||||||
} else {
|
} else {
|
||||||
// TODO: Feedback that escape sequence is wrong.
|
// TODO: Feedback that escape sequence is wrong.
|
||||||
out += p.eaten_from(start);
|
out += s.eaten_from(start);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Feedback about invalid escape sequence.
|
// TODO: Feedback about invalid escape sequence.
|
||||||
_ => out += p.eaten_from(start),
|
_ => out += s.eaten_from(start),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -69,10 +69,10 @@ pub fn resolve_raw(raw: &str, backticks: usize) -> Raw {
|
|||||||
|
|
||||||
/// Parse the lang tag and return it alongside the remaining inner raw text.
|
/// Parse the lang tag and return it alongside the remaining inner raw text.
|
||||||
fn split_at_lang_tag(raw: &str) -> (&str, &str) {
|
fn split_at_lang_tag(raw: &str) -> (&str, &str) {
|
||||||
let mut p = CharParser::new(raw);
|
let mut s = Scanner::new(raw);
|
||||||
(
|
(
|
||||||
p.eat_until(|c| c == '`' || c.is_whitespace() || is_newline_char(c)),
|
s.eat_until(|c| c == '`' || c.is_whitespace() || is_newline_char(c)),
|
||||||
p.rest(),
|
s.rest(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -104,11 +104,11 @@ fn trim_and_split_raw(raw: &str) -> (Vec<String>, bool) {
|
|||||||
/// Splits a string into a vector of lines (respecting Unicode & Windows line
|
/// Splits a string into a vector of lines (respecting Unicode & Windows line
|
||||||
/// breaks).
|
/// breaks).
|
||||||
pub fn split_lines(text: &str) -> Vec<String> {
|
pub fn split_lines(text: &str) -> Vec<String> {
|
||||||
let mut p = CharParser::new(text);
|
let mut s = Scanner::new(text);
|
||||||
let mut line = String::new();
|
let mut line = String::new();
|
||||||
let mut lines = Vec::new();
|
let mut lines = Vec::new();
|
||||||
|
|
||||||
while let Some(c) = p.eat_merging_crlf() {
|
while let Some(c) = s.eat_merging_crlf() {
|
||||||
if is_newline_char(c) {
|
if is_newline_char(c) {
|
||||||
lines.push(std::mem::take(&mut line));
|
lines.push(std::mem::take(&mut line));
|
||||||
} else {
|
} else {
|
||||||
|
@ -1,18 +1,18 @@
|
|||||||
//! Low-level char parser.
|
//! Low-level char-based scanner.
|
||||||
|
|
||||||
use std::fmt::{self, Debug, Formatter};
|
use std::fmt::{self, Debug, Formatter};
|
||||||
use std::slice::SliceIndex;
|
use std::slice::SliceIndex;
|
||||||
use std::str::Chars;
|
use std::str::Chars;
|
||||||
|
|
||||||
/// A low-level featureful char parser.
|
/// A low-level featureful char scanner.
|
||||||
pub struct CharParser<'s> {
|
pub struct Scanner<'s> {
|
||||||
src: &'s str,
|
src: &'s str,
|
||||||
iter: Chars<'s>,
|
iter: Chars<'s>,
|
||||||
index: usize,
|
index: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'s> CharParser<'s> {
|
impl<'s> Scanner<'s> {
|
||||||
/// Create a new char parser.
|
/// Create a new char scanner.
|
||||||
pub fn new(src: &'s str) -> Self {
|
pub fn new(src: &'s str) -> Self {
|
||||||
Self { src, iter: src.chars(), index: 0 }
|
Self { src, iter: src.chars(), index: 0 }
|
||||||
}
|
}
|
||||||
@ -104,7 +104,7 @@ impl<'s> CharParser<'s> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'s> CharParser<'s> {
|
impl<'s> Scanner<'s> {
|
||||||
/// Slice a part out of the source string.
|
/// Slice a part out of the source string.
|
||||||
pub fn get<I>(&self, index: I) -> &'s str
|
pub fn get<I>(&self, index: I) -> &'s str
|
||||||
where
|
where
|
||||||
@ -153,9 +153,9 @@ impl<'s> CharParser<'s> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Debug for CharParser<'_> {
|
impl Debug for Scanner<'_> {
|
||||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||||
write!(f, "CharParser({}|{})", self.eaten(), self.rest())
|
write!(f, "Scanner({}|{})", self.eaten(), self.rest())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1,6 +1,6 @@
|
|||||||
//! Tokenization.
|
//! Tokenization.
|
||||||
|
|
||||||
use super::{is_newline_char, CharParser};
|
use super::{is_newline_char, Scanner};
|
||||||
use crate::length::Length;
|
use crate::length::Length;
|
||||||
use crate::syntax::{Ident, Pos, Span, SpanWith, Spanned, Token};
|
use crate::syntax::{Ident, Pos, Span, SpanWith, Spanned, Token};
|
||||||
|
|
||||||
@ -9,7 +9,7 @@ use TokenMode::*;
|
|||||||
/// An iterator over the tokens of a string of source code.
|
/// An iterator over the tokens of a string of source code.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct Tokens<'s> {
|
pub struct Tokens<'s> {
|
||||||
p: CharParser<'s>,
|
s: Scanner<'s>,
|
||||||
mode: TokenMode,
|
mode: TokenMode,
|
||||||
stack: Vec<TokenMode>,
|
stack: Vec<TokenMode>,
|
||||||
}
|
}
|
||||||
@ -27,7 +27,7 @@ impl<'s> Tokens<'s> {
|
|||||||
/// Create a new token iterator with the given mode.
|
/// Create a new token iterator with the given mode.
|
||||||
pub fn new(src: &'s str, mode: TokenMode) -> Self {
|
pub fn new(src: &'s str, mode: TokenMode) -> Self {
|
||||||
Self {
|
Self {
|
||||||
p: CharParser::new(src),
|
s: Scanner::new(src),
|
||||||
mode,
|
mode,
|
||||||
stack: vec![],
|
stack: vec![],
|
||||||
}
|
}
|
||||||
@ -48,7 +48,7 @@ impl<'s> Tokens<'s> {
|
|||||||
/// The position in the string at which the last token ends and next token
|
/// The position in the string at which the last token ends and next token
|
||||||
/// will start.
|
/// will start.
|
||||||
pub fn pos(&self) -> Pos {
|
pub fn pos(&self) -> Pos {
|
||||||
self.p.index().into()
|
self.s.index().into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -57,15 +57,15 @@ impl<'s> Iterator for Tokens<'s> {
|
|||||||
|
|
||||||
/// Parse the next token in the source code.
|
/// Parse the next token in the source code.
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
let start = self.p.index();
|
let start = self.s.index();
|
||||||
let token = match self.p.eat()? {
|
let token = match self.s.eat()? {
|
||||||
// Whitespace.
|
// Whitespace.
|
||||||
c if c.is_whitespace() => self.read_whitespace(c),
|
c if c.is_whitespace() => self.read_whitespace(c),
|
||||||
|
|
||||||
// Comments.
|
// Comments.
|
||||||
'/' if self.p.eat_if('/') => self.read_line_comment(),
|
'/' if self.s.eat_if('/') => self.read_line_comment(),
|
||||||
'/' if self.p.eat_if('*') => self.read_block_comment(),
|
'/' if self.s.eat_if('*') => self.read_block_comment(),
|
||||||
'*' if self.p.eat_if('/') => Token::Invalid("*/"),
|
'*' if self.s.eat_if('/') => Token::Invalid("*/"),
|
||||||
|
|
||||||
// Functions.
|
// Functions.
|
||||||
'[' => Token::LeftBracket,
|
'[' => Token::LeftBracket,
|
||||||
@ -87,7 +87,7 @@ impl<'s> Iterator for Tokens<'s> {
|
|||||||
':' if self.mode == Header => Token::Colon,
|
':' if self.mode == Header => Token::Colon,
|
||||||
',' if self.mode == Header => Token::Comma,
|
',' if self.mode == Header => Token::Comma,
|
||||||
'=' if self.mode == Header => Token::Equals,
|
'=' if self.mode == Header => Token::Equals,
|
||||||
'>' if self.mode == Header && self.p.eat_if('>') => Token::Chain,
|
'>' if self.mode == Header && self.s.eat_if('>') => Token::Chain,
|
||||||
|
|
||||||
// Expressions in headers.
|
// Expressions in headers.
|
||||||
'+' if self.mode == Header => Token::Plus,
|
'+' if self.mode == Header => Token::Plus,
|
||||||
@ -101,7 +101,7 @@ impl<'s> Iterator for Tokens<'s> {
|
|||||||
_ => self.read_text_or_expr(start),
|
_ => self.read_text_or_expr(start),
|
||||||
};
|
};
|
||||||
|
|
||||||
let end = self.p.index();
|
let end = self.s.index();
|
||||||
Some(token.span_with(Span::new(start, end)))
|
Some(token.span_with(Span::new(start, end)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -109,21 +109,21 @@ impl<'s> Iterator for Tokens<'s> {
|
|||||||
impl<'s> Tokens<'s> {
|
impl<'s> Tokens<'s> {
|
||||||
fn read_whitespace(&mut self, first: char) -> Token<'s> {
|
fn read_whitespace(&mut self, first: char) -> Token<'s> {
|
||||||
// Shortcut for common case of exactly one space.
|
// Shortcut for common case of exactly one space.
|
||||||
if first == ' ' && !self.p.check(|c| c.is_whitespace()) {
|
if first == ' ' && !self.s.check(|c| c.is_whitespace()) {
|
||||||
return Token::Space(0);
|
return Token::Space(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Uneat the first char if it's a newline, so that it's counted in the
|
// Uneat the first char if it's a newline, so that it's counted in the
|
||||||
// loop.
|
// loop.
|
||||||
if is_newline_char(first) {
|
if is_newline_char(first) {
|
||||||
self.p.uneat();
|
self.s.uneat();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Count the number of newlines.
|
// Count the number of newlines.
|
||||||
let mut newlines = 0;
|
let mut newlines = 0;
|
||||||
while let Some(c) = self.p.eat_merging_crlf() {
|
while let Some(c) = self.s.eat_merging_crlf() {
|
||||||
if !c.is_whitespace() {
|
if !c.is_whitespace() {
|
||||||
self.p.uneat();
|
self.s.uneat();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -136,17 +136,17 @@ impl<'s> Tokens<'s> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn read_line_comment(&mut self) -> Token<'s> {
|
fn read_line_comment(&mut self) -> Token<'s> {
|
||||||
Token::LineComment(self.p.eat_until(is_newline_char))
|
Token::LineComment(self.s.eat_until(is_newline_char))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_block_comment(&mut self) -> Token<'s> {
|
fn read_block_comment(&mut self) -> Token<'s> {
|
||||||
let start = self.p.index();
|
let start = self.s.index();
|
||||||
|
|
||||||
let mut state = '_';
|
let mut state = '_';
|
||||||
let mut depth = 1;
|
let mut depth = 1;
|
||||||
|
|
||||||
// Find the first `*/` that does not correspond to a nested `/*`.
|
// Find the first `*/` that does not correspond to a nested `/*`.
|
||||||
while let Some(c) = self.p.eat() {
|
while let Some(c) = self.s.eat() {
|
||||||
state = match (state, c) {
|
state = match (state, c) {
|
||||||
('*', '/') => {
|
('*', '/') => {
|
||||||
depth -= 1;
|
depth -= 1;
|
||||||
@ -164,21 +164,21 @@ impl<'s> Tokens<'s> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let terminated = depth == 0;
|
let terminated = depth == 0;
|
||||||
let end = self.p.index() - if terminated { 2 } else { 0 };
|
let end = self.s.index() - if terminated { 2 } else { 0 };
|
||||||
|
|
||||||
Token::BlockComment(self.p.get(start .. end))
|
Token::BlockComment(self.s.get(start .. end))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_hex(&mut self) -> Token<'s> {
|
fn read_hex(&mut self) -> Token<'s> {
|
||||||
// This parses more than the permissable 0-9, a-f, A-F character ranges
|
// This parses more than the permissable 0-9, a-f, A-F character ranges
|
||||||
// to provide nicer error messages later.
|
// to provide nicer error messages later.
|
||||||
Token::Hex(self.p.eat_while(|c| c.is_ascii_alphanumeric()))
|
Token::Hex(self.s.eat_while(|c| c.is_ascii_alphanumeric()))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_string(&mut self) -> Token<'s> {
|
fn read_string(&mut self) -> Token<'s> {
|
||||||
let mut escaped = false;
|
let mut escaped = false;
|
||||||
Token::Str {
|
Token::Str {
|
||||||
string: self.p.eat_until(|c| {
|
string: self.s.eat_until(|c| {
|
||||||
if c == '"' && !escaped {
|
if c == '"' && !escaped {
|
||||||
true
|
true
|
||||||
} else {
|
} else {
|
||||||
@ -186,21 +186,21 @@ impl<'s> Tokens<'s> {
|
|||||||
false
|
false
|
||||||
}
|
}
|
||||||
}),
|
}),
|
||||||
terminated: self.p.eat_if('"'),
|
terminated: self.s.eat_if('"'),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_raw(&mut self) -> Token<'s> {
|
fn read_raw(&mut self) -> Token<'s> {
|
||||||
let mut backticks = 1;
|
let mut backticks = 1;
|
||||||
while self.p.eat_if('`') {
|
while self.s.eat_if('`') {
|
||||||
backticks += 1;
|
backticks += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
let start = self.p.index();
|
let start = self.s.index();
|
||||||
|
|
||||||
let mut found = 0;
|
let mut found = 0;
|
||||||
while found < backticks {
|
while found < backticks {
|
||||||
match self.p.eat() {
|
match self.s.eat() {
|
||||||
Some('`') => found += 1,
|
Some('`') => found += 1,
|
||||||
Some(_) => found = 0,
|
Some(_) => found = 0,
|
||||||
None => break,
|
None => break,
|
||||||
@ -208,29 +208,29 @@ impl<'s> Tokens<'s> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let terminated = found == backticks;
|
let terminated = found == backticks;
|
||||||
let end = self.p.index() - if terminated { found } else { 0 };
|
let end = self.s.index() - if terminated { found } else { 0 };
|
||||||
|
|
||||||
Token::Raw {
|
Token::Raw {
|
||||||
raw: self.p.get(start .. end),
|
raw: self.s.get(start .. end),
|
||||||
backticks,
|
backticks,
|
||||||
terminated,
|
terminated,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_escaped(&mut self) -> Token<'s> {
|
fn read_escaped(&mut self) -> Token<'s> {
|
||||||
if let Some(c) = self.p.peek() {
|
if let Some(c) = self.s.peek() {
|
||||||
match c {
|
match c {
|
||||||
'[' | ']' | '\\' | '/' | '*' | '_' | '`' | '"' | '#' | '~' => {
|
'[' | ']' | '\\' | '/' | '*' | '_' | '`' | '"' | '#' | '~' => {
|
||||||
let start = self.p.index();
|
let start = self.s.index();
|
||||||
self.p.eat_assert(c);
|
self.s.eat_assert(c);
|
||||||
Token::Text(&self.p.eaten_from(start))
|
Token::Text(&self.s.eaten_from(start))
|
||||||
}
|
}
|
||||||
'u' if self.p.peek_nth(1) == Some('{') => {
|
'u' if self.s.peek_nth(1) == Some('{') => {
|
||||||
self.p.eat_assert('u');
|
self.s.eat_assert('u');
|
||||||
self.p.eat_assert('{');
|
self.s.eat_assert('{');
|
||||||
Token::UnicodeEscape {
|
Token::UnicodeEscape {
|
||||||
sequence: self.p.eat_while(|c| c.is_ascii_hexdigit()),
|
sequence: self.s.eat_while(|c| c.is_ascii_hexdigit()),
|
||||||
terminated: self.p.eat_if('}'),
|
terminated: self.s.eat_if('}'),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
c if c.is_whitespace() => Token::Backslash,
|
c if c.is_whitespace() => Token::Backslash,
|
||||||
@ -246,7 +246,7 @@ impl<'s> Tokens<'s> {
|
|||||||
let header = self.mode == Header;
|
let header = self.mode == Header;
|
||||||
|
|
||||||
let mut last_was_e = false;
|
let mut last_was_e = false;
|
||||||
self.p.eat_until(|c| {
|
self.s.eat_until(|c| {
|
||||||
let end = match c {
|
let end = match c {
|
||||||
c if c.is_whitespace() => true,
|
c if c.is_whitespace() => true,
|
||||||
'[' | ']' | '*' | '/' => true,
|
'[' | ']' | '*' | '/' => true,
|
||||||
@ -259,7 +259,7 @@ impl<'s> Tokens<'s> {
|
|||||||
end
|
end
|
||||||
});
|
});
|
||||||
|
|
||||||
let read = self.p.eaten_from(start);
|
let read = self.s.eaten_from(start);
|
||||||
if self.mode == Header {
|
if self.mode == Header {
|
||||||
parse_expr(read)
|
parse_expr(read)
|
||||||
} else {
|
} else {
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
use std::fmt::{self, Debug, Display, Formatter};
|
use std::fmt::{self, Debug, Display, Formatter};
|
||||||
|
|
||||||
use super::Pos;
|
use super::Pos;
|
||||||
use crate::parse::{is_newline_char, CharParser};
|
use crate::parse::{is_newline_char, Scanner};
|
||||||
|
|
||||||
/// Enables conversion of byte position to locations.
|
/// Enables conversion of byte position to locations.
|
||||||
pub struct LineMap<'s> {
|
pub struct LineMap<'s> {
|
||||||
@ -15,11 +15,11 @@ impl<'s> LineMap<'s> {
|
|||||||
/// Create a new line map for a source string.
|
/// Create a new line map for a source string.
|
||||||
pub fn new(src: &'s str) -> Self {
|
pub fn new(src: &'s str) -> Self {
|
||||||
let mut line_starts = vec![Pos::ZERO];
|
let mut line_starts = vec![Pos::ZERO];
|
||||||
let mut p = CharParser::new(src);
|
let mut s = Scanner::new(src);
|
||||||
|
|
||||||
while let Some(c) = p.eat_merging_crlf() {
|
while let Some(c) = s.eat_merging_crlf() {
|
||||||
if is_newline_char(c) {
|
if is_newline_char(c) {
|
||||||
line_starts.push(p.index().into());
|
line_starts.push(s.index().into());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user