mirror of
https://github.com/typst/typst
synced 2025-05-14 04:56:26 +08:00
Implement low-level char parser 🥜
This commit is contained in:
parent
38607b8bea
commit
4b9bc66028
171
src/parse/chars.rs
Normal file
171
src/parse/chars.rs
Normal file
@ -0,0 +1,171 @@
|
|||||||
|
//! Low-level char parser.
|
||||||
|
|
||||||
|
use std::fmt::{self, Debug, Formatter};
|
||||||
|
use std::slice::SliceIndex;
|
||||||
|
use std::str::Chars;
|
||||||
|
|
||||||
|
/// A low-level featureful char parser.
|
||||||
|
pub struct CharParser<'s> {
|
||||||
|
src: &'s str,
|
||||||
|
iter: Chars<'s>,
|
||||||
|
index: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'s> CharParser<'s> {
|
||||||
|
/// Create a new char parser.
|
||||||
|
pub fn new(src: &'s str) -> Self {
|
||||||
|
Self { src, iter: src.chars(), index: 0 }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consume the next char.
|
||||||
|
pub fn eat(&mut self) -> Option<char> {
|
||||||
|
let next = self.iter.next();
|
||||||
|
if let Some(c) = next {
|
||||||
|
self.index += c.len_utf8();
|
||||||
|
}
|
||||||
|
next
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consume the next char if it is the given one.
|
||||||
|
///
|
||||||
|
/// Returns whether the char was consumed.
|
||||||
|
pub fn eat_if(&mut self, c: char) -> bool {
|
||||||
|
// Don't decode the char twice through peek() and eat().
|
||||||
|
//
|
||||||
|
// TODO: Benchmark this vs. the naive version.
|
||||||
|
if self.iter.next() == Some(c) {
|
||||||
|
self.index += c.len_utf8();
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
self.reset();
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consume the next char, debug-asserting that it is the given one.
|
||||||
|
pub fn eat_assert(&mut self, c: char) {
|
||||||
|
let next = self.eat();
|
||||||
|
debug_assert_eq!(next, Some(c));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consume the next char, coalescing `\r\n` to just `\n`.
|
||||||
|
pub fn eat_merging_crlf(&mut self) -> Option<char> {
|
||||||
|
let c = self.eat();
|
||||||
|
if c == Some('\r') && self.eat_if('\n') {
|
||||||
|
Some('\n')
|
||||||
|
} else {
|
||||||
|
c
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Eat chars while the condition is true.
|
||||||
|
pub fn eat_while(&mut self, mut f: impl FnMut(char) -> bool) -> &'s str {
|
||||||
|
self.eat_until(|c| !f(c))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Eat chars until the condition is true.
|
||||||
|
pub fn eat_until(&mut self, mut f: impl FnMut(char) -> bool) -> &'s str {
|
||||||
|
let start = self.index;
|
||||||
|
while let Some(c) = self.iter.next() {
|
||||||
|
if f(c) {
|
||||||
|
// Undo the previous `next()` without peeking all the time
|
||||||
|
// during iteration.
|
||||||
|
//
|
||||||
|
// TODO: Benchmark this vs. the naive peeking version.
|
||||||
|
self.reset();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
self.index += c.len_utf8();
|
||||||
|
}
|
||||||
|
&self.src[start .. self.index]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Uneat the last eaten character.
|
||||||
|
pub fn uneat(&mut self) {
|
||||||
|
self.index = self.prev_index();
|
||||||
|
self.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Peek at the next char without consuming it.
|
||||||
|
pub fn peek(&self) -> Option<char> {
|
||||||
|
self.iter.clone().next()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Peek at the nth-next char without consuming anything.
|
||||||
|
pub fn peek_nth(&self, n: usize) -> Option<char> {
|
||||||
|
self.iter.clone().nth(n)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Checks whether the next character fulfills a condition.
|
||||||
|
///
|
||||||
|
/// Returns `false` is there is no next character.
|
||||||
|
pub fn check(&self, f: impl FnMut(char) -> bool) -> bool {
|
||||||
|
self.peek().map(f).unwrap_or(false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'s> CharParser<'s> {
|
||||||
|
/// Slice a part out of the source string.
|
||||||
|
pub fn get<I>(&self, index: I) -> &'s str
|
||||||
|
where
|
||||||
|
I: SliceIndex<str, Output = str>,
|
||||||
|
{
|
||||||
|
&self.src[index]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The full source string.
|
||||||
|
pub fn src(&self) -> &'s str {
|
||||||
|
self.src
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The full string up to the current index.
|
||||||
|
pub fn eaten(&self) -> &'s str {
|
||||||
|
&self.src[.. self.index]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The string from `start` to the current index.
|
||||||
|
pub fn eaten_from(&self, start: usize) -> &'s str {
|
||||||
|
&self.src[start .. self.index]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The remaining string after the current index.
|
||||||
|
pub fn rest(&self) -> &'s str {
|
||||||
|
&self.src[self.index ..]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The current index in the string.
|
||||||
|
pub fn index(&self) -> usize {
|
||||||
|
self.index
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The previous index in the string.
|
||||||
|
pub fn prev_index(&self) -> usize {
|
||||||
|
self.src[.. self.index]
|
||||||
|
.chars()
|
||||||
|
.next_back()
|
||||||
|
.map(|c| self.index - c.len_utf8())
|
||||||
|
.unwrap_or(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Go back to the where the index says.
|
||||||
|
fn reset(&mut self) {
|
||||||
|
self.iter = self.src[self.index ..].chars();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Debug for CharParser<'_> {
|
||||||
|
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||||
|
write!(f, "CharParser({}|{})", self.eaten(), self.rest())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether this character denotes a newline.
|
||||||
|
pub fn is_newline_char(character: char) -> bool {
|
||||||
|
match character {
|
||||||
|
// Line Feed, Vertical Tab, Form Feed, Carriage Return.
|
||||||
|
'\n' | '\x0B' | '\x0C' | '\r' |
|
||||||
|
// Next Line, Line Separator, Paragraph Separator.
|
||||||
|
'\u{0085}' | '\u{2028}' | '\u{2029}' => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
@ -1,8 +1,11 @@
|
|||||||
//! Parsing and tokenization.
|
//! Parsing and tokenization.
|
||||||
|
|
||||||
mod postprocess;
|
mod chars;
|
||||||
|
mod resolve;
|
||||||
mod tokens;
|
mod tokens;
|
||||||
|
|
||||||
|
pub use chars::*;
|
||||||
|
pub use resolve::*;
|
||||||
pub use tokens::*;
|
pub use tokens::*;
|
||||||
|
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
@ -110,16 +113,7 @@ impl Parser<'_> {
|
|||||||
error!(@self.feedback, end, "expected backtick(s)");
|
error!(@self.feedback, end, "expected backtick(s)");
|
||||||
}
|
}
|
||||||
|
|
||||||
let raw = if backticks > 1 {
|
let raw = resolve::resolve_raw(raw, backticks);
|
||||||
postprocess::process_raw(raw)
|
|
||||||
} else {
|
|
||||||
Raw {
|
|
||||||
lang: None,
|
|
||||||
lines: postprocess::split_lines(raw),
|
|
||||||
inline: true,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
self.with_span(SyntaxNode::Raw(raw))
|
self.with_span(SyntaxNode::Raw(raw))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -131,10 +125,11 @@ impl Parser<'_> {
|
|||||||
error!(@self.feedback, end, "expected closing brace");
|
error!(@self.feedback, end, "expected closing brace");
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(c) = postprocess::hex_to_char(sequence) {
|
if let Some(c) = resolve::resolve_hex(sequence) {
|
||||||
self.with_span(SyntaxNode::Text(c.to_string()))
|
self.with_span(SyntaxNode::Text(c.to_string()))
|
||||||
} else {
|
} else {
|
||||||
error!(@self.feedback, token.span, "invalid unicode escape sequence");
|
error!(@self.feedback, token.span, "invalid unicode escape sequence");
|
||||||
|
// TODO: Decide whether to render the escape sequence.
|
||||||
self.eat();
|
self.eat();
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
@ -407,7 +402,7 @@ impl Parser<'_> {
|
|||||||
if !terminated {
|
if !terminated {
|
||||||
self.expected_at("quote", span.end);
|
self.expected_at("quote", span.end);
|
||||||
}
|
}
|
||||||
self.with_span(Expr::Str(postprocess::unescape_string(string)))
|
self.with_span(Expr::Str(resolve::resolve_string(string)))
|
||||||
}
|
}
|
||||||
|
|
||||||
Token::Bool(b) => self.with_span(Expr::Bool(b)),
|
Token::Bool(b) => self.with_span(Expr::Bool(b)),
|
||||||
|
@ -1,95 +1,79 @@
|
|||||||
//! Post-processing of strings and raw blocks.
|
//! Resolve strings and raw blocks.
|
||||||
|
|
||||||
use super::is_newline_char;
|
use super::{is_newline_char, CharParser};
|
||||||
use crate::syntax::{Ident, Raw};
|
use crate::syntax::{Ident, Raw};
|
||||||
|
|
||||||
/// Resolves all escape sequences in a string.
|
/// Resolves all escape sequences in a string.
|
||||||
pub fn unescape_string(string: &str) -> String {
|
pub fn resolve_string(string: &str) -> String {
|
||||||
let mut iter = string.chars().peekable();
|
|
||||||
let mut out = String::with_capacity(string.len());
|
let mut out = String::with_capacity(string.len());
|
||||||
|
let mut p = CharParser::new(string);
|
||||||
|
|
||||||
while let Some(c) = iter.next() {
|
while let Some(c) = p.eat() {
|
||||||
if c != '\\' {
|
if c != '\\' {
|
||||||
out.push(c);
|
out.push(c);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
match iter.next() {
|
let start = p.prev_index();
|
||||||
|
match p.eat() {
|
||||||
Some('\\') => out.push('\\'),
|
Some('\\') => out.push('\\'),
|
||||||
Some('"') => out.push('"'),
|
Some('"') => out.push('"'),
|
||||||
|
|
||||||
Some('n') => out.push('\n'),
|
Some('n') => out.push('\n'),
|
||||||
Some('t') => out.push('\t'),
|
Some('t') => out.push('\t'),
|
||||||
Some('u') if iter.peek() == Some(&'{') => {
|
Some('u') if p.eat_if('{') => {
|
||||||
iter.next();
|
|
||||||
|
|
||||||
// TODO: Feedback if closing brace is missing.
|
// TODO: Feedback if closing brace is missing.
|
||||||
let mut sequence = String::new();
|
let sequence = p.eat_while(|c| c.is_ascii_hexdigit());
|
||||||
let terminated = loop {
|
let _terminated = p.eat_if('}');
|
||||||
match iter.peek() {
|
|
||||||
Some('}') => {
|
|
||||||
iter.next();
|
|
||||||
break true;
|
|
||||||
}
|
|
||||||
Some(&c) if c.is_ascii_hexdigit() => {
|
|
||||||
iter.next();
|
|
||||||
sequence.push(c);
|
|
||||||
}
|
|
||||||
_ => break false,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Some(c) = hex_to_char(&sequence) {
|
if let Some(c) = resolve_hex(sequence) {
|
||||||
out.push(c);
|
out.push(c);
|
||||||
} else {
|
} else {
|
||||||
// TODO: Feedback that escape sequence is wrong.
|
// TODO: Feedback that escape sequence is wrong.
|
||||||
out.push_str("\\u{");
|
out += p.eaten_from(start);
|
||||||
out.push_str(&sequence);
|
|
||||||
if terminated {
|
|
||||||
out.push('}');
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
other => {
|
// TODO: Feedback about invalid escape sequence.
|
||||||
out.push('\\');
|
_ => out += p.eaten_from(start),
|
||||||
out.extend(other);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
out
|
out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Resolve a hexademical escape sequence (only the inner hex letters without
|
||||||
|
/// braces or `\u`) into a character.
|
||||||
|
pub fn resolve_hex(sequence: &str) -> Option<char> {
|
||||||
|
u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32)
|
||||||
|
}
|
||||||
|
|
||||||
/// Resolves the language tag and trims the raw text.
|
/// Resolves the language tag and trims the raw text.
|
||||||
///
|
pub fn resolve_raw(raw: &str, backticks: usize) -> Raw {
|
||||||
/// Returns:
|
if backticks > 1 {
|
||||||
/// - The language tag
|
let (tag, inner) = split_at_lang_tag(raw);
|
||||||
/// - The raw lines
|
|
||||||
/// - Whether at least one newline was present in the untrimmed text.
|
|
||||||
pub fn process_raw(raw: &str) -> Raw {
|
|
||||||
let (lang, inner) = split_after_lang_tag(raw);
|
|
||||||
let (lines, had_newline) = trim_and_split_raw(inner);
|
let (lines, had_newline) = trim_and_split_raw(inner);
|
||||||
Raw { lang, lines, inline: !had_newline }
|
Raw {
|
||||||
|
lang: Ident::new(tag),
|
||||||
|
lines,
|
||||||
|
inline: !had_newline,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Raw {
|
||||||
|
lang: None,
|
||||||
|
lines: split_lines(raw),
|
||||||
|
inline: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parse the lang tag and return it alongside the remaining inner raw text.
|
/// Parse the lang tag and return it alongside the remaining inner raw text.
|
||||||
fn split_after_lang_tag(raw: &str) -> (Option<Ident>, &str) {
|
fn split_at_lang_tag(raw: &str) -> (&str, &str) {
|
||||||
let mut lang = String::new();
|
let mut p = CharParser::new(raw);
|
||||||
|
(
|
||||||
let mut inner = raw;
|
p.eat_until(|c| c == '`' || c.is_whitespace() || is_newline_char(c)),
|
||||||
let mut iter = raw.chars();
|
p.rest(),
|
||||||
|
)
|
||||||
while let Some(c) = iter.next() {
|
|
||||||
if c == '`' || c.is_whitespace() || is_newline_char(c) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
inner = iter.as_str();
|
|
||||||
lang.push(c);
|
|
||||||
}
|
|
||||||
|
|
||||||
(Ident::new(lang), inner)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Trims raw text and splits it into lines.
|
/// Trims raw text and splits it into lines.
|
||||||
@ -117,18 +101,15 @@ fn trim_and_split_raw(raw: &str) -> (Vec<String>, bool) {
|
|||||||
(lines, had_newline)
|
(lines, had_newline)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Splits a string into a vector of lines (respecting Unicode & Windows line breaks).
|
/// Splits a string into a vector of lines (respecting Unicode & Windows line
|
||||||
|
/// breaks).
|
||||||
pub fn split_lines(text: &str) -> Vec<String> {
|
pub fn split_lines(text: &str) -> Vec<String> {
|
||||||
let mut iter = text.chars().peekable();
|
let mut p = CharParser::new(text);
|
||||||
let mut line = String::new();
|
let mut line = String::new();
|
||||||
let mut lines = Vec::new();
|
let mut lines = Vec::new();
|
||||||
|
|
||||||
while let Some(c) = iter.next() {
|
while let Some(c) = p.eat_merging_crlf() {
|
||||||
if is_newline_char(c) {
|
if is_newline_char(c) {
|
||||||
if c == '\r' && iter.peek() == Some(&'\n') {
|
|
||||||
iter.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
lines.push(std::mem::take(&mut line));
|
lines.push(std::mem::take(&mut line));
|
||||||
} else {
|
} else {
|
||||||
line.push(c);
|
line.push(c);
|
||||||
@ -139,11 +120,6 @@ pub fn split_lines(text: &str) -> Vec<String> {
|
|||||||
lines
|
lines
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Converts a hexademical sequence (without braces or "\u") into a character.
|
|
||||||
pub fn hex_to_char(sequence: &str) -> Option<char> {
|
|
||||||
u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
#[rustfmt::skip]
|
#[rustfmt::skip]
|
||||||
mod tests {
|
mod tests {
|
||||||
@ -152,7 +128,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_unescape_strings() {
|
fn test_unescape_strings() {
|
||||||
fn test(string: &str, expected: &str) {
|
fn test(string: &str, expected: &str) {
|
||||||
assert_eq!(unescape_string(string), expected.to_string());
|
assert_eq!(resolve_string(string), expected.to_string());
|
||||||
}
|
}
|
||||||
|
|
||||||
test(r#"hello world"#, "hello world");
|
test(r#"hello world"#, "hello world");
|
||||||
@ -170,19 +146,17 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_split_after_lang_tag() {
|
fn test_split_at_lang_tag() {
|
||||||
fn test(raw: &str, lang: Option<&str>, inner: &str) {
|
fn test(raw: &str, lang: &str, inner: &str) {
|
||||||
let (found_lang, found_inner) = split_after_lang_tag(raw);
|
assert_eq!(split_at_lang_tag(raw), (lang, inner));
|
||||||
assert_eq!(found_lang.as_ref().map(|id| id.as_str()), lang);
|
|
||||||
assert_eq!(found_inner, inner);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
test("typst it!", Some("typst"), " it!");
|
test("typst it!", "typst", " it!");
|
||||||
test("typst\n it!", Some("typst"), "\n it!");
|
test("typst\n it!", "typst", "\n it!");
|
||||||
test("typst\n it!", Some("typst"), "\n it!");
|
test("typst\n it!", "typst", "\n it!");
|
||||||
test("abc`", Some("abc"), "`");
|
test("abc`", "abc", "`");
|
||||||
test(" hi", None, " hi");
|
test(" hi", "", " hi");
|
||||||
test("`", None, "`");
|
test("`", "", "`");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
@ -1,23 +1,17 @@
|
|||||||
//! Tokenization.
|
//! Tokenization.
|
||||||
|
|
||||||
use std::iter::Peekable;
|
use super::{is_newline_char, CharParser};
|
||||||
use std::str::Chars;
|
|
||||||
use unicode_xid::UnicodeXID;
|
|
||||||
|
|
||||||
use crate::length::Length;
|
use crate::length::Length;
|
||||||
use crate::syntax::{Pos, Span, SpanWith, Spanned, Token};
|
use crate::syntax::{Ident, Pos, Span, SpanWith, Spanned, Token};
|
||||||
|
|
||||||
use Token::*;
|
|
||||||
use TokenMode::*;
|
use TokenMode::*;
|
||||||
|
|
||||||
/// An iterator over the tokens of a string of source code.
|
/// An iterator over the tokens of a string of source code.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct Tokens<'s> {
|
pub struct Tokens<'s> {
|
||||||
src: &'s str,
|
p: CharParser<'s>,
|
||||||
iter: Peekable<Chars<'s>>,
|
|
||||||
mode: TokenMode,
|
mode: TokenMode,
|
||||||
stack: Vec<TokenMode>,
|
stack: Vec<TokenMode>,
|
||||||
index: usize,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Whether to tokenize in header mode which yields expression, comma and
|
/// Whether to tokenize in header mode which yields expression, comma and
|
||||||
@ -33,11 +27,9 @@ impl<'s> Tokens<'s> {
|
|||||||
/// Create a new token iterator with the given mode.
|
/// Create a new token iterator with the given mode.
|
||||||
pub fn new(src: &'s str, mode: TokenMode) -> Self {
|
pub fn new(src: &'s str, mode: TokenMode) -> Self {
|
||||||
Self {
|
Self {
|
||||||
src,
|
p: CharParser::new(src),
|
||||||
iter: src.chars().peekable(),
|
|
||||||
mode,
|
mode,
|
||||||
stack: vec![],
|
stack: vec![],
|
||||||
index: 0,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -56,7 +48,7 @@ impl<'s> Tokens<'s> {
|
|||||||
/// The position in the string at which the last token ends and next token
|
/// The position in the string at which the last token ends and next token
|
||||||
/// will start.
|
/// will start.
|
||||||
pub fn pos(&self) -> Pos {
|
pub fn pos(&self) -> Pos {
|
||||||
self.index.into()
|
self.p.index().into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -65,183 +57,153 @@ impl<'s> Iterator for Tokens<'s> {
|
|||||||
|
|
||||||
/// Parse the next token in the source code.
|
/// Parse the next token in the source code.
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
let start = self.pos();
|
let start = self.p.index();
|
||||||
let first = self.eat()?;
|
let token = match self.p.eat()? {
|
||||||
|
|
||||||
let token = match first {
|
|
||||||
// Comments.
|
|
||||||
'/' if self.peek() == Some('/') => self.read_line_comment(),
|
|
||||||
'/' if self.peek() == Some('*') => self.read_block_comment(),
|
|
||||||
'*' if self.peek() == Some('/') => {
|
|
||||||
self.eat();
|
|
||||||
Invalid("*/")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Whitespace.
|
// Whitespace.
|
||||||
c if c.is_whitespace() => self.read_whitespace(c),
|
c if c.is_whitespace() => self.read_whitespace(c),
|
||||||
|
|
||||||
// Functions and blocks.
|
// Comments.
|
||||||
'[' => LeftBracket,
|
'/' if self.p.eat_if('/') => self.read_line_comment(),
|
||||||
']' => RightBracket,
|
'/' if self.p.eat_if('*') => self.read_block_comment(),
|
||||||
'{' => LeftBrace,
|
'*' if self.p.eat_if('/') => Token::Invalid("*/"),
|
||||||
'}' => RightBrace,
|
|
||||||
|
|
||||||
// Syntactic elements in function headers.
|
// Functions.
|
||||||
'(' if self.mode == Header => LeftParen,
|
'[' => Token::LeftBracket,
|
||||||
')' if self.mode == Header => RightParen,
|
']' => Token::RightBracket,
|
||||||
':' if self.mode == Header => Colon,
|
'{' => Token::LeftBrace,
|
||||||
',' if self.mode == Header => Comma,
|
'}' => Token::RightBrace,
|
||||||
'=' if self.mode == Header => Equals,
|
|
||||||
'>' if self.mode == Header && self.peek() == Some('>') => self.read_chain(),
|
|
||||||
|
|
||||||
// Expression operators.
|
// Syntactic elements in body text.
|
||||||
'+' if self.mode == Header => Plus,
|
'_' if self.mode == Body => Token::Underscore,
|
||||||
'-' if self.mode == Header => Hyphen,
|
'`' if self.mode == Body => self.read_raw(),
|
||||||
'/' if self.mode == Header => Slash,
|
'#' if self.mode == Body => Token::Hashtag,
|
||||||
|
'~' if self.mode == Body => Token::Text("\u{00A0}"),
|
||||||
|
'\\' if self.mode == Body => self.read_escaped(),
|
||||||
|
|
||||||
|
// Syntactic elements in headers.
|
||||||
|
'(' if self.mode == Header => Token::LeftParen,
|
||||||
|
')' if self.mode == Header => Token::RightParen,
|
||||||
|
':' if self.mode == Header => Token::Colon,
|
||||||
|
',' if self.mode == Header => Token::Comma,
|
||||||
|
'=' if self.mode == Header => Token::Equals,
|
||||||
|
'>' if self.mode == Header && self.p.eat_if('>') => Token::Chain,
|
||||||
|
|
||||||
|
// Expressions.
|
||||||
|
'+' if self.mode == Header => Token::Plus,
|
||||||
|
'-' if self.mode == Header => Token::Hyphen,
|
||||||
|
'/' if self.mode == Header => Token::Slash,
|
||||||
|
'#' if self.mode == Header => self.read_hex(),
|
||||||
|
'"' if self.mode == Header => self.read_string(),
|
||||||
|
|
||||||
// Star serves a double purpose as a style modifier
|
// Star serves a double purpose as a style modifier
|
||||||
// and a expression operator in the header.
|
// and a expression operator in the header.
|
||||||
'*' => Star,
|
'*' => Token::Star,
|
||||||
|
|
||||||
// A hex expression.
|
// Expressions or just plain text.
|
||||||
'#' if self.mode == Header => self.read_hex(),
|
_ => self.read_text_or_expr(start),
|
||||||
|
|
||||||
// String values.
|
|
||||||
'"' if self.mode == Header => self.read_string(),
|
|
||||||
|
|
||||||
// Style toggles.
|
|
||||||
'_' if self.mode == Body => Underscore,
|
|
||||||
'`' if self.mode == Body => self.read_raw(),
|
|
||||||
|
|
||||||
// Sections.
|
|
||||||
'#' if self.mode == Body => Hashtag,
|
|
||||||
|
|
||||||
// Non-breaking spaces.
|
|
||||||
'~' if self.mode == Body => Text("\u{00A0}"),
|
|
||||||
|
|
||||||
// An escaped thing.
|
|
||||||
'\\' if self.mode == Body => self.read_escaped(),
|
|
||||||
|
|
||||||
// Expressions or just strings.
|
|
||||||
c => {
|
|
||||||
let body = self.mode == Body;
|
|
||||||
|
|
||||||
let start_offset = -(c.len_utf8() as isize);
|
|
||||||
let mut last_was_e = false;
|
|
||||||
|
|
||||||
let (text, _) = self.read_string_until(false, start_offset, 0, |n| {
|
|
||||||
let val = match n {
|
|
||||||
c if c.is_whitespace() => true,
|
|
||||||
'[' | ']' | '{' | '}' | '/' | '*' => true,
|
|
||||||
'\\' | '_' | '`' | '#' | '~' if body => true,
|
|
||||||
':' | '=' | ',' | '"' | '(' | ')' if !body => true,
|
|
||||||
'+' | '-' if !body && !last_was_e => true,
|
|
||||||
_ => false,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
last_was_e = n == 'e' || n == 'E';
|
let end = self.p.index();
|
||||||
val
|
|
||||||
});
|
|
||||||
|
|
||||||
if self.mode == Header {
|
|
||||||
self.read_expr(text)
|
|
||||||
} else {
|
|
||||||
Text(text)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let end = self.pos();
|
|
||||||
|
|
||||||
Some(token.span_with(Span::new(start, end)))
|
Some(token.span_with(Span::new(start, end)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'s> Tokens<'s> {
|
impl<'s> Tokens<'s> {
|
||||||
|
fn read_whitespace(&mut self, first: char) -> Token<'s> {
|
||||||
|
// Shortcut for common case of exactly one space.
|
||||||
|
if first == ' ' && !self.p.check(|c| c.is_whitespace()) {
|
||||||
|
return Token::Space(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Uneat the first char if it's a newline, so it's counted in the loop.
|
||||||
|
if is_newline_char(first) {
|
||||||
|
self.p.uneat();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Count the number of newlines.
|
||||||
|
let mut newlines = 0;
|
||||||
|
while let Some(c) = self.p.eat_merging_crlf() {
|
||||||
|
if !c.is_whitespace() {
|
||||||
|
self.p.uneat();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if is_newline_char(c) {
|
||||||
|
newlines += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Token::Space(newlines)
|
||||||
|
}
|
||||||
|
|
||||||
fn read_line_comment(&mut self) -> Token<'s> {
|
fn read_line_comment(&mut self) -> Token<'s> {
|
||||||
self.eat();
|
Token::LineComment(self.p.eat_until(is_newline_char))
|
||||||
LineComment(self.read_string_until(false, 0, 0, is_newline_char).0)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_block_comment(&mut self) -> Token<'s> {
|
fn read_block_comment(&mut self) -> Token<'s> {
|
||||||
enum Last {
|
let start = self.p.index();
|
||||||
Slash,
|
|
||||||
Star,
|
|
||||||
Other,
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut depth = 0;
|
let mut depth = 1;
|
||||||
let mut last = Last::Other;
|
let mut state = ' ';
|
||||||
|
|
||||||
// Find the first `*/` that does not correspond to a nested `/*`.
|
// Find the first `*/` that does not correspond to a nested `/*`.
|
||||||
// Remove the last two bytes to obtain the raw inner text without `*/`.
|
while let Some(c) = self.p.eat() {
|
||||||
self.eat();
|
state = match (state, c) {
|
||||||
let (content, _) = self.read_string_until(true, 0, -2, |c| {
|
('*', '/') if depth == 1 => {
|
||||||
match c {
|
depth = 0;
|
||||||
'/' => match last {
|
break;
|
||||||
Last::Star if depth == 0 => return true,
|
|
||||||
Last::Star => depth -= 1,
|
|
||||||
_ => last = Last::Slash,
|
|
||||||
},
|
|
||||||
'*' => match last {
|
|
||||||
Last::Slash => depth += 1,
|
|
||||||
_ => last = Last::Star,
|
|
||||||
},
|
|
||||||
_ => last = Last::Other,
|
|
||||||
}
|
}
|
||||||
|
('*', '/') => {
|
||||||
false
|
depth -= 1;
|
||||||
});
|
' '
|
||||||
|
|
||||||
BlockComment(content)
|
|
||||||
}
|
}
|
||||||
|
('/', '*') => {
|
||||||
fn read_chain(&mut self) -> Token<'s> {
|
depth += 1;
|
||||||
assert!(self.eat() == Some('>'));
|
' '
|
||||||
Chain
|
|
||||||
}
|
}
|
||||||
|
_ => c,
|
||||||
fn read_whitespace(&mut self, mut c: char) -> Token<'s> {
|
|
||||||
let mut newlines = 0;
|
|
||||||
|
|
||||||
loop {
|
|
||||||
if is_newline_char(c) {
|
|
||||||
if c == '\r' && self.peek() == Some('\n') {
|
|
||||||
self.eat();
|
|
||||||
}
|
|
||||||
|
|
||||||
newlines += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
match self.peek() {
|
|
||||||
Some(n) if n.is_whitespace() => {
|
|
||||||
self.eat();
|
|
||||||
c = n;
|
|
||||||
}
|
|
||||||
_ => break,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Space(newlines)
|
let mut read = self.p.eaten_from(start);
|
||||||
|
if depth == 0 {
|
||||||
|
read = read.strip_suffix("*/").unwrap_or(read);
|
||||||
|
}
|
||||||
|
|
||||||
|
Token::BlockComment(read)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_hex(&mut self) -> Token<'s> {
|
||||||
|
// This parses more than the permissable 0-9, a-f, A-F character ranges
|
||||||
|
// to provide nicer error messages later.
|
||||||
|
Token::Hex(self.p.eat_while(|c| c.is_ascii_alphanumeric()))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_string(&mut self) -> Token<'s> {
|
fn read_string(&mut self) -> Token<'s> {
|
||||||
let (string, terminated) = self.read_until_unescaped('"');
|
let mut escaped = false;
|
||||||
Str { string, terminated }
|
Token::Str {
|
||||||
|
string: self.p.eat_until(|c| {
|
||||||
|
if c == '"' && !escaped {
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
escaped = c == '\\' && !escaped;
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
terminated: self.p.eat_if('"'),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_raw(&mut self) -> Token<'s> {
|
fn read_raw(&mut self) -> Token<'s> {
|
||||||
let mut backticks = 1;
|
let mut backticks = 1;
|
||||||
while self.peek() == Some('`') {
|
while self.p.eat_if('`') {
|
||||||
self.eat();
|
|
||||||
backticks += 1;
|
backticks += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
let start = self.index;
|
let start = self.p.index();
|
||||||
|
|
||||||
let mut found = 0;
|
let mut found = 0;
|
||||||
while found < backticks {
|
while found < backticks {
|
||||||
match self.eat() {
|
match self.p.eat() {
|
||||||
Some('`') => found += 1,
|
Some('`') => found += 1,
|
||||||
Some(_) => found = 0,
|
Some(_) => found = 0,
|
||||||
None => break,
|
None => break,
|
||||||
@ -249,134 +211,83 @@ impl<'s> Tokens<'s> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let terminated = found == backticks;
|
let terminated = found == backticks;
|
||||||
let end = self.index - if terminated { found } else { 0 };
|
let end = self.p.index() - if terminated { found } else { 0 };
|
||||||
|
|
||||||
Raw {
|
Token::Raw {
|
||||||
raw: &self.src[start .. end],
|
raw: self.p.get(start .. end),
|
||||||
backticks,
|
backticks,
|
||||||
terminated,
|
terminated,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_until_unescaped(&mut self, end: char) -> (&'s str, bool) {
|
|
||||||
let mut escaped = false;
|
|
||||||
self.read_string_until(true, 0, -1, |c| {
|
|
||||||
match c {
|
|
||||||
c if c == end && !escaped => return true,
|
|
||||||
'\\' => escaped = !escaped,
|
|
||||||
_ => escaped = false,
|
|
||||||
}
|
|
||||||
|
|
||||||
false
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn read_escaped(&mut self) -> Token<'s> {
|
fn read_escaped(&mut self) -> Token<'s> {
|
||||||
fn is_escapable(c: char) -> bool {
|
if let Some(c) = self.p.peek() {
|
||||||
match c {
|
match c {
|
||||||
'[' | ']' | '\\' | '/' | '*' | '_' | '`' | '"' | '#' | '~' => true,
|
'[' | ']' | '\\' | '/' | '*' | '_' | '`' | '"' | '#' | '~' => {
|
||||||
|
let start = self.p.index();
|
||||||
|
self.p.eat_assert(c);
|
||||||
|
Token::Text(&self.p.eaten_from(start))
|
||||||
|
}
|
||||||
|
'u' if self.p.peek_nth(1) == Some('{') => {
|
||||||
|
self.p.eat_assert('u');
|
||||||
|
self.p.eat_assert('{');
|
||||||
|
Token::UnicodeEscape {
|
||||||
|
sequence: self.p.eat_while(|c| c.is_ascii_hexdigit()),
|
||||||
|
terminated: self.p.eat_if('}'),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
c if c.is_whitespace() => Token::Backslash,
|
||||||
|
_ => Token::Text("\\"),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Token::Backslash
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_text_or_expr(&mut self, start: usize) -> Token<'s> {
|
||||||
|
let body = self.mode == Body;
|
||||||
|
let header = self.mode == Header;
|
||||||
|
|
||||||
|
let mut last_was_e = false;
|
||||||
|
self.p.eat_until(|c| {
|
||||||
|
let end = match c {
|
||||||
|
c if c.is_whitespace() => true,
|
||||||
|
'[' | ']' | '*' | '/' => true,
|
||||||
|
'_' | '`' | '~' | '\\' if body => true,
|
||||||
|
'(' | ')' | '{' | '}' | ':' | ',' | '=' | '"' | '#' if header => true,
|
||||||
|
'+' | '-' if header && !last_was_e => true,
|
||||||
_ => false,
|
_ => false,
|
||||||
}
|
};
|
||||||
}
|
last_was_e = c == 'e' || c == 'E';
|
||||||
|
end
|
||||||
|
});
|
||||||
|
|
||||||
match self.peek() {
|
let read = self.p.eaten_from(start);
|
||||||
Some('u') => {
|
if self.mode == Header {
|
||||||
self.eat();
|
parse_expr(read)
|
||||||
if self.peek() == Some('{') {
|
|
||||||
self.eat();
|
|
||||||
let (sequence, _) =
|
|
||||||
self.read_string_until(false, 0, 0, |c| !c.is_ascii_hexdigit());
|
|
||||||
|
|
||||||
let terminated = self.peek() == Some('}');
|
|
||||||
if terminated {
|
|
||||||
self.eat();
|
|
||||||
}
|
|
||||||
|
|
||||||
UnicodeEscape { sequence, terminated }
|
|
||||||
} else {
|
} else {
|
||||||
Text("\\u")
|
Token::Text(read)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Some(c) if is_escapable(c) => {
|
|
||||||
let index = self.index;
|
|
||||||
self.eat();
|
|
||||||
Text(&self.src[index .. index + c.len_utf8()])
|
|
||||||
}
|
|
||||||
Some(c) if c.is_whitespace() => Backslash,
|
|
||||||
Some(_) => Text("\\"),
|
|
||||||
None => Backslash,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_hex(&mut self) -> Token<'s> {
|
fn parse_expr(text: &str) -> Token<'_> {
|
||||||
// This will parse more than the permissable 0-9, a-f, A-F character
|
|
||||||
// ranges to provide nicer error messages later.
|
|
||||||
Hex(self.read_string_until(false, 0, 0, |n| !n.is_ascii_alphanumeric()).0)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn read_expr(&mut self, text: &'s str) -> Token<'s> {
|
|
||||||
if let Ok(b) = text.parse::<bool>() {
|
if let Ok(b) = text.parse::<bool>() {
|
||||||
Bool(b)
|
Token::Bool(b)
|
||||||
} else if let Ok(num) = text.parse::<f64>() {
|
} else if let Ok(num) = text.parse::<f64>() {
|
||||||
Number(num)
|
Token::Number(num)
|
||||||
} else if let Some(num) = parse_percentage(text) {
|
} else if let Some(num) = parse_percent(text) {
|
||||||
Number(num / 100.0)
|
Token::Number(num / 100.0)
|
||||||
} else if let Ok(length) = text.parse::<Length>() {
|
} else if let Ok(length) = text.parse::<Length>() {
|
||||||
Length(length)
|
Token::Length(length)
|
||||||
} else if is_identifier(text) {
|
} else if Ident::is_ident(text) {
|
||||||
Ident(text)
|
Token::Ident(text)
|
||||||
} else {
|
} else {
|
||||||
Invalid(text)
|
Token::Invalid(text)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Will read the input stream until `f` evaluates to `true`. When
|
fn parse_percent(text: &str) -> Option<f64> {
|
||||||
/// `eat_match` is true, the token for which `f` was true is consumed.
|
|
||||||
/// Returns the string from the index where this was called offset by
|
|
||||||
/// `offset_start` to the end offset by `offset_end`. The end is before or
|
|
||||||
/// after the match depending on `eat_match`.
|
|
||||||
fn read_string_until(
|
|
||||||
&mut self,
|
|
||||||
eat_match: bool,
|
|
||||||
offset_start: isize,
|
|
||||||
offset_end: isize,
|
|
||||||
mut f: impl FnMut(char) -> bool,
|
|
||||||
) -> (&'s str, bool) {
|
|
||||||
let start = ((self.index as isize) + offset_start) as usize;
|
|
||||||
let mut matched = false;
|
|
||||||
|
|
||||||
while let Some(c) = self.peek() {
|
|
||||||
if f(c) {
|
|
||||||
matched = true;
|
|
||||||
if eat_match {
|
|
||||||
self.eat();
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
self.eat();
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut end = self.index;
|
|
||||||
if matched {
|
|
||||||
end = ((end as isize) + offset_end) as usize;
|
|
||||||
}
|
|
||||||
|
|
||||||
(&self.src[start .. end], matched)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn eat(&mut self) -> Option<char> {
|
|
||||||
let c = self.iter.next()?;
|
|
||||||
self.index += c.len_utf8();
|
|
||||||
Some(c)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn peek(&mut self) -> Option<char> {
|
|
||||||
self.iter.peek().copied()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_percentage(text: &str) -> Option<f64> {
|
|
||||||
if text.ends_with('%') {
|
if text.ends_with('%') {
|
||||||
text[.. text.len() - 1].parse::<f64>().ok()
|
text[.. text.len() - 1].parse::<f64>().ok()
|
||||||
} else {
|
} else {
|
||||||
@ -384,39 +295,6 @@ fn parse_percentage(text: &str) -> Option<f64> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Whether this character denotes a newline.
|
|
||||||
pub fn is_newline_char(character: char) -> bool {
|
|
||||||
match character {
|
|
||||||
// Line Feed, Vertical Tab, Form Feed, Carriage Return.
|
|
||||||
'\x0A' ..= '\x0D' => true,
|
|
||||||
// Next Line, Line Separator, Paragraph Separator.
|
|
||||||
'\u{0085}' | '\u{2028}' | '\u{2029}' => true,
|
|
||||||
_ => false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Whether this word is a valid identifier.
|
|
||||||
pub fn is_identifier(string: &str) -> bool {
|
|
||||||
fn is_extra_allowed(c: char) -> bool {
|
|
||||||
c == '.' || c == '-' || c == '_'
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut chars = string.chars();
|
|
||||||
match chars.next() {
|
|
||||||
Some(c) if UnicodeXID::is_xid_start(c) || is_extra_allowed(c) => {}
|
|
||||||
_ => return false,
|
|
||||||
}
|
|
||||||
|
|
||||||
for c in chars {
|
|
||||||
match c {
|
|
||||||
c if UnicodeXID::is_xid_continue(c) || is_extra_allowed(c) => {}
|
|
||||||
_ => return false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
true
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
#[allow(non_snake_case)]
|
#[allow(non_snake_case)]
|
||||||
mod tests {
|
mod tests {
|
||||||
@ -428,7 +306,7 @@ mod tests {
|
|||||||
BlockComment as BC, Bool, Chain, Hex, Hyphen as Min, Ident as Id,
|
BlockComment as BC, Bool, Chain, Hex, Hyphen as Min, Ident as Id,
|
||||||
LeftBrace as LB, LeftBracket as L, LeftParen as LP, Length as Len,
|
LeftBrace as LB, LeftBracket as L, LeftParen as LP, Length as Len,
|
||||||
LineComment as LC, Number as Num, Plus, RightBrace as RB, RightBracket as R,
|
LineComment as LC, Number as Num, Plus, RightBrace as RB, RightBracket as R,
|
||||||
RightParen as RP, Slash, Space as S, Star, Text as T,
|
RightParen as RP, Slash, Space as S, Star, Text as T, *,
|
||||||
};
|
};
|
||||||
|
|
||||||
fn Str(string: &str, terminated: bool) -> Token {
|
fn Str(string: &str, terminated: bool) -> Token {
|
||||||
@ -482,10 +360,11 @@ mod tests {
|
|||||||
t!(Body, "/***/" => BC("*"));
|
t!(Body, "/***/" => BC("*"));
|
||||||
t!(Body, "/**\\****/*/*/" => BC("*\\***"), Invalid("*/"), Invalid("*/"));
|
t!(Body, "/**\\****/*/*/" => BC("*\\***"), Invalid("*/"), Invalid("*/"));
|
||||||
t!(Body, "/*abc" => BC("abc"));
|
t!(Body, "/*abc" => BC("abc"));
|
||||||
|
t!(Body, "/*/*abc*/" => BC("/*abc*/"));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_body_only_tokens() {
|
fn tokenize_body_tokens() {
|
||||||
t!(Body, "_*" => Underscore, Star);
|
t!(Body, "_*" => Underscore, Star);
|
||||||
t!(Body, "***" => Star, Star, Star);
|
t!(Body, "***" => Star, Star, Star);
|
||||||
t!(Body, "[func]*bold*" => L, T("func"), R, Star, T("bold"), Star);
|
t!(Body, "[func]*bold*" => L, T("func"), R, Star, T("bold"), Star);
|
||||||
@ -517,40 +396,36 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_header_only_tokens() {
|
fn tokenize_header_tokens() {
|
||||||
t!(Body, "a: b" => T("a:"), S(0), T("b"));
|
t!(Header, "__main__" => Id("__main__"));
|
||||||
t!(Body, "c=d, " => T("c=d,"), S(0));
|
t!(Header, "_func_box" => Id("_func_box"));
|
||||||
t!(Header, "(){}:=," => LP, RP, LB, RB, Colon, Equals, Comma);
|
t!(Header, ">main" => Invalid(">main"));
|
||||||
|
t!(Header, "🌓, 🌍," => Invalid("🌓"), Comma, S(0), Invalid("🌍"), Comma);
|
||||||
|
t!(Header, "{abc}" => LB, Id("abc"), RB);
|
||||||
|
t!(Header, "(1,2)" => LP, Num(1.0), Comma, Num(2.0), RP);
|
||||||
|
t!(Header, "12_pt, 12pt" => Invalid("12_pt"), Comma, S(0), Len(Length::pt(12.0)));
|
||||||
|
t!(Header, "f: arg >> g" => Id("f"), Colon, S(0), Id("arg"), S(0), Chain, S(0), Id("g"));
|
||||||
|
t!(Header, "=3.14" => Equals, Num(3.14));
|
||||||
|
t!(Header, "arg, _b, _1" => Id("arg"), Comma, S(0), Id("_b"), Comma, S(0), Id("_1"));
|
||||||
t!(Header, "a:b" => Id("a"), Colon, Id("b"));
|
t!(Header, "a:b" => Id("a"), Colon, Id("b"));
|
||||||
t!(Header, "#6ae6dd" => Hex("6ae6dd"));
|
t!(Header, "(){}:=," => LP, RP, LB, RB, Colon, Equals, Comma);
|
||||||
t!(Header, "#8A083c" => Hex("8A083c"));
|
t!(Body, "c=d, " => T("c=d,"), S(0));
|
||||||
|
t!(Body, "a: b" => T("a:"), S(0), T("b"));
|
||||||
t!(Header, "a: true, x=1" => Id("a"), Colon, S(0), Bool(true), Comma, S(0),
|
t!(Header, "a: true, x=1" => Id("a"), Colon, S(0), Bool(true), Comma, S(0),
|
||||||
Id("x"), Equals, Num(1.0));
|
Id("x"), Equals, Num(1.0));
|
||||||
t!(Header, "=3.14" => Equals, Num(3.14));
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tokenize_numeric_values() {
|
||||||
t!(Header, "12.3e5" => Num(12.3e5));
|
t!(Header, "12.3e5" => Num(12.3e5));
|
||||||
t!(Header, "120%" => Num(1.2));
|
t!(Header, "120%" => Num(1.2));
|
||||||
t!(Header, "12e4%" => Num(1200.0));
|
t!(Header, "12e4%" => Num(1200.0));
|
||||||
t!(Header, "__main__" => Id("__main__"));
|
|
||||||
t!(Header, ">main" => Invalid(">main"));
|
|
||||||
t!(Header, ".func.box" => Id(".func.box"));
|
|
||||||
t!(Header, "arg, _b, _1" => Id("arg"), Comma, S(0), Id("_b"), Comma, S(0), Id("_1"));
|
|
||||||
t!(Header, "f: arg >> g" => Id("f"), Colon, S(0), Id("arg"), S(0), Chain, S(0), Id("g"));
|
|
||||||
t!(Header, "12_pt, 12pt" => Invalid("12_pt"), Comma, S(0), Len(Length::pt(12.0)));
|
|
||||||
t!(Header, "1e5in" => Len(Length::inches(100000.0)));
|
t!(Header, "1e5in" => Len(Length::inches(100000.0)));
|
||||||
t!(Header, "2.3cm" => Len(Length::cm(2.3)));
|
t!(Header, "2.3cm" => Len(Length::cm(2.3)));
|
||||||
t!(Header, "12e-3in" => Len(Length::inches(12e-3)));
|
|
||||||
t!(Header, "6.1cm + 4pt,a=1*2" => Len(Length::cm(6.1)), S(0), Plus, S(0), Len(Length::pt(4.0)),
|
|
||||||
Comma, Id("a"), Equals, Num(1.0), Star, Num(2.0));
|
|
||||||
t!(Header, "(5 - 1) / 2.1" => LP, Num(5.0), S(0), Min, S(0), Num(1.0), RP,
|
|
||||||
S(0), Slash, S(0), Num(2.1));
|
|
||||||
t!(Header, "-1" => Min, Num(1.0));
|
|
||||||
t!(Header, "--1" => Min, Min, Num(1.0));
|
|
||||||
t!(Header, "- 1" => Min, S(0), Num(1.0));
|
|
||||||
t!(Header, "02.4mm" => Len(Length::mm(2.4)));
|
t!(Header, "02.4mm" => Len(Length::mm(2.4)));
|
||||||
t!(Header, "2.4.cm" => Invalid("2.4.cm"));
|
t!(Header, "2.4.cm" => Invalid("2.4.cm"));
|
||||||
t!(Header, "(1,2)" => LP, Num(1.0), Comma, Num(2.0), RP);
|
t!(Header, "#6ae6dd" => Hex("6ae6dd"));
|
||||||
t!(Header, "{abc}" => LB, Id("abc"), RB);
|
t!(Header, "#8A083c" => Hex("8A083c"));
|
||||||
t!(Header, "🌓, 🌍," => Invalid("🌓"), Comma, S(0), Invalid("🌍"), Comma);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -566,6 +441,18 @@ mod tests {
|
|||||||
t!(Header, "\"🌎\"" => Str("🌎", true));
|
t!(Header, "\"🌎\"" => Str("🌎", true));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tokenize_math() {
|
||||||
|
t!(Header, "12e-3in" => Len(Length::inches(12e-3)));
|
||||||
|
t!(Header, "-1" => Min, Num(1.0));
|
||||||
|
t!(Header, "--1" => Min, Min, Num(1.0));
|
||||||
|
t!(Header, "- 1" => Min, S(0), Num(1.0));
|
||||||
|
t!(Header, "6.1cm + 4pt,a=1*2" => Len(Length::cm(6.1)), S(0), Plus, S(0), Len(Length::pt(4.0)),
|
||||||
|
Comma, Id("a"), Equals, Num(1.0), Star, Num(2.0));
|
||||||
|
t!(Header, "(5 - 1) / 2.1" => LP, Num(5.0), S(0), Min, S(0), Num(1.0), RP,
|
||||||
|
S(0), Slash, S(0), Num(2.1));
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_escaped_symbols() {
|
fn tokenize_escaped_symbols() {
|
||||||
t!(Body, r"\\" => T(r"\"));
|
t!(Body, r"\\" => T(r"\"));
|
||||||
@ -587,7 +474,7 @@ mod tests {
|
|||||||
t!(Body, r"\=" => T(r"\"), T("="));
|
t!(Body, r"\=" => T(r"\"), T("="));
|
||||||
t!(Body, r"\u{2GA4" => UE("2", false), T("GA4"));
|
t!(Body, r"\u{2GA4" => UE("2", false), T("GA4"));
|
||||||
t!(Body, r"\u{ " => UE("", false), Space(0));
|
t!(Body, r"\u{ " => UE("", false), Space(0));
|
||||||
t!(Body, r"\u" => T(r"\u"));
|
t!(Body, r"\u" => T("\\"), T("u"));
|
||||||
t!(Header, r"\\\\" => Invalid(r"\\\\"));
|
t!(Header, r"\\\\" => Invalid(r"\\\\"));
|
||||||
t!(Header, r"\a" => Invalid(r"\a"));
|
t!(Header, r"\a" => Invalid(r"\a"));
|
||||||
t!(Header, r"\:" => Invalid(r"\"), Colon);
|
t!(Header, r"\:" => Invalid(r"\"), Colon);
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
use std::fmt::{self, Debug, Display, Formatter};
|
use std::fmt::{self, Debug, Display, Formatter};
|
||||||
|
|
||||||
use super::Pos;
|
use super::Pos;
|
||||||
use crate::parse::is_newline_char;
|
use crate::parse::{is_newline_char, CharParser};
|
||||||
|
|
||||||
/// Enables conversion of byte position to locations.
|
/// Enables conversion of byte position to locations.
|
||||||
pub struct LineMap<'s> {
|
pub struct LineMap<'s> {
|
||||||
@ -15,17 +15,11 @@ impl<'s> LineMap<'s> {
|
|||||||
/// Create a new line map for a source string.
|
/// Create a new line map for a source string.
|
||||||
pub fn new(src: &'s str) -> Self {
|
pub fn new(src: &'s str) -> Self {
|
||||||
let mut line_starts = vec![Pos::ZERO];
|
let mut line_starts = vec![Pos::ZERO];
|
||||||
let mut iter = src.char_indices().peekable();
|
let mut p = CharParser::new(src);
|
||||||
|
|
||||||
while let Some((mut i, c)) = iter.next() {
|
while let Some(c) = p.eat_merging_crlf() {
|
||||||
if is_newline_char(c) {
|
if is_newline_char(c) {
|
||||||
i += c.len_utf8();
|
line_starts.push(p.index().into());
|
||||||
if c == '\r' && matches!(iter.peek(), Some((_, '\n'))) {
|
|
||||||
i += '\n'.len_utf8();
|
|
||||||
iter.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
line_starts.push(Pos(i as u32));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,14 +19,15 @@ pub enum Token<'s> {
|
|||||||
LeftBracket,
|
LeftBracket,
|
||||||
/// A right bracket ending a function invocation or body: `]`.
|
/// A right bracket ending a function invocation or body: `]`.
|
||||||
RightBracket,
|
RightBracket,
|
||||||
|
/// A left brace indicating the start of content: `{`.
|
||||||
|
LeftBrace,
|
||||||
|
/// A right brace indicating the end of content: `}`.
|
||||||
|
RightBrace,
|
||||||
/// A left parenthesis in a function header: `(`.
|
/// A left parenthesis in a function header: `(`.
|
||||||
LeftParen,
|
LeftParen,
|
||||||
/// A right parenthesis in a function header: `)`.
|
/// A right parenthesis in a function header: `)`.
|
||||||
RightParen,
|
RightParen,
|
||||||
/// A left brace in a function header: `{`.
|
|
||||||
LeftBrace,
|
|
||||||
/// A right brace in a function header: `}`.
|
|
||||||
RightBrace,
|
|
||||||
/// A double forward chevron in a function header: `>>`.
|
/// A double forward chevron in a function header: `>>`.
|
||||||
Chain,
|
Chain,
|
||||||
|
|
||||||
|
@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
use std::fmt::{self, Debug, Formatter};
|
use std::fmt::{self, Debug, Formatter};
|
||||||
|
|
||||||
|
use unicode_xid::UnicodeXID;
|
||||||
|
|
||||||
use super::span::{SpanVec, SpanWith, Spanned};
|
use super::span::{SpanVec, SpanWith, Spanned};
|
||||||
use super::Decoration;
|
use super::Decoration;
|
||||||
use crate::color::RgbaColor;
|
use crate::color::RgbaColor;
|
||||||
@ -9,7 +11,6 @@ use crate::compute::table::{SpannedEntry, Table};
|
|||||||
use crate::compute::value::{TableValue, Value};
|
use crate::compute::value::{TableValue, Value};
|
||||||
use crate::layout::LayoutContext;
|
use crate::layout::LayoutContext;
|
||||||
use crate::length::Length;
|
use crate::length::Length;
|
||||||
use crate::parse::is_identifier;
|
|
||||||
use crate::{DynFuture, Feedback};
|
use crate::{DynFuture, Feedback};
|
||||||
|
|
||||||
/// A collection of nodes which form a tree together with the nodes' children.
|
/// A collection of nodes which form a tree together with the nodes' children.
|
||||||
@ -233,7 +234,7 @@ pub struct Ident(pub String);
|
|||||||
impl Ident {
|
impl Ident {
|
||||||
/// Create a new identifier from a string checking that it is a valid.
|
/// Create a new identifier from a string checking that it is a valid.
|
||||||
pub fn new(ident: impl AsRef<str> + Into<String>) -> Option<Self> {
|
pub fn new(ident: impl AsRef<str> + Into<String>) -> Option<Self> {
|
||||||
if is_identifier(ident.as_ref()) {
|
if Self::is_ident(ident.as_ref()) {
|
||||||
Some(Self(ident.into()))
|
Some(Self(ident.into()))
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
@ -244,6 +245,20 @@ impl Ident {
|
|||||||
pub fn as_str(&self) -> &str {
|
pub fn as_str(&self) -> &str {
|
||||||
self.0.as_str()
|
self.0.as_str()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Whether the string is a valid identifier.
|
||||||
|
pub fn is_ident(string: &str) -> bool {
|
||||||
|
fn is_ok(c: char) -> bool {
|
||||||
|
c == '-' || c == '_'
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut chars = string.chars();
|
||||||
|
if matches!(chars.next(), Some(c) if c.is_xid_start() || is_ok(c)) {
|
||||||
|
chars.all(|c| c.is_xid_continue() || is_ok(c))
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Debug for Ident {
|
impl Debug for Ident {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user