From d68367f32a9e698923b554984c59f0671e27ba5f Mon Sep 17 00:00:00 2001 From: Martin Haug Date: Sat, 29 Aug 2020 17:20:04 +0200 Subject: [PATCH] =?UTF-8?q?Newlines=20are=20complicated,=20y'all=20?= =?UTF-8?q?=F0=9F=98=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: laurmaedje@outlook.de --- src/layout/tree.rs | 35 +++---- src/syntax/parsing.rs | 228 ++++++++++++++++++++++-------------------- src/syntax/tokens.rs | 89 +++++++++-------- src/syntax/tree.rs | 11 +- 4 files changed, 188 insertions(+), 175 deletions(-) diff --git a/src/layout/tree.rs b/src/layout/tree.rs index 714cfe274..16a2930ae 100644 --- a/src/layout/tree.rs +++ b/src/layout/tree.rs @@ -3,7 +3,7 @@ use crate::style::LayoutStyle; use crate::syntax::decoration::Decoration; use crate::syntax::span::{Span, Spanned}; -use crate::syntax::tree::{CallExpr, SyntaxNode, SyntaxTree, CodeBlockExpr}; +use crate::syntax::tree::{CallExpr, SyntaxNode, SyntaxTree, Code}; use crate::{DynFuture, Feedback, Pass}; use super::line::{LineContext, LineLayouter}; use super::text::{layout_text, TextContext}; @@ -63,10 +63,7 @@ impl<'a> TreeLayouter<'a> { match &node.v { SyntaxNode::Spacing => self.layout_space(), SyntaxNode::Linebreak => self.layouter.finish_line(), - SyntaxNode::Parbreak => self.layouter.add_secondary_spacing( - self.style.text.paragraph_spacing(), - SpacingKind::PARAGRAPH, - ), + SyntaxNode::Parbreak => self.layout_parbreak(), SyntaxNode::ToggleItalic => { self.style.text.italic = !self.style.text.italic; @@ -84,7 +81,7 @@ impl<'a> TreeLayouter<'a> { } SyntaxNode::Raw(lines) => self.layout_raw(lines).await, - SyntaxNode::CodeBlock(block) => self.layout_code(block).await, + SyntaxNode::Code(block) => self.layout_code(block).await, SyntaxNode::Call(call) => { self.layout_call(Spanned::new(call, node.span)).await; @@ -99,6 +96,13 @@ impl<'a> TreeLayouter<'a> { ); } + fn layout_parbreak(&mut self) { + self.layouter.add_secondary_spacing( + self.style.text.paragraph_spacing(), + SpacingKind::PARAGRAPH, + ); + } + async fn layout_text(&mut self, text: &str) { self.layouter.add( layout_text( @@ -133,19 +137,16 @@ impl<'a> TreeLayouter<'a> { self.style.text.fallback = fallback; } - async fn layout_code(&mut self, block: &CodeBlockExpr) { - let fallback = self.style.text.fallback.clone(); - self.style.text.fallback - .list_mut() - .insert(0, "monospace".to_string()); - self.style.text.fallback.flatten(); - - for line in &block.raw { - self.layout_text(line).await; - self.layouter.finish_line(); + async fn layout_code(&mut self, code: &Code) { + if code.block { + self.layout_parbreak(); } - self.style.text.fallback = fallback; + self.layout_raw(&code.lines).await; + + if code.block { + self.layout_parbreak() + } } async fn layout_call(&mut self, call: Spanned<&CallExpr>) { diff --git a/src/syntax/parsing.rs b/src/syntax/parsing.rs index 76509faee..0d12f6e18 100644 --- a/src/syntax/parsing.rs +++ b/src/syntax/parsing.rs @@ -7,14 +7,9 @@ use crate::color::RgbaColor; use crate::compute::table::SpannedEntry; use super::decoration::Decoration; use super::span::{Pos, Span, Spanned}; -use super::tokens::{is_newline_char, Token, TokenMode, Tokens, is_identifier}; +use super::tokens::{is_newline_char, Token, TokenMode, Tokens}; use super::tree::{ - CallExpr, - Expr, - SyntaxNode, - SyntaxTree, - TableExpr, - CodeBlockExpr, + CallExpr, Expr, SyntaxNode, SyntaxTree, TableExpr, Code, }; use super::Ident; @@ -88,28 +83,27 @@ impl Parser<'_> { if !terminated { error!( @self.feedback, Span::at(token.span.end), - "expected code block to close", + "expected backticks", ); } - let mut valid_ident = false; - let mut lang = lang.map(|s| s.map(|v| { - if is_identifier(v) { - valid_ident = true; - } - Ident(v.to_string()) - })); - if !valid_ident { - if let Some(l) = lang { - error!( - @self.feedback, l.span, - "expected language to be a valid identifier", - ); + let lang = lang.and_then(|lang| { + if let Some(ident) = Ident::new(lang.v) { + Some(Spanned::new(ident, lang.span)) + } else { + error!(@self.feedback, lang.span, "invalid identifier"); + None } - lang = None; + }); + + let mut lines = unescape_code(raw); + let block = lines.len() > 1; + + if lines.last().map(|s| s.is_empty()).unwrap_or(false) { + lines.pop(); } - self.with_span(SyntaxNode::CodeBlock(CodeBlockExpr { raw: unescape_code(raw), lang })) + self.with_span(SyntaxNode::Code(Code { lang, lines, block })) } Token::Text(text) => { @@ -624,45 +618,99 @@ fn unescape_string(string: &str) -> String { /// Unescape raw markup and split it into into lines. fn unescape_raw(raw: &str) -> Vec { let mut iter = raw.chars().peekable(); - let mut line = String::new(); - let mut lines = Vec::new(); + let mut text = String::new(); while let Some(c) = iter.next() { if c == '\\' { - match iter.next() { - Some('`') => line.push('`'), - Some(c) => { line.push('\\'); line.push(c); } - None => line.push('\\'), - } - } else if is_newline_char(c) { - if c == '\r' && iter.peek() == Some(&'\n') { - iter.next(); - } + if let Some(c) = iter.next() { + if c != '\\' && c != '`' { + text.push('\\'); + } - lines.push(std::mem::take(&mut line)); + text.push(c); + } else { + text.push('\\'); + } } else { - line.push(c); + text.push(c); } } - lines.push(line); - lines + split_lines(&text) } /// Unescape raw markup and split it into into lines. fn unescape_code(raw: &str) -> Vec { let mut iter = raw.chars().peekable(); - let mut line = String::new(); - let mut lines = Vec::new(); - let mut backticks: usize = 0; - - // This assignment is used in line 731, 733; - // the compiler does not want to acknowledge that, however. - #[allow(unused_assignments)] - let mut update_backtick_count = true; + let mut text = String::new(); + let mut backticks = 0u32; + let mut update_backtick_count; while let Some(c) = iter.next() { update_backtick_count = true; + + if c == '\\' && backticks > 0 { + let mut tail = String::new(); + let mut escape_success = false; + let mut backticks_after_slash = 0u32; + + while let Some(&s) = iter.peek() { + match s { + '\\' => { + if backticks_after_slash == 0 { + tail.push('\\'); + } else { + // Pattern like `\`\` should fail + // escape and just be printed verbantim. + break; + } + } + '`' => { + tail.push(s); + backticks_after_slash += 1; + if backticks_after_slash == 2 { + escape_success = true; + iter.next(); + break; + } + } + _ => break, + } + + iter.next(); + } + + if !escape_success { + text.push(c); + backticks = backticks_after_slash; + update_backtick_count = false; + } else { + backticks = 0; + } + + text.push_str(&tail); + } else { + text.push(c); + } + + if update_backtick_count { + if c == '`' { + backticks += 1; + } else { + backticks = 0; + } + } + } + + split_lines(&text) +} + +fn split_lines(text: &str) -> Vec { + let mut iter = text.chars().peekable(); + let mut line = String::new(); + let mut lines = Vec::new(); + + while let Some(c) = iter.next() { if is_newline_char(c) { if c == '\r' && iter.peek() == Some(&'\n') { iter.next(); @@ -670,56 +718,7 @@ fn unescape_code(raw: &str) -> Vec { lines.push(std::mem::take(&mut line)); } else { - if c == '\\' && backticks > 0 { - let mut tail = String::new(); - let mut escape_success = false; - - let mut backticks_after_slash: u8 = 0; - - while let Some(&s) = iter.peek() { - match s { - '\\' => { - if backticks_after_slash == 0 { - tail.push(s); - } else { - // Pattern like `\`\` should fail - // escape and just be printed verbantim. - break; - } - } - '`' => { - tail.push(s); - backticks_after_slash += 1; - if backticks_after_slash == 2 { - escape_success = true; - iter.next(); - break; - } - } - _ => { break } - } - - iter.next(); - } - - if !escape_success { - line.push(c); - backticks = backticks_after_slash as usize; - update_backtick_count = false; - } else { - backticks = 0; - } - - line.push_str(&tail); - } else { - line.push(c); - } - } - - if update_backtick_count && c == '`' { - backticks += 1; - } else if update_backtick_count { - backticks = 0; + line.push(c); } } @@ -753,13 +752,23 @@ mod tests { }; } - - fn Lang(text: &str) -> Option> { Some(Spanned::zero(Ident(text.to_string()))) } - macro_rules! C { - ($lang:expr, $($line:expr),* $(,)?) => { - SyntaxNode::CodeBlock(CodeBlockExpr { raw: vec![$($line.to_string()) ,*], lang: $lang }) - }; + (None, $($line:expr),* $(,)?) => {{ + let lines = vec![$($line.to_string()) ,*]; + SyntaxNode::Code(Code { + lang: None, + block: lines.len() > 1, + lines, + }) + }}; + (Some($lang:expr), $($line:expr),* $(,)?) => {{ + let lines = vec![$($line.to_string()) ,*]; + SyntaxNode::Code(Code { + lang: Some(Into::>::into($lang).map(|s| Ident(s.to_string()))), + block: lines.len() > 1, + lines, + }) + }}; } macro_rules! F { @@ -896,6 +905,7 @@ mod tests { } test("raw\\`", vec!["raw`"]); + test("raw\\\\`", vec!["raw\\`"]); test("raw\ntext", vec!["raw", "text"]); test("a\r\nb", vec!["a", "b"]); test("a\n\nb", vec!["a", "", "b"]); @@ -942,16 +952,16 @@ mod tests { t!("`hi\\`du`" => R!["hi`du"]); t!("```java System.out.print```" => C![ - Lang("java"), "System.out.print" - ]); + Some("java"), "System.out.print" + ]); t!("``` console.log(\n\"alert\"\n)" => C![ None, "console.log(", "\"alert\"", ")" - ]); + ]); t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => C![ - Lang("typst"), " Typst uses ``` to indicate code blocks" - ]); - e!("``` hi\nyou" => s(1,3, 1,3, "expected code block to close")); - e!("```🌍 hi\nyou```" => s(0,3, 0,4, "expected language to be a valid identifier")); + Some("typst"), " Typst uses ``` to indicate code blocks" + ]); + e!("``` hi\nyou" => s(1,3, 1,3, "expected backticks")); + e!("```🌍 hi\nyou```" => s(0,3, 0,4, "invalid identifier")); t!("💜\n\n 🌍" => T("💜"), P, T("🌍")); ts!("hi" => s(0,0, 0,2, T("hi"))); diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs index dbba175dc..7ecb05fe4 100644 --- a/src/syntax/tokens.rs +++ b/src/syntax/tokens.rs @@ -252,7 +252,7 @@ impl<'s> Iterator for Tokens<'s> { // Style toggles. '_' if self.mode == Body => Underscore, - '`' if self.mode == Body => self.read_raw_and_code(), + '`' if self.mode == Body => self.read_raw_or_code(), // An escaped thing. '\\' if self.mode == Body => self.read_escaped(), @@ -341,66 +341,67 @@ impl<'s> Tokens<'s> { Str { string, terminated } } - fn read_raw_and_code(&mut self) -> Token<'s> { + fn read_raw_or_code(&mut self) -> Token<'s> { let (raw, terminated) = self.read_until_unescaped('`'); - if raw.len() == 0 && terminated && self.peek() == Some('`') { - // Third tick found; this is a code block + if raw.is_empty() && terminated && self.peek() == Some('`') { + // Third tick found; this is a code block. self.eat(); - let mut backticks = 0; - let mut terminated = true; - // Reads the lang tag (until newline or whitespace) - let lang_start = self.pos(); - let (lang_opt, _) = self.read_string_until( - |c| c == '`' || c.is_whitespace() || is_newline_char(c), - false, 0, 0); - let lang_end = self.pos(); - #[derive(Debug, PartialEq)] - enum WhitespaceIngestion { All, ExceptNewline, Never } - let mut ingest_whitespace = WhitespaceIngestion::Never; - let mut start = self.index(); + // Reads the lang tag (until newline or whitespace). + let start = self.pos(); + let lang = self.read_string_until( + |c| c == '`' || c.is_whitespace() || is_newline_char(c), + false, 0, 0, + ).0; + let end = self.pos(); + let lang = if !lang.is_empty() { + Some(Spanned::new(lang, Span::new(start, end))) + } else { + None + }; + + // Skip to start of raw contents. + while let Some(c) = self.peek() { + if is_newline_char(c) { + self.eat(); + if c == '\r' && self.peek() == Some('\n') { + self.eat(); + } + + break; + } else if c.is_whitespace() { + self.eat(); + } else { + break; + } + } + + let start = self.index(); + let mut backticks = 0u32; while backticks < 3 { match self.eat() { Some('`') => backticks += 1, + // Escaping of triple backticks. Some('\\') if backticks == 1 && self.peek() == Some('`') => { backticks = 0; } - Some(c) => { - // Remove whitespace between language and content or - // first line break, deal with CRLF and CR line endings. - if ingest_whitespace != WhitespaceIngestion::All - && c == '\n' { - start += 1; - ingest_whitespace = WhitespaceIngestion::All; - } else if ingest_whitespace != WhitespaceIngestion::All - && c == '\r' { - start += 1; - ingest_whitespace = WhitespaceIngestion::ExceptNewline; - } else if ingest_whitespace == WhitespaceIngestion::Never - && c.is_whitespace() { - start += 1; - } else { - ingest_whitespace = WhitespaceIngestion::All; - } - } - None => { - terminated = false; - break; - } + Some(_) => {} + None => break, } } - let end = self.index() - (if terminated { 3 } else { 0 }); - return Code { - lang: if lang_opt.len() == 0 { None } else { - Some(Spanned::new(lang_opt, Span::new(lang_start, lang_end))) - }, + let terminated = backticks == 3; + let end = self.index() - if terminated { 3 } else { 0 }; + + Code { + lang, raw: &self.src[start..end], terminated } + } else { + Raw { raw, terminated } } - Raw { raw, terminated } } fn read_until_unescaped(&mut self, c: char) -> (&'s str, bool) { diff --git a/src/syntax/tree.rs b/src/syntax/tree.rs index 313e76a4a..44acd0234 100644 --- a/src/syntax/tree.rs +++ b/src/syntax/tree.rs @@ -33,8 +33,8 @@ pub enum SyntaxNode { Text(String), /// Lines of raw text. Raw(Vec), - /// An optionally highlighted multi-line code block. - CodeBlock(CodeBlockExpr), + /// An optionally highlighted (multi-line) code block. + Code(Code), /// A function call. Call(CallExpr), } @@ -201,9 +201,10 @@ impl CallExpr { } } } -/// An code block. +/// A code block. #[derive(Debug, Clone, PartialEq)] -pub struct CodeBlockExpr { +pub struct Code { pub lang: Option>, - pub raw: Vec, + pub lines: Vec, + pub block: bool, }