From 11917bb1835c0ecb5a4097e47ca97f4dc796a9ea Mon Sep 17 00:00:00 2001 From: Jesse Braham Date: Thu, 5 Dec 2024 17:16:45 +0100 Subject: [PATCH] Even more lexer improvments, increased test coverage too! --- onihime/Cargo.toml | 3 ++ onihime/src/lexer/error.rs | 8 ++-- onihime/src/lexer/mod.rs | 89 +++++++++++++++++++++----------------- onihime/src/span.rs | 6 +-- 4 files changed, 58 insertions(+), 48 deletions(-) diff --git a/onihime/Cargo.toml b/onihime/Cargo.toml index ef48661..76c5369 100644 --- a/onihime/Cargo.toml +++ b/onihime/Cargo.toml @@ -8,3 +8,6 @@ repository.workspace = true license.workspace = true [dependencies] + +[lints.rust] +unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tarpaulin_include)'] } diff --git a/onihime/src/lexer/error.rs b/onihime/src/lexer/error.rs index 58e6d7c..f50081a 100644 --- a/onihime/src/lexer/error.rs +++ b/onihime/src/lexer/error.rs @@ -1,10 +1,8 @@ use crate::span::Span; /// Errors that can occur during lexical analysis. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum LexerErrorKind { - /// An invalid character was encountered. - InvalidChar, /// An invalid escape sequence was encountered. InvalidEscape(char), /// An invalid number was encountered. @@ -18,7 +16,7 @@ pub enum LexerErrorKind { } /// Lexer error, with a start and end location. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Hash)] pub struct LexerError { /// The type of error encountered. pub kind: LexerErrorKind, @@ -36,12 +34,12 @@ impl LexerError { impl std::error::Error for LexerError {} +#[cfg(not(tarpaulin_include))] impl std::fmt::Display for LexerError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use LexerErrorKind::*; match &self.kind { - InvalidChar => write!(f, "Invalid character literal"), InvalidEscape(c) => write!(f, "Unknown escape sequence '\\{c}' in string"), InvalidNumber(n) => write!(f, "`{n}` is not a valid numeric literal"), InvalidString => write!(f, "Invalid string literal"), diff --git a/onihime/src/lexer/mod.rs b/onihime/src/lexer/mod.rs index a4b53a7..c725a91 100644 --- a/onihime/src/lexer/mod.rs +++ b/onihime/src/lexer/mod.rs @@ -92,6 +92,14 @@ impl<'lexer> Lexer<'lexer> { Some(c) } + /// Advance the lexer by one character, and then return the specified + /// `TokenKind`: + fn advance_and(&mut self, kind: TokenKind) -> TokenKind { + self.advance(); + + kind + } + /// Read a word from the input until a separator is reached. fn read_word(&mut self) -> String { let mut word = String::new(); @@ -135,13 +143,12 @@ impl<'lexer> Lexer<'lexer> { let kind = match c { '#' if matches!(self.peek(1), Some('|')) => { self.advance(); // '#' - self.advance(); // '|#' + self.advance(); // '|' let mut comment = String::new(); while let Some(c) = self.advance() { match c { '|' if matches!(self.peek(0), Some('#')) => { - self.advance(); // '|' self.advance(); // '#' break; } @@ -154,44 +161,29 @@ impl<'lexer> Lexer<'lexer> { TokenKind::BlockComment(comment.trim().into()) } ';' => { + while self.current().is_some_and(|c| c == ';') { + self.advance(); + } + let mut comment = String::new(); while let Some(c) = self.advance() { - match c { - ';' => continue, - '\n' => break, - c => { - comment.push(c); - } + if c == '\n' { + break; } + + comment.push(c); } TokenKind::LineComment(comment.trim().into()) } - '(' => { - self.advance(); - TokenKind::OpenParen - } - ')' => { - self.advance(); - TokenKind::CloseParen - } - '{' => { - self.advance(); - TokenKind::OpenBrace - } - '}' => { - self.advance(); - TokenKind::CloseBrace - } - '[' => { - self.advance(); - TokenKind::OpenBracket - } - ']' => { - self.advance(); - TokenKind::CloseBracket - } + '(' => self.advance_and(TokenKind::OpenParen), + ')' => self.advance_and(TokenKind::CloseParen), + '{' => self.advance_and(TokenKind::OpenBrace), + '}' => self.advance_and(TokenKind::CloseBrace), + '[' => self.advance_and(TokenKind::OpenBracket), + ']' => self.advance_and(TokenKind::CloseBracket), + '#' if matches!(self.peek(1), Some('{')) => { self.advance(); // '#' self.advance(); // '{' @@ -207,6 +199,7 @@ impl<'lexer> Lexer<'lexer> { Some('n') => '\n', Some('e') => '\x1b', Some(c) => { + self.read_word(); // Recover from the error return Err(LexerError::new( LexerErrorKind::InvalidEscape(c), span.join(&self.span()), @@ -229,9 +222,9 @@ impl<'lexer> Lexer<'lexer> { }; if self.advance() != Some('\'') { - self.read_word(); + self.read_word(); // Recover from the error return Err(LexerError::new( - LexerErrorKind::InvalidChar, + LexerErrorKind::UnclosedChar, span.join(&self.span()), )); } @@ -277,7 +270,7 @@ impl<'lexer> Lexer<'lexer> { } if self.current().is_some_and(|c| !c.is_separator()) { - self.read_word(); + self.read_word(); // Recover from the error return Err(LexerError::new( LexerErrorKind::InvalidString, span.join(&self.span()), @@ -339,13 +332,17 @@ mod tests { }; } - test!(block_comment: "#| foo\nbar |#", [ - Ok(TokenKind::BlockComment("foo\nbar".into())) + test!(block_comment: "#| foo\nbar |#(- 1)", [ + Ok(TokenKind::BlockComment("foo\nbar".into())), + Ok(TokenKind::OpenParen), + Ok(TokenKind::Symbol(Symbol::from("-"))), + Ok(TokenKind::Number(1.0)), + Ok(TokenKind::CloseParen), ]); - test!(line_comment: "; foo\n;; bar baz qux", [ + test!(line_comment: "; foo\n;; bar baz; qux", [ Ok(TokenKind::LineComment("foo".into())), - Ok(TokenKind::LineComment("bar baz qux".into())), + Ok(TokenKind::LineComment("bar baz; qux".into())), ]); test!(hashset: "{{} true false}", [ @@ -369,9 +366,21 @@ mod tests { Ok(TokenKind::CloseParen), ]); - test!(char_literal: "'x' '\n'", [ + test!(vector: "[0 1 2]", [ + Ok(TokenKind::OpenBracket), + Ok(TokenKind::Number(0.0)), + Ok(TokenKind::Number(1.0)), + Ok(TokenKind::Number(2.0)), + Ok(TokenKind::CloseBracket), + ]); + + test!(char_literal: r"'x' '\n' '\e' '\\' '\q' 'b", [ Ok(TokenKind::Char('x')), Ok(TokenKind::Char('\n')), + Ok(TokenKind::Char('\x1b')), + Ok(TokenKind::Char('\\')), + Err(LexerErrorKind::InvalidEscape('q')), + Err(LexerErrorKind::UnclosedChar), ]); test!(lex: "(+ 14 25.5 333 (* 2 5))", [ diff --git a/onihime/src/span.rs b/onihime/src/span.rs index 1c78a8b..22d61c5 100644 --- a/onihime/src/span.rs +++ b/onihime/src/span.rs @@ -1,7 +1,7 @@ use std::{cmp::Ordering, iter, ops::Range, sync::Arc}; /// A location within some source text. -#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)] pub struct Location { line: usize, column: usize, @@ -25,7 +25,7 @@ impl PartialOrd for Location { } /// Some (optionally named) source text. -#[derive(Debug, Default, Clone, PartialEq, Eq)] +#[derive(Debug, Default, Clone, PartialEq, Eq, Hash)] pub struct Source { name: Option, contents: String, @@ -87,7 +87,7 @@ impl Source { } /// A contiguous sequence of bytes within some source. -#[derive(Debug, Default, Clone)] +#[derive(Debug, Default, Clone, Hash)] pub struct Span { bytes: Range, source: Arc,