Even more lexer improvments, increased test coverage too!

This commit is contained in:
Jesse Braham 2024-12-05 17:16:45 +01:00
parent 0839bd542c
commit 11917bb183
4 changed files with 58 additions and 48 deletions

View File

@ -8,3 +8,6 @@ repository.workspace = true
license.workspace = true license.workspace = true
[dependencies] [dependencies]
[lints.rust]
unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tarpaulin_include)'] }

View File

@ -1,10 +1,8 @@
use crate::span::Span; use crate::span::Span;
/// Errors that can occur during lexical analysis. /// Errors that can occur during lexical analysis.
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum LexerErrorKind { pub enum LexerErrorKind {
/// An invalid character was encountered.
InvalidChar,
/// An invalid escape sequence was encountered. /// An invalid escape sequence was encountered.
InvalidEscape(char), InvalidEscape(char),
/// An invalid number was encountered. /// An invalid number was encountered.
@ -18,7 +16,7 @@ pub enum LexerErrorKind {
} }
/// Lexer error, with a start and end location. /// Lexer error, with a start and end location.
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq, Hash)]
pub struct LexerError { pub struct LexerError {
/// The type of error encountered. /// The type of error encountered.
pub kind: LexerErrorKind, pub kind: LexerErrorKind,
@ -36,12 +34,12 @@ impl LexerError {
impl std::error::Error for LexerError {} impl std::error::Error for LexerError {}
#[cfg(not(tarpaulin_include))]
impl std::fmt::Display for LexerError { impl std::fmt::Display for LexerError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
use LexerErrorKind::*; use LexerErrorKind::*;
match &self.kind { match &self.kind {
InvalidChar => write!(f, "Invalid character literal"),
InvalidEscape(c) => write!(f, "Unknown escape sequence '\\{c}' in string"), InvalidEscape(c) => write!(f, "Unknown escape sequence '\\{c}' in string"),
InvalidNumber(n) => write!(f, "`{n}` is not a valid numeric literal"), InvalidNumber(n) => write!(f, "`{n}` is not a valid numeric literal"),
InvalidString => write!(f, "Invalid string literal"), InvalidString => write!(f, "Invalid string literal"),

View File

@ -92,6 +92,14 @@ impl<'lexer> Lexer<'lexer> {
Some(c) Some(c)
} }
/// Advance the lexer by one character, and then return the specified
/// `TokenKind`:
fn advance_and(&mut self, kind: TokenKind) -> TokenKind {
self.advance();
kind
}
/// Read a word from the input until a separator is reached. /// Read a word from the input until a separator is reached.
fn read_word(&mut self) -> String { fn read_word(&mut self) -> String {
let mut word = String::new(); let mut word = String::new();
@ -135,13 +143,12 @@ impl<'lexer> Lexer<'lexer> {
let kind = match c { let kind = match c {
'#' if matches!(self.peek(1), Some('|')) => { '#' if matches!(self.peek(1), Some('|')) => {
self.advance(); // '#' self.advance(); // '#'
self.advance(); // '|#' self.advance(); // '|'
let mut comment = String::new(); let mut comment = String::new();
while let Some(c) = self.advance() { while let Some(c) = self.advance() {
match c { match c {
'|' if matches!(self.peek(0), Some('#')) => { '|' if matches!(self.peek(0), Some('#')) => {
self.advance(); // '|'
self.advance(); // '#' self.advance(); // '#'
break; break;
} }
@ -154,44 +161,29 @@ impl<'lexer> Lexer<'lexer> {
TokenKind::BlockComment(comment.trim().into()) TokenKind::BlockComment(comment.trim().into())
} }
';' => { ';' => {
while self.current().is_some_and(|c| c == ';') {
self.advance();
}
let mut comment = String::new(); let mut comment = String::new();
while let Some(c) = self.advance() { while let Some(c) = self.advance() {
match c { if c == '\n' {
';' => continue, break;
'\n' => break,
c => {
comment.push(c);
}
} }
comment.push(c);
} }
TokenKind::LineComment(comment.trim().into()) TokenKind::LineComment(comment.trim().into())
} }
'(' => { '(' => self.advance_and(TokenKind::OpenParen),
self.advance(); ')' => self.advance_and(TokenKind::CloseParen),
TokenKind::OpenParen '{' => self.advance_and(TokenKind::OpenBrace),
} '}' => self.advance_and(TokenKind::CloseBrace),
')' => { '[' => self.advance_and(TokenKind::OpenBracket),
self.advance(); ']' => self.advance_and(TokenKind::CloseBracket),
TokenKind::CloseParen
}
'{' => {
self.advance();
TokenKind::OpenBrace
}
'}' => {
self.advance();
TokenKind::CloseBrace
}
'[' => {
self.advance();
TokenKind::OpenBracket
}
']' => {
self.advance();
TokenKind::CloseBracket
}
'#' if matches!(self.peek(1), Some('{')) => { '#' if matches!(self.peek(1), Some('{')) => {
self.advance(); // '#' self.advance(); // '#'
self.advance(); // '{' self.advance(); // '{'
@ -207,6 +199,7 @@ impl<'lexer> Lexer<'lexer> {
Some('n') => '\n', Some('n') => '\n',
Some('e') => '\x1b', Some('e') => '\x1b',
Some(c) => { Some(c) => {
self.read_word(); // Recover from the error
return Err(LexerError::new( return Err(LexerError::new(
LexerErrorKind::InvalidEscape(c), LexerErrorKind::InvalidEscape(c),
span.join(&self.span()), span.join(&self.span()),
@ -229,9 +222,9 @@ impl<'lexer> Lexer<'lexer> {
}; };
if self.advance() != Some('\'') { if self.advance() != Some('\'') {
self.read_word(); self.read_word(); // Recover from the error
return Err(LexerError::new( return Err(LexerError::new(
LexerErrorKind::InvalidChar, LexerErrorKind::UnclosedChar,
span.join(&self.span()), span.join(&self.span()),
)); ));
} }
@ -277,7 +270,7 @@ impl<'lexer> Lexer<'lexer> {
} }
if self.current().is_some_and(|c| !c.is_separator()) { if self.current().is_some_and(|c| !c.is_separator()) {
self.read_word(); self.read_word(); // Recover from the error
return Err(LexerError::new( return Err(LexerError::new(
LexerErrorKind::InvalidString, LexerErrorKind::InvalidString,
span.join(&self.span()), span.join(&self.span()),
@ -339,13 +332,17 @@ mod tests {
}; };
} }
test!(block_comment: "#| foo\nbar |#", [ test!(block_comment: "#| foo\nbar |#(- 1)", [
Ok(TokenKind::BlockComment("foo\nbar".into())) Ok(TokenKind::BlockComment("foo\nbar".into())),
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("-"))),
Ok(TokenKind::Number(1.0)),
Ok(TokenKind::CloseParen),
]); ]);
test!(line_comment: "; foo\n;; bar baz qux", [ test!(line_comment: "; foo\n;; bar baz; qux", [
Ok(TokenKind::LineComment("foo".into())), Ok(TokenKind::LineComment("foo".into())),
Ok(TokenKind::LineComment("bar baz qux".into())), Ok(TokenKind::LineComment("bar baz; qux".into())),
]); ]);
test!(hashset: "{{} true false}", [ test!(hashset: "{{} true false}", [
@ -369,9 +366,21 @@ mod tests {
Ok(TokenKind::CloseParen), Ok(TokenKind::CloseParen),
]); ]);
test!(char_literal: "'x' '\n'", [ test!(vector: "[0 1 2]", [
Ok(TokenKind::OpenBracket),
Ok(TokenKind::Number(0.0)),
Ok(TokenKind::Number(1.0)),
Ok(TokenKind::Number(2.0)),
Ok(TokenKind::CloseBracket),
]);
test!(char_literal: r"'x' '\n' '\e' '\\' '\q' 'b", [
Ok(TokenKind::Char('x')), Ok(TokenKind::Char('x')),
Ok(TokenKind::Char('\n')), Ok(TokenKind::Char('\n')),
Ok(TokenKind::Char('\x1b')),
Ok(TokenKind::Char('\\')),
Err(LexerErrorKind::InvalidEscape('q')),
Err(LexerErrorKind::UnclosedChar),
]); ]);
test!(lex: "(+ 14 25.5 333 (* 2 5))", [ test!(lex: "(+ 14 25.5 333 (* 2 5))", [

View File

@ -1,7 +1,7 @@
use std::{cmp::Ordering, iter, ops::Range, sync::Arc}; use std::{cmp::Ordering, iter, ops::Range, sync::Arc};
/// A location within some source text. /// A location within some source text.
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)]
pub struct Location { pub struct Location {
line: usize, line: usize,
column: usize, column: usize,
@ -25,7 +25,7 @@ impl PartialOrd for Location {
} }
/// Some (optionally named) source text. /// Some (optionally named) source text.
#[derive(Debug, Default, Clone, PartialEq, Eq)] #[derive(Debug, Default, Clone, PartialEq, Eq, Hash)]
pub struct Source { pub struct Source {
name: Option<String>, name: Option<String>,
contents: String, contents: String,
@ -87,7 +87,7 @@ impl Source {
} }
/// A contiguous sequence of bytes within some source. /// A contiguous sequence of bytes within some source.
#[derive(Debug, Default, Clone)] #[derive(Debug, Default, Clone, Hash)]
pub struct Span { pub struct Span {
bytes: Range<usize>, bytes: Range<usize>,
source: Arc<Source>, source: Arc<Source>,