Fix separator-related bug in lexer, rewrite lexer unit tests

This commit is contained in:
Jesse Braham 2024-12-11 17:39:11 +01:00
parent ee08ffb28d
commit cc01cf97db

View File

@ -14,21 +14,17 @@ mod error;
mod symbol;
mod token;
/// A trait for checking if a character is a separator.
pub trait Separator {
/// Check if the character is a separator.
fn is_separator(&self) -> bool;
}
impl Separator for char {
fn is_separator(&self) -> bool {
self.is_ascii_whitespace() || matches!(self, '(' | ')' | '[' | ']' | '{' | '}')
}
/// Determine if the current character is a separator, performing 1-character
/// lookahead as needed to handle multi-character separators.
fn is_separator(current: char, next: Option<char>) -> bool {
current.is_ascii_whitespace()
|| matches!(current, '(' | ')' | '[' | ']' | '{' | '}' | ';')
|| (current == '#' && next.is_some_and(|c| matches!(c, '|' | '{')))
}
/// A lexer, used by the parser.
#[derive(Debug)]
pub struct Lexer<'lexer> {
pub(crate) struct Lexer<'lexer> {
input: Chars<'lexer>,
byte: usize,
source: Arc<Source>,
@ -49,24 +45,18 @@ impl<'lexer> Lexer<'lexer> {
/// Set the name of the [Source] being lexically analyzed.
pub fn set_name(&mut self, name: String) {
// FIXME: Avoid unwrapping here (if possible?)
// TODO: Avoid unwrapping here (if possible?)
Arc::get_mut(&mut self.source).unwrap().set_name(name);
}
/// The source being lexically analyzed.
#[cfg(test)]
#[must_use]
pub fn source(&self) -> Arc<Source> {
pub(crate) fn source(&self) -> Arc<Source> {
self.source.clone()
}
/// Get the unparsed input.
#[must_use]
pub fn get_unparsed(&self) -> &str {
self.input.as_str()
}
/// Get the current position of the lexer.
#[inline]
#[must_use]
pub(crate) fn span(&self) -> Span {
Span::new(self.byte..self.byte, self.source.clone())
@ -79,7 +69,7 @@ impl<'lexer> Lexer<'lexer> {
}
/// Get the nth character ahead of the current character without advancing.
#[inline]
#[must_use]
fn peek(&self, n: usize) -> Option<char> {
self.input.as_str().chars().nth(n)
}
@ -105,7 +95,7 @@ impl<'lexer> Lexer<'lexer> {
fn read_word(&mut self) -> String {
let mut word = String::new();
while let Some(c) = self.current() {
if c.is_separator() || (c == '#' && self.peek(1) == Some('{')) {
if is_separator(c, self.peek(1)) {
break;
}
@ -117,6 +107,7 @@ impl<'lexer> Lexer<'lexer> {
}
/// Parse a value from the input or return an error.
#[must_use]
fn parse_or<T>(&mut self, err: impl Fn(String) -> LexerErrorKind) -> Result<T, LexerError>
where
T: FromStr,
@ -143,7 +134,9 @@ impl<'lexer> Lexer<'lexer> {
};
let mut span = self.span();
let kind = match c {
// TODO: This allows for unclosed block comments; do we care?
'#' if self.peek(1) == Some('|') => {
self.advance(); // '#'
self.advance(); // '|'
@ -167,6 +160,7 @@ impl<'lexer> Lexer<'lexer> {
self.advance();
}
// Line comments continue until a newline character is encountered:
let mut comment = String::new();
while let Some(c) = self.advance() {
if c == '\n' {
@ -199,6 +193,8 @@ impl<'lexer> Lexer<'lexer> {
Some('\\') => match self.advance() {
Some(c @ ('"' | '\\')) => c,
Some('n') => '\n',
Some('r') => '\r',
Some('t') => '\t',
Some('e') => '\x1b',
Some(c) => {
self.read_word(); // Recover from the error
@ -250,12 +246,15 @@ impl<'lexer> Lexer<'lexer> {
Some('\\') => match self.advance() {
Some(c @ ('"' | '\\')) => c,
Some('n') => '\n',
Some('r') => '\r',
Some('t') => '\t',
Some('e') => '\x1b',
Some(c) => {
self.read_word(); // Recover from the error
return Err(LexerError::new(
LexerErrorKind::InvalidEscape(c),
ch_span.join(&self.span()),
))
));
}
None => {
return Err(LexerError::new(
@ -271,7 +270,10 @@ impl<'lexer> Lexer<'lexer> {
});
}
if self.current().is_some_and(|c| !c.is_separator()) {
if self
.current()
.is_some_and(|c| !is_separator(c, self.peek(1)))
{
self.read_word(); // Recover from the error
return Err(LexerError::new(
LexerErrorKind::InvalidString,
@ -282,7 +284,7 @@ impl<'lexer> Lexer<'lexer> {
TokenKind::String(string)
}
':' => {
self.advance();
self.advance(); // ':'
TokenKind::Keyword(Symbol(self.read_word()))
}
@ -320,35 +322,40 @@ mod tests {
#[test]
fn $name() {
let mut lexer = Lexer::new($input);
for token in $tokens {
let x = lexer.next().map(|r| match r {
let kind = lexer.next().map(|r| match r {
Ok(t) => Ok(t.kind),
Err(e) => Err(e.kind),
});
assert_eq!(x, Some(token));
assert_eq!(kind, Some(token));
}
assert_eq!(lexer.next(), None);
}
};
}
test!(block_comment: "#| foo\nbar |#(- 1)", [
Ok(TokenKind::BlockComment("foo\nbar".into())),
test!(block_comment: "foo#| bar\nbaz |#qux", [
Ok(TokenKind::Symbol(Symbol::from("foo"))),
Ok(TokenKind::BlockComment("bar\nbaz".into())),
Ok(TokenKind::Symbol(Symbol::from("qux"))),
]);
test!(line_comment: "nil ;; foo; bar\nnil", [
Ok(TokenKind::Nil),
Ok(TokenKind::LineComment("foo; bar".into())),
Ok(TokenKind::Nil),
]);
test!(list: "(1 () -2.3)", [
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("-"))),
Ok(TokenKind::Number(1.0)),
Ok(TokenKind::OpenParen),
Ok(TokenKind::CloseParen),
Ok(TokenKind::Number(-2.3)),
Ok(TokenKind::CloseParen),
]);
test!(line_comment: "; foo\n;; bar baz; qux", [
Ok(TokenKind::LineComment("foo".into())),
Ok(TokenKind::LineComment("bar baz; qux".into())),
]);
test!(hashset: "{{} true false}", [
test!(set: "{{} true false}", [
Ok(TokenKind::OpenBrace),
Ok(TokenKind::OpenBrace),
Ok(TokenKind::CloseBrace),
@ -357,70 +364,34 @@ mod tests {
Ok(TokenKind::CloseBrace),
]);
test!(hashmap: "#{:bar 0 :baz 1}", [
Ok(TokenKind::OpenHashBrace),
Ok(TokenKind::Keyword(Symbol::from("bar"))),
test!(vector: "[0 10 200]", [
Ok(TokenKind::OpenBracket),
Ok(TokenKind::Number(0.0)),
Ok(TokenKind::Keyword(Symbol::from("baz"))),
Ok(TokenKind::Number(10.0)),
Ok(TokenKind::Number(200.0)),
Ok(TokenKind::CloseBracket),
]);
test!(map: "#{:a 0 :b 1}", [
Ok(TokenKind::OpenHashBrace),
Ok(TokenKind::Keyword(Symbol::from("a"))),
Ok(TokenKind::Number(0.0)),
Ok(TokenKind::Keyword(Symbol::from("b"))),
Ok(TokenKind::Number(1.0)),
Ok(TokenKind::CloseBrace),
]);
test!(vector: "[0 1 2]", [
Ok(TokenKind::OpenBracket),
Ok(TokenKind::Number(0.0)),
Ok(TokenKind::Number(1.0)),
Ok(TokenKind::Number(2.0)),
Ok(TokenKind::CloseBracket),
]);
test!(char_literal: r"'x' '\n' '\e' '\\' '\q' 'b", [
test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [
Ok(TokenKind::Char('x')),
Ok(TokenKind::Char('\n')),
Ok(TokenKind::Char('\r')),
Ok(TokenKind::Char('\t')),
Ok(TokenKind::Char('\x1b')),
Ok(TokenKind::Char('\\')),
Err(LexerErrorKind::InvalidEscape('q')),
Err(LexerErrorKind::UnclosedChar),
]);
test!(nested_lists: "(+ 14 25.5 333 (* 2 5))", [
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("+"))),
Ok(TokenKind::Number(14.0)),
Ok(TokenKind::Number(25.5)),
Ok(TokenKind::Number(333.0)),
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("*"))),
Ok(TokenKind::Number(2.0)),
Ok(TokenKind::Number(5.0)),
Ok(TokenKind::CloseParen),
Ok(TokenKind::CloseParen),
]);
test!(newline: "(+ 14 25.5 333\n(* 2 5 5.x))", [
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("+"))),
Ok(TokenKind::Number(14.0)),
Ok(TokenKind::Number(25.5)),
Ok(TokenKind::Number(333.0)),
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("*"))),
Ok(TokenKind::Number(2.0)),
Ok(TokenKind::Number(5.0)),
Err(LexerErrorKind::InvalidNumber("5.x".into())),
Ok(TokenKind::CloseParen),
Ok(TokenKind::CloseParen),
]);
test!(negative_minus: "(- 1 -2 3)", [
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("-"))),
Ok(TokenKind::Number(1.0)),
Ok(TokenKind::Number(-2.0)),
Ok(TokenKind::Number(3.0)),
Ok(TokenKind::CloseParen),
]);
test!(error_unclosed_char_escape: r"'\", [
Err(LexerErrorKind::UnclosedChar),
]);
@ -429,12 +400,30 @@ mod tests {
Err(LexerErrorKind::UnclosedChar),
]);
test!(error_parse_numbers: "2 55 3.144 0.0001 1.1.1", [
Ok(TokenKind::Number(2.0)),
Ok(TokenKind::Number(55.0)),
Ok(TokenKind::Number(3.144)),
Ok(TokenKind::Number(0.0001)),
test!(number: "0 -1 20.0 +0.003", [
Ok(TokenKind::Number(0.0)),
Ok(TokenKind::Number(-1.0)),
Ok(TokenKind::Number(20.0)),
Ok(TokenKind::Number(0.003)),
]);
test!(error_parse_number: "1.1.1 0.x", [
Err(LexerErrorKind::InvalidNumber("1.1.1".into())),
Err(LexerErrorKind::InvalidNumber("0.x".into())),
]);
test!(string: "\"\" \"xyz\" \"This is a string!\"", [
Ok(TokenKind::String("".into())),
Ok(TokenKind::String("xyz".into())),
Ok(TokenKind::String("This is a string!".into())),
]);
test!(string_with_escapes: "\"\\e[0mfoo\\nbar\\r\\t\"", [
Ok(TokenKind::String("\x1b[0mfoo\nbar\r\t".into())),
]);
test!(error_invalid_escape_string: "\"oh no \\p\"", [
Err(LexerErrorKind::InvalidEscape('p')),
]);
test!(error_unclosed_string: "\"hiii", [
@ -444,4 +433,20 @@ mod tests {
test!(error_invalid_string: "\"hiii\"222", [
Err(LexerErrorKind::InvalidString),
]);
test!(nested_lists: "(+ (- 0 -1)\n(* 2 3))", [
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("+"))),
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("-"))),
Ok(TokenKind::Number(0.0)),
Ok(TokenKind::Number(-1.0)),
Ok(TokenKind::CloseParen),
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("*"))),
Ok(TokenKind::Number(2.0)),
Ok(TokenKind::Number(3.0)),
Ok(TokenKind::CloseParen),
Ok(TokenKind::CloseParen),
]);
}