Fix separator-related bug in lexer, rewrite lexer unit tests
This commit is contained in:
parent
ee08ffb28d
commit
cc01cf97db
@ -14,21 +14,17 @@ mod error;
|
||||
mod symbol;
|
||||
mod token;
|
||||
|
||||
/// A trait for checking if a character is a separator.
|
||||
pub trait Separator {
|
||||
/// Check if the character is a separator.
|
||||
fn is_separator(&self) -> bool;
|
||||
}
|
||||
|
||||
impl Separator for char {
|
||||
fn is_separator(&self) -> bool {
|
||||
self.is_ascii_whitespace() || matches!(self, '(' | ')' | '[' | ']' | '{' | '}')
|
||||
}
|
||||
/// Determine if the current character is a separator, performing 1-character
|
||||
/// lookahead as needed to handle multi-character separators.
|
||||
fn is_separator(current: char, next: Option<char>) -> bool {
|
||||
current.is_ascii_whitespace()
|
||||
|| matches!(current, '(' | ')' | '[' | ']' | '{' | '}' | ';')
|
||||
|| (current == '#' && next.is_some_and(|c| matches!(c, '|' | '{')))
|
||||
}
|
||||
|
||||
/// A lexer, used by the parser.
|
||||
#[derive(Debug)]
|
||||
pub struct Lexer<'lexer> {
|
||||
pub(crate) struct Lexer<'lexer> {
|
||||
input: Chars<'lexer>,
|
||||
byte: usize,
|
||||
source: Arc<Source>,
|
||||
@ -49,24 +45,18 @@ impl<'lexer> Lexer<'lexer> {
|
||||
|
||||
/// Set the name of the [Source] being lexically analyzed.
|
||||
pub fn set_name(&mut self, name: String) {
|
||||
// FIXME: Avoid unwrapping here (if possible?)
|
||||
// TODO: Avoid unwrapping here (if possible?)
|
||||
Arc::get_mut(&mut self.source).unwrap().set_name(name);
|
||||
}
|
||||
|
||||
/// The source being lexically analyzed.
|
||||
#[cfg(test)]
|
||||
#[must_use]
|
||||
pub fn source(&self) -> Arc<Source> {
|
||||
pub(crate) fn source(&self) -> Arc<Source> {
|
||||
self.source.clone()
|
||||
}
|
||||
|
||||
/// Get the unparsed input.
|
||||
#[must_use]
|
||||
pub fn get_unparsed(&self) -> &str {
|
||||
self.input.as_str()
|
||||
}
|
||||
|
||||
/// Get the current position of the lexer.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub(crate) fn span(&self) -> Span {
|
||||
Span::new(self.byte..self.byte, self.source.clone())
|
||||
@ -79,7 +69,7 @@ impl<'lexer> Lexer<'lexer> {
|
||||
}
|
||||
|
||||
/// Get the nth character ahead of the current character without advancing.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
fn peek(&self, n: usize) -> Option<char> {
|
||||
self.input.as_str().chars().nth(n)
|
||||
}
|
||||
@ -105,7 +95,7 @@ impl<'lexer> Lexer<'lexer> {
|
||||
fn read_word(&mut self) -> String {
|
||||
let mut word = String::new();
|
||||
while let Some(c) = self.current() {
|
||||
if c.is_separator() || (c == '#' && self.peek(1) == Some('{')) {
|
||||
if is_separator(c, self.peek(1)) {
|
||||
break;
|
||||
}
|
||||
|
||||
@ -117,6 +107,7 @@ impl<'lexer> Lexer<'lexer> {
|
||||
}
|
||||
|
||||
/// Parse a value from the input or return an error.
|
||||
#[must_use]
|
||||
fn parse_or<T>(&mut self, err: impl Fn(String) -> LexerErrorKind) -> Result<T, LexerError>
|
||||
where
|
||||
T: FromStr,
|
||||
@ -143,7 +134,9 @@ impl<'lexer> Lexer<'lexer> {
|
||||
};
|
||||
|
||||
let mut span = self.span();
|
||||
|
||||
let kind = match c {
|
||||
// TODO: This allows for unclosed block comments; do we care?
|
||||
'#' if self.peek(1) == Some('|') => {
|
||||
self.advance(); // '#'
|
||||
self.advance(); // '|'
|
||||
@ -167,6 +160,7 @@ impl<'lexer> Lexer<'lexer> {
|
||||
self.advance();
|
||||
}
|
||||
|
||||
// Line comments continue until a newline character is encountered:
|
||||
let mut comment = String::new();
|
||||
while let Some(c) = self.advance() {
|
||||
if c == '\n' {
|
||||
@ -199,6 +193,8 @@ impl<'lexer> Lexer<'lexer> {
|
||||
Some('\\') => match self.advance() {
|
||||
Some(c @ ('"' | '\\')) => c,
|
||||
Some('n') => '\n',
|
||||
Some('r') => '\r',
|
||||
Some('t') => '\t',
|
||||
Some('e') => '\x1b',
|
||||
Some(c) => {
|
||||
self.read_word(); // Recover from the error
|
||||
@ -250,12 +246,15 @@ impl<'lexer> Lexer<'lexer> {
|
||||
Some('\\') => match self.advance() {
|
||||
Some(c @ ('"' | '\\')) => c,
|
||||
Some('n') => '\n',
|
||||
Some('r') => '\r',
|
||||
Some('t') => '\t',
|
||||
Some('e') => '\x1b',
|
||||
Some(c) => {
|
||||
self.read_word(); // Recover from the error
|
||||
return Err(LexerError::new(
|
||||
LexerErrorKind::InvalidEscape(c),
|
||||
ch_span.join(&self.span()),
|
||||
))
|
||||
));
|
||||
}
|
||||
None => {
|
||||
return Err(LexerError::new(
|
||||
@ -271,7 +270,10 @@ impl<'lexer> Lexer<'lexer> {
|
||||
});
|
||||
}
|
||||
|
||||
if self.current().is_some_and(|c| !c.is_separator()) {
|
||||
if self
|
||||
.current()
|
||||
.is_some_and(|c| !is_separator(c, self.peek(1)))
|
||||
{
|
||||
self.read_word(); // Recover from the error
|
||||
return Err(LexerError::new(
|
||||
LexerErrorKind::InvalidString,
|
||||
@ -282,7 +284,7 @@ impl<'lexer> Lexer<'lexer> {
|
||||
TokenKind::String(string)
|
||||
}
|
||||
':' => {
|
||||
self.advance();
|
||||
self.advance(); // ':'
|
||||
|
||||
TokenKind::Keyword(Symbol(self.read_word()))
|
||||
}
|
||||
@ -320,35 +322,40 @@ mod tests {
|
||||
#[test]
|
||||
fn $name() {
|
||||
let mut lexer = Lexer::new($input);
|
||||
|
||||
for token in $tokens {
|
||||
let x = lexer.next().map(|r| match r {
|
||||
let kind = lexer.next().map(|r| match r {
|
||||
Ok(t) => Ok(t.kind),
|
||||
Err(e) => Err(e.kind),
|
||||
});
|
||||
|
||||
assert_eq!(x, Some(token));
|
||||
assert_eq!(kind, Some(token));
|
||||
}
|
||||
|
||||
assert_eq!(lexer.next(), None);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
test!(block_comment: "#| foo\nbar |#(- 1)", [
|
||||
Ok(TokenKind::BlockComment("foo\nbar".into())),
|
||||
test!(block_comment: "foo#| bar\nbaz |#qux", [
|
||||
Ok(TokenKind::Symbol(Symbol::from("foo"))),
|
||||
Ok(TokenKind::BlockComment("bar\nbaz".into())),
|
||||
Ok(TokenKind::Symbol(Symbol::from("qux"))),
|
||||
]);
|
||||
|
||||
test!(line_comment: "nil ;; foo; bar\nnil", [
|
||||
Ok(TokenKind::Nil),
|
||||
Ok(TokenKind::LineComment("foo; bar".into())),
|
||||
Ok(TokenKind::Nil),
|
||||
]);
|
||||
|
||||
test!(list: "(1 () -2.3)", [
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("-"))),
|
||||
Ok(TokenKind::Number(1.0)),
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::CloseParen),
|
||||
Ok(TokenKind::Number(-2.3)),
|
||||
Ok(TokenKind::CloseParen),
|
||||
]);
|
||||
|
||||
test!(line_comment: "; foo\n;; bar baz; qux", [
|
||||
Ok(TokenKind::LineComment("foo".into())),
|
||||
Ok(TokenKind::LineComment("bar baz; qux".into())),
|
||||
]);
|
||||
|
||||
test!(hashset: "{{} true false}", [
|
||||
test!(set: "{{} true false}", [
|
||||
Ok(TokenKind::OpenBrace),
|
||||
Ok(TokenKind::OpenBrace),
|
||||
Ok(TokenKind::CloseBrace),
|
||||
@ -357,70 +364,34 @@ mod tests {
|
||||
Ok(TokenKind::CloseBrace),
|
||||
]);
|
||||
|
||||
test!(hashmap: "#{:bar 0 :baz 1}", [
|
||||
Ok(TokenKind::OpenHashBrace),
|
||||
Ok(TokenKind::Keyword(Symbol::from("bar"))),
|
||||
test!(vector: "[0 10 200]", [
|
||||
Ok(TokenKind::OpenBracket),
|
||||
Ok(TokenKind::Number(0.0)),
|
||||
Ok(TokenKind::Keyword(Symbol::from("baz"))),
|
||||
Ok(TokenKind::Number(10.0)),
|
||||
Ok(TokenKind::Number(200.0)),
|
||||
Ok(TokenKind::CloseBracket),
|
||||
]);
|
||||
|
||||
test!(map: "#{:a 0 :b 1}", [
|
||||
Ok(TokenKind::OpenHashBrace),
|
||||
Ok(TokenKind::Keyword(Symbol::from("a"))),
|
||||
Ok(TokenKind::Number(0.0)),
|
||||
Ok(TokenKind::Keyword(Symbol::from("b"))),
|
||||
Ok(TokenKind::Number(1.0)),
|
||||
Ok(TokenKind::CloseBrace),
|
||||
]);
|
||||
|
||||
test!(vector: "[0 1 2]", [
|
||||
Ok(TokenKind::OpenBracket),
|
||||
Ok(TokenKind::Number(0.0)),
|
||||
Ok(TokenKind::Number(1.0)),
|
||||
Ok(TokenKind::Number(2.0)),
|
||||
Ok(TokenKind::CloseBracket),
|
||||
]);
|
||||
|
||||
test!(char_literal: r"'x' '\n' '\e' '\\' '\q' 'b", [
|
||||
test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [
|
||||
Ok(TokenKind::Char('x')),
|
||||
Ok(TokenKind::Char('\n')),
|
||||
Ok(TokenKind::Char('\r')),
|
||||
Ok(TokenKind::Char('\t')),
|
||||
Ok(TokenKind::Char('\x1b')),
|
||||
Ok(TokenKind::Char('\\')),
|
||||
Err(LexerErrorKind::InvalidEscape('q')),
|
||||
Err(LexerErrorKind::UnclosedChar),
|
||||
]);
|
||||
|
||||
test!(nested_lists: "(+ 14 25.5 333 (* 2 5))", [
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("+"))),
|
||||
Ok(TokenKind::Number(14.0)),
|
||||
Ok(TokenKind::Number(25.5)),
|
||||
Ok(TokenKind::Number(333.0)),
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("*"))),
|
||||
Ok(TokenKind::Number(2.0)),
|
||||
Ok(TokenKind::Number(5.0)),
|
||||
Ok(TokenKind::CloseParen),
|
||||
Ok(TokenKind::CloseParen),
|
||||
]);
|
||||
|
||||
test!(newline: "(+ 14 25.5 333\n(* 2 5 5.x))", [
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("+"))),
|
||||
Ok(TokenKind::Number(14.0)),
|
||||
Ok(TokenKind::Number(25.5)),
|
||||
Ok(TokenKind::Number(333.0)),
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("*"))),
|
||||
Ok(TokenKind::Number(2.0)),
|
||||
Ok(TokenKind::Number(5.0)),
|
||||
Err(LexerErrorKind::InvalidNumber("5.x".into())),
|
||||
Ok(TokenKind::CloseParen),
|
||||
Ok(TokenKind::CloseParen),
|
||||
]);
|
||||
|
||||
test!(negative_minus: "(- 1 -2 3)", [
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("-"))),
|
||||
Ok(TokenKind::Number(1.0)),
|
||||
Ok(TokenKind::Number(-2.0)),
|
||||
Ok(TokenKind::Number(3.0)),
|
||||
Ok(TokenKind::CloseParen),
|
||||
]);
|
||||
|
||||
test!(error_unclosed_char_escape: r"'\", [
|
||||
Err(LexerErrorKind::UnclosedChar),
|
||||
]);
|
||||
@ -429,12 +400,30 @@ mod tests {
|
||||
Err(LexerErrorKind::UnclosedChar),
|
||||
]);
|
||||
|
||||
test!(error_parse_numbers: "2 55 3.144 0.0001 1.1.1", [
|
||||
Ok(TokenKind::Number(2.0)),
|
||||
Ok(TokenKind::Number(55.0)),
|
||||
Ok(TokenKind::Number(3.144)),
|
||||
Ok(TokenKind::Number(0.0001)),
|
||||
test!(number: "0 -1 20.0 +0.003", [
|
||||
Ok(TokenKind::Number(0.0)),
|
||||
Ok(TokenKind::Number(-1.0)),
|
||||
Ok(TokenKind::Number(20.0)),
|
||||
Ok(TokenKind::Number(0.003)),
|
||||
]);
|
||||
|
||||
test!(error_parse_number: "1.1.1 0.x", [
|
||||
Err(LexerErrorKind::InvalidNumber("1.1.1".into())),
|
||||
Err(LexerErrorKind::InvalidNumber("0.x".into())),
|
||||
]);
|
||||
|
||||
test!(string: "\"\" \"xyz\" \"This is a string!\"", [
|
||||
Ok(TokenKind::String("".into())),
|
||||
Ok(TokenKind::String("xyz".into())),
|
||||
Ok(TokenKind::String("This is a string!".into())),
|
||||
]);
|
||||
|
||||
test!(string_with_escapes: "\"\\e[0mfoo\\nbar\\r\\t\"", [
|
||||
Ok(TokenKind::String("\x1b[0mfoo\nbar\r\t".into())),
|
||||
]);
|
||||
|
||||
test!(error_invalid_escape_string: "\"oh no \\p\"", [
|
||||
Err(LexerErrorKind::InvalidEscape('p')),
|
||||
]);
|
||||
|
||||
test!(error_unclosed_string: "\"hiii", [
|
||||
@ -444,4 +433,20 @@ mod tests {
|
||||
test!(error_invalid_string: "\"hiii\"222", [
|
||||
Err(LexerErrorKind::InvalidString),
|
||||
]);
|
||||
|
||||
test!(nested_lists: "(+ (- 0 -1)\n(* 2 3))", [
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("+"))),
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("-"))),
|
||||
Ok(TokenKind::Number(0.0)),
|
||||
Ok(TokenKind::Number(-1.0)),
|
||||
Ok(TokenKind::CloseParen),
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("*"))),
|
||||
Ok(TokenKind::Number(2.0)),
|
||||
Ok(TokenKind::Number(3.0)),
|
||||
Ok(TokenKind::CloseParen),
|
||||
Ok(TokenKind::CloseParen),
|
||||
]);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user