Fix separator-related bug in lexer, rewrite lexer unit tests
This commit is contained in:
parent
ee08ffb28d
commit
cc01cf97db
@ -14,21 +14,17 @@ mod error;
|
|||||||
mod symbol;
|
mod symbol;
|
||||||
mod token;
|
mod token;
|
||||||
|
|
||||||
/// A trait for checking if a character is a separator.
|
/// Determine if the current character is a separator, performing 1-character
|
||||||
pub trait Separator {
|
/// lookahead as needed to handle multi-character separators.
|
||||||
/// Check if the character is a separator.
|
fn is_separator(current: char, next: Option<char>) -> bool {
|
||||||
fn is_separator(&self) -> bool;
|
current.is_ascii_whitespace()
|
||||||
}
|
|| matches!(current, '(' | ')' | '[' | ']' | '{' | '}' | ';')
|
||||||
|
|| (current == '#' && next.is_some_and(|c| matches!(c, '|' | '{')))
|
||||||
impl Separator for char {
|
|
||||||
fn is_separator(&self) -> bool {
|
|
||||||
self.is_ascii_whitespace() || matches!(self, '(' | ')' | '[' | ']' | '{' | '}')
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A lexer, used by the parser.
|
/// A lexer, used by the parser.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct Lexer<'lexer> {
|
pub(crate) struct Lexer<'lexer> {
|
||||||
input: Chars<'lexer>,
|
input: Chars<'lexer>,
|
||||||
byte: usize,
|
byte: usize,
|
||||||
source: Arc<Source>,
|
source: Arc<Source>,
|
||||||
@ -49,24 +45,18 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
|
|
||||||
/// Set the name of the [Source] being lexically analyzed.
|
/// Set the name of the [Source] being lexically analyzed.
|
||||||
pub fn set_name(&mut self, name: String) {
|
pub fn set_name(&mut self, name: String) {
|
||||||
// FIXME: Avoid unwrapping here (if possible?)
|
// TODO: Avoid unwrapping here (if possible?)
|
||||||
Arc::get_mut(&mut self.source).unwrap().set_name(name);
|
Arc::get_mut(&mut self.source).unwrap().set_name(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The source being lexically analyzed.
|
/// The source being lexically analyzed.
|
||||||
|
#[cfg(test)]
|
||||||
#[must_use]
|
#[must_use]
|
||||||
pub fn source(&self) -> Arc<Source> {
|
pub(crate) fn source(&self) -> Arc<Source> {
|
||||||
self.source.clone()
|
self.source.clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the unparsed input.
|
|
||||||
#[must_use]
|
|
||||||
pub fn get_unparsed(&self) -> &str {
|
|
||||||
self.input.as_str()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the current position of the lexer.
|
/// Get the current position of the lexer.
|
||||||
#[inline]
|
|
||||||
#[must_use]
|
#[must_use]
|
||||||
pub(crate) fn span(&self) -> Span {
|
pub(crate) fn span(&self) -> Span {
|
||||||
Span::new(self.byte..self.byte, self.source.clone())
|
Span::new(self.byte..self.byte, self.source.clone())
|
||||||
@ -79,7 +69,7 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Get the nth character ahead of the current character without advancing.
|
/// Get the nth character ahead of the current character without advancing.
|
||||||
#[inline]
|
#[must_use]
|
||||||
fn peek(&self, n: usize) -> Option<char> {
|
fn peek(&self, n: usize) -> Option<char> {
|
||||||
self.input.as_str().chars().nth(n)
|
self.input.as_str().chars().nth(n)
|
||||||
}
|
}
|
||||||
@ -105,7 +95,7 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
fn read_word(&mut self) -> String {
|
fn read_word(&mut self) -> String {
|
||||||
let mut word = String::new();
|
let mut word = String::new();
|
||||||
while let Some(c) = self.current() {
|
while let Some(c) = self.current() {
|
||||||
if c.is_separator() || (c == '#' && self.peek(1) == Some('{')) {
|
if is_separator(c, self.peek(1)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -117,6 +107,7 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Parse a value from the input or return an error.
|
/// Parse a value from the input or return an error.
|
||||||
|
#[must_use]
|
||||||
fn parse_or<T>(&mut self, err: impl Fn(String) -> LexerErrorKind) -> Result<T, LexerError>
|
fn parse_or<T>(&mut self, err: impl Fn(String) -> LexerErrorKind) -> Result<T, LexerError>
|
||||||
where
|
where
|
||||||
T: FromStr,
|
T: FromStr,
|
||||||
@ -143,7 +134,9 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let mut span = self.span();
|
let mut span = self.span();
|
||||||
|
|
||||||
let kind = match c {
|
let kind = match c {
|
||||||
|
// TODO: This allows for unclosed block comments; do we care?
|
||||||
'#' if self.peek(1) == Some('|') => {
|
'#' if self.peek(1) == Some('|') => {
|
||||||
self.advance(); // '#'
|
self.advance(); // '#'
|
||||||
self.advance(); // '|'
|
self.advance(); // '|'
|
||||||
@ -167,6 +160,7 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
self.advance();
|
self.advance();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Line comments continue until a newline character is encountered:
|
||||||
let mut comment = String::new();
|
let mut comment = String::new();
|
||||||
while let Some(c) = self.advance() {
|
while let Some(c) = self.advance() {
|
||||||
if c == '\n' {
|
if c == '\n' {
|
||||||
@ -199,6 +193,8 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
Some('\\') => match self.advance() {
|
Some('\\') => match self.advance() {
|
||||||
Some(c @ ('"' | '\\')) => c,
|
Some(c @ ('"' | '\\')) => c,
|
||||||
Some('n') => '\n',
|
Some('n') => '\n',
|
||||||
|
Some('r') => '\r',
|
||||||
|
Some('t') => '\t',
|
||||||
Some('e') => '\x1b',
|
Some('e') => '\x1b',
|
||||||
Some(c) => {
|
Some(c) => {
|
||||||
self.read_word(); // Recover from the error
|
self.read_word(); // Recover from the error
|
||||||
@ -250,12 +246,15 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
Some('\\') => match self.advance() {
|
Some('\\') => match self.advance() {
|
||||||
Some(c @ ('"' | '\\')) => c,
|
Some(c @ ('"' | '\\')) => c,
|
||||||
Some('n') => '\n',
|
Some('n') => '\n',
|
||||||
|
Some('r') => '\r',
|
||||||
|
Some('t') => '\t',
|
||||||
Some('e') => '\x1b',
|
Some('e') => '\x1b',
|
||||||
Some(c) => {
|
Some(c) => {
|
||||||
|
self.read_word(); // Recover from the error
|
||||||
return Err(LexerError::new(
|
return Err(LexerError::new(
|
||||||
LexerErrorKind::InvalidEscape(c),
|
LexerErrorKind::InvalidEscape(c),
|
||||||
ch_span.join(&self.span()),
|
ch_span.join(&self.span()),
|
||||||
))
|
));
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
return Err(LexerError::new(
|
return Err(LexerError::new(
|
||||||
@ -271,7 +270,10 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if self.current().is_some_and(|c| !c.is_separator()) {
|
if self
|
||||||
|
.current()
|
||||||
|
.is_some_and(|c| !is_separator(c, self.peek(1)))
|
||||||
|
{
|
||||||
self.read_word(); // Recover from the error
|
self.read_word(); // Recover from the error
|
||||||
return Err(LexerError::new(
|
return Err(LexerError::new(
|
||||||
LexerErrorKind::InvalidString,
|
LexerErrorKind::InvalidString,
|
||||||
@ -282,7 +284,7 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
TokenKind::String(string)
|
TokenKind::String(string)
|
||||||
}
|
}
|
||||||
':' => {
|
':' => {
|
||||||
self.advance();
|
self.advance(); // ':'
|
||||||
|
|
||||||
TokenKind::Keyword(Symbol(self.read_word()))
|
TokenKind::Keyword(Symbol(self.read_word()))
|
||||||
}
|
}
|
||||||
@ -320,35 +322,40 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn $name() {
|
fn $name() {
|
||||||
let mut lexer = Lexer::new($input);
|
let mut lexer = Lexer::new($input);
|
||||||
|
|
||||||
for token in $tokens {
|
for token in $tokens {
|
||||||
let x = lexer.next().map(|r| match r {
|
let kind = lexer.next().map(|r| match r {
|
||||||
Ok(t) => Ok(t.kind),
|
Ok(t) => Ok(t.kind),
|
||||||
Err(e) => Err(e.kind),
|
Err(e) => Err(e.kind),
|
||||||
});
|
});
|
||||||
|
assert_eq!(kind, Some(token));
|
||||||
assert_eq!(x, Some(token));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
assert_eq!(lexer.next(), None);
|
assert_eq!(lexer.next(), None);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
test!(block_comment: "#| foo\nbar |#(- 1)", [
|
test!(block_comment: "foo#| bar\nbaz |#qux", [
|
||||||
Ok(TokenKind::BlockComment("foo\nbar".into())),
|
Ok(TokenKind::Symbol(Symbol::from("foo"))),
|
||||||
|
Ok(TokenKind::BlockComment("bar\nbaz".into())),
|
||||||
|
Ok(TokenKind::Symbol(Symbol::from("qux"))),
|
||||||
|
]);
|
||||||
|
|
||||||
|
test!(line_comment: "nil ;; foo; bar\nnil", [
|
||||||
|
Ok(TokenKind::Nil),
|
||||||
|
Ok(TokenKind::LineComment("foo; bar".into())),
|
||||||
|
Ok(TokenKind::Nil),
|
||||||
|
]);
|
||||||
|
|
||||||
|
test!(list: "(1 () -2.3)", [
|
||||||
Ok(TokenKind::OpenParen),
|
Ok(TokenKind::OpenParen),
|
||||||
Ok(TokenKind::Symbol(Symbol::from("-"))),
|
|
||||||
Ok(TokenKind::Number(1.0)),
|
Ok(TokenKind::Number(1.0)),
|
||||||
|
Ok(TokenKind::OpenParen),
|
||||||
|
Ok(TokenKind::CloseParen),
|
||||||
|
Ok(TokenKind::Number(-2.3)),
|
||||||
Ok(TokenKind::CloseParen),
|
Ok(TokenKind::CloseParen),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
test!(line_comment: "; foo\n;; bar baz; qux", [
|
test!(set: "{{} true false}", [
|
||||||
Ok(TokenKind::LineComment("foo".into())),
|
|
||||||
Ok(TokenKind::LineComment("bar baz; qux".into())),
|
|
||||||
]);
|
|
||||||
|
|
||||||
test!(hashset: "{{} true false}", [
|
|
||||||
Ok(TokenKind::OpenBrace),
|
Ok(TokenKind::OpenBrace),
|
||||||
Ok(TokenKind::OpenBrace),
|
Ok(TokenKind::OpenBrace),
|
||||||
Ok(TokenKind::CloseBrace),
|
Ok(TokenKind::CloseBrace),
|
||||||
@ -357,70 +364,34 @@ mod tests {
|
|||||||
Ok(TokenKind::CloseBrace),
|
Ok(TokenKind::CloseBrace),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
test!(hashmap: "#{:bar 0 :baz 1}", [
|
test!(vector: "[0 10 200]", [
|
||||||
Ok(TokenKind::OpenHashBrace),
|
Ok(TokenKind::OpenBracket),
|
||||||
Ok(TokenKind::Keyword(Symbol::from("bar"))),
|
|
||||||
Ok(TokenKind::Number(0.0)),
|
Ok(TokenKind::Number(0.0)),
|
||||||
Ok(TokenKind::Keyword(Symbol::from("baz"))),
|
Ok(TokenKind::Number(10.0)),
|
||||||
|
Ok(TokenKind::Number(200.0)),
|
||||||
|
Ok(TokenKind::CloseBracket),
|
||||||
|
]);
|
||||||
|
|
||||||
|
test!(map: "#{:a 0 :b 1}", [
|
||||||
|
Ok(TokenKind::OpenHashBrace),
|
||||||
|
Ok(TokenKind::Keyword(Symbol::from("a"))),
|
||||||
|
Ok(TokenKind::Number(0.0)),
|
||||||
|
Ok(TokenKind::Keyword(Symbol::from("b"))),
|
||||||
Ok(TokenKind::Number(1.0)),
|
Ok(TokenKind::Number(1.0)),
|
||||||
Ok(TokenKind::CloseBrace),
|
Ok(TokenKind::CloseBrace),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
test!(vector: "[0 1 2]", [
|
test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [
|
||||||
Ok(TokenKind::OpenBracket),
|
|
||||||
Ok(TokenKind::Number(0.0)),
|
|
||||||
Ok(TokenKind::Number(1.0)),
|
|
||||||
Ok(TokenKind::Number(2.0)),
|
|
||||||
Ok(TokenKind::CloseBracket),
|
|
||||||
]);
|
|
||||||
|
|
||||||
test!(char_literal: r"'x' '\n' '\e' '\\' '\q' 'b", [
|
|
||||||
Ok(TokenKind::Char('x')),
|
Ok(TokenKind::Char('x')),
|
||||||
Ok(TokenKind::Char('\n')),
|
Ok(TokenKind::Char('\n')),
|
||||||
|
Ok(TokenKind::Char('\r')),
|
||||||
|
Ok(TokenKind::Char('\t')),
|
||||||
Ok(TokenKind::Char('\x1b')),
|
Ok(TokenKind::Char('\x1b')),
|
||||||
Ok(TokenKind::Char('\\')),
|
Ok(TokenKind::Char('\\')),
|
||||||
Err(LexerErrorKind::InvalidEscape('q')),
|
Err(LexerErrorKind::InvalidEscape('q')),
|
||||||
Err(LexerErrorKind::UnclosedChar),
|
Err(LexerErrorKind::UnclosedChar),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
test!(nested_lists: "(+ 14 25.5 333 (* 2 5))", [
|
|
||||||
Ok(TokenKind::OpenParen),
|
|
||||||
Ok(TokenKind::Symbol(Symbol::from("+"))),
|
|
||||||
Ok(TokenKind::Number(14.0)),
|
|
||||||
Ok(TokenKind::Number(25.5)),
|
|
||||||
Ok(TokenKind::Number(333.0)),
|
|
||||||
Ok(TokenKind::OpenParen),
|
|
||||||
Ok(TokenKind::Symbol(Symbol::from("*"))),
|
|
||||||
Ok(TokenKind::Number(2.0)),
|
|
||||||
Ok(TokenKind::Number(5.0)),
|
|
||||||
Ok(TokenKind::CloseParen),
|
|
||||||
Ok(TokenKind::CloseParen),
|
|
||||||
]);
|
|
||||||
|
|
||||||
test!(newline: "(+ 14 25.5 333\n(* 2 5 5.x))", [
|
|
||||||
Ok(TokenKind::OpenParen),
|
|
||||||
Ok(TokenKind::Symbol(Symbol::from("+"))),
|
|
||||||
Ok(TokenKind::Number(14.0)),
|
|
||||||
Ok(TokenKind::Number(25.5)),
|
|
||||||
Ok(TokenKind::Number(333.0)),
|
|
||||||
Ok(TokenKind::OpenParen),
|
|
||||||
Ok(TokenKind::Symbol(Symbol::from("*"))),
|
|
||||||
Ok(TokenKind::Number(2.0)),
|
|
||||||
Ok(TokenKind::Number(5.0)),
|
|
||||||
Err(LexerErrorKind::InvalidNumber("5.x".into())),
|
|
||||||
Ok(TokenKind::CloseParen),
|
|
||||||
Ok(TokenKind::CloseParen),
|
|
||||||
]);
|
|
||||||
|
|
||||||
test!(negative_minus: "(- 1 -2 3)", [
|
|
||||||
Ok(TokenKind::OpenParen),
|
|
||||||
Ok(TokenKind::Symbol(Symbol::from("-"))),
|
|
||||||
Ok(TokenKind::Number(1.0)),
|
|
||||||
Ok(TokenKind::Number(-2.0)),
|
|
||||||
Ok(TokenKind::Number(3.0)),
|
|
||||||
Ok(TokenKind::CloseParen),
|
|
||||||
]);
|
|
||||||
|
|
||||||
test!(error_unclosed_char_escape: r"'\", [
|
test!(error_unclosed_char_escape: r"'\", [
|
||||||
Err(LexerErrorKind::UnclosedChar),
|
Err(LexerErrorKind::UnclosedChar),
|
||||||
]);
|
]);
|
||||||
@ -429,12 +400,30 @@ mod tests {
|
|||||||
Err(LexerErrorKind::UnclosedChar),
|
Err(LexerErrorKind::UnclosedChar),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
test!(error_parse_numbers: "2 55 3.144 0.0001 1.1.1", [
|
test!(number: "0 -1 20.0 +0.003", [
|
||||||
Ok(TokenKind::Number(2.0)),
|
Ok(TokenKind::Number(0.0)),
|
||||||
Ok(TokenKind::Number(55.0)),
|
Ok(TokenKind::Number(-1.0)),
|
||||||
Ok(TokenKind::Number(3.144)),
|
Ok(TokenKind::Number(20.0)),
|
||||||
Ok(TokenKind::Number(0.0001)),
|
Ok(TokenKind::Number(0.003)),
|
||||||
|
]);
|
||||||
|
|
||||||
|
test!(error_parse_number: "1.1.1 0.x", [
|
||||||
Err(LexerErrorKind::InvalidNumber("1.1.1".into())),
|
Err(LexerErrorKind::InvalidNumber("1.1.1".into())),
|
||||||
|
Err(LexerErrorKind::InvalidNumber("0.x".into())),
|
||||||
|
]);
|
||||||
|
|
||||||
|
test!(string: "\"\" \"xyz\" \"This is a string!\"", [
|
||||||
|
Ok(TokenKind::String("".into())),
|
||||||
|
Ok(TokenKind::String("xyz".into())),
|
||||||
|
Ok(TokenKind::String("This is a string!".into())),
|
||||||
|
]);
|
||||||
|
|
||||||
|
test!(string_with_escapes: "\"\\e[0mfoo\\nbar\\r\\t\"", [
|
||||||
|
Ok(TokenKind::String("\x1b[0mfoo\nbar\r\t".into())),
|
||||||
|
]);
|
||||||
|
|
||||||
|
test!(error_invalid_escape_string: "\"oh no \\p\"", [
|
||||||
|
Err(LexerErrorKind::InvalidEscape('p')),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
test!(error_unclosed_string: "\"hiii", [
|
test!(error_unclosed_string: "\"hiii", [
|
||||||
@ -444,4 +433,20 @@ mod tests {
|
|||||||
test!(error_invalid_string: "\"hiii\"222", [
|
test!(error_invalid_string: "\"hiii\"222", [
|
||||||
Err(LexerErrorKind::InvalidString),
|
Err(LexerErrorKind::InvalidString),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
test!(nested_lists: "(+ (- 0 -1)\n(* 2 3))", [
|
||||||
|
Ok(TokenKind::OpenParen),
|
||||||
|
Ok(TokenKind::Symbol(Symbol::from("+"))),
|
||||||
|
Ok(TokenKind::OpenParen),
|
||||||
|
Ok(TokenKind::Symbol(Symbol::from("-"))),
|
||||||
|
Ok(TokenKind::Number(0.0)),
|
||||||
|
Ok(TokenKind::Number(-1.0)),
|
||||||
|
Ok(TokenKind::CloseParen),
|
||||||
|
Ok(TokenKind::OpenParen),
|
||||||
|
Ok(TokenKind::Symbol(Symbol::from("*"))),
|
||||||
|
Ok(TokenKind::Number(2.0)),
|
||||||
|
Ok(TokenKind::Number(3.0)),
|
||||||
|
Ok(TokenKind::CloseParen),
|
||||||
|
Ok(TokenKind::CloseParen),
|
||||||
|
]);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user