Even more lexer improvments, increased test coverage too!
This commit is contained in:
parent
0839bd542c
commit
11917bb183
@ -8,3 +8,6 @@ repository.workspace = true
|
|||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
|
||||||
|
[lints.rust]
|
||||||
|
unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tarpaulin_include)'] }
|
||||||
|
@ -1,10 +1,8 @@
|
|||||||
use crate::span::Span;
|
use crate::span::Span;
|
||||||
|
|
||||||
/// Errors that can occur during lexical analysis.
|
/// Errors that can occur during lexical analysis.
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
pub enum LexerErrorKind {
|
pub enum LexerErrorKind {
|
||||||
/// An invalid character was encountered.
|
|
||||||
InvalidChar,
|
|
||||||
/// An invalid escape sequence was encountered.
|
/// An invalid escape sequence was encountered.
|
||||||
InvalidEscape(char),
|
InvalidEscape(char),
|
||||||
/// An invalid number was encountered.
|
/// An invalid number was encountered.
|
||||||
@ -18,7 +16,7 @@ pub enum LexerErrorKind {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Lexer error, with a start and end location.
|
/// Lexer error, with a start and end location.
|
||||||
#[derive(Debug, Clone, PartialEq)]
|
#[derive(Debug, Clone, PartialEq, Hash)]
|
||||||
pub struct LexerError {
|
pub struct LexerError {
|
||||||
/// The type of error encountered.
|
/// The type of error encountered.
|
||||||
pub kind: LexerErrorKind,
|
pub kind: LexerErrorKind,
|
||||||
@ -36,12 +34,12 @@ impl LexerError {
|
|||||||
|
|
||||||
impl std::error::Error for LexerError {}
|
impl std::error::Error for LexerError {}
|
||||||
|
|
||||||
|
#[cfg(not(tarpaulin_include))]
|
||||||
impl std::fmt::Display for LexerError {
|
impl std::fmt::Display for LexerError {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
use LexerErrorKind::*;
|
use LexerErrorKind::*;
|
||||||
|
|
||||||
match &self.kind {
|
match &self.kind {
|
||||||
InvalidChar => write!(f, "Invalid character literal"),
|
|
||||||
InvalidEscape(c) => write!(f, "Unknown escape sequence '\\{c}' in string"),
|
InvalidEscape(c) => write!(f, "Unknown escape sequence '\\{c}' in string"),
|
||||||
InvalidNumber(n) => write!(f, "`{n}` is not a valid numeric literal"),
|
InvalidNumber(n) => write!(f, "`{n}` is not a valid numeric literal"),
|
||||||
InvalidString => write!(f, "Invalid string literal"),
|
InvalidString => write!(f, "Invalid string literal"),
|
||||||
|
@ -92,6 +92,14 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
Some(c)
|
Some(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Advance the lexer by one character, and then return the specified
|
||||||
|
/// `TokenKind`:
|
||||||
|
fn advance_and(&mut self, kind: TokenKind) -> TokenKind {
|
||||||
|
self.advance();
|
||||||
|
|
||||||
|
kind
|
||||||
|
}
|
||||||
|
|
||||||
/// Read a word from the input until a separator is reached.
|
/// Read a word from the input until a separator is reached.
|
||||||
fn read_word(&mut self) -> String {
|
fn read_word(&mut self) -> String {
|
||||||
let mut word = String::new();
|
let mut word = String::new();
|
||||||
@ -135,13 +143,12 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
let kind = match c {
|
let kind = match c {
|
||||||
'#' if matches!(self.peek(1), Some('|')) => {
|
'#' if matches!(self.peek(1), Some('|')) => {
|
||||||
self.advance(); // '#'
|
self.advance(); // '#'
|
||||||
self.advance(); // '|#'
|
self.advance(); // '|'
|
||||||
|
|
||||||
let mut comment = String::new();
|
let mut comment = String::new();
|
||||||
while let Some(c) = self.advance() {
|
while let Some(c) = self.advance() {
|
||||||
match c {
|
match c {
|
||||||
'|' if matches!(self.peek(0), Some('#')) => {
|
'|' if matches!(self.peek(0), Some('#')) => {
|
||||||
self.advance(); // '|'
|
|
||||||
self.advance(); // '#'
|
self.advance(); // '#'
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -154,44 +161,29 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
TokenKind::BlockComment(comment.trim().into())
|
TokenKind::BlockComment(comment.trim().into())
|
||||||
}
|
}
|
||||||
';' => {
|
';' => {
|
||||||
|
while self.current().is_some_and(|c| c == ';') {
|
||||||
|
self.advance();
|
||||||
|
}
|
||||||
|
|
||||||
let mut comment = String::new();
|
let mut comment = String::new();
|
||||||
while let Some(c) = self.advance() {
|
while let Some(c) = self.advance() {
|
||||||
match c {
|
if c == '\n' {
|
||||||
';' => continue,
|
break;
|
||||||
'\n' => break,
|
}
|
||||||
c => {
|
|
||||||
comment.push(c);
|
comment.push(c);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TokenKind::LineComment(comment.trim().into())
|
TokenKind::LineComment(comment.trim().into())
|
||||||
}
|
}
|
||||||
|
|
||||||
'(' => {
|
'(' => self.advance_and(TokenKind::OpenParen),
|
||||||
self.advance();
|
')' => self.advance_and(TokenKind::CloseParen),
|
||||||
TokenKind::OpenParen
|
'{' => self.advance_and(TokenKind::OpenBrace),
|
||||||
}
|
'}' => self.advance_and(TokenKind::CloseBrace),
|
||||||
')' => {
|
'[' => self.advance_and(TokenKind::OpenBracket),
|
||||||
self.advance();
|
']' => self.advance_and(TokenKind::CloseBracket),
|
||||||
TokenKind::CloseParen
|
|
||||||
}
|
|
||||||
'{' => {
|
|
||||||
self.advance();
|
|
||||||
TokenKind::OpenBrace
|
|
||||||
}
|
|
||||||
'}' => {
|
|
||||||
self.advance();
|
|
||||||
TokenKind::CloseBrace
|
|
||||||
}
|
|
||||||
'[' => {
|
|
||||||
self.advance();
|
|
||||||
TokenKind::OpenBracket
|
|
||||||
}
|
|
||||||
']' => {
|
|
||||||
self.advance();
|
|
||||||
TokenKind::CloseBracket
|
|
||||||
}
|
|
||||||
'#' if matches!(self.peek(1), Some('{')) => {
|
'#' if matches!(self.peek(1), Some('{')) => {
|
||||||
self.advance(); // '#'
|
self.advance(); // '#'
|
||||||
self.advance(); // '{'
|
self.advance(); // '{'
|
||||||
@ -207,6 +199,7 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
Some('n') => '\n',
|
Some('n') => '\n',
|
||||||
Some('e') => '\x1b',
|
Some('e') => '\x1b',
|
||||||
Some(c) => {
|
Some(c) => {
|
||||||
|
self.read_word(); // Recover from the error
|
||||||
return Err(LexerError::new(
|
return Err(LexerError::new(
|
||||||
LexerErrorKind::InvalidEscape(c),
|
LexerErrorKind::InvalidEscape(c),
|
||||||
span.join(&self.span()),
|
span.join(&self.span()),
|
||||||
@ -229,9 +222,9 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
if self.advance() != Some('\'') {
|
if self.advance() != Some('\'') {
|
||||||
self.read_word();
|
self.read_word(); // Recover from the error
|
||||||
return Err(LexerError::new(
|
return Err(LexerError::new(
|
||||||
LexerErrorKind::InvalidChar,
|
LexerErrorKind::UnclosedChar,
|
||||||
span.join(&self.span()),
|
span.join(&self.span()),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
@ -277,7 +270,7 @@ impl<'lexer> Lexer<'lexer> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if self.current().is_some_and(|c| !c.is_separator()) {
|
if self.current().is_some_and(|c| !c.is_separator()) {
|
||||||
self.read_word();
|
self.read_word(); // Recover from the error
|
||||||
return Err(LexerError::new(
|
return Err(LexerError::new(
|
||||||
LexerErrorKind::InvalidString,
|
LexerErrorKind::InvalidString,
|
||||||
span.join(&self.span()),
|
span.join(&self.span()),
|
||||||
@ -339,13 +332,17 @@ mod tests {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
test!(block_comment: "#| foo\nbar |#", [
|
test!(block_comment: "#| foo\nbar |#(- 1)", [
|
||||||
Ok(TokenKind::BlockComment("foo\nbar".into()))
|
Ok(TokenKind::BlockComment("foo\nbar".into())),
|
||||||
|
Ok(TokenKind::OpenParen),
|
||||||
|
Ok(TokenKind::Symbol(Symbol::from("-"))),
|
||||||
|
Ok(TokenKind::Number(1.0)),
|
||||||
|
Ok(TokenKind::CloseParen),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
test!(line_comment: "; foo\n;; bar baz qux", [
|
test!(line_comment: "; foo\n;; bar baz; qux", [
|
||||||
Ok(TokenKind::LineComment("foo".into())),
|
Ok(TokenKind::LineComment("foo".into())),
|
||||||
Ok(TokenKind::LineComment("bar baz qux".into())),
|
Ok(TokenKind::LineComment("bar baz; qux".into())),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
test!(hashset: "{{} true false}", [
|
test!(hashset: "{{} true false}", [
|
||||||
@ -369,9 +366,21 @@ mod tests {
|
|||||||
Ok(TokenKind::CloseParen),
|
Ok(TokenKind::CloseParen),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
test!(char_literal: "'x' '\n'", [
|
test!(vector: "[0 1 2]", [
|
||||||
|
Ok(TokenKind::OpenBracket),
|
||||||
|
Ok(TokenKind::Number(0.0)),
|
||||||
|
Ok(TokenKind::Number(1.0)),
|
||||||
|
Ok(TokenKind::Number(2.0)),
|
||||||
|
Ok(TokenKind::CloseBracket),
|
||||||
|
]);
|
||||||
|
|
||||||
|
test!(char_literal: r"'x' '\n' '\e' '\\' '\q' 'b", [
|
||||||
Ok(TokenKind::Char('x')),
|
Ok(TokenKind::Char('x')),
|
||||||
Ok(TokenKind::Char('\n')),
|
Ok(TokenKind::Char('\n')),
|
||||||
|
Ok(TokenKind::Char('\x1b')),
|
||||||
|
Ok(TokenKind::Char('\\')),
|
||||||
|
Err(LexerErrorKind::InvalidEscape('q')),
|
||||||
|
Err(LexerErrorKind::UnclosedChar),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
test!(lex: "(+ 14 25.5 333 (* 2 5))", [
|
test!(lex: "(+ 14 25.5 333 (* 2 5))", [
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::{cmp::Ordering, iter, ops::Range, sync::Arc};
|
use std::{cmp::Ordering, iter, ops::Range, sync::Arc};
|
||||||
|
|
||||||
/// A location within some source text.
|
/// A location within some source text.
|
||||||
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
|
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
pub struct Location {
|
pub struct Location {
|
||||||
line: usize,
|
line: usize,
|
||||||
column: usize,
|
column: usize,
|
||||||
@ -25,7 +25,7 @@ impl PartialOrd for Location {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Some (optionally named) source text.
|
/// Some (optionally named) source text.
|
||||||
#[derive(Debug, Default, Clone, PartialEq, Eq)]
|
#[derive(Debug, Default, Clone, PartialEq, Eq, Hash)]
|
||||||
pub struct Source {
|
pub struct Source {
|
||||||
name: Option<String>,
|
name: Option<String>,
|
||||||
contents: String,
|
contents: String,
|
||||||
@ -87,7 +87,7 @@ impl Source {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// A contiguous sequence of bytes within some source.
|
/// A contiguous sequence of bytes within some source.
|
||||||
#[derive(Debug, Default, Clone)]
|
#[derive(Debug, Default, Clone, Hash)]
|
||||||
pub struct Span {
|
pub struct Span {
|
||||||
bytes: Range<usize>,
|
bytes: Range<usize>,
|
||||||
source: Arc<Source>,
|
source: Arc<Source>,
|
||||||
|
Loading…
Reference in New Issue
Block a user