Lexer improvements, more tests, some reorg

This commit is contained in:
Jesse Braham 2024-12-02 17:57:11 +01:00
parent de78b9840a
commit 0839bd542c
3 changed files with 150 additions and 105 deletions

View File

@ -22,7 +22,7 @@ pub trait Separator {
impl Separator for char { impl Separator for char {
fn is_separator(&self) -> bool { fn is_separator(&self) -> bool {
self.is_ascii_whitespace() || matches!(self, '(' | ')' | '[' | ']' | '{' | '}' | ',') self.is_ascii_whitespace() || matches!(self, '(' | ')' | '[' | ']' | '{' | '}')
} }
} }
@ -49,11 +49,8 @@ impl<'lexer> Lexer<'lexer> {
/// Set the name of the [Source] being lexically analyzed. /// Set the name of the [Source] being lexically analyzed.
pub fn set_name(&mut self, name: String) { pub fn set_name(&mut self, name: String) {
if let Some(source) = Arc::get_mut(&mut self.source) { // FIXME: Avoid unwrapping here (if possible?)
source.set_name(name); Arc::get_mut(&mut self.source).unwrap().set_name(name);
} else {
unimplemented!(); // FIXME: What should we do in this case?
}
} }
/// The source being lexically analyzed. /// The source being lexically analyzed.
@ -62,12 +59,6 @@ impl<'lexer> Lexer<'lexer> {
self.source.clone() self.source.clone()
} }
/// Get the current character.
#[must_use]
pub fn current(&self) -> Option<char> {
self.input.as_str().chars().next()
}
/// Get the unparsed input. /// Get the unparsed input.
#[must_use] #[must_use]
pub fn get_unparsed(&self) -> &str { pub fn get_unparsed(&self) -> &str {
@ -81,7 +72,14 @@ impl<'lexer> Lexer<'lexer> {
Span::new(self.byte..self.byte, self.source.clone()) Span::new(self.byte..self.byte, self.source.clone())
} }
/// Get the current character.
#[must_use]
fn current(&self) -> Option<char> {
self.input.as_str().chars().next()
}
/// Get the nth character ahead of the current character without advancing. /// Get the nth character ahead of the current character without advancing.
#[inline]
fn peek(&self, n: usize) -> Option<char> { fn peek(&self, n: usize) -> Option<char> {
self.input.as_str().chars().nth(n) self.input.as_str().chars().nth(n)
} }
@ -98,7 +96,7 @@ impl<'lexer> Lexer<'lexer> {
fn read_word(&mut self) -> String { fn read_word(&mut self) -> String {
let mut word = String::new(); let mut word = String::new();
while let Some(c) = self.current() { while let Some(c) = self.current() {
if c.is_separator() { if c.is_separator() || (c == '#' && self.peek(1) == Some('{')) {
break; break;
} }
@ -125,7 +123,7 @@ impl<'lexer> Lexer<'lexer> {
pub fn read(&mut self) -> Result<Option<Token>, LexerError> { pub fn read(&mut self) -> Result<Option<Token>, LexerError> {
let c = loop { let c = loop {
match self.current() { match self.current() {
Some(c) if c.is_ascii_whitespace() || c == ',' => { Some(c) if c.is_ascii_whitespace() => {
self.advance(); self.advance();
} }
Some(c) => break c, Some(c) => break c,
@ -135,6 +133,41 @@ impl<'lexer> Lexer<'lexer> {
let mut span = self.span(); let mut span = self.span();
let kind = match c { let kind = match c {
'#' if matches!(self.peek(1), Some('|')) => {
self.advance(); // '#'
self.advance(); // '|#'
let mut comment = String::new();
while let Some(c) = self.advance() {
match c {
'|' if matches!(self.peek(0), Some('#')) => {
self.advance(); // '|'
self.advance(); // '#'
break;
}
c => {
comment.push(c);
}
}
}
TokenKind::BlockComment(comment.trim().into())
}
';' => {
let mut comment = String::new();
while let Some(c) = self.advance() {
match c {
';' => continue,
'\n' => break,
c => {
comment.push(c);
}
}
}
TokenKind::LineComment(comment.trim().into())
}
'(' => { '(' => {
self.advance(); self.advance();
TokenKind::OpenParen TokenKind::OpenParen
@ -159,48 +192,56 @@ impl<'lexer> Lexer<'lexer> {
self.advance(); self.advance();
TokenKind::CloseBracket TokenKind::CloseBracket
} }
'#' if matches!(self.peek(1), Some('{')) => {
self.advance(); // '#'
self.advance(); // '{'
TokenKind::OpenHashBrace
}
'\'' => {
self.advance(); // '\''
let c = match self.advance() {
Some('\\') => match self.advance() {
Some(c @ ('"' | '\\')) => c,
Some('n') => '\n',
Some('e') => '\x1b',
Some(c) => {
return Err(LexerError::new(
LexerErrorKind::InvalidEscape(c),
span.join(&self.span()),
));
}
None => {
return Err(LexerError::new(
LexerErrorKind::UnclosedChar,
span.join(&self.span()),
));
}
},
Some(c) => c,
None => {
return Err(LexerError::new(
LexerErrorKind::UnclosedChar,
span.join(&self.span()),
))
}
};
if self.advance() != Some('\'') {
self.read_word();
return Err(LexerError::new(
LexerErrorKind::InvalidChar,
span.join(&self.span()),
));
}
TokenKind::Char(c)
}
'0'..='9' => TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?), '0'..='9' => TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?),
'+' | '-' if matches!(self.peek(1), Some('0'..='9')) => { '+' | '-' if matches!(self.peek(1), Some('0'..='9')) => {
TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?) TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?)
} }
';' => {
let mut comment = String::new();
while let Some(c) = self.advance() {
match c {
';' => continue,
'\n' => break,
c => {
comment.push(c);
}
}
}
TokenKind::LineComment(comment.trim().into())
}
'#' if matches!(self.peek(1), Some('|')) => {
self.advance(); // '#'
self.advance(); // '|#'
let mut comment = String::new();
while let Some(c) = self.advance() {
match c {
'|' if matches!(self.peek(0), Some('#')) => {
self.advance(); // '|'
self.advance(); // '#'
break;
}
c => {
comment.push(c);
}
}
}
TokenKind::BlockComment(comment.trim().into())
}
':' => {
self.advance();
TokenKind::Keyword(Symbol(self.read_word()))
}
'"' => { '"' => {
self.advance(); // '"' self.advance(); // '"'
@ -245,45 +286,9 @@ impl<'lexer> Lexer<'lexer> {
TokenKind::String(string) TokenKind::String(string)
} }
'\'' => { ':' => {
self.advance(); // '\'' self.advance();
TokenKind::Keyword(Symbol(self.read_word()))
let c = match self.advance() {
Some('\\') => match self.advance() {
Some(c @ ('"' | '\\')) => c,
Some('n') => '\n',
Some('e') => '\x1b',
Some(c) => {
return Err(LexerError::new(
LexerErrorKind::InvalidEscape(c),
span.join(&self.span()),
));
}
None => {
return Err(LexerError::new(
LexerErrorKind::UnclosedChar,
span.join(&self.span()),
));
}
},
Some(c) => c,
None => {
return Err(LexerError::new(
LexerErrorKind::UnclosedChar,
span.join(&self.span()),
))
}
};
if self.advance() != Some('\'') {
self.read_word();
return Err(LexerError::new(
LexerErrorKind::InvalidChar,
span.join(&self.span()),
));
}
TokenKind::Char(c)
} }
_ => { _ => {
let word = self.read_word(); let word = self.read_word();
@ -334,6 +339,41 @@ mod tests {
}; };
} }
test!(block_comment: "#| foo\nbar |#", [
Ok(TokenKind::BlockComment("foo\nbar".into()))
]);
test!(line_comment: "; foo\n;; bar baz qux", [
Ok(TokenKind::LineComment("foo".into())),
Ok(TokenKind::LineComment("bar baz qux".into())),
]);
test!(hashset: "{{} true false}", [
Ok(TokenKind::OpenBrace),
Ok(TokenKind::OpenBrace),
Ok(TokenKind::CloseBrace),
Ok(TokenKind::Bool(true)),
Ok(TokenKind::Bool(false)),
Ok(TokenKind::CloseBrace),
]);
test!(hashmap: "(foo #{:bar 0 :baz 1})", [
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("foo"))),
Ok(TokenKind::OpenHashBrace),
Ok(TokenKind::Keyword(Symbol::from("bar"))),
Ok(TokenKind::Number(0.0)),
Ok(TokenKind::Keyword(Symbol::from("baz"))),
Ok(TokenKind::Number(1.0)),
Ok(TokenKind::CloseBrace),
Ok(TokenKind::CloseParen),
]);
test!(char_literal: "'x' '\n'", [
Ok(TokenKind::Char('x')),
Ok(TokenKind::Char('\n')),
]);
test!(lex: "(+ 14 25.5 333 (* 2 5))", [ test!(lex: "(+ 14 25.5 333 (* 2 5))", [
Ok(TokenKind::OpenParen), Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("+"))), Ok(TokenKind::Symbol(Symbol::from("+"))),
@ -372,15 +412,6 @@ mod tests {
Ok(TokenKind::CloseParen), Ok(TokenKind::CloseParen),
]); ]);
test!(line_comment: "; foo\n;; bar baz qux", [
Ok(TokenKind::LineComment("foo".into())),
Ok(TokenKind::LineComment("bar baz qux".into())),
]);
test!(block_comment: "#| foo\nbar |#", [
Ok(TokenKind::BlockComment("foo\nbar".into()))
]);
test!(error_parse_numbers: "2 55 3.144 0.0001 1.1.1", [ test!(error_parse_numbers: "2 55 3.144 0.0001 1.1.1", [
Ok(TokenKind::Number(2.0)), Ok(TokenKind::Number(2.0)),
Ok(TokenKind::Number(55.0)), Ok(TokenKind::Number(55.0)),

View File

@ -17,3 +17,15 @@ impl std::fmt::Display for Symbol {
write!(f, "{}", self.0) write!(f, "{}", self.0)
} }
} }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn display() {
assert_eq!(Symbol::from("foo").to_string(), "foo");
assert_eq!(Symbol::from("+").to_string(), "+");
assert_eq!(Symbol::from("bar0").to_string(), "bar0");
}
}

View File

@ -4,6 +4,11 @@ use crate::span::Span;
/// The type of a [Token]. /// The type of a [Token].
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
pub enum TokenKind { pub enum TokenKind {
/// Block comment, e.g. '#| ... |#'
BlockComment(String),
/// Line comment, e.g. '; ...'
LineComment(String),
/// Opening parenthesis, e.g. '(' /// Opening parenthesis, e.g. '('
OpenParen, OpenParen,
/// Closing parenthesis, e.g. ')' /// Closing parenthesis, e.g. ')'
@ -16,11 +21,8 @@ pub enum TokenKind {
OpenBracket, OpenBracket,
/// Closing bracket, e.g. ']' /// Closing bracket, e.g. ']'
CloseBracket, CloseBracket,
/// Opening hash-brace, e.g. '#{'
/// Block comment, e.g. '#| ... |#' OpenHashBrace,
BlockComment(String),
/// Line comment, e.g. '; ...'
LineComment(String),
/// Boolean, e.g. 'true', 'false' /// Boolean, e.g. 'true', 'false'
Bool(bool), Bool(bool),