Split the lexer's number token kind into float and integer, more refactoring

This commit is contained in:
Jesse Braham 2024-12-13 16:42:27 +01:00
parent ede65dcf3e
commit ffed778a71
2 changed files with 236 additions and 177 deletions

View File

@ -1,7 +1,4 @@
use std::{
str::{Chars, FromStr},
sync::Arc,
};
use std::{str::Chars, sync::Arc};
pub use self::{
error::{LexerError, LexerErrorKind},
@ -33,7 +30,7 @@ pub(crate) struct Lexer<'lexer> {
impl<'lexer> Lexer<'lexer> {
/// Create a new lexer instance from a string.
#[must_use]
pub(crate) fn new(input: &'lexer str) -> Self {
pub fn new(input: &'lexer str) -> Self {
let source = Arc::new(Source::new(None, input.to_string()));
Self {
@ -44,21 +41,20 @@ impl<'lexer> Lexer<'lexer> {
}
/// Set the name of the [Source] being lexically analyzed.
pub(crate) fn set_name(&mut self, name: String) {
pub fn set_name(&mut self, name: String) {
// TODO: Avoid unwrapping here (if possible?)
Arc::get_mut(&mut self.source).unwrap().set_name(name);
}
/// The source being lexically analyzed.
#[cfg(test)]
#[must_use]
pub(crate) fn source(&self) -> Arc<Source> {
pub fn source(&self) -> Arc<Source> {
self.source.clone()
}
/// Get the current position of the lexer.
#[must_use]
pub(crate) fn span(&self) -> Span {
pub fn span(&self) -> Span {
Span::new(self.byte..self.byte, self.source.clone())
}
@ -106,21 +102,8 @@ impl<'lexer> Lexer<'lexer> {
word
}
/// Parse a value from the input or return an error.
#[must_use]
fn parse_or<T>(&mut self, err: impl Fn(String) -> LexerErrorKind) -> Result<T, LexerError>
where
T: FromStr,
{
let span = self.span();
let word = self.read_word();
word.parse()
.map_err(|_| LexerError::new(err(word), span.join(&self.span())))
}
/// Read the next token from the input.
pub(crate) fn read(&mut self) -> Result<Option<Token>, LexerError> {
pub fn read(&mut self) -> Result<Option<Token>, LexerError> {
// Eat whitespace until we encounter a meaningful character, or simply return if
// we have reached the end of input and no additional characters can be read:
let c = loop {
@ -136,42 +119,8 @@ impl<'lexer> Lexer<'lexer> {
let mut span = self.span();
let kind = match c {
// TODO: This allows for unclosed block comments; do we care?
'#' if self.peek(1) == Some('|') => {
self.advance(); // '#'
self.advance(); // '|'
let mut comment = String::new();
while let Some(c) = self.advance() {
if c == '|' && matches!(self.peek(0), Some('#')) {
self.advance(); // '#'
break;
}
comment.push(c);
}
TokenKind::BlockComment(comment.trim().into())
}
';' => {
// Line comments may start with any number of semicolons, so consume however
// many are present at the beginning of the comment:
while self.current().is_some_and(|c| c == ';') {
self.advance();
}
// Line comments continue until a newline character is encountered:
let mut comment = String::new();
while let Some(c) = self.advance() {
if c == '\n' {
break;
}
comment.push(c);
}
TokenKind::LineComment(comment.trim().into())
}
';' => self.line_comment(),
'#' if self.peek(1) == Some('|') => self.block_comment(),
'(' => self.advance_and(TokenKind::OpenParen),
')' => self.advance_and(TokenKind::CloseParen),
@ -186,108 +135,33 @@ impl<'lexer> Lexer<'lexer> {
TokenKind::OpenHashBrace
}
'\'' => {
self.advance(); // '\''
let c = match self.advance() {
Some('\\') => match self.advance() {
Some(c @ ('"' | '\\')) => c,
Some('n') => '\n',
Some('r') => '\r',
Some('t') => '\t',
Some('e') => '\x1b',
Some(c) => {
self.read_word(); // Recover from the error
return Err(LexerError::new(
LexerErrorKind::InvalidEscape(c),
span.join(&self.span()),
));
}
None => {
return Err(LexerError::new(
LexerErrorKind::UnclosedChar,
span.join(&self.span()),
));
}
},
Some(c) => c,
None => {
return Err(LexerError::new(
LexerErrorKind::UnclosedChar,
span.join(&self.span()),
))
}
'0' if matches!(self.peek(1), Some('b') | Some('o') | Some('x')) => {
let radix = match self.peek(1) {
Some('b') => 2,
Some('o') => 8,
Some('x') => 16,
_ => unreachable!(),
};
if self.advance() != Some('\'') {
self.read_word(); // Recover from the error
return Err(LexerError::new(
LexerErrorKind::UnclosedChar,
span.join(&self.span()),
));
}
let span = span.clone();
let word = self.read_word();
TokenKind::Char(c)
self.integer_literal(word, span, radix)?
}
'0'..='9' => TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?),
'0'..='9' => self.numeric_literal(span.clone())?,
'+' | '-' if matches!(self.peek(1), Some('0'..='9')) => {
TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?)
self.numeric_literal(span.clone())?
}
'"' => {
self.advance(); // '"'
let quote_span = span.clone().join(&self.span());
let mut string = String::new();
'\'' => self.char_literal(span.clone())?,
'"' => self.string_literal(span.clone())?,
loop {
let ch_span = self.span();
string.push(match self.advance() {
Some('"') => break,
Some('\\') => match self.advance() {
Some(c @ ('"' | '\\')) => c,
Some('n') => '\n',
Some('r') => '\r',
Some('t') => '\t',
Some('e') => '\x1b',
Some(c) => {
self.read_word(); // Recover from the error
return Err(LexerError::new(
LexerErrorKind::InvalidEscape(c),
ch_span.join(&self.span()),
));
}
None => {
return Err(LexerError::new(
LexerErrorKind::UnclosedString,
quote_span,
))
}
},
Some(c) => c,
None => {
return Err(LexerError::new(LexerErrorKind::UnclosedString, quote_span))
}
});
}
if self
.current()
.is_some_and(|c| !is_separator(c, self.peek(1)))
{
self.read_word(); // Recover from the error
return Err(LexerError::new(
LexerErrorKind::InvalidString,
span.join(&self.span()),
));
}
TokenKind::String(string)
}
':' => {
self.advance(); // ':'
TokenKind::Keyword(Symbol(self.read_word()))
TokenKind::Keyword(Symbol::from(self.read_word()))
}
_ => {
let word = self.read_word();
match word.as_str() {
@ -303,6 +177,174 @@ impl<'lexer> Lexer<'lexer> {
Ok(Some(Token::new(kind, span)))
}
fn line_comment(&mut self) -> TokenKind {
// Line comments may start with any number of semicolons, so consume however
// many are present at the beginning of the comment:
while self.current().is_some_and(|c| c == ';') {
self.advance();
}
// Line comments continue until a newline character is encountered:
let mut comment = String::new();
while let Some(c) = self.advance() {
if c == '\n' {
break;
}
comment.push(c);
}
TokenKind::LineComment(comment.trim().into())
}
fn block_comment(&mut self) -> TokenKind {
// TODO: This currently allows for unclosed block comments; do we care?
self.advance(); // '#'
self.advance(); // '|'
let mut comment = String::new();
while let Some(c) = self.advance() {
if c == '|' && matches!(self.peek(0), Some('#')) {
self.advance(); // '#'
break;
}
comment.push(c);
}
TokenKind::BlockComment(comment.trim().into())
}
fn float_literal(&self, word: String, span: Span) -> Result<TokenKind, LexerError> {
let float = word.parse().map_err(|_| {
LexerError::new(LexerErrorKind::InvalidNumber(word), span.join(&self.span()))
})?;
Ok(TokenKind::Float(float))
}
fn integer_literal(
&self,
word: String,
span: Span,
radix: u32,
) -> Result<TokenKind, LexerError> {
// For numbers which are not base-10, strip the prefix (e.g. '0b', '0o', '0x'):
let word = if radix == 10 {
word
} else {
word[2..].to_string()
};
let integer = i64::from_str_radix(&word, radix).map_err(|_| {
LexerError::new(LexerErrorKind::InvalidNumber(word), span.join(&self.span()))
})?;
Ok(TokenKind::Integer(integer))
}
fn numeric_literal(&mut self, span: Span) -> Result<TokenKind, LexerError> {
let word = self.read_word();
let kind = if word.contains('.') {
self.float_literal(word, span)?
} else {
self.integer_literal(word, span, 10)?
};
Ok(kind)
}
fn char_literal(&mut self, span: Span) -> Result<TokenKind, LexerError> {
self.advance(); // '\''
let c = match self.advance() {
Some('\\') => match self.advance() {
Some(c @ ('"' | '\\')) => c,
Some('n') => '\n',
Some('r') => '\r',
Some('t') => '\t',
Some('e') => '\x1b',
Some(c) => {
self.read_word(); // Recover from the error
return Err(LexerError::new(
LexerErrorKind::InvalidEscape(c),
span.join(&self.span()),
));
}
None => {
return Err(LexerError::new(
LexerErrorKind::UnclosedChar,
span.join(&self.span()),
));
}
},
Some(c) => c,
None => {
return Err(LexerError::new(
LexerErrorKind::UnclosedChar,
span.join(&self.span()),
))
}
};
if self.advance() != Some('\'') {
self.read_word(); // Recover from the error
return Err(LexerError::new(
LexerErrorKind::UnclosedChar,
span.join(&self.span()),
));
}
Ok(TokenKind::Char(c))
}
fn string_literal(&mut self, span: Span) -> Result<TokenKind, LexerError> {
self.advance(); // '"'
let quote_span = span.clone().join(&self.span());
let mut string = String::new();
loop {
let ch_span = self.span();
string.push(match self.advance() {
Some('"') => break,
Some('\\') => match self.advance() {
Some(c @ ('"' | '\\')) => c,
Some('n') => '\n',
Some('r') => '\r',
Some('t') => '\t',
Some('e') => '\x1b',
Some(c) => {
self.read_word(); // Recover from the error
return Err(LexerError::new(
LexerErrorKind::InvalidEscape(c),
ch_span.join(&self.span()),
));
}
None => {
return Err(LexerError::new(LexerErrorKind::UnclosedString, quote_span))
}
},
Some(c) => c,
None => return Err(LexerError::new(LexerErrorKind::UnclosedString, quote_span)),
});
}
if self
.current()
.is_some_and(|c| !is_separator(c, self.peek(1)))
{
self.read_word(); // Recover from the error
return Err(LexerError::new(
LexerErrorKind::InvalidString,
span.join(&self.span()),
));
}
Ok(TokenKind::String(string))
}
}
impl Iterator for Lexer<'_> {
@ -348,10 +390,10 @@ mod tests {
test!(list: "(1 () -2.3)", [
Ok(TokenKind::OpenParen),
Ok(TokenKind::Number(1.0)),
Ok(TokenKind::Integer(1)),
Ok(TokenKind::OpenParen),
Ok(TokenKind::CloseParen),
Ok(TokenKind::Number(-2.3)),
Ok(TokenKind::Float(-2.3)),
Ok(TokenKind::CloseParen),
]);
@ -366,21 +408,39 @@ mod tests {
test!(vector: "[0 10 200]", [
Ok(TokenKind::OpenBracket),
Ok(TokenKind::Number(0.0)),
Ok(TokenKind::Number(10.0)),
Ok(TokenKind::Number(200.0)),
Ok(TokenKind::Integer(0)),
Ok(TokenKind::Integer(10)),
Ok(TokenKind::Integer(200)),
Ok(TokenKind::CloseBracket),
]);
test!(map: "#{:a 0 :b 1}", [
Ok(TokenKind::OpenHashBrace),
Ok(TokenKind::Keyword(Symbol::from("a"))),
Ok(TokenKind::Number(0.0)),
Ok(TokenKind::Integer(0)),
Ok(TokenKind::Keyword(Symbol::from("b"))),
Ok(TokenKind::Number(1.0)),
Ok(TokenKind::Integer(1)),
Ok(TokenKind::CloseBrace),
]);
test!(number: "0 -1 20.0 +0.003", [
Ok(TokenKind::Integer(0)),
Ok(TokenKind::Integer(-1)),
Ok(TokenKind::Float(20.0)),
Ok(TokenKind::Float(0.003)),
]);
test!(number_non_base_10: "0b0011 0o70 0xFF", [
Ok(TokenKind::Integer(3)),
Ok(TokenKind::Integer(56)),
Ok(TokenKind::Integer(255)),
]);
test!(error_parse_number: "1.1.1 0.x", [
Err(LexerErrorKind::InvalidNumber("1.1.1".into())),
Err(LexerErrorKind::InvalidNumber("0.x".into())),
]);
test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [
Ok(TokenKind::Char('x')),
Ok(TokenKind::Char('\n')),
@ -400,26 +460,15 @@ mod tests {
Err(LexerErrorKind::UnclosedChar),
]);
test!(number: "0 -1 20.0 +0.003", [
Ok(TokenKind::Number(0.0)),
Ok(TokenKind::Number(-1.0)),
Ok(TokenKind::Number(20.0)),
Ok(TokenKind::Number(0.003)),
]);
test!(error_parse_number: "1.1.1 0.x", [
Err(LexerErrorKind::InvalidNumber("1.1.1".into())),
Err(LexerErrorKind::InvalidNumber("0.x".into())),
]);
test!(string: "\"\" \"xyz\" \"This is a string!\"", [
test!(string: "\"\" \"xyz\" \"This is a string!\" \"凄い😍\"", [
Ok(TokenKind::String("".into())),
Ok(TokenKind::String("xyz".into())),
Ok(TokenKind::String("This is a string!".into())),
Ok(TokenKind::String("凄い😍".into())),
]);
test!(string_with_escapes: "\"\\e[0mfoo\\nbar\\r\\t\"", [
Ok(TokenKind::String("\x1b[0mfoo\nbar\r\t".into())),
test!(string_with_escapes: "\"\\e[0mfoo\\\"\\nbar\\r\\t\"", [
Ok(TokenKind::String("\x1b[0mfoo\"\nbar\r\t".into())),
]);
test!(error_invalid_escape_string: "\"oh no \\p\"", [
@ -439,14 +488,22 @@ mod tests {
Ok(TokenKind::Symbol(Symbol::from("+"))),
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("-"))),
Ok(TokenKind::Number(0.0)),
Ok(TokenKind::Number(-1.0)),
Ok(TokenKind::Integer(0)),
Ok(TokenKind::Integer(-1)),
Ok(TokenKind::CloseParen),
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("*"))),
Ok(TokenKind::Number(2.0)),
Ok(TokenKind::Number(3.0)),
Ok(TokenKind::Integer(2)),
Ok(TokenKind::Integer(3)),
Ok(TokenKind::CloseParen),
Ok(TokenKind::CloseParen),
]);
test!(unicode_symbol: "(かわいい 🐕 😻)", [
Ok(TokenKind::OpenParen),
Ok(TokenKind::Symbol(Symbol::from("かわいい"))),
Ok(TokenKind::Symbol(Symbol::from("🐕"))),
Ok(TokenKind::Symbol(Symbol::from("😻"))),
Ok(TokenKind::CloseParen),
]);
}

View File

@ -28,8 +28,10 @@ pub enum TokenKind {
Bool(bool),
/// Character, e.g. `'c'`, `'\n'`
Char(char),
/// Number, e.g. `1`, `2.0`, `0.003`
Number(f64),
/// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003`
Float(f64),
/// Integer, e.g. `0`, `-1`, `+200`
Integer(i64),
/// String, e.g. `"foo bar"`
String(String),
/// Keyword, e.g. `:baz`