Split the lexer's number token kind into float and integer, more refactoring
This commit is contained in:
parent
ede65dcf3e
commit
ffed778a71
@ -1,7 +1,4 @@
|
||||
use std::{
|
||||
str::{Chars, FromStr},
|
||||
sync::Arc,
|
||||
};
|
||||
use std::{str::Chars, sync::Arc};
|
||||
|
||||
pub use self::{
|
||||
error::{LexerError, LexerErrorKind},
|
||||
@ -33,7 +30,7 @@ pub(crate) struct Lexer<'lexer> {
|
||||
impl<'lexer> Lexer<'lexer> {
|
||||
/// Create a new lexer instance from a string.
|
||||
#[must_use]
|
||||
pub(crate) fn new(input: &'lexer str) -> Self {
|
||||
pub fn new(input: &'lexer str) -> Self {
|
||||
let source = Arc::new(Source::new(None, input.to_string()));
|
||||
|
||||
Self {
|
||||
@ -44,21 +41,20 @@ impl<'lexer> Lexer<'lexer> {
|
||||
}
|
||||
|
||||
/// Set the name of the [Source] being lexically analyzed.
|
||||
pub(crate) fn set_name(&mut self, name: String) {
|
||||
pub fn set_name(&mut self, name: String) {
|
||||
// TODO: Avoid unwrapping here (if possible?)
|
||||
Arc::get_mut(&mut self.source).unwrap().set_name(name);
|
||||
}
|
||||
|
||||
/// The source being lexically analyzed.
|
||||
#[cfg(test)]
|
||||
#[must_use]
|
||||
pub(crate) fn source(&self) -> Arc<Source> {
|
||||
pub fn source(&self) -> Arc<Source> {
|
||||
self.source.clone()
|
||||
}
|
||||
|
||||
/// Get the current position of the lexer.
|
||||
#[must_use]
|
||||
pub(crate) fn span(&self) -> Span {
|
||||
pub fn span(&self) -> Span {
|
||||
Span::new(self.byte..self.byte, self.source.clone())
|
||||
}
|
||||
|
||||
@ -106,21 +102,8 @@ impl<'lexer> Lexer<'lexer> {
|
||||
word
|
||||
}
|
||||
|
||||
/// Parse a value from the input or return an error.
|
||||
#[must_use]
|
||||
fn parse_or<T>(&mut self, err: impl Fn(String) -> LexerErrorKind) -> Result<T, LexerError>
|
||||
where
|
||||
T: FromStr,
|
||||
{
|
||||
let span = self.span();
|
||||
let word = self.read_word();
|
||||
|
||||
word.parse()
|
||||
.map_err(|_| LexerError::new(err(word), span.join(&self.span())))
|
||||
}
|
||||
|
||||
/// Read the next token from the input.
|
||||
pub(crate) fn read(&mut self) -> Result<Option<Token>, LexerError> {
|
||||
pub fn read(&mut self) -> Result<Option<Token>, LexerError> {
|
||||
// Eat whitespace until we encounter a meaningful character, or simply return if
|
||||
// we have reached the end of input and no additional characters can be read:
|
||||
let c = loop {
|
||||
@ -136,24 +119,66 @@ impl<'lexer> Lexer<'lexer> {
|
||||
let mut span = self.span();
|
||||
|
||||
let kind = match c {
|
||||
// TODO: This allows for unclosed block comments; do we care?
|
||||
'#' if self.peek(1) == Some('|') => {
|
||||
';' => self.line_comment(),
|
||||
'#' if self.peek(1) == Some('|') => self.block_comment(),
|
||||
|
||||
'(' => self.advance_and(TokenKind::OpenParen),
|
||||
')' => self.advance_and(TokenKind::CloseParen),
|
||||
'{' => self.advance_and(TokenKind::OpenBrace),
|
||||
'}' => self.advance_and(TokenKind::CloseBrace),
|
||||
'[' => self.advance_and(TokenKind::OpenBracket),
|
||||
']' => self.advance_and(TokenKind::CloseBracket),
|
||||
'#' if self.peek(1) == Some('{') => {
|
||||
self.advance(); // '#'
|
||||
self.advance(); // '|'
|
||||
self.advance(); // '{'
|
||||
|
||||
let mut comment = String::new();
|
||||
while let Some(c) = self.advance() {
|
||||
if c == '|' && matches!(self.peek(0), Some('#')) {
|
||||
self.advance(); // '#'
|
||||
break;
|
||||
TokenKind::OpenHashBrace
|
||||
}
|
||||
|
||||
comment.push(c);
|
||||
'0' if matches!(self.peek(1), Some('b') | Some('o') | Some('x')) => {
|
||||
let radix = match self.peek(1) {
|
||||
Some('b') => 2,
|
||||
Some('o') => 8,
|
||||
Some('x') => 16,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
let span = span.clone();
|
||||
let word = self.read_word();
|
||||
|
||||
self.integer_literal(word, span, radix)?
|
||||
}
|
||||
'0'..='9' => self.numeric_literal(span.clone())?,
|
||||
'+' | '-' if matches!(self.peek(1), Some('0'..='9')) => {
|
||||
self.numeric_literal(span.clone())?
|
||||
}
|
||||
|
||||
TokenKind::BlockComment(comment.trim().into())
|
||||
'\'' => self.char_literal(span.clone())?,
|
||||
'"' => self.string_literal(span.clone())?,
|
||||
|
||||
':' => {
|
||||
self.advance(); // ':'
|
||||
|
||||
TokenKind::Keyword(Symbol::from(self.read_word()))
|
||||
}
|
||||
';' => {
|
||||
|
||||
_ => {
|
||||
let word = self.read_word();
|
||||
match word.as_str() {
|
||||
"true" => TokenKind::Bool(true),
|
||||
"false" => TokenKind::Bool(false),
|
||||
"nil" => TokenKind::Nil,
|
||||
_ => TokenKind::Symbol(Symbol::from(word)),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
span.extend(&self.span());
|
||||
|
||||
Ok(Some(Token::new(kind, span)))
|
||||
}
|
||||
|
||||
fn line_comment(&mut self) -> TokenKind {
|
||||
// Line comments may start with any number of semicolons, so consume however
|
||||
// many are present at the beginning of the comment:
|
||||
while self.current().is_some_and(|c| c == ';') {
|
||||
@ -173,20 +198,65 @@ impl<'lexer> Lexer<'lexer> {
|
||||
TokenKind::LineComment(comment.trim().into())
|
||||
}
|
||||
|
||||
'(' => self.advance_and(TokenKind::OpenParen),
|
||||
')' => self.advance_and(TokenKind::CloseParen),
|
||||
'{' => self.advance_and(TokenKind::OpenBrace),
|
||||
'}' => self.advance_and(TokenKind::CloseBrace),
|
||||
'[' => self.advance_and(TokenKind::OpenBracket),
|
||||
']' => self.advance_and(TokenKind::CloseBracket),
|
||||
'#' if self.peek(1) == Some('{') => {
|
||||
fn block_comment(&mut self) -> TokenKind {
|
||||
// TODO: This currently allows for unclosed block comments; do we care?
|
||||
self.advance(); // '#'
|
||||
self.advance(); // '{'
|
||||
self.advance(); // '|'
|
||||
|
||||
TokenKind::OpenHashBrace
|
||||
let mut comment = String::new();
|
||||
while let Some(c) = self.advance() {
|
||||
if c == '|' && matches!(self.peek(0), Some('#')) {
|
||||
self.advance(); // '#'
|
||||
break;
|
||||
}
|
||||
|
||||
'\'' => {
|
||||
comment.push(c);
|
||||
}
|
||||
|
||||
TokenKind::BlockComment(comment.trim().into())
|
||||
}
|
||||
|
||||
fn float_literal(&self, word: String, span: Span) -> Result<TokenKind, LexerError> {
|
||||
let float = word.parse().map_err(|_| {
|
||||
LexerError::new(LexerErrorKind::InvalidNumber(word), span.join(&self.span()))
|
||||
})?;
|
||||
|
||||
Ok(TokenKind::Float(float))
|
||||
}
|
||||
|
||||
fn integer_literal(
|
||||
&self,
|
||||
word: String,
|
||||
span: Span,
|
||||
radix: u32,
|
||||
) -> Result<TokenKind, LexerError> {
|
||||
// For numbers which are not base-10, strip the prefix (e.g. '0b', '0o', '0x'):
|
||||
let word = if radix == 10 {
|
||||
word
|
||||
} else {
|
||||
word[2..].to_string()
|
||||
};
|
||||
|
||||
let integer = i64::from_str_radix(&word, radix).map_err(|_| {
|
||||
LexerError::new(LexerErrorKind::InvalidNumber(word), span.join(&self.span()))
|
||||
})?;
|
||||
|
||||
Ok(TokenKind::Integer(integer))
|
||||
}
|
||||
|
||||
fn numeric_literal(&mut self, span: Span) -> Result<TokenKind, LexerError> {
|
||||
let word = self.read_word();
|
||||
|
||||
let kind = if word.contains('.') {
|
||||
self.float_literal(word, span)?
|
||||
} else {
|
||||
self.integer_literal(word, span, 10)?
|
||||
};
|
||||
|
||||
Ok(kind)
|
||||
}
|
||||
|
||||
fn char_literal(&mut self, span: Span) -> Result<TokenKind, LexerError> {
|
||||
self.advance(); // '\''
|
||||
|
||||
let c = match self.advance() {
|
||||
@ -227,13 +297,10 @@ impl<'lexer> Lexer<'lexer> {
|
||||
));
|
||||
}
|
||||
|
||||
TokenKind::Char(c)
|
||||
Ok(TokenKind::Char(c))
|
||||
}
|
||||
'0'..='9' => TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?),
|
||||
'+' | '-' if matches!(self.peek(1), Some('0'..='9')) => {
|
||||
TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?)
|
||||
}
|
||||
'"' => {
|
||||
|
||||
fn string_literal(&mut self, span: Span) -> Result<TokenKind, LexerError> {
|
||||
self.advance(); // '"'
|
||||
|
||||
let quote_span = span.clone().join(&self.span());
|
||||
@ -257,16 +324,11 @@ impl<'lexer> Lexer<'lexer> {
|
||||
));
|
||||
}
|
||||
None => {
|
||||
return Err(LexerError::new(
|
||||
LexerErrorKind::UnclosedString,
|
||||
quote_span,
|
||||
))
|
||||
return Err(LexerError::new(LexerErrorKind::UnclosedString, quote_span))
|
||||
}
|
||||
},
|
||||
Some(c) => c,
|
||||
None => {
|
||||
return Err(LexerError::new(LexerErrorKind::UnclosedString, quote_span))
|
||||
}
|
||||
None => return Err(LexerError::new(LexerErrorKind::UnclosedString, quote_span)),
|
||||
});
|
||||
}
|
||||
|
||||
@ -281,27 +343,7 @@ impl<'lexer> Lexer<'lexer> {
|
||||
));
|
||||
}
|
||||
|
||||
TokenKind::String(string)
|
||||
}
|
||||
':' => {
|
||||
self.advance(); // ':'
|
||||
|
||||
TokenKind::Keyword(Symbol(self.read_word()))
|
||||
}
|
||||
_ => {
|
||||
let word = self.read_word();
|
||||
match word.as_str() {
|
||||
"true" => TokenKind::Bool(true),
|
||||
"false" => TokenKind::Bool(false),
|
||||
"nil" => TokenKind::Nil,
|
||||
_ => TokenKind::Symbol(Symbol::from(word)),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
span.extend(&self.span());
|
||||
|
||||
Ok(Some(Token::new(kind, span)))
|
||||
Ok(TokenKind::String(string))
|
||||
}
|
||||
}
|
||||
|
||||
@ -348,10 +390,10 @@ mod tests {
|
||||
|
||||
test!(list: "(1 () -2.3)", [
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Number(1.0)),
|
||||
Ok(TokenKind::Integer(1)),
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::CloseParen),
|
||||
Ok(TokenKind::Number(-2.3)),
|
||||
Ok(TokenKind::Float(-2.3)),
|
||||
Ok(TokenKind::CloseParen),
|
||||
]);
|
||||
|
||||
@ -366,21 +408,39 @@ mod tests {
|
||||
|
||||
test!(vector: "[0 10 200]", [
|
||||
Ok(TokenKind::OpenBracket),
|
||||
Ok(TokenKind::Number(0.0)),
|
||||
Ok(TokenKind::Number(10.0)),
|
||||
Ok(TokenKind::Number(200.0)),
|
||||
Ok(TokenKind::Integer(0)),
|
||||
Ok(TokenKind::Integer(10)),
|
||||
Ok(TokenKind::Integer(200)),
|
||||
Ok(TokenKind::CloseBracket),
|
||||
]);
|
||||
|
||||
test!(map: "#{:a 0 :b 1}", [
|
||||
Ok(TokenKind::OpenHashBrace),
|
||||
Ok(TokenKind::Keyword(Symbol::from("a"))),
|
||||
Ok(TokenKind::Number(0.0)),
|
||||
Ok(TokenKind::Integer(0)),
|
||||
Ok(TokenKind::Keyword(Symbol::from("b"))),
|
||||
Ok(TokenKind::Number(1.0)),
|
||||
Ok(TokenKind::Integer(1)),
|
||||
Ok(TokenKind::CloseBrace),
|
||||
]);
|
||||
|
||||
test!(number: "0 -1 20.0 +0.003", [
|
||||
Ok(TokenKind::Integer(0)),
|
||||
Ok(TokenKind::Integer(-1)),
|
||||
Ok(TokenKind::Float(20.0)),
|
||||
Ok(TokenKind::Float(0.003)),
|
||||
]);
|
||||
|
||||
test!(number_non_base_10: "0b0011 0o70 0xFF", [
|
||||
Ok(TokenKind::Integer(3)),
|
||||
Ok(TokenKind::Integer(56)),
|
||||
Ok(TokenKind::Integer(255)),
|
||||
]);
|
||||
|
||||
test!(error_parse_number: "1.1.1 0.x", [
|
||||
Err(LexerErrorKind::InvalidNumber("1.1.1".into())),
|
||||
Err(LexerErrorKind::InvalidNumber("0.x".into())),
|
||||
]);
|
||||
|
||||
test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [
|
||||
Ok(TokenKind::Char('x')),
|
||||
Ok(TokenKind::Char('\n')),
|
||||
@ -400,26 +460,15 @@ mod tests {
|
||||
Err(LexerErrorKind::UnclosedChar),
|
||||
]);
|
||||
|
||||
test!(number: "0 -1 20.0 +0.003", [
|
||||
Ok(TokenKind::Number(0.0)),
|
||||
Ok(TokenKind::Number(-1.0)),
|
||||
Ok(TokenKind::Number(20.0)),
|
||||
Ok(TokenKind::Number(0.003)),
|
||||
]);
|
||||
|
||||
test!(error_parse_number: "1.1.1 0.x", [
|
||||
Err(LexerErrorKind::InvalidNumber("1.1.1".into())),
|
||||
Err(LexerErrorKind::InvalidNumber("0.x".into())),
|
||||
]);
|
||||
|
||||
test!(string: "\"\" \"xyz\" \"This is a string!\"", [
|
||||
test!(string: "\"\" \"xyz\" \"This is a string!\" \"凄い😍\"", [
|
||||
Ok(TokenKind::String("".into())),
|
||||
Ok(TokenKind::String("xyz".into())),
|
||||
Ok(TokenKind::String("This is a string!".into())),
|
||||
Ok(TokenKind::String("凄い😍".into())),
|
||||
]);
|
||||
|
||||
test!(string_with_escapes: "\"\\e[0mfoo\\nbar\\r\\t\"", [
|
||||
Ok(TokenKind::String("\x1b[0mfoo\nbar\r\t".into())),
|
||||
test!(string_with_escapes: "\"\\e[0mfoo\\\"\\nbar\\r\\t\"", [
|
||||
Ok(TokenKind::String("\x1b[0mfoo\"\nbar\r\t".into())),
|
||||
]);
|
||||
|
||||
test!(error_invalid_escape_string: "\"oh no \\p\"", [
|
||||
@ -439,14 +488,22 @@ mod tests {
|
||||
Ok(TokenKind::Symbol(Symbol::from("+"))),
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("-"))),
|
||||
Ok(TokenKind::Number(0.0)),
|
||||
Ok(TokenKind::Number(-1.0)),
|
||||
Ok(TokenKind::Integer(0)),
|
||||
Ok(TokenKind::Integer(-1)),
|
||||
Ok(TokenKind::CloseParen),
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("*"))),
|
||||
Ok(TokenKind::Number(2.0)),
|
||||
Ok(TokenKind::Number(3.0)),
|
||||
Ok(TokenKind::Integer(2)),
|
||||
Ok(TokenKind::Integer(3)),
|
||||
Ok(TokenKind::CloseParen),
|
||||
Ok(TokenKind::CloseParen),
|
||||
]);
|
||||
|
||||
test!(unicode_symbol: "(かわいい 🐕 😻)", [
|
||||
Ok(TokenKind::OpenParen),
|
||||
Ok(TokenKind::Symbol(Symbol::from("かわいい"))),
|
||||
Ok(TokenKind::Symbol(Symbol::from("🐕"))),
|
||||
Ok(TokenKind::Symbol(Symbol::from("😻"))),
|
||||
Ok(TokenKind::CloseParen),
|
||||
]);
|
||||
}
|
||||
|
@ -28,8 +28,10 @@ pub enum TokenKind {
|
||||
Bool(bool),
|
||||
/// Character, e.g. `'c'`, `'\n'`
|
||||
Char(char),
|
||||
/// Number, e.g. `1`, `2.0`, `0.003`
|
||||
Number(f64),
|
||||
/// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003`
|
||||
Float(f64),
|
||||
/// Integer, e.g. `0`, `-1`, `+200`
|
||||
Integer(i64),
|
||||
/// String, e.g. `"foo bar"`
|
||||
String(String),
|
||||
/// Keyword, e.g. `:baz`
|
||||
|
Loading…
Reference in New Issue
Block a user