From ffed778a71c390a30b82e390577d59d695d5ae0a Mon Sep 17 00:00:00 2001 From: Jesse Braham Date: Fri, 13 Dec 2024 16:42:27 +0100 Subject: [PATCH] Split the lexer's number token kind into float and integer, more refactoring --- onihime/src/lexer/mod.rs | 407 +++++++++++++++++++++---------------- onihime/src/lexer/token.rs | 6 +- 2 files changed, 236 insertions(+), 177 deletions(-) diff --git a/onihime/src/lexer/mod.rs b/onihime/src/lexer/mod.rs index b996410..f423401 100644 --- a/onihime/src/lexer/mod.rs +++ b/onihime/src/lexer/mod.rs @@ -1,7 +1,4 @@ -use std::{ - str::{Chars, FromStr}, - sync::Arc, -}; +use std::{str::Chars, sync::Arc}; pub use self::{ error::{LexerError, LexerErrorKind}, @@ -33,7 +30,7 @@ pub(crate) struct Lexer<'lexer> { impl<'lexer> Lexer<'lexer> { /// Create a new lexer instance from a string. #[must_use] - pub(crate) fn new(input: &'lexer str) -> Self { + pub fn new(input: &'lexer str) -> Self { let source = Arc::new(Source::new(None, input.to_string())); Self { @@ -44,21 +41,20 @@ impl<'lexer> Lexer<'lexer> { } /// Set the name of the [Source] being lexically analyzed. - pub(crate) fn set_name(&mut self, name: String) { + pub fn set_name(&mut self, name: String) { // TODO: Avoid unwrapping here (if possible?) Arc::get_mut(&mut self.source).unwrap().set_name(name); } /// The source being lexically analyzed. - #[cfg(test)] #[must_use] - pub(crate) fn source(&self) -> Arc { + pub fn source(&self) -> Arc { self.source.clone() } /// Get the current position of the lexer. #[must_use] - pub(crate) fn span(&self) -> Span { + pub fn span(&self) -> Span { Span::new(self.byte..self.byte, self.source.clone()) } @@ -106,21 +102,8 @@ impl<'lexer> Lexer<'lexer> { word } - /// Parse a value from the input or return an error. - #[must_use] - fn parse_or(&mut self, err: impl Fn(String) -> LexerErrorKind) -> Result - where - T: FromStr, - { - let span = self.span(); - let word = self.read_word(); - - word.parse() - .map_err(|_| LexerError::new(err(word), span.join(&self.span()))) - } - /// Read the next token from the input. - pub(crate) fn read(&mut self) -> Result, LexerError> { + pub fn read(&mut self) -> Result, LexerError> { // Eat whitespace until we encounter a meaningful character, or simply return if // we have reached the end of input and no additional characters can be read: let c = loop { @@ -136,42 +119,8 @@ impl<'lexer> Lexer<'lexer> { let mut span = self.span(); let kind = match c { - // TODO: This allows for unclosed block comments; do we care? - '#' if self.peek(1) == Some('|') => { - self.advance(); // '#' - self.advance(); // '|' - - let mut comment = String::new(); - while let Some(c) = self.advance() { - if c == '|' && matches!(self.peek(0), Some('#')) { - self.advance(); // '#' - break; - } - - comment.push(c); - } - - TokenKind::BlockComment(comment.trim().into()) - } - ';' => { - // Line comments may start with any number of semicolons, so consume however - // many are present at the beginning of the comment: - while self.current().is_some_and(|c| c == ';') { - self.advance(); - } - - // Line comments continue until a newline character is encountered: - let mut comment = String::new(); - while let Some(c) = self.advance() { - if c == '\n' { - break; - } - - comment.push(c); - } - - TokenKind::LineComment(comment.trim().into()) - } + ';' => self.line_comment(), + '#' if self.peek(1) == Some('|') => self.block_comment(), '(' => self.advance_and(TokenKind::OpenParen), ')' => self.advance_and(TokenKind::CloseParen), @@ -186,108 +135,33 @@ impl<'lexer> Lexer<'lexer> { TokenKind::OpenHashBrace } - '\'' => { - self.advance(); // '\'' - - let c = match self.advance() { - Some('\\') => match self.advance() { - Some(c @ ('"' | '\\')) => c, - Some('n') => '\n', - Some('r') => '\r', - Some('t') => '\t', - Some('e') => '\x1b', - Some(c) => { - self.read_word(); // Recover from the error - return Err(LexerError::new( - LexerErrorKind::InvalidEscape(c), - span.join(&self.span()), - )); - } - None => { - return Err(LexerError::new( - LexerErrorKind::UnclosedChar, - span.join(&self.span()), - )); - } - }, - Some(c) => c, - None => { - return Err(LexerError::new( - LexerErrorKind::UnclosedChar, - span.join(&self.span()), - )) - } + '0' if matches!(self.peek(1), Some('b') | Some('o') | Some('x')) => { + let radix = match self.peek(1) { + Some('b') => 2, + Some('o') => 8, + Some('x') => 16, + _ => unreachable!(), }; - if self.advance() != Some('\'') { - self.read_word(); // Recover from the error - return Err(LexerError::new( - LexerErrorKind::UnclosedChar, - span.join(&self.span()), - )); - } + let span = span.clone(); + let word = self.read_word(); - TokenKind::Char(c) + self.integer_literal(word, span, radix)? } - '0'..='9' => TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?), + '0'..='9' => self.numeric_literal(span.clone())?, '+' | '-' if matches!(self.peek(1), Some('0'..='9')) => { - TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?) + self.numeric_literal(span.clone())? } - '"' => { - self.advance(); // '"' - let quote_span = span.clone().join(&self.span()); - let mut string = String::new(); + '\'' => self.char_literal(span.clone())?, + '"' => self.string_literal(span.clone())?, - loop { - let ch_span = self.span(); - string.push(match self.advance() { - Some('"') => break, - Some('\\') => match self.advance() { - Some(c @ ('"' | '\\')) => c, - Some('n') => '\n', - Some('r') => '\r', - Some('t') => '\t', - Some('e') => '\x1b', - Some(c) => { - self.read_word(); // Recover from the error - return Err(LexerError::new( - LexerErrorKind::InvalidEscape(c), - ch_span.join(&self.span()), - )); - } - None => { - return Err(LexerError::new( - LexerErrorKind::UnclosedString, - quote_span, - )) - } - }, - Some(c) => c, - None => { - return Err(LexerError::new(LexerErrorKind::UnclosedString, quote_span)) - } - }); - } - - if self - .current() - .is_some_and(|c| !is_separator(c, self.peek(1))) - { - self.read_word(); // Recover from the error - return Err(LexerError::new( - LexerErrorKind::InvalidString, - span.join(&self.span()), - )); - } - - TokenKind::String(string) - } ':' => { self.advance(); // ':' - TokenKind::Keyword(Symbol(self.read_word())) + TokenKind::Keyword(Symbol::from(self.read_word())) } + _ => { let word = self.read_word(); match word.as_str() { @@ -303,6 +177,174 @@ impl<'lexer> Lexer<'lexer> { Ok(Some(Token::new(kind, span))) } + + fn line_comment(&mut self) -> TokenKind { + // Line comments may start with any number of semicolons, so consume however + // many are present at the beginning of the comment: + while self.current().is_some_and(|c| c == ';') { + self.advance(); + } + + // Line comments continue until a newline character is encountered: + let mut comment = String::new(); + while let Some(c) = self.advance() { + if c == '\n' { + break; + } + + comment.push(c); + } + + TokenKind::LineComment(comment.trim().into()) + } + + fn block_comment(&mut self) -> TokenKind { + // TODO: This currently allows for unclosed block comments; do we care? + self.advance(); // '#' + self.advance(); // '|' + + let mut comment = String::new(); + while let Some(c) = self.advance() { + if c == '|' && matches!(self.peek(0), Some('#')) { + self.advance(); // '#' + break; + } + + comment.push(c); + } + + TokenKind::BlockComment(comment.trim().into()) + } + + fn float_literal(&self, word: String, span: Span) -> Result { + let float = word.parse().map_err(|_| { + LexerError::new(LexerErrorKind::InvalidNumber(word), span.join(&self.span())) + })?; + + Ok(TokenKind::Float(float)) + } + + fn integer_literal( + &self, + word: String, + span: Span, + radix: u32, + ) -> Result { + // For numbers which are not base-10, strip the prefix (e.g. '0b', '0o', '0x'): + let word = if radix == 10 { + word + } else { + word[2..].to_string() + }; + + let integer = i64::from_str_radix(&word, radix).map_err(|_| { + LexerError::new(LexerErrorKind::InvalidNumber(word), span.join(&self.span())) + })?; + + Ok(TokenKind::Integer(integer)) + } + + fn numeric_literal(&mut self, span: Span) -> Result { + let word = self.read_word(); + + let kind = if word.contains('.') { + self.float_literal(word, span)? + } else { + self.integer_literal(word, span, 10)? + }; + + Ok(kind) + } + + fn char_literal(&mut self, span: Span) -> Result { + self.advance(); // '\'' + + let c = match self.advance() { + Some('\\') => match self.advance() { + Some(c @ ('"' | '\\')) => c, + Some('n') => '\n', + Some('r') => '\r', + Some('t') => '\t', + Some('e') => '\x1b', + Some(c) => { + self.read_word(); // Recover from the error + return Err(LexerError::new( + LexerErrorKind::InvalidEscape(c), + span.join(&self.span()), + )); + } + None => { + return Err(LexerError::new( + LexerErrorKind::UnclosedChar, + span.join(&self.span()), + )); + } + }, + Some(c) => c, + None => { + return Err(LexerError::new( + LexerErrorKind::UnclosedChar, + span.join(&self.span()), + )) + } + }; + + if self.advance() != Some('\'') { + self.read_word(); // Recover from the error + return Err(LexerError::new( + LexerErrorKind::UnclosedChar, + span.join(&self.span()), + )); + } + + Ok(TokenKind::Char(c)) + } + + fn string_literal(&mut self, span: Span) -> Result { + self.advance(); // '"' + + let quote_span = span.clone().join(&self.span()); + let mut string = String::new(); + + loop { + let ch_span = self.span(); + string.push(match self.advance() { + Some('"') => break, + Some('\\') => match self.advance() { + Some(c @ ('"' | '\\')) => c, + Some('n') => '\n', + Some('r') => '\r', + Some('t') => '\t', + Some('e') => '\x1b', + Some(c) => { + self.read_word(); // Recover from the error + return Err(LexerError::new( + LexerErrorKind::InvalidEscape(c), + ch_span.join(&self.span()), + )); + } + None => { + return Err(LexerError::new(LexerErrorKind::UnclosedString, quote_span)) + } + }, + Some(c) => c, + None => return Err(LexerError::new(LexerErrorKind::UnclosedString, quote_span)), + }); + } + + if self + .current() + .is_some_and(|c| !is_separator(c, self.peek(1))) + { + self.read_word(); // Recover from the error + return Err(LexerError::new( + LexerErrorKind::InvalidString, + span.join(&self.span()), + )); + } + + Ok(TokenKind::String(string)) + } } impl Iterator for Lexer<'_> { @@ -348,10 +390,10 @@ mod tests { test!(list: "(1 () -2.3)", [ Ok(TokenKind::OpenParen), - Ok(TokenKind::Number(1.0)), + Ok(TokenKind::Integer(1)), Ok(TokenKind::OpenParen), Ok(TokenKind::CloseParen), - Ok(TokenKind::Number(-2.3)), + Ok(TokenKind::Float(-2.3)), Ok(TokenKind::CloseParen), ]); @@ -366,21 +408,39 @@ mod tests { test!(vector: "[0 10 200]", [ Ok(TokenKind::OpenBracket), - Ok(TokenKind::Number(0.0)), - Ok(TokenKind::Number(10.0)), - Ok(TokenKind::Number(200.0)), + Ok(TokenKind::Integer(0)), + Ok(TokenKind::Integer(10)), + Ok(TokenKind::Integer(200)), Ok(TokenKind::CloseBracket), ]); test!(map: "#{:a 0 :b 1}", [ Ok(TokenKind::OpenHashBrace), Ok(TokenKind::Keyword(Symbol::from("a"))), - Ok(TokenKind::Number(0.0)), + Ok(TokenKind::Integer(0)), Ok(TokenKind::Keyword(Symbol::from("b"))), - Ok(TokenKind::Number(1.0)), + Ok(TokenKind::Integer(1)), Ok(TokenKind::CloseBrace), ]); + test!(number: "0 -1 20.0 +0.003", [ + Ok(TokenKind::Integer(0)), + Ok(TokenKind::Integer(-1)), + Ok(TokenKind::Float(20.0)), + Ok(TokenKind::Float(0.003)), + ]); + + test!(number_non_base_10: "0b0011 0o70 0xFF", [ + Ok(TokenKind::Integer(3)), + Ok(TokenKind::Integer(56)), + Ok(TokenKind::Integer(255)), + ]); + + test!(error_parse_number: "1.1.1 0.x", [ + Err(LexerErrorKind::InvalidNumber("1.1.1".into())), + Err(LexerErrorKind::InvalidNumber("0.x".into())), + ]); + test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [ Ok(TokenKind::Char('x')), Ok(TokenKind::Char('\n')), @@ -400,26 +460,15 @@ mod tests { Err(LexerErrorKind::UnclosedChar), ]); - test!(number: "0 -1 20.0 +0.003", [ - Ok(TokenKind::Number(0.0)), - Ok(TokenKind::Number(-1.0)), - Ok(TokenKind::Number(20.0)), - Ok(TokenKind::Number(0.003)), - ]); - - test!(error_parse_number: "1.1.1 0.x", [ - Err(LexerErrorKind::InvalidNumber("1.1.1".into())), - Err(LexerErrorKind::InvalidNumber("0.x".into())), - ]); - - test!(string: "\"\" \"xyz\" \"This is a string!\"", [ + test!(string: "\"\" \"xyz\" \"This is a string!\" \"凄い😍\"", [ Ok(TokenKind::String("".into())), Ok(TokenKind::String("xyz".into())), Ok(TokenKind::String("This is a string!".into())), + Ok(TokenKind::String("凄い😍".into())), ]); - test!(string_with_escapes: "\"\\e[0mfoo\\nbar\\r\\t\"", [ - Ok(TokenKind::String("\x1b[0mfoo\nbar\r\t".into())), + test!(string_with_escapes: "\"\\e[0mfoo\\\"\\nbar\\r\\t\"", [ + Ok(TokenKind::String("\x1b[0mfoo\"\nbar\r\t".into())), ]); test!(error_invalid_escape_string: "\"oh no \\p\"", [ @@ -439,14 +488,22 @@ mod tests { Ok(TokenKind::Symbol(Symbol::from("+"))), Ok(TokenKind::OpenParen), Ok(TokenKind::Symbol(Symbol::from("-"))), - Ok(TokenKind::Number(0.0)), - Ok(TokenKind::Number(-1.0)), + Ok(TokenKind::Integer(0)), + Ok(TokenKind::Integer(-1)), Ok(TokenKind::CloseParen), Ok(TokenKind::OpenParen), Ok(TokenKind::Symbol(Symbol::from("*"))), - Ok(TokenKind::Number(2.0)), - Ok(TokenKind::Number(3.0)), + Ok(TokenKind::Integer(2)), + Ok(TokenKind::Integer(3)), Ok(TokenKind::CloseParen), Ok(TokenKind::CloseParen), ]); + + test!(unicode_symbol: "(かわいい 🐕 😻)", [ + Ok(TokenKind::OpenParen), + Ok(TokenKind::Symbol(Symbol::from("かわいい"))), + Ok(TokenKind::Symbol(Symbol::from("🐕"))), + Ok(TokenKind::Symbol(Symbol::from("😻"))), + Ok(TokenKind::CloseParen), + ]); } diff --git a/onihime/src/lexer/token.rs b/onihime/src/lexer/token.rs index 1263c14..62c8a54 100644 --- a/onihime/src/lexer/token.rs +++ b/onihime/src/lexer/token.rs @@ -28,8 +28,10 @@ pub enum TokenKind { Bool(bool), /// Character, e.g. `'c'`, `'\n'` Char(char), - /// Number, e.g. `1`, `2.0`, `0.003` - Number(f64), + /// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003` + Float(f64), + /// Integer, e.g. `0`, `-1`, `+200` + Integer(i64), /// String, e.g. `"foo bar"` String(String), /// Keyword, e.g. `:baz`