Remove currently unhandled tokens, improvements to lexer module

This commit is contained in:
Jesse Braham 2025-01-26 11:29:04 +01:00
parent 4a5b7321e8
commit 7add446d14
3 changed files with 164 additions and 148 deletions

View File

@ -1,38 +1,74 @@
/// Errors during lexical analysis. use std::fmt;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LexerError { /// Kinds of errors which can occur during lexical analysis.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum LexerErrorKind {
/// An invalid character literal was encountered. /// An invalid character literal was encountered.
InvalidChar, InvalidChar,
/// An invalid keyword literal was encountered. /// An invalid keyword was encountered.
InvalidKeyword, InvalidKeyword,
/// An invalid number literal was encountered. /// An invalid number literal was encountered.
InvalidNumber, InvalidNumber,
/// An invalid token was encountered. /// An invalid symbol was encountered.
InvalidToken, InvalidSymbol,
/// An unclosed string was encountered. /// An unclosed string literal was encountered.
UnclosedString, UnclosedString,
/// Invalid UTF-8 sequence was encountered.
Utf8Error(std::str::Utf8Error),
} }
impl From<std::str::Utf8Error> for LexerError { #[cfg(not(tarpaulin_include))]
fn from(err: std::str::Utf8Error) -> Self { impl fmt::Display for LexerErrorKind {
Self::Utf8Error(err) fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use LexerErrorKind::*;
match self {
InvalidChar => write!(f, "Invalid character literal"),
InvalidKeyword => write!(f, "Invalid keyword"),
InvalidNumber => write!(f, "Invalid number literal"),
InvalidSymbol => write!(f, "Invalid symbol"),
UnclosedString => write!(f, "Unclosed string literal"),
}
}
}
/// Errors which occur during lexical analysis.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct LexerError {
/// The kind of lexer error.
pub kind: LexerErrorKind,
/// Additional context regarding the lexer error.
pub context: Option<String>,
}
impl LexerError {
/// Construct a new instance of a lexer error.
#[must_use]
pub const fn new(kind: LexerErrorKind) -> Self {
Self {
kind,
context: None,
}
}
/// Provide additional context for a lexer error.
#[must_use]
pub fn with_context<C>(mut self, f: impl FnOnce() -> C) -> Self
where
C: fmt::Display,
{
self.context = Some(f().to_string());
self
} }
} }
impl std::error::Error for LexerError {} impl std::error::Error for LexerError {}
#[cfg(not(tarpaulin_include))] #[cfg(not(tarpaulin_include))]
impl std::fmt::Display for LexerError { impl fmt::Display for LexerError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self { if let Some(ref context) = self.context {
LexerError::InvalidChar => write!(f, "An invalid character literal was encountered"), write!(f, "{}: {}", self.kind, context)
LexerError::InvalidKeyword => write!(f, "An invalid keyword literal was encountered"), } else {
LexerError::InvalidNumber => write!(f, "An invalid number literal was encountered"), write!(f, "{}", self.kind)
LexerError::InvalidToken => write!(f, "An invalid token was encountered"),
LexerError::UnclosedString => write!(f, "An unclosed string was encountered"),
LexerError::Utf8Error(err) => write!(f, "{err}"),
} }
} }
} }

View File

@ -9,14 +9,14 @@
use std::{ use std::{
iter::Peekable, iter::Peekable,
ops::Range,
str::{self, Chars}, str::{self, Chars},
}; };
pub use self::{ pub use self::{
error::LexerError, error::{LexerError, LexerErrorKind},
token::{Token, TokenKind}, token::{Token, TokenKind},
}; };
use crate::Span;
mod error; mod error;
mod token; mod token;
@ -31,7 +31,7 @@ pub struct Lexer<'a> {
} }
impl<'a> Lexer<'a> { impl<'a> Lexer<'a> {
/// Construct a new lexer instance. /// Construct a new instance of a lexer.
#[must_use] #[must_use]
pub fn new(source: &'a str) -> Self { pub fn new(source: &'a str) -> Self {
Self { Self {
@ -52,8 +52,8 @@ impl<'a> Lexer<'a> {
/// Return the span of the current [Token]. /// Return the span of the current [Token].
#[inline] #[inline]
#[must_use] #[must_use]
pub fn span(&self) -> Range<usize> { pub fn span(&self) -> Span {
self.token_start..self.cursor Span::new(self.token_start, self.cursor)
} }
/// Return the next [Token] in the input stream. /// Return the next [Token] in the input stream.
@ -62,7 +62,7 @@ impl<'a> Lexer<'a> {
self.token_start = self.cursor; self.token_start = self.cursor;
let Some(c) = self.advance() else { let Some(c) = self.advance() else {
return Ok(None); // EOF reached return Ok(None); // EOF reached, no input left to tokenize
}; };
let kind = match c { let kind = match c {
@ -78,36 +78,17 @@ impl<'a> Lexer<'a> {
'{' => TokenKind::OpenBrace, '{' => TokenKind::OpenBrace,
'}' => TokenKind::CloseBrace, '}' => TokenKind::CloseBrace,
// Dispatch:
'#' => match self.advance() {
Some('{') => TokenKind::OpenHashBrace,
Some('_') => TokenKind::Discard,
_ => {
self.read_word(); // Recover
return Err(LexerError::InvalidToken);
}
},
// Macros:
'\'' => TokenKind::Quote,
'`' => TokenKind::BackQuote,
',' if self.peek().is_some_and(|c| *c == '@') => {
self.advance(); // '@'
TokenKind::CommaAt
}
',' => TokenKind::Comma,
// Literals: // Literals:
'\\' => self.read_char()?, '\\' => self.read_char()?,
':' => self.read_keyword()?, ':' => self.read_keyword()?,
'0'..='9' => self.read_number(c)?, '0'..='9' => self.read_number(c)?,
'+' | '-' if self.peek().is_some_and(|c| c.is_ascii_digit()) => self.read_number(c)?, '+' | '-' if self.peek().is_some_and(|c| c.is_ascii_digit()) => self.read_number(c)?,
'"' => self.read_string()?, '"' => self.read_string()?,
_ if is_symbol_prefix(c) => { _ if is_symbol_prefix(&c) => {
self.read_word(); self.read_word();
match str::from_utf8(self.slice())? { match self.slice() {
"true" | "false" => TokenKind::Bool, b"true" | b"false" => TokenKind::Bool,
"nil" => TokenKind::Nil, b"nil" => TokenKind::Nil,
_ => TokenKind::Symbol, _ => TokenKind::Symbol,
} }
} }
@ -115,11 +96,11 @@ impl<'a> Lexer<'a> {
// Invalid tokens: // Invalid tokens:
_ => { _ => {
self.read_word(); // Recover self.read_word(); // Recover
return Err(LexerError::InvalidToken); return Err(LexerError::new(LexerErrorKind::InvalidSymbol));
} }
}; };
Ok(Some(Token::new(kind))) Ok(Some(Token::new(kind, self.span())))
} }
#[inline] #[inline]
@ -140,12 +121,12 @@ impl<'a> Lexer<'a> {
} }
fn read_word(&mut self) { fn read_word(&mut self) {
self.take_while(|c| !is_separator(*c)); self.take_while(|c| !is_separator(c));
} }
fn read_comment(&mut self) -> TokenKind { fn read_comment(&mut self) -> TokenKind {
self.take_while(|c| *c != '\n'); self.take_while(|c| *c != '\n');
TokenKind::LineComment TokenKind::Comment
} }
fn read_whitespace(&mut self) -> TokenKind { fn read_whitespace(&mut self) -> TokenKind {
@ -155,16 +136,19 @@ impl<'a> Lexer<'a> {
fn read_char(&mut self) -> Result<TokenKind, LexerError> { fn read_char(&mut self) -> Result<TokenKind, LexerError> {
// NOTE: We have already consumed the initial '\' when this function is invoked // NOTE: We have already consumed the initial '\' when this function is invoked
let c = if self.peek().is_some_and(|c| !is_separator(*c)) {
let c = if self.peek().is_some_and(|c| !is_separator(c)) {
self.advance().unwrap() // SAFETY: This will never panic self.advance().unwrap() // SAFETY: This will never panic
} else { } else {
return Err(LexerError::InvalidChar); return Err(LexerError::new(LexerErrorKind::InvalidChar));
}; };
match c { match c {
'u' if self.peek().is_some_and(|c| !is_separator(*c)) => self.complete_unicode_escape(), 'u' if self.peek().is_some_and(|c| !is_separator(c)) => self.complete_unicode_escape(),
'x' if self.peek().is_some_and(|c| !is_separator(*c)) => self.complete_ascii_escape(), 'x' if self.peek().is_some_and(|c| !is_separator(c)) => self.complete_ascii_escape(),
_ if self.peek().is_some_and(|c| !is_separator(*c)) => Err(LexerError::InvalidChar), _ if self.peek().is_some_and(|c| !is_separator(c)) => {
Err(LexerError::new(LexerErrorKind::InvalidChar))
}
_ => Ok(TokenKind::Char), _ => Ok(TokenKind::Char),
} }
} }
@ -177,7 +161,7 @@ impl<'a> Lexer<'a> {
self.advance(); self.advance();
} else { } else {
self.read_word(); // Recover self.read_word(); // Recover
return Err(LexerError::InvalidChar); return Err(LexerError::new(LexerErrorKind::InvalidChar));
} }
// Expect a single hexadecimal digit: // Expect a single hexadecimal digit:
@ -185,14 +169,14 @@ impl<'a> Lexer<'a> {
self.advance(); self.advance();
} else { } else {
self.read_word(); // Recover self.read_word(); // Recover
return Err(LexerError::InvalidChar); return Err(LexerError::new(LexerErrorKind::InvalidChar));
} }
// We should be at the end of the literal now, i.e. next char should be a // We should be at the end of the literal now, i.e. next char should be a
// separator: // separator:
if self.peek().is_some_and(|c| !is_separator(*c)) { if self.peek().is_some_and(|c| !is_separator(c)) {
self.read_word(); // Recover self.read_word(); // Recover
return Err(LexerError::InvalidChar); return Err(LexerError::new(LexerErrorKind::InvalidChar));
} }
Ok(TokenKind::Char) Ok(TokenKind::Char)
@ -203,21 +187,21 @@ impl<'a> Lexer<'a> {
// Expect between 1 and 6 hexadecimal digits: // Expect between 1 and 6 hexadecimal digits:
let mut count = 0; let mut count = 0;
while self.peek().is_some_and(|c| !is_separator(*c)) && count < 6 { while self.peek().is_some_and(|c| !is_separator(c)) && count < 6 {
match self.advance() { match self.advance() {
Some(c) if c.is_ascii_hexdigit() => count += 1, Some(c) if c.is_ascii_hexdigit() => count += 1,
_ => { _ => {
self.read_word(); // Recover self.read_word(); // Recover
return Err(LexerError::InvalidChar); return Err(LexerError::new(LexerErrorKind::InvalidChar));
} }
}; };
} }
// If no hexadecimal digits were found, or digits were found but we are still // If no hexadecimal digits were found, or digits were found but we are still
// not at the end of the literal, then the literal is invalid: // not at the end of the literal, then the literal is invalid:
if count == 0 || self.peek().is_some_and(|c| !is_separator(*c)) { if count == 0 || self.peek().is_some_and(|c| !is_separator(c)) {
self.read_word(); // Recover self.read_word(); // Recover
return Err(LexerError::InvalidChar); return Err(LexerError::new(LexerErrorKind::InvalidChar));
} }
Ok(TokenKind::Char) Ok(TokenKind::Char)
@ -225,12 +209,13 @@ impl<'a> Lexer<'a> {
fn read_keyword(&mut self) -> Result<TokenKind, LexerError> { fn read_keyword(&mut self) -> Result<TokenKind, LexerError> {
// NOTE: We have already consumed the initial ':' when this function is invoked // NOTE: We have already consumed the initial ':' when this function is invoked
if self.peek().is_some_and(|c| is_symbol_prefix(*c)) {
if self.peek().is_some_and(|c| !is_separator(c)) {
self.read_word(); self.read_word();
Ok(TokenKind::Keyword) Ok(TokenKind::Keyword)
} else { } else {
self.read_word(); // Recover self.read_word(); // Recover
Err(LexerError::InvalidKeyword) Err(LexerError::new(LexerErrorKind::InvalidKeyword))
} }
} }
@ -244,14 +229,14 @@ impl<'a> Lexer<'a> {
} }
} }
while self.peek().is_some_and(|c| !is_separator(*c)) { while self.peek().is_some_and(|c| !is_separator(c)) {
match self.advance() { match self.advance() {
Some(c) if c.is_ascii_digit() => {} Some(c) if c.is_ascii_digit() => {}
Some('.') => return self.complete_decimal(), Some('.') => return self.complete_decimal(),
Some('/') => return self.complete_ratio(), Some('/') => return self.complete_ratio(),
_ => { _ => {
self.read_word(); // Recover self.read_word(); // Recover
return Err(LexerError::InvalidNumber); return Err(LexerError::new(LexerErrorKind::InvalidNumber));
} }
} }
} }
@ -261,26 +246,27 @@ impl<'a> Lexer<'a> {
fn read_number_radix(&mut self, radix: u32) -> Result<TokenKind, LexerError> { fn read_number_radix(&mut self, radix: u32) -> Result<TokenKind, LexerError> {
// NOTE: We have already consumed the initial '0' when this function is invoked // NOTE: We have already consumed the initial '0' when this function is invoked
self.advance(); // Base prefix (i.e. 'b'/'B', 'o'/'O', 'x'/'X') self.advance(); // Base prefix (i.e. 'b'/'B', 'o'/'O', 'x'/'X')
let mut digit_found = false; let mut digit_found = false;
while let Some(c) = self.peek() { while let Some(c) = self.peek() {
match c { match c {
_ if is_separator(*c) => break, _ if is_separator(c) => break,
_ if c.is_digit(radix) => { _ if c.is_digit(radix) => {
digit_found = true;
self.advance(); self.advance();
digit_found = true;
} }
_ => { _ => {
self.read_word(); // Recover self.read_word(); // Recover
return Err(LexerError::InvalidNumber); return Err(LexerError::new(LexerErrorKind::InvalidNumber));
} }
}; };
} }
if !digit_found { if !digit_found {
self.read_word(); // Recover self.read_word(); // Recover
return Err(LexerError::InvalidNumber); return Err(LexerError::new(LexerErrorKind::InvalidNumber));
} }
Ok(TokenKind::Integer) Ok(TokenKind::Integer)
@ -289,18 +275,19 @@ impl<'a> Lexer<'a> {
fn complete_decimal(&mut self) -> Result<TokenKind, LexerError> { fn complete_decimal(&mut self) -> Result<TokenKind, LexerError> {
// NOTE: We have already consumed the leading digits and '.' when this function // NOTE: We have already consumed the leading digits and '.' when this function
// is invoked // is invoked
let mut digit_found = false; let mut digit_found = false;
let mut exp_found = false; let mut exp_found = false;
let mut sign_found = false; let mut sign_found = false;
while self.peek().is_some_and(|c| !is_separator(*c)) { while self.peek().is_some_and(|c| !is_separator(c)) {
match self.advance() { match self.advance() {
Some(c) if c.is_ascii_digit() => digit_found = true, Some(c) if c.is_ascii_digit() => digit_found = true,
Some('e') | Some('E') if digit_found && !exp_found => exp_found = true, Some('e') | Some('E') if digit_found && !exp_found => exp_found = true,
Some('+') | Some('-') if exp_found && !sign_found => sign_found = true, Some('+') | Some('-') if exp_found && !sign_found => sign_found = true,
Some(_) => { Some(_) => {
self.read_word(); // Recover self.read_word(); // Recover
return Err(LexerError::InvalidNumber); return Err(LexerError::new(LexerErrorKind::InvalidNumber));
} }
None => unreachable!(), None => unreachable!(),
}; };
@ -312,16 +299,17 @@ impl<'a> Lexer<'a> {
fn complete_ratio(&mut self) -> Result<TokenKind, LexerError> { fn complete_ratio(&mut self) -> Result<TokenKind, LexerError> {
// NOTE: We have already consumed the leading digits and '/' when this function // NOTE: We have already consumed the leading digits and '/' when this function
// is invoked // is invoked
let mut sign_found = false; let mut sign_found = false;
let mut digit_found = false; let mut digit_found = false;
while self.peek().is_some_and(|c| !is_separator(*c)) { while self.peek().is_some_and(|c| !is_separator(c)) {
match self.advance() { match self.advance() {
Some(c) if c.is_ascii_digit() => digit_found = true, Some(c) if c.is_ascii_digit() => digit_found = true,
Some('+') | Some('-') if !digit_found && !sign_found => sign_found = true, Some('+') | Some('-') if !digit_found && !sign_found => sign_found = true,
Some(_) => { Some(_) => {
self.read_word(); // Recover self.read_word(); // Recover
return Err(LexerError::InvalidNumber); return Err(LexerError::new(LexerErrorKind::InvalidNumber));
} }
None => unreachable!(), None => unreachable!(),
}; };
@ -329,7 +317,7 @@ impl<'a> Lexer<'a> {
if !digit_found { if !digit_found {
self.read_word(); // Recover self.read_word(); // Recover
return Err(LexerError::InvalidNumber); return Err(LexerError::new(LexerErrorKind::InvalidNumber));
} }
Ok(TokenKind::Ratio) Ok(TokenKind::Ratio)
@ -337,6 +325,7 @@ impl<'a> Lexer<'a> {
fn read_string(&mut self) -> Result<TokenKind, LexerError> { fn read_string(&mut self) -> Result<TokenKind, LexerError> {
// NOTE: We have already consumed the initial '"' when this function is invoked // NOTE: We have already consumed the initial '"' when this function is invoked
loop { loop {
match self.advance() { match self.advance() {
Some('"') => break, Some('"') => break,
@ -344,7 +333,7 @@ impl<'a> Lexer<'a> {
self.advance(); // '"' self.advance(); // '"'
} }
Some(_) => {} Some(_) => {}
None => return Err(LexerError::UnclosedString), None => return Err(LexerError::new(LexerErrorKind::UnclosedString)),
} }
} }
@ -361,12 +350,12 @@ impl Iterator for Lexer<'_> {
} }
#[inline] #[inline]
fn is_separator(c: char) -> bool { fn is_separator(c: &char) -> bool {
c.is_ascii_whitespace() | matches!(c, '(' | ')' | '[' | ']' | '{' | '}' | ';') c.is_ascii_whitespace() | matches!(c, '(' | ')' | '[' | ']' | '{' | '}' | ';')
} }
#[inline] #[inline]
fn is_symbol_prefix(c: char) -> bool { fn is_symbol_prefix(c: &char) -> bool {
c.is_alphabetic() c.is_alphabetic()
| matches!( | matches!(
c, c,
@ -382,7 +371,7 @@ mod tests {
fn empty() { fn empty() {
let mut lexer = Lexer::new(""); let mut lexer = Lexer::new("");
assert_eq!(lexer.next(), None); assert_eq!(lexer.next(), None);
assert_eq!(lexer.span(), 0..0); assert_eq!(lexer.span(), Span::default());
assert_eq!(lexer.slice(), &[]); assert_eq!(lexer.slice(), &[]);
} }
@ -397,7 +386,7 @@ mod tests {
Err(e) => Err(e), Err(e) => Err(e),
}); });
assert_eq!(kind, Some(token)); assert_eq!(kind, Some(token));
assert_eq!(span, lexer.span()); assert_eq!(span, lexer.span().into());
assert_eq!(slice.as_bytes(), lexer.slice()); assert_eq!(slice.as_bytes(), lexer.slice());
} }
assert_eq!(lexer.next(), None); assert_eq!(lexer.next(), None);
@ -406,11 +395,11 @@ mod tests {
} }
test!(line_comment: ";; foobar\nnil ; bar; baz" => [ test!(line_comment: ";; foobar\nnil ; bar; baz" => [
(Ok(TokenKind::LineComment), 0..9, ";; foobar"), (Ok(TokenKind::Comment), 0..9, ";; foobar"),
(Ok(TokenKind::Whitespace), 9..10, "\n"), (Ok(TokenKind::Whitespace), 9..10, "\n"),
(Ok(TokenKind::Nil), 10..13, "nil"), (Ok(TokenKind::Nil), 10..13, "nil"),
(Ok(TokenKind::Whitespace), 13..14, " "), (Ok(TokenKind::Whitespace), 13..14, " "),
(Ok(TokenKind::LineComment), 14..24, "; bar; baz"), (Ok(TokenKind::Comment), 14..24, "; bar; baz"),
]); ]);
test!(list: "(0 1.2 -3/4 +5.6e-7)" => [ test!(list: "(0 1.2 -3/4 +5.6e-7)" => [
@ -449,33 +438,21 @@ mod tests {
(Ok(TokenKind::CloseBracket), 11..12, "]"), (Ok(TokenKind::CloseBracket), 11..12, "]"),
]); ]);
test!(dispatch: "#{} #_() #_ 4" => [ test!(keyword: ":m :0 :this-is-an-keyword-too! :😻" => [
(Ok(TokenKind::OpenHashBrace), 0..2, "#{"),
(Ok(TokenKind::CloseBrace), 2..3, "}"),
(Ok(TokenKind::Whitespace), 3..4, " "),
(Ok(TokenKind::Discard), 4..6, "#_"),
(Ok(TokenKind::OpenParen), 6..7, "("),
(Ok(TokenKind::CloseParen), 7..8, ")"),
(Ok(TokenKind::Whitespace), 8..9, " "),
(Ok(TokenKind::Discard), 9..11, "#_"),
(Ok(TokenKind::Whitespace), 11..12, " "),
(Ok(TokenKind::Integer), 12..13, "4"),
]);
test!(err_invalid_dispatch: "#@" => [
(Err(LexerError::InvalidToken), 0..2, "#@"),
]);
test!(keyword: ":m :x0 :this-is-an-keyword-too!" => [
(Ok(TokenKind::Keyword), 0..2, ":m"), (Ok(TokenKind::Keyword), 0..2, ":m"),
(Ok(TokenKind::Whitespace), 2..3, " "), (Ok(TokenKind::Whitespace), 2..3, " "),
(Ok(TokenKind::Keyword), 3..6, ":x0"), (Ok(TokenKind::Keyword), 3..5, ":0"),
(Ok(TokenKind::Whitespace), 6..7, " "), (Ok(TokenKind::Whitespace), 5..6, " "),
(Ok(TokenKind::Keyword), 7..31, ":this-is-an-keyword-too!"), (Ok(TokenKind::Keyword), 6..30, ":this-is-an-keyword-too!"),
(Ok(TokenKind::Whitespace), 30..31, " "),
(Ok(TokenKind::Keyword), 31..36, ":😻"),
]); ]);
test!(err_invalid_keyword: ":0" => [ test!(err_invalid_keyword: ": :;" => [
(Err(LexerError::InvalidKeyword), 0..2, ":0"), (Err(LexerError::new(LexerErrorKind::InvalidKeyword)), 0..1, ":"),
(Ok(TokenKind::Whitespace), 1..2, " "),
(Err(LexerError::new(LexerErrorKind::InvalidKeyword)), 2..3, ":"),
(Ok(TokenKind::Comment), 3..4, ";"),
]); ]);
test!(char: r"\a \? \7 \λ \\ \u \x" => [ test!(char: r"\a \? \7 \λ \\ \u \x" => [
@ -495,47 +472,47 @@ mod tests {
]); ]);
test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222" => [ test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222" => [
(Err(LexerError::InvalidChar), 0..1, r"\"), (Err(LexerError::new(LexerErrorKind::InvalidChar)), 0..1, r"\"),
(Ok(TokenKind::Whitespace), 1..2, " "), (Ok(TokenKind::Whitespace), 1..2, " "),
(Err(LexerError::InvalidChar), 2..5, r"\xF"), (Err(LexerError::new(LexerErrorKind::InvalidChar)), 2..5, r"\xF"),
(Ok(TokenKind::Whitespace), 5..6, " "), (Ok(TokenKind::Whitespace), 5..6, " "),
(Err(LexerError::InvalidChar), 6..9, r"\x0"), (Err(LexerError::new(LexerErrorKind::InvalidChar)), 6..9, r"\x0"),
(Ok(TokenKind::Whitespace), 9..10, " "), (Ok(TokenKind::Whitespace), 9..10, " "),
(Err(LexerError::InvalidChar), 10..15, r"\x111"), (Err(LexerError::new(LexerErrorKind::InvalidChar)), 10..15, r"\x111"),
(Ok(TokenKind::Whitespace), 15..16, " "), (Ok(TokenKind::Whitespace), 15..16, " "),
(Err(LexerError::InvalidChar), 16..19, r"\uG"), (Err(LexerError::new(LexerErrorKind::InvalidChar)), 16..19, r"\uG"),
(Ok(TokenKind::Whitespace), 19..20, " "), (Ok(TokenKind::Whitespace), 19..20, " "),
(Err(LexerError::InvalidChar), 20..29, r"\u2222222"), (Err(LexerError::new(LexerErrorKind::InvalidChar)), 20..29, r"\u2222222"),
]); ]);
test!(err_invalid_integer: "0b012 0o8 0xFG 1N 0x" => [ test!(err_invalid_integer: "0b012 0o8 0xFG 1N 0x" => [
(Err(LexerError::InvalidNumber), 0..5, "0b012"), (Err(LexerError::new(LexerErrorKind::InvalidNumber)), 0..5, "0b012"),
(Ok(TokenKind::Whitespace), 5..6, " "), (Ok(TokenKind::Whitespace), 5..6, " "),
(Err(LexerError::InvalidNumber), 6..9, "0o8"), (Err(LexerError::new(LexerErrorKind::InvalidNumber)), 6..9, "0o8"),
(Ok(TokenKind::Whitespace), 9..10, " "), (Ok(TokenKind::Whitespace), 9..10, " "),
(Err(LexerError::InvalidNumber), 10..14, "0xFG"), (Err(LexerError::new(LexerErrorKind::InvalidNumber)), 10..14, "0xFG"),
(Ok(TokenKind::Whitespace), 14..15, " "), (Ok(TokenKind::Whitespace), 14..15, " "),
(Err(LexerError::InvalidNumber), 15..17, "1N"), (Err(LexerError::new(LexerErrorKind::InvalidNumber)), 15..17, "1N"),
(Ok(TokenKind::Whitespace), 17..18, " "), (Ok(TokenKind::Whitespace), 17..18, " "),
(Err(LexerError::InvalidNumber), 18..20, "0x"), (Err(LexerError::new(LexerErrorKind::InvalidNumber)), 18..20, "0x"),
]); ]);
test!(err_invalid_decimal: "1.2.3 4.e6 7.8+ 9.0+e1" => [ test!(err_invalid_decimal: "1.2.3 4.e6 7.8+ 9.0+e1" => [
(Err(LexerError::InvalidNumber), 0..5, "1.2.3"), (Err(LexerError::new(LexerErrorKind::InvalidNumber)), 0..5, "1.2.3"),
(Ok(TokenKind::Whitespace), 5..6, " "), (Ok(TokenKind::Whitespace), 5..6, " "),
(Err(LexerError::InvalidNumber), 6..10, "4.e6"), (Err(LexerError::new(LexerErrorKind::InvalidNumber)), 6..10, "4.e6"),
(Ok(TokenKind::Whitespace), 10..11, " "), (Ok(TokenKind::Whitespace), 10..11, " "),
(Err(LexerError::InvalidNumber), 11..15, "7.8+"), (Err(LexerError::new(LexerErrorKind::InvalidNumber)), 11..15, "7.8+"),
(Ok(TokenKind::Whitespace), 15..16, " "), (Ok(TokenKind::Whitespace), 15..16, " "),
(Err(LexerError::InvalidNumber), 16..22, "9.0+e1"), (Err(LexerError::new(LexerErrorKind::InvalidNumber)), 16..22, "9.0+e1"),
]); ]);
test!(err_invalid_ratio: "1/ -2/3+ 4/-" => [ test!(err_invalid_ratio: "1/ -2/3+ 4/-" => [
(Err(LexerError::InvalidNumber), 0..2, "1/"), (Err(LexerError::new(LexerErrorKind::InvalidNumber)), 0..2, "1/"),
(Ok(TokenKind::Whitespace), 2..3, " "), (Ok(TokenKind::Whitespace), 2..3, " "),
(Err(LexerError::InvalidNumber), 3..8, "-2/3+"), (Err(LexerError::new(LexerErrorKind::InvalidNumber)), 3..8, "-2/3+"),
(Ok(TokenKind::Whitespace), 8..9, " "), (Ok(TokenKind::Whitespace), 8..9, " "),
(Err(LexerError::InvalidNumber), 9..12, "4/-"), (Err(LexerError::new(LexerErrorKind::InvalidNumber)), 9..12, "4/-"),
]); ]);
test!(string: "\"föö bar1\nbaz\" \"\" \"凄い 😍\"" => [ test!(string: "\"föö bar1\nbaz\" \"\" \"凄い 😍\"" => [
@ -547,7 +524,7 @@ mod tests {
]); ]);
test!(err_unclosed_string: "\"oops" => [ test!(err_unclosed_string: "\"oops" => [
(Err(LexerError::UnclosedString), 0..5, "\"oops"), (Err(LexerError::new(LexerErrorKind::UnclosedString)), 0..5, "\"oops"),
]); ]);
test!(symbol: "+ rev fold0 nil? x str-cat 猫" => [ test!(symbol: "+ rev fold0 nil? x str-cat 猫" => [
@ -572,9 +549,9 @@ mod tests {
#[test] #[test]
fn $name(x in $input) { fn $name(x in $input) {
let mut lexer = Lexer::new(&x); let mut lexer = Lexer::new(&x);
assert_eq!(lexer.next(), Some(Ok(Token { kind: TokenKind::$kind }))); assert_eq!(lexer.next(), Some(Ok(Token::new(TokenKind::$kind, lexer.span()))));
assert_eq!(lexer.slice(), x.as_bytes()); assert_eq!(lexer.slice(), x.as_bytes());
assert_eq!(lexer.span(), 0..x.len()); assert_eq!(lexer.span(), Span::new(0, x.len()));
} }
} }
}; };

View File

@ -1,12 +1,12 @@
use crate::Span;
/// Kinds of tokens which are valid in Onihime source code. /// Kinds of tokens which are valid in Onihime source code.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum TokenKind { pub enum TokenKind {
/// Line comment, e.g. `; ...`
Comment,
/// Whitespace, e.g. ' ', '\t', '\n' /// Whitespace, e.g. ' ', '\t', '\n'
Whitespace, Whitespace,
/// Line comment, e.g. `; ...`
LineComment,
/// Discard, e.g. `#_ 4`, `#_( ... )`
Discard,
/// Opening parenthesis, e.g. `(` /// Opening parenthesis, e.g. `(`
OpenParen, OpenParen,
@ -20,8 +20,6 @@ pub enum TokenKind {
OpenBracket, OpenBracket,
/// Closing bracket, e.g. `]` /// Closing bracket, e.g. `]`
CloseBracket, CloseBracket,
/// Opening hash-brace, e.g. `#{`
OpenHashBrace,
/// Boolean, e.g. `true`, `false` /// Boolean, e.g. `true`, `false`
Bool, Bool,
@ -41,15 +39,18 @@ pub enum TokenKind {
Symbol, Symbol,
/// Nil, e.g. `nil` /// Nil, e.g. `nil`
Nil, Nil,
}
/// Comma, e.g. `,` impl TokenKind {
Comma, /// Returns `true` if the token type an atom.
/// Comma followed by at sign, e.g. `,@` pub fn is_atom(&self) -> bool {
CommaAt, use TokenKind::*;
/// Backtick quote, e.g. `` ` ``
BackQuote, matches!(
/// Single quote, e.g. `'` self,
Quote, Bool | Char | Keyword | Decimal | Integer | Ratio | String | Symbol | Nil
)
}
} }
/// A valid token found in Onihime source code. /// A valid token found in Onihime source code.
@ -57,12 +58,14 @@ pub enum TokenKind {
pub struct Token { pub struct Token {
/// Kind of token which was found. /// Kind of token which was found.
pub kind: TokenKind, pub kind: TokenKind,
/// The token's span.
pub span: Span,
} }
impl Token { impl Token {
/// Construct a new instance of `Token`. /// Construct a new instance of a token.
#[must_use] #[must_use]
pub const fn new(kind: TokenKind) -> Self { pub const fn new(kind: TokenKind, span: Span) -> Self {
Self { kind } Self { kind, span }
} }
} }