diff --git a/.gitignore b/.gitignore index 65b40a0..81245fb 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ Cargo.lock # Miscellaneous files .DS_Store +tarpaulin-report.html diff --git a/onihime/Cargo.toml b/onihime/Cargo.toml index 1775b96..ccccd85 100644 --- a/onihime/Cargo.toml +++ b/onihime/Cargo.toml @@ -6,3 +6,9 @@ edition.workspace = true homepage.workspace = true repository.workspace = true license.workspace = true + +[dev-dependencies] +proptest = "1.6.0" + +[lints.rust] +unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tarpaulin_include)'] } diff --git a/onihime/src/lexer/error.rs b/onihime/src/lexer/error.rs new file mode 100644 index 0000000..ef6e3c7 --- /dev/null +++ b/onihime/src/lexer/error.rs @@ -0,0 +1,38 @@ +/// Errors during lexical analysis. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum LexerError { + /// An invalid character literal was encountered. + InvalidChar, + /// An invalid keyword literal was encountered. + InvalidKeyword, + /// An invalid number literal was encountered. + InvalidNumber, + /// An invalid token was encountered. + InvalidToken, + /// An unclosed string was encountered. + UnclosedString, + /// Invalid UTF-8 sequence was encountered. + Utf8Error(std::str::Utf8Error), +} + +impl From for LexerError { + fn from(err: std::str::Utf8Error) -> Self { + Self::Utf8Error(err) + } +} + +impl std::error::Error for LexerError {} + +#[cfg(not(tarpaulin_include))] +impl std::fmt::Display for LexerError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + LexerError::InvalidChar => write!(f, "An invalid character literal was encountered"), + LexerError::InvalidKeyword => write!(f, "An invalid keyword literal was encountered"), + LexerError::InvalidNumber => write!(f, "An invalid number literal was encountered"), + LexerError::InvalidToken => write!(f, "An invalid token was encountered"), + LexerError::UnclosedString => write!(f, "An unclosed string was encountered"), + LexerError::Utf8Error(err) => write!(f, "{err}"), + } + } +} diff --git a/onihime/src/lexer/mod.rs b/onihime/src/lexer/mod.rs new file mode 100644 index 0000000..78c589f --- /dev/null +++ b/onihime/src/lexer/mod.rs @@ -0,0 +1,594 @@ +//! Lexer for the Onihime programming language. +//! +//! The job of a lexer is to perform lexical analysis on some input text. In +//! other words, it converts character sequences into token sequences. For a +//! more in-depth explanation of this process, see the [Wikipedia page for +//! Lexical Analysis]. +//! +//! [Wikipedia page for Lexical Analysis]: https://en.wikipedia.org/wiki/Lexical_analysis + +use std::{ + iter::Peekable, + ops::Range, + str::{self, Chars}, +}; + +pub use self::{ + error::LexerError, + token::{Token, TokenKind}, +}; + +mod error; +mod token; + +/// Performs lexical analysis and produces tokens from an input string. +#[derive(Debug)] +pub struct Lexer<'a> { + bytes: &'a [u8], + chars: Peekable>, + token_start: usize, + cursor: usize, +} + +impl<'a> Lexer<'a> { + /// Construct a new lexer instance. + #[must_use] + pub fn new(source: &'a str) -> Self { + Self { + bytes: source.as_bytes(), + chars: source.chars().peekable(), + token_start: 0, + cursor: 0, + } + } + + /// Return a byte slice containing the contents of the current [Token]. + #[inline] + #[must_use] + pub fn slice(&self) -> &'a [u8] { + &self.bytes[self.token_start..self.cursor] + } + + /// Return the span of the current [Token]. + #[inline] + #[must_use] + pub fn span(&self) -> Range { + self.token_start..self.cursor + } + + /// Return the next [Token] in the input stream. + #[inline] + pub fn next_token(&mut self) -> Result, LexerError> { + self.token_start = self.cursor; + + let Some(c) = self.advance() else { + return Ok(None); // EOF reached + }; + + let kind = match c { + // Comments and whitespace: + ';' => self.read_comment(), + _ if c.is_ascii_whitespace() => self.read_whitespace(), + + // Sequence delimiters: + '(' => TokenKind::OpenParen, + ')' => TokenKind::CloseParen, + '[' => TokenKind::OpenBracket, + ']' => TokenKind::CloseBracket, + '{' => TokenKind::OpenBrace, + '}' => TokenKind::CloseBrace, + + // Dispatch: + '#' => match self.advance() { + Some('{') => TokenKind::OpenHashBrace, + Some('_') => TokenKind::Discard, + _ => { + self.read_word(); // Recover + return Err(LexerError::InvalidToken); + } + }, + + // Macros: + '\'' => TokenKind::Quote, + '`' => TokenKind::BackQuote, + ',' if self.peek().is_some_and(|c| *c == '@') => { + self.advance(); // '@' + TokenKind::CommaAt + } + ',' => TokenKind::Comma, + + // Literals: + '\\' => self.read_char()?, + ':' => self.read_keyword()?, + '0'..='9' => self.read_number(c)?, + '+' | '-' if self.peek().is_some_and(|c| c.is_ascii_digit()) => self.read_number(c)?, + '"' => self.read_string()?, + _ if is_symbol_prefix(c) => { + self.read_word(); + match str::from_utf8(self.slice())? { + "true" | "false" => TokenKind::Bool, + "nil" => TokenKind::Nil, + _ => TokenKind::Symbol, + } + } + + // Invalid tokens: + _ => { + self.read_word(); // Recover + return Err(LexerError::InvalidToken); + } + }; + + Ok(Some(Token::new(kind))) + } + + #[inline] + fn peek(&mut self) -> Option<&char> { + self.chars.peek() + } + + fn advance(&mut self) -> Option { + self.chars.next().inspect(|c| { + self.cursor += c.len_utf8(); + }) + } + + fn take_while(&mut self, predicate: impl Fn(&char) -> bool) { + while self.chars.peek().is_some_and(&predicate) { + self.advance(); + } + } + + fn read_word(&mut self) { + self.take_while(|c| !is_separator(*c)); + } + + fn read_comment(&mut self) -> TokenKind { + self.take_while(|c| *c != '\n'); + TokenKind::LineComment + } + + fn read_whitespace(&mut self) -> TokenKind { + self.take_while(|c| c.is_ascii_whitespace()); + TokenKind::Whitespace + } + + fn read_char(&mut self) -> Result { + // NOTE: We have already consumed the initial '\' when this function is invoked + let c = if self.peek().is_some_and(|c| !is_separator(*c)) { + self.advance().unwrap() // SAFETY: This will never panic + } else { + return Err(LexerError::InvalidChar); + }; + + match c { + 'u' if self.peek().is_some_and(|c| !is_separator(*c)) => self.complete_unicode_escape(), + 'x' if self.peek().is_some_and(|c| !is_separator(*c)) => self.complete_ascii_escape(), + _ if self.peek().is_some_and(|c| !is_separator(*c)) => Err(LexerError::InvalidChar), + _ => Ok(TokenKind::Char), + } + } + + fn complete_ascii_escape(&mut self) -> Result { + // NOTE: We have already consumed the initial '\x' when this function is invoked + + // Expect a single octal digit: + if self.peek().is_some_and(|c| c.is_digit(8)) { + self.advance(); + } else { + self.read_word(); // Recover + return Err(LexerError::InvalidChar); + } + + // Expect a single hexadecimal digit: + if self.peek().is_some_and(|c| c.is_ascii_hexdigit()) { + self.advance(); + } else { + self.read_word(); // Recover + return Err(LexerError::InvalidChar); + } + + // We should be at the end of the literal now, i.e. next char should be a + // separator: + if self.peek().is_some_and(|c| !is_separator(*c)) { + self.read_word(); // Recover + return Err(LexerError::InvalidChar); + } + + Ok(TokenKind::Char) + } + + fn complete_unicode_escape(&mut self) -> Result { + // NOTE: We have already consumed the initial '\u' when this function is invoked + + // Expect between 1 and 6 hexadecimal digits: + let mut count = 0; + while self.peek().is_some_and(|c| !is_separator(*c)) && count < 6 { + match self.advance() { + Some(c) if c.is_ascii_hexdigit() => count += 1, + _ => { + self.read_word(); // Recover + return Err(LexerError::InvalidChar); + } + }; + } + + // If no hexadecimal digits were found, or digits were found but we are still + // not at the end of the literal, then the literal is invalid: + if count == 0 || self.peek().is_some_and(|c| !is_separator(*c)) { + self.read_word(); // Recover + return Err(LexerError::InvalidChar); + } + + Ok(TokenKind::Char) + } + + fn read_keyword(&mut self) -> Result { + // NOTE: We have already consumed the initial ':' when this function is invoked + if self.peek().is_some_and(|c| is_symbol_prefix(*c)) { + self.read_word(); + Ok(TokenKind::Keyword) + } else { + self.read_word(); // Recover + Err(LexerError::InvalidKeyword) + } + } + + fn read_number(&mut self, first_char: char) -> Result { + if first_char == '0' { + match self.peek() { + Some('b') | Some('B') => return self.read_number_radix(2), + Some('o') | Some('O') => return self.read_number_radix(8), + Some('x') | Some('X') => return self.read_number_radix(16), + _ => {} + } + } + + while self.peek().is_some_and(|c| !is_separator(*c)) { + match self.advance() { + Some(c) if c.is_ascii_digit() => {} + Some('.') => return self.complete_decimal(), + Some('/') => return self.complete_ratio(), + _ => { + self.read_word(); // Recover + return Err(LexerError::InvalidNumber); + } + } + } + + Ok(TokenKind::Integer) + } + + fn read_number_radix(&mut self, radix: u32) -> Result { + // NOTE: We have already consumed the initial '0' when this function is invoked + self.advance(); // Base prefix (i.e. 'b'/'B', 'o'/'O', 'x'/'X') + + let mut digit_found = false; + while let Some(c) = self.peek() { + match c { + _ if is_separator(*c) => break, + _ if c.is_digit(radix) => { + digit_found = true; + self.advance(); + } + _ => { + self.read_word(); // Recover + return Err(LexerError::InvalidNumber); + } + }; + } + + if !digit_found { + self.read_word(); // Recover + return Err(LexerError::InvalidNumber); + } + + Ok(TokenKind::Integer) + } + + fn complete_decimal(&mut self) -> Result { + // NOTE: We have already consumed the leading digits and '.' when this function + // is invoked + let mut digit_found = false; + let mut exp_found = false; + let mut sign_found = false; + + while self.peek().is_some_and(|c| !is_separator(*c)) { + match self.advance() { + Some(c) if c.is_ascii_digit() => digit_found = true, + Some('e') | Some('E') if digit_found && !exp_found => exp_found = true, + Some('+') | Some('-') if exp_found && !sign_found => sign_found = true, + Some(_) => { + self.read_word(); // Recover + return Err(LexerError::InvalidNumber); + } + None => unreachable!(), + }; + } + + Ok(TokenKind::Decimal) + } + + fn complete_ratio(&mut self) -> Result { + // NOTE: We have already consumed the leading digits and '/' when this function + // is invoked + let mut sign_found = false; + let mut digit_found = false; + + while self.peek().is_some_and(|c| !is_separator(*c)) { + match self.advance() { + Some(c) if c.is_ascii_digit() => digit_found = true, + Some('+') | Some('-') if !digit_found && !sign_found => sign_found = true, + Some(_) => { + self.read_word(); // Recover + return Err(LexerError::InvalidNumber); + } + None => unreachable!(), + }; + } + + if !digit_found { + self.read_word(); // Recover + return Err(LexerError::InvalidNumber); + } + + Ok(TokenKind::Ratio) + } + + fn read_string(&mut self) -> Result { + // NOTE: We have already consumed the initial '"' when this function is invoked + loop { + match self.advance() { + Some('"') => break, + Some(c) if c == '\\' && self.peek().is_some_and(|c| *c == '"') => { + self.advance(); // '"' + } + Some(_) => {} + None => return Err(LexerError::UnclosedString), + } + } + + Ok(TokenKind::String) + } +} + +impl Iterator for Lexer<'_> { + type Item = Result; + + fn next(&mut self) -> Option { + self.next_token().transpose() + } +} + +#[inline] +fn is_separator(c: char) -> bool { + c.is_ascii_whitespace() | matches!(c, '(' | ')' | '[' | ']' | '{' | '}' | ';') +} + +#[inline] +fn is_symbol_prefix(c: char) -> bool { + c.is_alphabetic() + | matches!( + c, + '~' | '!' | '$' | '%' | '^' | '&' | '*' | '-' | '_' | '+' | '=' | '<' | '>' | '/' | '?' + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty() { + let mut lexer = Lexer::new(""); + assert_eq!(lexer.next(), None); + assert_eq!(lexer.span(), 0..0); + assert_eq!(lexer.slice(), &[]); + } + + macro_rules! test { + ( $name:ident: $input:literal => $expected:expr ) => { + #[test] + fn $name() { + let mut lexer = Lexer::new($input); + for (token, span, slice) in $expected { + let kind = lexer.next().map(|r| match r { + Ok(t) => Ok(t.kind), + Err(e) => Err(e), + }); + assert_eq!(kind, Some(token)); + assert_eq!(span, lexer.span()); + assert_eq!(slice.as_bytes(), lexer.slice()); + } + assert_eq!(lexer.next(), None); + } + }; + } + + test!(line_comment: ";; foobar\nnil ; bar; baz" => [ + (Ok(TokenKind::LineComment), 0..9, ";; foobar"), + (Ok(TokenKind::Whitespace), 9..10, "\n"), + (Ok(TokenKind::Nil), 10..13, "nil"), + (Ok(TokenKind::Whitespace), 13..14, " "), + (Ok(TokenKind::LineComment), 14..24, "; bar; baz"), + ]); + + test!(list: "(0 1.2 -3/4 +5.6e-7)" => [ + (Ok(TokenKind::OpenParen), 0..1, "("), + (Ok(TokenKind::Integer), 1..2, "0"), + (Ok(TokenKind::Whitespace), 2..3, " "), + (Ok(TokenKind::Decimal), 3..6, "1.2"), + (Ok(TokenKind::Whitespace), 6..7, " "), + (Ok(TokenKind::Ratio), 7..11, "-3/4"), + (Ok(TokenKind::Whitespace), 11..12, " "), + (Ok(TokenKind::Decimal), 12..19, "+5.6e-7"), + (Ok(TokenKind::CloseParen), 19..20, ")"), + ]); + + test!(map: r"{:a \a :b {:c nil}}" => [ + (Ok(TokenKind::OpenBrace), 0..1, "{"), + (Ok(TokenKind::Keyword), 1..3, ":a"), + (Ok(TokenKind::Whitespace), 3..4, " "), + (Ok(TokenKind::Char), 4..6, "\\a"), + (Ok(TokenKind::Whitespace), 6..7, " "), + (Ok(TokenKind::Keyword), 7..9, ":b"), + (Ok(TokenKind::Whitespace), 9..10, " "), + (Ok(TokenKind::OpenBrace), 10..11, "{"), + (Ok(TokenKind::Keyword), 11..13, ":c"), + (Ok(TokenKind::Whitespace), 13..14, " "), + (Ok(TokenKind::Nil), 14..17, "nil"), + (Ok(TokenKind::CloseBrace), 17..18, "}"), + (Ok(TokenKind::CloseBrace), 18..19, "}"), + ]); + + test!(vector: "[true false]" => [ + (Ok(TokenKind::OpenBracket), 0..1, "["), + (Ok(TokenKind::Bool), 1..5, "true"), + (Ok(TokenKind::Whitespace), 5..6, " "), + (Ok(TokenKind::Bool), 6..11, "false"), + (Ok(TokenKind::CloseBracket), 11..12, "]"), + ]); + + test!(dispatch: "#{} #_() #_ 4" => [ + (Ok(TokenKind::OpenHashBrace), 0..2, "#{"), + (Ok(TokenKind::CloseBrace), 2..3, "}"), + (Ok(TokenKind::Whitespace), 3..4, " "), + (Ok(TokenKind::Discard), 4..6, "#_"), + (Ok(TokenKind::OpenParen), 6..7, "("), + (Ok(TokenKind::CloseParen), 7..8, ")"), + (Ok(TokenKind::Whitespace), 8..9, " "), + (Ok(TokenKind::Discard), 9..11, "#_"), + (Ok(TokenKind::Whitespace), 11..12, " "), + (Ok(TokenKind::Integer), 12..13, "4"), + ]); + + test!(err_invalid_dispatch: "#@" => [ + (Err(LexerError::InvalidToken), 0..2, "#@"), + ]); + + test!(keyword: ":m :x0 :this-is-an-keyword-too!" => [ + (Ok(TokenKind::Keyword), 0..2, ":m"), + (Ok(TokenKind::Whitespace), 2..3, " "), + (Ok(TokenKind::Keyword), 3..6, ":x0"), + (Ok(TokenKind::Whitespace), 6..7, " "), + (Ok(TokenKind::Keyword), 7..31, ":this-is-an-keyword-too!"), + ]); + + test!(err_invalid_keyword: ":0" => [ + (Err(LexerError::InvalidKeyword), 0..2, ":0"), + ]); + + test!(char: r"\a \? \7 \λ \\ \u \x" => [ + (Ok(TokenKind::Char), 0..2, r"\a"), + (Ok(TokenKind::Whitespace), 2..3, " "), + (Ok(TokenKind::Char), 3..5, r"\?"), + (Ok(TokenKind::Whitespace), 5..6, " "), + (Ok(TokenKind::Char), 6..8, r"\7"), + (Ok(TokenKind::Whitespace), 8..9, " "), + (Ok(TokenKind::Char), 9..12, r"\λ"), + (Ok(TokenKind::Whitespace), 12..13, " "), + (Ok(TokenKind::Char), 13..15, r"\\"), + (Ok(TokenKind::Whitespace), 15..16, " "), + (Ok(TokenKind::Char), 16..18, r"\u"), + (Ok(TokenKind::Whitespace), 18..19, " "), + (Ok(TokenKind::Char), 19..21, r"\x"), + ]); + + test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222" => [ + (Err(LexerError::InvalidChar), 0..1, r"\"), + (Ok(TokenKind::Whitespace), 1..2, " "), + (Err(LexerError::InvalidChar), 2..5, r"\xF"), + (Ok(TokenKind::Whitespace), 5..6, " "), + (Err(LexerError::InvalidChar), 6..9, r"\x0"), + (Ok(TokenKind::Whitespace), 9..10, " "), + (Err(LexerError::InvalidChar), 10..15, r"\x111"), + (Ok(TokenKind::Whitespace), 15..16, " "), + (Err(LexerError::InvalidChar), 16..19, r"\uG"), + (Ok(TokenKind::Whitespace), 19..20, " "), + (Err(LexerError::InvalidChar), 20..29, r"\u2222222"), + ]); + + test!(err_invalid_integer: "0b012 0o8 0xFG 1N 0x" => [ + (Err(LexerError::InvalidNumber), 0..5, "0b012"), + (Ok(TokenKind::Whitespace), 5..6, " "), + (Err(LexerError::InvalidNumber), 6..9, "0o8"), + (Ok(TokenKind::Whitespace), 9..10, " "), + (Err(LexerError::InvalidNumber), 10..14, "0xFG"), + (Ok(TokenKind::Whitespace), 14..15, " "), + (Err(LexerError::InvalidNumber), 15..17, "1N"), + (Ok(TokenKind::Whitespace), 17..18, " "), + (Err(LexerError::InvalidNumber), 18..20, "0x"), + ]); + + test!(err_invalid_decimal: "1.2.3 4.e6 7.8+ 9.0+e1" => [ + (Err(LexerError::InvalidNumber), 0..5, "1.2.3"), + (Ok(TokenKind::Whitespace), 5..6, " "), + (Err(LexerError::InvalidNumber), 6..10, "4.e6"), + (Ok(TokenKind::Whitespace), 10..11, " "), + (Err(LexerError::InvalidNumber), 11..15, "7.8+"), + (Ok(TokenKind::Whitespace), 15..16, " "), + (Err(LexerError::InvalidNumber), 16..22, "9.0+e1"), + ]); + + test!(err_invalid_ratio: "1/ -2/3+ 4/-" => [ + (Err(LexerError::InvalidNumber), 0..2, "1/"), + (Ok(TokenKind::Whitespace), 2..3, " "), + (Err(LexerError::InvalidNumber), 3..8, "-2/3+"), + (Ok(TokenKind::Whitespace), 8..9, " "), + (Err(LexerError::InvalidNumber), 9..12, "4/-"), + ]); + + test!(string: "\"föö bar1\nbaz\" \"\" \"凄い 😍\"" => [ + (Ok(TokenKind::String), 0..16, "\"föö bar1\nbaz\""), + (Ok(TokenKind::Whitespace), 16..17, " "), + (Ok(TokenKind::String), 17..19, "\"\""), + (Ok(TokenKind::Whitespace), 19..20, " "), + (Ok(TokenKind::String), 20..33, "\"凄い 😍\""), + ]); + + test!(err_unclosed_string: "\"oops" => [ + (Err(LexerError::UnclosedString), 0..5, "\"oops"), + ]); + + test!(symbol: "+ rev fold0 nil? x str-cat 猫" => [ + (Ok(TokenKind::Symbol), 0..1, "+"), + (Ok(TokenKind::Whitespace), 1..2, " "), + (Ok(TokenKind::Symbol), 2..5, "rev"), + (Ok(TokenKind::Whitespace), 5..6, " "), + (Ok(TokenKind::Symbol), 6..11, "fold0"), + (Ok(TokenKind::Whitespace), 11..12, " "), + (Ok(TokenKind::Symbol), 12..16, "nil?"), + (Ok(TokenKind::Whitespace), 16..17, " "), + (Ok(TokenKind::Symbol), 17..18, "x"), + (Ok(TokenKind::Whitespace), 18..19, " "), + (Ok(TokenKind::Symbol), 19..26, "str-cat"), + (Ok(TokenKind::Whitespace), 26..27, " "), + (Ok(TokenKind::Symbol), 27..30, "猫"), + ]); + + macro_rules! ptest { + ( $name:ident: $input:literal => $kind:ident ) => { + proptest::proptest! { + #[test] + fn $name(x in $input) { + let mut lexer = Lexer::new(&x); + assert_eq!(lexer.next(), Some(Ok(Token { kind: TokenKind::$kind }))); + assert_eq!(lexer.slice(), x.as_bytes()); + assert_eq!(lexer.span(), 0..x.len()); + } + } + }; + } + + ptest!(all_valid_ascii_escapes: r"\\x[0-7][0-9a-fA-F]" => Char); + ptest!(all_valid_unicode_escaps: r"\\u[0-9a-fA-F]{1,6}" => Char); + + ptest!(all_valid_base10_integers: "[+-]?[0-9]+" => Integer); + ptest!(all_valid_binary_integers: "0[bB][01]+" => Integer); + ptest!(all_valid_octal_integers: "0[oO][0-7]+" => Integer); + ptest!(all_valid_hexadecimal_integers: "0[xX][0-9a-fA-F]+" => Integer); + + ptest!(all_valid_decimals: r"[+-]?[0-9]+\.[0-9]+([eE][+-]?[0-9]+)?" => Decimal); + + ptest!(all_valid_ratios: "[+-]?[0-9]+/[+-]?[0-9]+" => Ratio); +} diff --git a/onihime/src/lexer/token.rs b/onihime/src/lexer/token.rs new file mode 100644 index 0000000..cb43a34 --- /dev/null +++ b/onihime/src/lexer/token.rs @@ -0,0 +1,68 @@ +/// Kinds of tokens which are valid in Onihime source code. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum TokenKind { + /// Whitespace, e.g. ' ', '\t', '\n' + Whitespace, + /// Line comment, e.g. `; ...` + LineComment, + /// Discard, e.g. `#_ 4`, `#_( ... )` + Discard, + + /// Opening parenthesis, e.g. `(` + OpenParen, + /// Closing parenthesis, e.g. `)` + CloseParen, + /// Opening brace, e.g. `{` + OpenBrace, + /// Closing brace, e.g. `}` + CloseBrace, + /// Opening bracket, e.g. `[` + OpenBracket, + /// Closing bracket, e.g. `]` + CloseBracket, + /// Opening hash-brace, e.g. `#{` + OpenHashBrace, + + /// Boolean, e.g. `true`, `false` + Bool, + /// Character, e.g. `\a`, `\x1e`, `\u03BB`, `\newline` + Char, + /// Keyword, e.g. `:foo-bar`, `:baz`, `:qux0` + Keyword, + /// Floating-point number, e.g. `-1.0`, `2.0`, `3.0e-4` + Decimal, + /// Integer, e.g. `0`, `-1`, `0b1010`, `0o7`, `0xDECAFBAD` + Integer, + /// Ratio, e.g. `1/3`, `-5/7` + Ratio, + /// String, e.g. `"foo bar"` + String, + /// Symbol, e.g. `baz`, `*qux*`, `nil?`, `+` + Symbol, + /// Nil, e.g. `nil` + Nil, + + /// Comma, e.g. `,` + Comma, + /// Comma followed by at sign, e.g. `,@` + CommaAt, + /// Backtick quote, e.g. `` ` `` + BackQuote, + /// Single quote, e.g. `'` + Quote, +} + +/// A valid token found in Onihime source code. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct Token { + /// Kind of token which was found. + pub kind: TokenKind, +} + +impl Token { + /// Construct a new instance of `Token`. + #[must_use] + pub const fn new(kind: TokenKind) -> Self { + Self { kind } + } +} diff --git a/onihime/src/lib.rs b/onihime/src/lib.rs index 9aadc49..62a8b9d 100644 --- a/onihime/src/lib.rs +++ b/onihime/src/lib.rs @@ -6,3 +6,5 @@ rust_2018_idioms, unsafe_code )] + +pub mod lexer;