Add the initial implementation of the lexer

2025-01-14 20:58:03 +01:00 · 2025-01-14 20:58:03 +01:00 · 8cc6b9d415
commit 8cc6b9d415
parent 26db9d0398
6 changed files with 709 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,3 +12,4 @@ Cargo.lock

 # Miscellaneous files
 .DS_Store
+tarpaulin-report.html
--- a/onihime/Cargo.toml
+++ b/onihime/Cargo.toml
@ -6,3 +6,9 @@ edition.workspace    = true
 homepage.workspace   = true
 repository.workspace = true
 license.workspace    = true
+
+[dev-dependencies]
+proptest = "1.6.0"
+
+[lints.rust]
+unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tarpaulin_include)'] }
--- a/onihime/src/lexer/error.rs
+++ b/onihime/src/lexer/error.rs
@ -0,0 +1,38 @@
+/// Errors during lexical analysis.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum LexerError {
+    /// An invalid character literal was encountered.
+    InvalidChar,
+    /// An invalid keyword literal was encountered.
+    InvalidKeyword,
+    /// An invalid number literal was encountered.
+    InvalidNumber,
+    /// An invalid token was encountered.
+    InvalidToken,
+    /// An unclosed string was encountered.
+    UnclosedString,
+    /// Invalid UTF-8 sequence was encountered.
+    Utf8Error(std::str::Utf8Error),
+}
+
+impl From<std::str::Utf8Error> for LexerError {
+    fn from(err: std::str::Utf8Error) -> Self {
+        Self::Utf8Error(err)
+    }
+}
+
+impl std::error::Error for LexerError {}
+
+#[cfg(not(tarpaulin_include))]
+impl std::fmt::Display for LexerError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            LexerError::InvalidChar => write!(f, "An invalid character literal was encountered"),
+            LexerError::InvalidKeyword => write!(f, "An invalid keyword literal was encountered"),
+            LexerError::InvalidNumber => write!(f, "An invalid number literal was encountered"),
+            LexerError::InvalidToken => write!(f, "An invalid token was encountered"),
+            LexerError::UnclosedString => write!(f, "An unclosed string was encountered"),
+            LexerError::Utf8Error(err) => write!(f, "{err}"),
+        }
+    }
+}
--- a/onihime/src/lexer/mod.rs
+++ b/onihime/src/lexer/mod.rs
@ -0,0 +1,594 @@
+//! Lexer for the Onihime programming language.
+//!
+//! The job of a lexer is to perform lexical analysis on some input text. In
+//! other words, it converts character sequences into token sequences. For a
+//! more in-depth explanation of this process, see the [Wikipedia page for
+//! Lexical Analysis].
+//!
+//! [Wikipedia page for Lexical Analysis]: https://en.wikipedia.org/wiki/Lexical_analysis
+
+use std::{
+    iter::Peekable,
+    ops::Range,
+    str::{self, Chars},
+};
+
+pub use self::{
+    error::LexerError,
+    token::{Token, TokenKind},
+};
+
+mod error;
+mod token;
+
+/// Performs lexical analysis and produces tokens from an input string.
+#[derive(Debug)]
+pub struct Lexer<'a> {
+    bytes: &'a [u8],
+    chars: Peekable<Chars<'a>>,
+    token_start: usize,
+    cursor: usize,
+}
+
+impl<'a> Lexer<'a> {
+    /// Construct a new lexer instance.
+    #[must_use]
+    pub fn new(source: &'a str) -> Self {
+        Self {
+            bytes: source.as_bytes(),
+            chars: source.chars().peekable(),
+            token_start: 0,
+            cursor: 0,
+        }
+    }
+
+    /// Return a byte slice containing the contents of the current [Token].
+    #[inline]
+    #[must_use]
+    pub fn slice(&self) -> &'a [u8] {
+        &self.bytes[self.token_start..self.cursor]
+    }
+
+    /// Return the span of the current [Token].
+    #[inline]
+    #[must_use]
+    pub fn span(&self) -> Range<usize> {
+        self.token_start..self.cursor
+    }
+
+    /// Return the next [Token] in the input stream.
+    #[inline]
+    pub fn next_token(&mut self) -> Result<Option<Token>, LexerError> {
+        self.token_start = self.cursor;
+
+        let Some(c) = self.advance() else {
+            return Ok(None); // EOF reached
+        };
+
+        let kind = match c {
+            // Comments and whitespace:
+            ';' => self.read_comment(),
+            _ if c.is_ascii_whitespace() => self.read_whitespace(),
+
+            // Sequence delimiters:
+            '(' => TokenKind::OpenParen,
+            ')' => TokenKind::CloseParen,
+            '[' => TokenKind::OpenBracket,
+            ']' => TokenKind::CloseBracket,
+            '{' => TokenKind::OpenBrace,
+            '}' => TokenKind::CloseBrace,
+
+            // Dispatch:
+            '#' => match self.advance() {
+                Some('{') => TokenKind::OpenHashBrace,
+                Some('_') => TokenKind::Discard,
+                _ => {
+                    self.read_word(); // Recover
+                    return Err(LexerError::InvalidToken);
+                }
+            },
+
+            // Macros:
+            '\'' => TokenKind::Quote,
+            '`' => TokenKind::BackQuote,
+            ',' if self.peek().is_some_and(|c| *c == '@') => {
+                self.advance(); // '@'
+                TokenKind::CommaAt
+            }
+            ',' => TokenKind::Comma,
+
+            // Literals:
+            '\\' => self.read_char()?,
+            ':' => self.read_keyword()?,
+            '0'..='9' => self.read_number(c)?,
+            '+' | '-' if self.peek().is_some_and(|c| c.is_ascii_digit()) => self.read_number(c)?,
+            '"' => self.read_string()?,
+            _ if is_symbol_prefix(c) => {
+                self.read_word();
+                match str::from_utf8(self.slice())? {
+                    "true" | "false" => TokenKind::Bool,
+                    "nil" => TokenKind::Nil,
+                    _ => TokenKind::Symbol,
+                }
+            }
+
+            // Invalid tokens:
+            _ => {
+                self.read_word(); // Recover
+                return Err(LexerError::InvalidToken);
+            }
+        };
+
+        Ok(Some(Token::new(kind)))
+    }
+
+    #[inline]
+    fn peek(&mut self) -> Option<&char> {
+        self.chars.peek()
+    }
+
+    fn advance(&mut self) -> Option<char> {
+        self.chars.next().inspect(|c| {
+            self.cursor += c.len_utf8();
+        })
+    }
+
+    fn take_while(&mut self, predicate: impl Fn(&char) -> bool) {
+        while self.chars.peek().is_some_and(&predicate) {
+            self.advance();
+        }
+    }
+
+    fn read_word(&mut self) {
+        self.take_while(|c| !is_separator(*c));
+    }
+
+    fn read_comment(&mut self) -> TokenKind {
+        self.take_while(|c| *c != '\n');
+        TokenKind::LineComment
+    }
+
+    fn read_whitespace(&mut self) -> TokenKind {
+        self.take_while(|c| c.is_ascii_whitespace());
+        TokenKind::Whitespace
+    }
+
+    fn read_char(&mut self) -> Result<TokenKind, LexerError> {
+        // NOTE: We have already consumed the initial '\' when this function is invoked
+        let c = if self.peek().is_some_and(|c| !is_separator(*c)) {
+            self.advance().unwrap() // SAFETY: This will never panic
+        } else {
+            return Err(LexerError::InvalidChar);
+        };
+
+        match c {
+            'u' if self.peek().is_some_and(|c| !is_separator(*c)) => self.complete_unicode_escape(),
+            'x' if self.peek().is_some_and(|c| !is_separator(*c)) => self.complete_ascii_escape(),
+            _ if self.peek().is_some_and(|c| !is_separator(*c)) => Err(LexerError::InvalidChar),
+            _ => Ok(TokenKind::Char),
+        }
+    }
+
+    fn complete_ascii_escape(&mut self) -> Result<TokenKind, LexerError> {
+        // NOTE: We have already consumed the initial '\x' when this function is invoked
+
+        // Expect a single octal digit:
+        if self.peek().is_some_and(|c| c.is_digit(8)) {
+            self.advance();
+        } else {
+            self.read_word(); // Recover
+            return Err(LexerError::InvalidChar);
+        }
+
+        // Expect a single hexadecimal digit:
+        if self.peek().is_some_and(|c| c.is_ascii_hexdigit()) {
+            self.advance();
+        } else {
+            self.read_word(); // Recover
+            return Err(LexerError::InvalidChar);
+        }
+
+        // We should be at the end of the literal now, i.e. next char should be a
+        // separator:
+        if self.peek().is_some_and(|c| !is_separator(*c)) {
+            self.read_word(); // Recover
+            return Err(LexerError::InvalidChar);
+        }
+
+        Ok(TokenKind::Char)
+    }
+
+    fn complete_unicode_escape(&mut self) -> Result<TokenKind, LexerError> {
+        // NOTE: We have already consumed the initial '\u' when this function is invoked
+
+        // Expect between 1 and 6 hexadecimal digits:
+        let mut count = 0;
+        while self.peek().is_some_and(|c| !is_separator(*c)) && count < 6 {
+            match self.advance() {
+                Some(c) if c.is_ascii_hexdigit() => count += 1,
+                _ => {
+                    self.read_word(); // Recover
+                    return Err(LexerError::InvalidChar);
+                }
+            };
+        }
+
+        // If no hexadecimal digits were found, or digits were found but we are still
+        // not at the end of the literal, then the literal is invalid:
+        if count == 0 || self.peek().is_some_and(|c| !is_separator(*c)) {
+            self.read_word(); // Recover
+            return Err(LexerError::InvalidChar);
+        }
+
+        Ok(TokenKind::Char)
+    }
+
+    fn read_keyword(&mut self) -> Result<TokenKind, LexerError> {
+        // NOTE: We have already consumed the initial ':' when this function is invoked
+        if self.peek().is_some_and(|c| is_symbol_prefix(*c)) {
+            self.read_word();
+            Ok(TokenKind::Keyword)
+        } else {
+            self.read_word(); // Recover
+            Err(LexerError::InvalidKeyword)
+        }
+    }
+
+    fn read_number(&mut self, first_char: char) -> Result<TokenKind, LexerError> {
+        if first_char == '0' {
+            match self.peek() {
+                Some('b') | Some('B') => return self.read_number_radix(2),
+                Some('o') | Some('O') => return self.read_number_radix(8),
+                Some('x') | Some('X') => return self.read_number_radix(16),
+                _ => {}
+            }
+        }
+
+        while self.peek().is_some_and(|c| !is_separator(*c)) {
+            match self.advance() {
+                Some(c) if c.is_ascii_digit() => {}
+                Some('.') => return self.complete_decimal(),
+                Some('/') => return self.complete_ratio(),
+                _ => {
+                    self.read_word(); // Recover
+                    return Err(LexerError::InvalidNumber);
+                }
+            }
+        }
+
+        Ok(TokenKind::Integer)
+    }
+
+    fn read_number_radix(&mut self, radix: u32) -> Result<TokenKind, LexerError> {
+        // NOTE: We have already consumed the initial '0' when this function is invoked
+        self.advance(); // Base prefix (i.e. 'b'/'B', 'o'/'O', 'x'/'X')
+
+        let mut digit_found = false;
+        while let Some(c) = self.peek() {
+            match c {
+                _ if is_separator(*c) => break,
+                _ if c.is_digit(radix) => {
+                    digit_found = true;
+                    self.advance();
+                }
+                _ => {
+                    self.read_word(); // Recover
+                    return Err(LexerError::InvalidNumber);
+                }
+            };
+        }
+
+        if !digit_found {
+            self.read_word(); // Recover
+            return Err(LexerError::InvalidNumber);
+        }
+
+        Ok(TokenKind::Integer)
+    }
+
+    fn complete_decimal(&mut self) -> Result<TokenKind, LexerError> {
+        // NOTE: We have already consumed the leading digits and '.' when this function
+        //       is invoked
+        let mut digit_found = false;
+        let mut exp_found = false;
+        let mut sign_found = false;
+
+        while self.peek().is_some_and(|c| !is_separator(*c)) {
+            match self.advance() {
+                Some(c) if c.is_ascii_digit() => digit_found = true,
+                Some('e') | Some('E') if digit_found && !exp_found => exp_found = true,
+                Some('+') | Some('-') if exp_found && !sign_found => sign_found = true,
+                Some(_) => {
+                    self.read_word(); // Recover
+                    return Err(LexerError::InvalidNumber);
+                }
+                None => unreachable!(),
+            };
+        }
+
+        Ok(TokenKind::Decimal)
+    }
+
+    fn complete_ratio(&mut self) -> Result<TokenKind, LexerError> {
+        // NOTE: We have already consumed the leading digits and '/' when this function
+        //       is invoked
+        let mut sign_found = false;
+        let mut digit_found = false;
+
+        while self.peek().is_some_and(|c| !is_separator(*c)) {
+            match self.advance() {
+                Some(c) if c.is_ascii_digit() => digit_found = true,
+                Some('+') | Some('-') if !digit_found && !sign_found => sign_found = true,
+                Some(_) => {
+                    self.read_word(); // Recover
+                    return Err(LexerError::InvalidNumber);
+                }
+                None => unreachable!(),
+            };
+        }
+
+        if !digit_found {
+            self.read_word(); // Recover
+            return Err(LexerError::InvalidNumber);
+        }
+
+        Ok(TokenKind::Ratio)
+    }
+
+    fn read_string(&mut self) -> Result<TokenKind, LexerError> {
+        // NOTE: We have already consumed the initial '"' when this function is invoked
+        loop {
+            match self.advance() {
+                Some('"') => break,
+                Some(c) if c == '\\' && self.peek().is_some_and(|c| *c == '"') => {
+                    self.advance(); // '"'
+                }
+                Some(_) => {}
+                None => return Err(LexerError::UnclosedString),
+            }
+        }
+
+        Ok(TokenKind::String)
+    }
+}
+
+impl Iterator for Lexer<'_> {
+    type Item = Result<Token, LexerError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.next_token().transpose()
+    }
+}
+
+#[inline]
+fn is_separator(c: char) -> bool {
+    c.is_ascii_whitespace() | matches!(c, '(' | ')' | '[' | ']' | '{' | '}' | ';')
+}
+
+#[inline]
+fn is_symbol_prefix(c: char) -> bool {
+    c.is_alphabetic()
+        | matches!(
+            c,
+            '~' | '!' | '$' | '%' | '^' | '&' | '*' | '-' | '_' | '+' | '=' | '<' | '>' | '/' | '?'
+        )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn empty() {
+        let mut lexer = Lexer::new("");
+        assert_eq!(lexer.next(), None);
+        assert_eq!(lexer.span(), 0..0);
+        assert_eq!(lexer.slice(), &[]);
+    }
+
+    macro_rules! test {
+        ( $name:ident: $input:literal => $expected:expr ) => {
+            #[test]
+            fn $name() {
+                let mut lexer = Lexer::new($input);
+                for (token, span, slice) in $expected {
+                    let kind = lexer.next().map(|r| match r {
+                        Ok(t) => Ok(t.kind),
+                        Err(e) => Err(e),
+                    });
+                    assert_eq!(kind, Some(token));
+                    assert_eq!(span, lexer.span());
+                    assert_eq!(slice.as_bytes(), lexer.slice());
+                }
+                assert_eq!(lexer.next(), None);
+            }
+        };
+    }
+
+    test!(line_comment: ";; foobar\nnil ; bar; baz" => [
+        (Ok(TokenKind::LineComment), 0..9, ";; foobar"),
+        (Ok(TokenKind::Whitespace), 9..10, "\n"),
+        (Ok(TokenKind::Nil), 10..13, "nil"),
+        (Ok(TokenKind::Whitespace), 13..14, " "),
+        (Ok(TokenKind::LineComment), 14..24, "; bar; baz"),
+    ]);
+
+    test!(list: "(0 1.2 -3/4 +5.6e-7)" => [
+        (Ok(TokenKind::OpenParen), 0..1, "("),
+        (Ok(TokenKind::Integer), 1..2, "0"),
+        (Ok(TokenKind::Whitespace), 2..3, " "),
+        (Ok(TokenKind::Decimal), 3..6, "1.2"),
+        (Ok(TokenKind::Whitespace), 6..7, " "),
+        (Ok(TokenKind::Ratio), 7..11, "-3/4"),
+        (Ok(TokenKind::Whitespace), 11..12, " "),
+        (Ok(TokenKind::Decimal), 12..19, "+5.6e-7"),
+        (Ok(TokenKind::CloseParen), 19..20, ")"),
+    ]);
+
+    test!(map: r"{:a \a :b {:c nil}}" => [
+        (Ok(TokenKind::OpenBrace), 0..1, "{"),
+        (Ok(TokenKind::Keyword), 1..3, ":a"),
+        (Ok(TokenKind::Whitespace), 3..4, " "),
+        (Ok(TokenKind::Char), 4..6, "\\a"),
+        (Ok(TokenKind::Whitespace), 6..7, " "),
+        (Ok(TokenKind::Keyword), 7..9, ":b"),
+        (Ok(TokenKind::Whitespace), 9..10, " "),
+        (Ok(TokenKind::OpenBrace), 10..11, "{"),
+        (Ok(TokenKind::Keyword), 11..13, ":c"),
+        (Ok(TokenKind::Whitespace), 13..14, " "),
+        (Ok(TokenKind::Nil), 14..17, "nil"),
+        (Ok(TokenKind::CloseBrace), 17..18, "}"),
+        (Ok(TokenKind::CloseBrace), 18..19, "}"),
+    ]);
+
+    test!(vector: "[true false]" => [
+        (Ok(TokenKind::OpenBracket), 0..1, "["),
+        (Ok(TokenKind::Bool), 1..5, "true"),
+        (Ok(TokenKind::Whitespace), 5..6, " "),
+        (Ok(TokenKind::Bool), 6..11, "false"),
+        (Ok(TokenKind::CloseBracket), 11..12, "]"),
+    ]);
+
+    test!(dispatch: "#{} #_() #_ 4" => [
+        (Ok(TokenKind::OpenHashBrace), 0..2, "#{"),
+        (Ok(TokenKind::CloseBrace), 2..3, "}"),
+        (Ok(TokenKind::Whitespace), 3..4, " "),
+        (Ok(TokenKind::Discard), 4..6, "#_"),
+        (Ok(TokenKind::OpenParen), 6..7, "("),
+        (Ok(TokenKind::CloseParen), 7..8, ")"),
+        (Ok(TokenKind::Whitespace), 8..9, " "),
+        (Ok(TokenKind::Discard), 9..11, "#_"),
+        (Ok(TokenKind::Whitespace), 11..12, " "),
+        (Ok(TokenKind::Integer), 12..13, "4"),
+    ]);
+
+    test!(err_invalid_dispatch: "#@" => [
+        (Err(LexerError::InvalidToken), 0..2, "#@"),
+    ]);
+
+    test!(keyword: ":m :x0 :this-is-an-keyword-too!" => [
+        (Ok(TokenKind::Keyword), 0..2, ":m"),
+        (Ok(TokenKind::Whitespace), 2..3, " "),
+        (Ok(TokenKind::Keyword), 3..6, ":x0"),
+        (Ok(TokenKind::Whitespace), 6..7, " "),
+        (Ok(TokenKind::Keyword), 7..31, ":this-is-an-keyword-too!"),
+    ]);
+
+    test!(err_invalid_keyword: ":0" => [
+        (Err(LexerError::InvalidKeyword), 0..2, ":0"),
+    ]);
+
+    test!(char: r"\a \? \7 \λ \\ \u \x" => [
+        (Ok(TokenKind::Char), 0..2, r"\a"),
+        (Ok(TokenKind::Whitespace), 2..3, " "),
+        (Ok(TokenKind::Char), 3..5, r"\?"),
+        (Ok(TokenKind::Whitespace), 5..6, " "),
+        (Ok(TokenKind::Char), 6..8, r"\7"),
+        (Ok(TokenKind::Whitespace), 8..9, " "),
+        (Ok(TokenKind::Char), 9..12, r"\λ"),
+        (Ok(TokenKind::Whitespace), 12..13, " "),
+        (Ok(TokenKind::Char), 13..15, r"\\"),
+        (Ok(TokenKind::Whitespace), 15..16, " "),
+        (Ok(TokenKind::Char), 16..18, r"\u"),
+        (Ok(TokenKind::Whitespace), 18..19, " "),
+        (Ok(TokenKind::Char), 19..21, r"\x"),
+    ]);
+
+    test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222" => [
+        (Err(LexerError::InvalidChar), 0..1, r"\"),
+        (Ok(TokenKind::Whitespace), 1..2, " "),
+        (Err(LexerError::InvalidChar), 2..5, r"\xF"),
+        (Ok(TokenKind::Whitespace), 5..6, " "),
+        (Err(LexerError::InvalidChar), 6..9, r"\x0"),
+        (Ok(TokenKind::Whitespace), 9..10, " "),
+        (Err(LexerError::InvalidChar), 10..15, r"\x111"),
+        (Ok(TokenKind::Whitespace), 15..16, " "),
+        (Err(LexerError::InvalidChar), 16..19, r"\uG"),
+        (Ok(TokenKind::Whitespace), 19..20, " "),
+        (Err(LexerError::InvalidChar), 20..29, r"\u2222222"),
+    ]);
+
+    test!(err_invalid_integer: "0b012 0o8 0xFG 1N 0x" => [
+        (Err(LexerError::InvalidNumber), 0..5, "0b012"),
+        (Ok(TokenKind::Whitespace), 5..6, " "),
+        (Err(LexerError::InvalidNumber), 6..9, "0o8"),
+        (Ok(TokenKind::Whitespace), 9..10, " "),
+        (Err(LexerError::InvalidNumber), 10..14, "0xFG"),
+        (Ok(TokenKind::Whitespace), 14..15, " "),
+        (Err(LexerError::InvalidNumber), 15..17, "1N"),
+        (Ok(TokenKind::Whitespace), 17..18, " "),
+        (Err(LexerError::InvalidNumber), 18..20, "0x"),
+    ]);
+
+    test!(err_invalid_decimal: "1.2.3 4.e6 7.8+ 9.0+e1" => [
+        (Err(LexerError::InvalidNumber), 0..5, "1.2.3"),
+        (Ok(TokenKind::Whitespace), 5..6, " "),
+        (Err(LexerError::InvalidNumber), 6..10, "4.e6"),
+        (Ok(TokenKind::Whitespace), 10..11, " "),
+        (Err(LexerError::InvalidNumber), 11..15, "7.8+"),
+        (Ok(TokenKind::Whitespace), 15..16, " "),
+        (Err(LexerError::InvalidNumber), 16..22, "9.0+e1"),
+    ]);
+
+    test!(err_invalid_ratio: "1/ -2/3+ 4/-" => [
+        (Err(LexerError::InvalidNumber), 0..2, "1/"),
+        (Ok(TokenKind::Whitespace), 2..3, " "),
+        (Err(LexerError::InvalidNumber), 3..8, "-2/3+"),
+        (Ok(TokenKind::Whitespace), 8..9, " "),
+        (Err(LexerError::InvalidNumber), 9..12, "4/-"),
+    ]);
+
+    test!(string: "\"föö bar1\nbaz\" \"\" \"凄い 😍\"" => [
+        (Ok(TokenKind::String), 0..16, "\"föö bar1\nbaz\""),
+        (Ok(TokenKind::Whitespace), 16..17, " "),
+        (Ok(TokenKind::String), 17..19, "\"\""),
+        (Ok(TokenKind::Whitespace), 19..20, " "),
+        (Ok(TokenKind::String), 20..33, "\"凄い 😍\""),
+    ]);
+
+    test!(err_unclosed_string: "\"oops" => [
+        (Err(LexerError::UnclosedString), 0..5, "\"oops"),
+    ]);
+
+    test!(symbol: "+ rev fold0 nil? x str-cat 猫" => [
+        (Ok(TokenKind::Symbol), 0..1, "+"),
+        (Ok(TokenKind::Whitespace), 1..2, " "),
+        (Ok(TokenKind::Symbol), 2..5, "rev"),
+        (Ok(TokenKind::Whitespace), 5..6, " "),
+        (Ok(TokenKind::Symbol), 6..11, "fold0"),
+        (Ok(TokenKind::Whitespace), 11..12, " "),
+        (Ok(TokenKind::Symbol), 12..16, "nil?"),
+        (Ok(TokenKind::Whitespace), 16..17, " "),
+        (Ok(TokenKind::Symbol), 17..18, "x"),
+        (Ok(TokenKind::Whitespace), 18..19, " "),
+        (Ok(TokenKind::Symbol), 19..26, "str-cat"),
+        (Ok(TokenKind::Whitespace), 26..27, " "),
+        (Ok(TokenKind::Symbol), 27..30, "猫"),
+    ]);
+
+    macro_rules! ptest {
+        ( $name:ident: $input:literal => $kind:ident ) => {
+            proptest::proptest! {
+                #[test]
+                fn $name(x in $input) {
+                    let mut lexer = Lexer::new(&x);
+                    assert_eq!(lexer.next(), Some(Ok(Token { kind: TokenKind::$kind })));
+                    assert_eq!(lexer.slice(), x.as_bytes());
+                    assert_eq!(lexer.span(), 0..x.len());
+                }
+            }
+        };
+    }
+
+    ptest!(all_valid_ascii_escapes: r"\\x[0-7][0-9a-fA-F]" => Char);
+    ptest!(all_valid_unicode_escaps: r"\\u[0-9a-fA-F]{1,6}" => Char);
+
+    ptest!(all_valid_base10_integers: "[+-]?[0-9]+" => Integer);
+    ptest!(all_valid_binary_integers: "0[bB][01]+" => Integer);
+    ptest!(all_valid_octal_integers: "0[oO][0-7]+" => Integer);
+    ptest!(all_valid_hexadecimal_integers: "0[xX][0-9a-fA-F]+" => Integer);
+
+    ptest!(all_valid_decimals: r"[+-]?[0-9]+\.[0-9]+([eE][+-]?[0-9]+)?" => Decimal);
+
+    ptest!(all_valid_ratios: "[+-]?[0-9]+/[+-]?[0-9]+" => Ratio);
+}
--- a/onihime/src/lexer/token.rs
+++ b/onihime/src/lexer/token.rs
@ -0,0 +1,68 @@
+/// Kinds of tokens which are valid in Onihime source code.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum TokenKind {
+    /// Whitespace, e.g. ' ', '\t', '\n'
+    Whitespace,
+    /// Line comment, e.g. `; ...`
+    LineComment,
+    /// Discard, e.g. `#_ 4`, `#_( ... )`
+    Discard,
+
+    /// Opening parenthesis, e.g. `(`
+    OpenParen,
+    /// Closing parenthesis, e.g. `)`
+    CloseParen,
+    /// Opening brace, e.g. `{`
+    OpenBrace,
+    /// Closing brace, e.g. `}`
+    CloseBrace,
+    /// Opening bracket, e.g. `[`
+    OpenBracket,
+    /// Closing bracket, e.g. `]`
+    CloseBracket,
+    /// Opening hash-brace, e.g. `#{`
+    OpenHashBrace,
+
+    /// Boolean, e.g. `true`, `false`
+    Bool,
+    /// Character, e.g. `\a`, `\x1e`, `\u03BB`, `\newline`
+    Char,
+    /// Keyword, e.g. `:foo-bar`, `:baz`, `:qux0`
+    Keyword,
+    /// Floating-point number, e.g. `-1.0`, `2.0`, `3.0e-4`
+    Decimal,
+    /// Integer, e.g. `0`, `-1`, `0b1010`, `0o7`, `0xDECAFBAD`
+    Integer,
+    /// Ratio, e.g. `1/3`, `-5/7`
+    Ratio,
+    /// String, e.g. `"foo bar"`
+    String,
+    /// Symbol, e.g. `baz`, `*qux*`, `nil?`, `+`
+    Symbol,
+    /// Nil, e.g. `nil`
+    Nil,
+
+    /// Comma, e.g. `,`
+    Comma,
+    /// Comma followed by at sign, e.g. `,@`
+    CommaAt,
+    /// Backtick quote, e.g. `` ` ``
+    BackQuote,
+    /// Single quote, e.g. `'`
+    Quote,
+}
+
+/// A valid token found in Onihime source code.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct Token {
+    /// Kind of token which was found.
+    pub kind: TokenKind,
+}
+
+impl Token {
+    /// Construct a new instance of `Token`.
+    #[must_use]
+    pub const fn new(kind: TokenKind) -> Self {
+        Self { kind }
+    }
+}
--- a/onihime/src/lib.rs
+++ b/onihime/src/lib.rs
@ -6,3 +6,5 @@
    rust_2018_idioms,
    unsafe_code
 )]
+
+pub mod lexer;