Add the initial implementation of the lexer

Create the Span struct, along with its constituent types
2024-11-28 19:35:45 +01:00 · 2024-11-28 18:50:14 +01:00
6 changed files with 670 additions and 0 deletions
--- a/onihime/src/lexer/error.rs
+++ b/onihime/src/lexer/error.rs
@ -0,0 +1,52 @@
 use crate::span::Span;
 /// Errors that can occur during lexical analysis.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum LexerErrorKind {
    /// An invalid character was encountered.
    InvalidChar,
    /// An invalid escape sequence was encountered.
    InvalidEscape(char),
    /// An invalid number was encountered.
    InvalidNumber(String),
    /// An invalid string was encountered.
    InvalidString,
    /// An unclosed character literal was encountered.
    UnclosedChar,
    /// And unclosed string literal was encountered.
    UnclosedString,
 }
 /// Lexer error, with a start and end location.
 #[derive(Debug, Clone, PartialEq)]
 pub struct LexerError {
    /// The type of error encountered.
    pub kind: LexerErrorKind,
    /// The span in which the error occurred.
    pub span: Span,
 }
 impl LexerError {
    /// Construct a new instance of `LexerError`.
    #[must_use]
    pub const fn new(kind: LexerErrorKind, span: Span) -> Self {
        Self { kind, span }
    }
 }
 impl std::error::Error for LexerError {}
 impl std::fmt::Display for LexerError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        use LexerErrorKind::*;
        match &self.kind {
            InvalidChar => write!(f, "Invalid character literal"),
            InvalidEscape(c) => write!(f, "Unknown escape sequence '\\{c}' in string"),
            InvalidNumber(n) => write!(f, "`{n}` is not a valid numeric literal"),
            InvalidString => write!(f, "Invalid string literal"),
            UnclosedChar => write!(f, "Unclosed character literal"),
            UnclosedString => write!(f, "Unclosed string literal"),
        }
    }
 }
--- a/onihime/src/lexer/mod.rs
+++ b/onihime/src/lexer/mod.rs
@ -0,0 +1,399 @@
 use std::{
    str::{Chars, FromStr},
    sync::Arc,
 };
 pub use self::{
    error::{LexerError, LexerErrorKind},
    symbol::Symbol,
    token::{Token, TokenKind},
 };
 use crate::span::{Source, Span};
 mod error;
 mod symbol;
 mod token;
 /// A trait for checking if a character is a separator.
 pub trait Separator {
    /// Check if the character is a separator.
    fn is_separator(&self) -> bool;
 }
 impl Separator for char {
    fn is_separator(&self) -> bool {
        self.is_ascii_whitespace() || matches!(self, '(' | ')' | '[' | ']' | '{' | '}' | ',')
    }
 }
 /// A lexer, used by the parser.
 #[derive(Debug)]
 pub struct Lexer<'lexer> {
    input: Chars<'lexer>,
    byte: usize,
    source: Arc<Source>,
 }
 impl<'lexer> Lexer<'lexer> {
    /// Create a new lexer instance from a string.
    #[must_use]
    pub fn new(input: &'lexer str) -> Self {
        let source = Arc::new(Source::new(None, input.to_string()));
        Self {
            input: input.chars(),
            byte: 0,
            source,
        }
    }
    /// Set the name of the [Source] being lexically analyzed.
    pub fn set_name(&mut self, name: String) {
        if let Some(source) = Arc::get_mut(&mut self.source) {
            source.set_name(name);
        } else {
            unimplemented!(); // FIXME: What should we do in this case?
        }
    }
    /// The source being lexically analyzed.
    #[must_use]
    pub fn source(&self) -> Arc<Source> {
        self.source.clone()
    }
    /// Get the current character.
    #[must_use]
    pub fn current(&self) -> Option<char> {
        self.input.as_str().chars().next()
    }
    /// Get the unparsed input.
    #[must_use]
    pub fn get_unparsed(&self) -> &str {
        self.input.as_str()
    }
    /// Get the current position of the lexer.
    #[inline]
    #[must_use]
    pub(crate) fn span(&self) -> Span {
        Span::new(self.byte..self.byte, self.source.clone())
    }
    /// Get the nth character ahead of the current character without advancing.
    fn peek(&self, n: usize) -> Option<char> {
        self.input.as_str().chars().nth(n)
    }
    /// Advance the lexer by one character.
    fn advance(&mut self) -> Option<char> {
        let c = self.input.next()?;
        self.byte += c.len_utf8();
        Some(c)
    }
    /// Read a word from the input until a separator is reached.
    fn read_word(&mut self) -> String {
        let mut word = String::new();
        while let Some(c) = self.current() {
            if c.is_separator() {
                break;
            }
            word.push(c);
            self.advance();
        }
        word
    }
    /// Parse a value from the input or return an error.
    fn parse_or<T>(&mut self, err: impl Fn(String) -> LexerErrorKind) -> Result<T, LexerError>
    where
        T: FromStr,
    {
        let span = self.span();
        let word = self.read_word();
        word.parse()
            .map_err(|_| LexerError::new(err(word), span.join(&self.span())))
    }
    /// Read the next token from the input.
    pub fn read(&mut self) -> Result<Option<Token>, LexerError> {
        let c = loop {
            match self.current() {
                Some(c) if c.is_ascii_whitespace() || c == ',' => {
                    self.advance();
                }
                Some(c) => break c,
                None => return Ok(None),
            }
        };
        let mut span = self.span();
        let kind = match c {
            '(' => {
                self.advance();
                TokenKind::OpenParen
            }
            ')' => {
                self.advance();
                TokenKind::CloseParen
            }
            '{' => {
                self.advance();
                TokenKind::OpenBrace
            }
            '}' => {
                self.advance();
                TokenKind::CloseBrace
            }
            '[' => {
                self.advance();
                TokenKind::OpenBracket
            }
            ']' => {
                self.advance();
                TokenKind::CloseBracket
            }
            '0'..='9' => TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?),
            '+' | '-' if matches!(self.peek(1), Some('0'..='9')) => {
                TokenKind::Number(self.parse_or(LexerErrorKind::InvalidNumber)?)
            }
            ';' => {
                let mut comment = String::new();
                while let Some(c) = self.advance() {
                    match c {
                        ';' => continue,
                        '\n' => break,
                        c => {
                            comment.push(c);
                        }
                    }
                }
                TokenKind::LineComment(comment.trim().into())
            }
            '#' if matches!(self.peek(1), Some('|')) => {
                self.advance(); // '#'
                self.advance(); // '|#'
                let mut comment = String::new();
                while let Some(c) = self.advance() {
                    match c {
                        '|' if matches!(self.peek(0), Some('#')) => {
                            self.advance(); // '|'
                            self.advance(); // '#'
                            break;
                        }
                        c => {
                            comment.push(c);
                        }
                    }
                }
                TokenKind::BlockComment(comment.trim().into())
            }
            ':' => {
                self.advance();
                TokenKind::Keyword(Symbol(self.read_word()))
            }
            '"' => {
                self.advance(); // '"'
                let quote_span = span.clone().join(&self.span());
                let mut string = String::new();
                loop {
                    let ch_span = self.span();
                    string.push(match self.advance() {
                        Some('"') => break,
                        Some('\\') => match self.advance() {
                            Some(c @ ('"' | '\\')) => c,
                            Some('n') => '\n',
                            Some('e') => '\x1b',
                            Some(c) => {
                                return Err(LexerError::new(
                                    LexerErrorKind::InvalidEscape(c),
                                    ch_span.join(&self.span()),
                                ))
                            }
                            None => {
                                return Err(LexerError::new(
                                    LexerErrorKind::UnclosedString,
                                    quote_span,
                                ))
                            }
                        },
                        Some(c) => c,
                        None => {
                            return Err(LexerError::new(LexerErrorKind::UnclosedString, quote_span))
                        }
                    });
                }
                if self.current().is_some_and(|c| !c.is_separator()) {
                    self.read_word();
                    return Err(LexerError::new(
                        LexerErrorKind::InvalidString,
                        span.join(&self.span()),
                    ));
                }
                TokenKind::String(string)
            }
            '\'' => {
                self.advance(); // '\''
                let c = match self.advance() {
                    Some('\\') => match self.advance() {
                        Some(c @ ('"' | '\\')) => c,
                        Some('n') => '\n',
                        Some('e') => '\x1b',
                        Some(c) => {
                            return Err(LexerError::new(
                                LexerErrorKind::InvalidEscape(c),
                                span.join(&self.span()),
                            ));
                        }
                        None => {
                            return Err(LexerError::new(
                                LexerErrorKind::UnclosedChar,
                                span.join(&self.span()),
                            ));
                        }
                    },
                    Some(c) => c,
                    None => {
                        return Err(LexerError::new(
                            LexerErrorKind::UnclosedChar,
                            span.join(&self.span()),
                        ))
                    }
                };
                if self.advance() != Some('\'') {
                    self.read_word();
                    return Err(LexerError::new(
                        LexerErrorKind::InvalidChar,
                        span.join(&self.span()),
                    ));
                }
                TokenKind::Char(c)
            }
            _ => {
                let word = self.read_word();
                match word.as_str() {
                    "true" => TokenKind::Bool(true),
                    "false" => TokenKind::Bool(false),
                    "nil" => TokenKind::Nil,
                    _ => TokenKind::Symbol(Symbol::from(word)),
                }
            }
        };
        span.extend(&self.span());
        Ok(Some(Token::new(kind, span)))
    }
 }
 impl Iterator for Lexer<'_> {
    type Item = Result<Token, LexerError>;
    fn next(&mut self) -> Option<Self::Item> {
        self.read().transpose()
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    macro_rules! test {
        ( $name:ident: $input:literal, $tokens:expr ) => {
            #[test]
            fn $name() {
                let mut lexer = Lexer::new($input);
                for token in $tokens {
                    let x = lexer.next().map(|r| match r {
                        Ok(t) => Ok(t.kind),
                        Err(e) => Err(e.kind),
                    });
                    assert_eq!(x, Some(token));
                }
                assert_eq!(lexer.next(), None);
            }
        };
    }
    test!(lex: "(+ 14 25.5 333 (* 2 5))", [
        Ok(TokenKind::OpenParen),
        Ok(TokenKind::Symbol(Symbol::from("+"))),
        Ok(TokenKind::Number(14.0)),
        Ok(TokenKind::Number(25.5)),
        Ok(TokenKind::Number(333.0)),
        Ok(TokenKind::OpenParen),
        Ok(TokenKind::Symbol(Symbol::from("*"))),
        Ok(TokenKind::Number(2.0)),
        Ok(TokenKind::Number(5.0)),
        Ok(TokenKind::CloseParen),
        Ok(TokenKind::CloseParen),
    ]);
    test!(newline: "(+ 14 25.5 333\n(* 2 5 5.x))", [
        Ok(TokenKind::OpenParen),
        Ok(TokenKind::Symbol(Symbol::from("+"))),
        Ok(TokenKind::Number(14.0)),
        Ok(TokenKind::Number(25.5)),
        Ok(TokenKind::Number(333.0)),
        Ok(TokenKind::OpenParen),
        Ok(TokenKind::Symbol(Symbol::from("*"))),
        Ok(TokenKind::Number(2.0)),
        Ok(TokenKind::Number(5.0)),
        Err(LexerErrorKind::InvalidNumber("5.x".into())),
        Ok(TokenKind::CloseParen),
        Ok(TokenKind::CloseParen),
    ]);
    test!(negative_minus: "(- 1 -2 3)", [
        Ok(TokenKind::OpenParen),
        Ok(TokenKind::Symbol(Symbol::from("-"))),
        Ok(TokenKind::Number(1.0)),
        Ok(TokenKind::Number(-2.0)),
        Ok(TokenKind::Number(3.0)),
        Ok(TokenKind::CloseParen),
    ]);
    test!(line_comment: "; foo\n;; bar baz qux", [
        Ok(TokenKind::LineComment("foo".into())),
        Ok(TokenKind::LineComment("bar baz qux".into())),
    ]);
    test!(block_comment: "#| foo\nbar |#", [
        Ok(TokenKind::BlockComment("foo\nbar".into()))
    ]);
    test!(error_parse_numbers: "2 55 3.144 0.0001 1.1.1", [
        Ok(TokenKind::Number(2.0)),
        Ok(TokenKind::Number(55.0)),
        Ok(TokenKind::Number(3.144)),
        Ok(TokenKind::Number(0.0001)),
        Err(LexerErrorKind::InvalidNumber("1.1.1".into())),
    ]);
    test!(error_unclosed_string: "\"hiii", [
        Err(LexerErrorKind::UnclosedString),
    ]);
    test!(error_invalid_string: "\"hiii\"222", [
        Err(LexerErrorKind::InvalidString),
    ]);
 }
--- a/onihime/src/lexer/symbol.rs
+++ b/onihime/src/lexer/symbol.rs
@ -0,0 +1,19 @@
 /// A symbol used to identify a function or variable.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct Symbol(pub String);
 impl Symbol {
    /// Create a new `Symbol` from a string.
    pub fn from<S>(s: S) -> Self
    where
        S: Into<String>,
    {
        Self(s.into())
    }
 }
 impl std::fmt::Display for Symbol {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.0)
    }
 }
--- a/onihime/src/lexer/token.rs
+++ b/onihime/src/lexer/token.rs
@ -0,0 +1,56 @@
 use super::Symbol;
 use crate::span::Span;
 /// The type of a [Token].
 #[derive(Debug, Clone, PartialEq)]
 pub enum TokenKind {
    /// Opening parenthesis, e.g. '('
    OpenParen,
    /// Closing parenthesis, e.g. ')'
    CloseParen,
    /// Opening brace, e.g. '{'
    OpenBrace,
    /// Closing brace, e.g. '}'
    CloseBrace,
    /// Opening bracket, e.g. '['
    OpenBracket,
    /// Closing bracket, e.g. ']'
    CloseBracket,
    /// Block comment, e.g. '#| ... |#'
    BlockComment(String),
    /// Line comment, e.g. '; ...'
    LineComment(String),
    /// Boolean, e.g. 'true', 'false'
    Bool(bool),
    /// Character, e.g. 'c', '\n'
    Char(char),
    /// Number, e.g. '1', '2.0', '0.003'
    Number(f64),
    /// String, e.g. '"foo bar"'
    String(String),
    /// Keyword, e.g. ':baz'
    Keyword(Symbol),
    /// Symbol, e.g. 'qux', '+'
    Symbol(Symbol),
    /// Nil, e.g. 'nil'
    Nil,
 }
 /// A token with a start and end location.
 #[derive(Debug, Clone, PartialEq)]
 pub struct Token {
    /// The type of token.
    pub kind: TokenKind,
    /// The span in which the token occurs.
    pub span: Span,
 }
 impl Token {
    /// Construct a new instance of `Token`.
    #[must_use]
    pub const fn new(kind: TokenKind, span: Span) -> Self {
        Self { kind, span }
    }
 }
--- a/onihime/src/lib.rs
+++ b/onihime/src/lib.rs
@ -1,3 +1,6 @@
 //! Onihime programming language.
 #![deny(missing_debug_implementations, missing_docs, rust_2018_idioms)]
 mod lexer;
 mod span;
--- a/onihime/src/span.rs
+++ b/onihime/src/span.rs
@ -0,0 +1,141 @@
 use std::{cmp::Ordering, iter, ops::Range, sync::Arc};
 /// A location within some source text.
 #[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
 pub struct Location {
    line: usize,
    column: usize,
 }
 impl Location {
    /// Construct a new instance of `Location`.
    #[must_use]
    pub const fn new(line: usize, column: usize) -> Self {
        Self { line, column }
    }
 }
 impl PartialOrd for Location {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        match self.line.partial_cmp(&other.line) {
            Some(Ordering::Equal) => self.column.partial_cmp(&other.column),
            ord => ord,
        }
    }
 }
 /// Some (optionally named) source text.
 #[derive(Debug, Default, Clone, PartialEq, Eq)]
 pub struct Source {
    name: Option<String>,
    contents: String,
    lines: Vec<usize>,
 }
 impl Source {
    /// Construct a new instance of `Source`.
    #[must_use]
    pub fn new(name: Option<String>, contents: String) -> Self {
        let lines = contents
            .match_indices('\n')
            .map(|(i, _)| i)
            .chain(iter::once(contents.len()))
            .collect();
        Self {
            name,
            contents,
            lines,
        }
    }
    /// Get the name of the source.
    #[must_use]
    pub fn name(&self) -> Option<&str> {
        self.name.as_deref()
    }
    /// Set the name of the source.
    pub fn set_name(&mut self, name: String) {
        self.name = Some(name);
    }
    /// Get the [Location] of the specified byte in the source.
    #[must_use]
    pub fn location(&self, byte: usize) -> Location {
        let line = self.lines.partition_point(|&x| x < byte);
        let start = line.checked_sub(1).map_or(0, |n| self.lines[n] + 1);
        let column = self.contents[start..byte].chars().count();
        Location::new(line, column)
    }
    /// Get the full contents of the source.
    #[must_use]
    pub fn contents(&self) -> &str {
        &self.contents
    }
    /// Get the specified line from the source.
    #[must_use]
    pub fn get_line(&self, line: usize) -> &str {
        let end = self.lines[line];
        let start = line.checked_sub(1).map_or(0, |n| self.lines[n] + 1);
        &self.contents[start..end]
    }
 }
 /// A contiguous sequence of bytes within some source.
 #[derive(Debug, Default, Clone)]
 pub struct Span {
    bytes: Range<usize>,
    source: Arc<Source>,
 }
 impl Span {
    /// Construct a new instance of `Span`.
    #[must_use]
    pub fn new(bytes: Range<usize>, source: Arc<Source>) -> Self {
        Self { bytes, source }
    }
    /// Join two spans, creating a new span.
    #[must_use]
    pub fn join(self, other: &Self) -> Self {
        debug_assert!(self.same_source(other));
        Self::new(self.bytes.start..other.bytes.end, self.source)
    }
    /// Extend one span to include another.
    pub fn extend(&mut self, other: &Self) {
        debug_assert!(self.same_source(other));
        self.bytes.end = other.bytes.end;
    }
    /// The start location of a span within some source.
    #[must_use]
    pub fn location(&self) -> Location {
        self.source.location(self.bytes.start)
    }
    /// The end location of a span within some source.
    #[must_use]
    pub fn end_location(&self) -> Location {
        self.source.location(self.bytes.end)
    }
    /// Do two spans share the same source?
    #[must_use]
    pub fn same_source(&self, other: &Self) -> bool {
        Arc::ptr_eq(&self.source, &other.source)
    }
 }
 impl PartialEq for Span {
    fn eq(&self, other: &Self) -> bool {
        self.same_source(other) && self.bytes == other.bytes
    }
 }
Author	SHA1	Message	Date
Jesse Braham	cd76ceaa77	Add the initial implementation of the lexer	2024-11-28 19:35:45 +01:00
Jesse Braham	a345c35f80	Create the `Span` struct, along with its constituent types	2024-11-28 18:50:14 +01:00