Make the lexer operate on graphemes instead of chars (still needs some work)

2024-12-28 10:31:37 +01:00 · 2024-12-28 10:31:37 +01:00 · e5fafd03ba
commit e5fafd03ba
parent 8dcdd34b65
6 changed files with 108 additions and 80 deletions
--- a/onihime/Cargo.toml
+++ b/onihime/Cargo.toml
@ -8,6 +8,7 @@ repository.workspace = true
 license.workspace    = true

 [dependencies]
+unicode-segmentation = "1.12.0"

 [lints.rust]
 unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tarpaulin_include)'] }
--- a/onihime/src/lexer/error.rs
+++ b/onihime/src/lexer/error.rs
@ -4,7 +4,7 @@ use crate::span::Span;
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub enum LexerErrorKind {
    /// An invalid escape sequence was encountered.
-    InvalidEscape(char),
+    InvalidEscape(String),
    /// An invalid numeric literal was encountered.
    InvalidNumber(String),
    /// An invalid string literal was encountered.
--- a/onihime/src/lexer/mod.rs
+++ b/onihime/src/lexer/mod.rs
@ -1,4 +1,6 @@
-use std::{str::Chars, sync::Arc};
+use std::sync::Arc;
+
+use unicode_segmentation::{Graphemes, UnicodeSegmentation as _};

 pub(crate) use self::{
    error::{LexerError, LexerErrorKind},
@ -11,18 +13,26 @@ mod error;
 mod symbol;
 mod token;

-/// Determine if the current character is a separator, performing 1-character
-/// lookahead as needed to handle multi-character separators.
-fn is_separator(current: char, next: Option<char>) -> bool {
-    current.is_ascii_whitespace()
-        || matches!(current, '(' | ')' | '[' | ']' | '{' | '}' | ';')
-        || (current == '#' && next.is_some_and(|c| matches!(c, '|' | '{')))
+/// Determine if the current grapheme is an ASCII digit.
+fn is_ascii_digit(current: &str) -> bool {
+    matches!(
+        current,
+        "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
+    )
+}
+
+/// Determine if the current grapheme is a separator, performing lookahead as
+/// needed to handle multi-character separators.
+fn is_separator(current: &str, next: Option<&str>) -> bool {
+    current.trim_ascii().is_empty()
+        || matches!(current, "(" | ")" | "[" | "]" | "{" | "}" | ";")
+        || (current == "#" && next.is_some_and(|c| matches!(c, "|" | "{")))
 }

 /// A lexer, used by the parser.
 #[derive(Debug)]
 pub(crate) struct Lexer<'lexer> {
-    input: Chars<'lexer>,
+    input: Graphemes<'lexer>,
    byte: usize,
    source: Arc<Source>,
 }
@ -34,7 +44,7 @@ impl<'lexer> Lexer<'lexer> {
        let source = Arc::new(Source::new(None, input.to_string()));

        Self {
-            input: input.chars(),
+            input: input.graphemes(true),
            byte: 0,
            source,
        }
@ -62,30 +72,30 @@ impl<'lexer> Lexer<'lexer> {
    /// Returns `true` when at the end of the input.
    #[must_use]
    pub(crate) fn eof(&self) -> bool {
-        self.peek(0).is_none()
+        self.current().is_none()
    }

-    /// Get the current character.
+    /// Get the current grapheme without advancing.
    #[must_use]
-    fn current(&self) -> Option<char> {
-        self.input.as_str().chars().next()
+    fn current(&self) -> Option<&str> {
+        self.input.clone().next()
    }

-    /// Get the nth character ahead of the current character without advancing.
+    /// Get the nth grapheme ahead of the current grapheme without advancing.
    #[must_use]
-    fn peek(&self, n: usize) -> Option<char> {
-        self.input.as_str().chars().nth(n)
+    fn peek(&self) -> Option<&str> {
+        self.input.clone().take(2).last()
    }

-    /// Advance the lexer by one character.
-    fn advance(&mut self) -> Option<char> {
+    /// Advance the lexer by one grapheme.
+    fn advance(&mut self) -> Option<&str> {
        let c = self.input.next()?;
-        self.byte += c.len_utf8();
+        self.byte += c.len();

        Some(c)
    }

-    /// Advance the lexer by one character, and then return the specified
+    /// Advance the lexer by one grapheme, and then return the specified
    /// `TokenKind`:
    #[must_use]
    fn advance_and(&mut self, kind: TokenKind) -> TokenKind {
@ -98,11 +108,11 @@ impl<'lexer> Lexer<'lexer> {
    fn read_word(&mut self) -> String {
        let mut word = String::new();
        while let Some(c) = self.current() {
-            if is_separator(c, self.peek(1)) {
+            if is_separator(c, self.peek()) {
                break;
            }

-            word.push(c);
+            word.push_str(c);
            self.advance();
        }

@ -115,7 +125,7 @@ impl<'lexer> Lexer<'lexer> {
        // we have reached the end of input and no additional characters can be read:
        let c = loop {
            match self.current() {
-                Some(c) if c.is_ascii_whitespace() => {
+                Some(c) if c.trim_ascii().is_empty() => {
                    self.advance();
                }
                Some(c) => break c,
@ -126,27 +136,27 @@ impl<'lexer> Lexer<'lexer> {
        let mut span = self.span();

        let kind = match c {
-            ';' => self.line_comment(),
-            '#' if self.peek(1) == Some('|') => self.block_comment(),
+            ";" => self.line_comment(),
+            "#" if self.peek() == Some("|") => self.block_comment(),

-            '(' => self.advance_and(TokenKind::OpenParen),
-            ')' => self.advance_and(TokenKind::CloseParen),
-            '{' => self.advance_and(TokenKind::OpenBrace),
-            '}' => self.advance_and(TokenKind::CloseBrace),
-            '[' => self.advance_and(TokenKind::OpenBracket),
-            ']' => self.advance_and(TokenKind::CloseBracket),
-            '#' if self.peek(1) == Some('{') => {
+            "(" => self.advance_and(TokenKind::OpenParen),
+            ")" => self.advance_and(TokenKind::CloseParen),
+            "{" => self.advance_and(TokenKind::OpenBrace),
+            "}" => self.advance_and(TokenKind::CloseBrace),
+            "[" => self.advance_and(TokenKind::OpenBracket),
+            "]" => self.advance_and(TokenKind::CloseBracket),
+            "#" if self.peek() == Some("{") => {
                self.advance(); // '#'
                self.advance(); // '{'

                TokenKind::OpenHashBrace
            }

-            '0' if matches!(self.peek(1), Some('b') | Some('o') | Some('x')) => {
-                let radix = match self.peek(1) {
-                    Some('b') => 2,
-                    Some('o') => 8,
-                    Some('x') => 16,
+            "0" if matches!(self.peek(), Some("b") | Some("o") | Some("x")) => {
+                let radix = match self.peek() {
+                    Some("b") => 2,
+                    Some("o") => 8,
+                    Some("x") => 16,
                    _ => unreachable!(),
                };

@ -155,15 +165,15 @@ impl<'lexer> Lexer<'lexer> {

                self.integer_literal(word, span, radix)?
            }
-            '0'..='9' => self.numeric_literal(span.clone())?,
-            '+' | '-' if matches!(self.peek(1), Some('0'..='9')) => {
+            c if is_ascii_digit(c) => self.numeric_literal(span.clone())?,
+            "+" | "-" if self.peek().is_some_and(|c| is_ascii_digit(c)) => {
                self.numeric_literal(span.clone())?
            }

-            '\'' => self.char_literal(span.clone())?,
-            '"' => self.string_literal(span.clone())?,
+            "'" => self.char_literal(span.clone())?,
+            "\"" => self.string_literal(span.clone())?,

-            ':' => {
+            ":" => {
                self.advance(); // ':'

                TokenKind::Keyword(Symbol::from(self.read_word()))
@ -189,18 +199,18 @@ impl<'lexer> Lexer<'lexer> {
    fn line_comment(&mut self) -> TokenKind {
        // Line comments may start with any number of semicolons, so consume however
        // many are present at the beginning of the comment:
-        while self.current().is_some_and(|c| c == ';') {
+        while self.current().is_some_and(|c| c == ";") {
            self.advance();
        }

        // Line comments continue until a newline character is encountered:
        let mut comment = String::new();
        while let Some(c) = self.advance() {
-            if c == '\n' {
+            if c == "\n" {
                break;
            }

-            comment.push(c);
+            comment.push_str(c);
        }

        TokenKind::LineComment(comment.trim().into())
@ -213,16 +223,20 @@ impl<'lexer> Lexer<'lexer> {
        self.advance(); // '|'

        let mut comment = String::new();
+        let mut pipe_found = false;
+
        while let Some(c) = self.advance() {
-            if c == '|' && matches!(self.peek(0), Some('#')) {
-                self.advance(); // '#'
+            if pipe_found && c == "#" {
                break;
            }

-            comment.push(c);
+            comment.push_str(c);
+            pipe_found = c == "|";
        }

-        TokenKind::BlockComment(comment.trim().into())
+        let comment = comment.trim_end_matches('|').trim();
+
+        TokenKind::BlockComment(comment.into())
    }

    fn float_literal(&self, word: String, span: Span) -> Result<TokenKind, LexerError> {
@ -269,13 +283,15 @@ impl<'lexer> Lexer<'lexer> {
        self.advance(); // '\''

        let c = match self.advance() {
-            Some('\\') => match self.advance() {
-                Some(c @ ('"' | '\\')) => c,
-                Some('n') => '\n',
-                Some('r') => '\r',
-                Some('t') => '\t',
-                Some('e') => '\x1b',
+            Some("\\") => match self.advance() {
+                Some(c @ ("\"" | "\\")) => c,
+                Some("n") => "\n",
+                Some("r") => "\r",
+                Some("t") => "\t",
+                Some("e") => "\x1b",
                Some(c) => {
+                    let c = c.to_string();
+
                    self.read_word(); // Recover from the error
                    return Err(LexerError::new(
                        LexerErrorKind::InvalidEscape(c),
@ -298,7 +314,9 @@ impl<'lexer> Lexer<'lexer> {
            }
        };

-        if self.advance() != Some('\'') {
+        let c = c.to_string();
+
+        if self.advance() != Some("'") {
            self.read_word(); // Recover from the error
            return Err(LexerError::new(
                LexerErrorKind::UnclosedChar,
@ -317,15 +335,17 @@ impl<'lexer> Lexer<'lexer> {

        loop {
            let ch_span = self.span();
-            string.push(match self.advance() {
-                Some('"') => break,
-                Some('\\') => match self.advance() {
-                    Some(c @ ('"' | '\\')) => c,
-                    Some('n') => '\n',
-                    Some('r') => '\r',
-                    Some('t') => '\t',
-                    Some('e') => '\x1b',
+            string.push_str(match self.advance() {
+                Some("\"") => break,
+                Some("\\") => match self.advance() {
+                    Some(c @ ("\"" | "\\")) => c,
+                    Some("n") => "\n",
+                    Some("r") => "\r",
+                    Some("t") => "\t",
+                    Some("e") => "\x1b",
                    Some(c) => {
+                        let c = c.to_string();
+
                        self.read_word(); // Recover from the error
                        return Err(LexerError::new(
                            LexerErrorKind::InvalidEscape(c),
@ -343,7 +363,7 @@ impl<'lexer> Lexer<'lexer> {

        if self
            .current()
-            .is_some_and(|c| !is_separator(c, self.peek(1)))
+            .is_some_and(|c| !is_separator(c, self.peek()))
        {
            self.read_word(); // Recover from the error
            return Err(LexerError::new(
@ -452,16 +472,20 @@ mod tests {
    ]);

    test!(char_literal: r"'x' '\n' '\r' '\t' '\e' '\\' '\q' 'b", [
-        Ok(TokenKind::Char('x')),
-        Ok(TokenKind::Char('\n')),
-        Ok(TokenKind::Char('\r')),
-        Ok(TokenKind::Char('\t')),
-        Ok(TokenKind::Char('\x1b')),
-        Ok(TokenKind::Char('\\')),
-        Err(LexerErrorKind::InvalidEscape('q')),
+        Ok(TokenKind::Char("x".into())),
+        Ok(TokenKind::Char("\n".into())),
+        Ok(TokenKind::Char("\r".into())),
+        Ok(TokenKind::Char("\t".into())),
+        Ok(TokenKind::Char("\x1b".into())),
+        Ok(TokenKind::Char("\\".into())),
+        Err(LexerErrorKind::InvalidEscape("q".into())),
        Err(LexerErrorKind::UnclosedChar),
    ]);

+    test!(char_literal_with_unicode: "'y̆'", [
+        Ok(TokenKind::Char("y̆".into())),
+    ]);
+
    test!(error_unclosed_char_escape: r"'\", [
        Err(LexerErrorKind::UnclosedChar),
    ]);
@ -482,7 +506,7 @@ mod tests {
    ]);

    test!(error_invalid_escape_string: "\"oh no \\p\"", [
-        Err(LexerErrorKind::InvalidEscape('p')),
+        Err(LexerErrorKind::InvalidEscape("p".into())),
    ]);

    test!(error_unclosed_string: "\"hiii", [
@ -513,11 +537,14 @@ mod tests {
        Ok(TokenKind::CloseParen),
    ]);

-    test!(unicode_symbol: "(かわいい 🐕 😻)", [
+    test!(unicode_symbols: "(かわいい 🐕 😻 (föö))", [
        Ok(TokenKind::OpenParen),
        Ok(TokenKind::Symbol(Symbol::from("かわいい"))),
        Ok(TokenKind::Symbol(Symbol::from("🐕"))),
        Ok(TokenKind::Symbol(Symbol::from("😻"))),
+        Ok(TokenKind::OpenParen),
+        Ok(TokenKind::Symbol(Symbol::from("föö"))),
+        Ok(TokenKind::CloseParen),
        Ok(TokenKind::CloseParen),
    ]);
 }
--- a/onihime/src/lexer/token.rs
+++ b/onihime/src/lexer/token.rs
@ -27,7 +27,7 @@ pub(crate) enum TokenKind {
    /// Boolean, e.g. `true`, `false`
    Bool(bool),
    /// Character, e.g. `'c'`, `'\n'`
-    Char(char),
+    Char(String),
    /// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003`
    Float(f64),
    /// Integer, e.g. `0`, `-1`, `+200`
--- a/onihime/src/parser/ast.rs
+++ b/onihime/src/parser/ast.rs
@ -117,7 +117,7 @@ pub(crate) enum Atom {
    /// Boolean, e.g. `true`, `false`
    Bool(bool),
    /// Character, e.g. `'c'`, `'\n'`
-    Char(char),
+    Char(String),
    /// Floating-point number, e.g. `-1.0`, `2.0`, `+0.003`
    Float(f64),
    /// Integer, e.g. `0`, `-1`, `+200`
--- a/onihime/src/parser/mod.rs
+++ b/onihime/src/parser/mod.rs
@ -258,9 +258,9 @@ mod tests {
    test!(vector: "['a' 'b' 'c']", src => Ok(Ast::from(vec![
        Node::new(
            Expr::Vector(vec![
-                Node::new(Expr::Atom(Atom::Char('a')), Span::new(1..4, src.clone())),
-                Node::new(Expr::Atom(Atom::Char('b')), Span::new(5..8, src.clone())),
-                Node::new(Expr::Atom(Atom::Char('c')), Span::new(9..12, src.clone())),
+                Node::new(Expr::Atom(Atom::Char("a".into())), Span::new(1..4, src.clone())),
+                Node::new(Expr::Atom(Atom::Char("b".into())), Span::new(5..8, src.clone())),
+                Node::new(Expr::Atom(Atom::Char("c".into())), Span::new(9..12, src.clone())),
            ]),
            Span::new(0..13, src),
        )