More lexer refactoring, support character literals for e.g. newline, tab, etc.

2025-01-31 17:03:21 +01:00 · 2025-01-31 17:03:21 +01:00 · ece6645e50
commit ece6645e50
parent 2d95a58ce7
1 changed files with 47 additions and 31 deletions
--- a/onihime/src/lexer/mod.rs
+++ b/onihime/src/lexer/mod.rs
@ -108,6 +108,11 @@ impl<'a> Lexer<'a> {
        self.chars.peek()
    }

+    #[inline]
+    fn peek_some_not_separator(&mut self) -> bool {
+        self.peek().is_some_and(|c| !is_separator(c))
+    }
+
    fn advance(&mut self) -> Option<char> {
        self.chars.next().inspect(|c| {
            self.cursor += c.len_utf8();
@ -137,20 +142,29 @@ impl<'a> Lexer<'a> {
    fn read_char(&mut self) -> Result<TokenKind, LexerError> {
        // NOTE: We have already consumed the initial '\' when this function is invoked

-        let c = if self.peek().is_some_and(|c| !is_separator(c)) {
-            self.advance().unwrap() // SAFETY: This will never panic
-        } else {
+        if self.peek().is_none() || self.peek().is_some_and(|c| is_separator(c)) {
            return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
-        };
-
-        match c {
-            'u' if self.peek().is_some_and(|c| !is_separator(c)) => self.complete_unicode_escape(),
-            'x' if self.peek().is_some_and(|c| !is_separator(c)) => self.complete_ascii_escape(),
-            _ if self.peek().is_some_and(|c| !is_separator(c)) => {
-                Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()))
-            }
-            _ => Ok(TokenKind::Char),
        }
+
+        match self.advance() {
+            Some('u') if self.peek_some_not_separator() => self.complete_unicode_escape(),
+            Some('x') if self.peek_some_not_separator() => self.complete_ascii_escape(),
+            _ => {
+                if self.peek_some_not_separator() {
+                    // Consume the remainder of the literal:
+                    self.read_word();
+
+                    if !matches!(
+                        &self.slice()[1..], // Ignore the leading '\'
+                        b"backspace" | b"formfeed" | b"newline" | b"return" | b"space" | b"tab"
+                    ) {
+                        return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
+            }
+                }
+
+                Ok(TokenKind::Char)
+        }
+    }
    }

    fn complete_ascii_escape(&mut self) -> Result<TokenKind, LexerError> {
@ -174,7 +188,7 @@ impl<'a> Lexer<'a> {

        // We should be at the end of the literal now, i.e. next char should be a
        // separator:
-        if self.peek().is_some_and(|c| !is_separator(c)) {
+        if self.peek_some_not_separator() {
            self.read_word(); // Recover
            return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
        }
@ -187,7 +201,7 @@ impl<'a> Lexer<'a> {

        // Expect between 1 and 6 hexadecimal digits:
        let mut count = 0;
-        while self.peek().is_some_and(|c| !is_separator(c)) && count < 6 {
+        while self.peek_some_not_separator() && count < 6 {
            match self.advance() {
                Some(c) if c.is_ascii_hexdigit() => count += 1,
                _ => {
@ -199,7 +213,7 @@ impl<'a> Lexer<'a> {

        // If no hexadecimal digits were found, or digits were found but we are still
        // not at the end of the literal, then the literal is invalid:
-        if count == 0 || self.peek().is_some_and(|c| !is_separator(c)) {
+        if count == 0 || self.peek_some_not_separator() {
            self.read_word(); // Recover
            return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
        }
@ -210,7 +224,7 @@ impl<'a> Lexer<'a> {
    fn read_keyword(&mut self) -> Result<TokenKind, LexerError> {
        // NOTE: We have already consumed the initial ':' when this function is invoked

-        if self.peek().is_some_and(|c| !is_separator(c)) {
+        if self.peek_some_not_separator() {
            self.read_word();
            Ok(TokenKind::Keyword)
        } else {
@ -229,7 +243,7 @@ impl<'a> Lexer<'a> {
            }
        }

-        while self.peek().is_some_and(|c| !is_separator(c)) {
+        while self.peek_some_not_separator() {
            match self.advance() {
                Some(c) if c.is_ascii_digit() => {}
                Some('.') => return self.complete_decimal(),
@ -250,23 +264,20 @@ impl<'a> Lexer<'a> {
        self.advance(); // Base prefix (i.e. 'b'/'B', 'o'/'O', 'x'/'X')

        let mut digit_found = false;
-        while let Some(c) = self.peek() {
-            match c {
-                _ if is_separator(c) => break,
-                _ if c.is_digit(radix) => {
-                    self.advance();
-                    digit_found = true;
-                }
+        while self.peek_some_not_separator() {
+            match self.advance() {
+                Some(c) if c.is_digit(radix) => digit_found = true,
                _ => {
                    self.read_word(); // Recover
                    return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()));
                }
-            };
+            }
        }

        if !digit_found {
            self.read_word(); // Recover
-            return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()));
+            return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())
+                .with_context(|| "no valid digits found following the integer base prefix"));
        }

        Ok(TokenKind::Integer)
@ -280,7 +291,7 @@ impl<'a> Lexer<'a> {
        let mut exp_found = false;
        let mut sign_found = false;

-        while self.peek().is_some_and(|c| !is_separator(c)) {
+        while self.peek_some_not_separator() {
            match self.advance() {
                Some(c) if c.is_ascii_digit() => digit_found = true,
                Some('e') | Some('E') if digit_found && !exp_found => exp_found = true,
@ -303,7 +314,7 @@ impl<'a> Lexer<'a> {
        let mut sign_found = false;
        let mut digit_found = false;

-        while self.peek().is_some_and(|c| !is_separator(c)) {
+        while self.peek_some_not_separator() {
            match self.advance() {
                Some(c) if c.is_ascii_digit() => digit_found = true,
                Some('+') | Some('-') if !digit_found && !sign_found => sign_found = true,
@ -317,7 +328,8 @@ impl<'a> Lexer<'a> {

        if !digit_found {
            self.read_word(); // Recover
-            return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()));
+            return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())
+                .with_context(|| "no digits found in the ratio's denominator"));
        }

        Ok(TokenKind::Ratio)
@ -467,7 +479,7 @@ mod tests {
        (Ok(TokenKind::Comment), 3..4, ";"),
    ]);

-    test!(char: r"\a \? \7 \λ \\ \u \x" => [
+    test!(char: r"\a \? \7 \λ \\ \u \x \newline" => [
        (Ok(TokenKind::Char), 0..2, r"\a"),
        (Ok(TokenKind::Whitespace), 2..3, " "),
        (Ok(TokenKind::Char), 3..5, r"\?"),
@ -481,9 +493,11 @@ mod tests {
        (Ok(TokenKind::Char), 16..18, r"\u"),
        (Ok(TokenKind::Whitespace), 18..19, " "),
        (Ok(TokenKind::Char), 19..21, r"\x"),
+        (Ok(TokenKind::Whitespace), 21..22, " "),
+        (Ok(TokenKind::Char), 22..30, r"\newline"),
    ]);

-    test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222" => [
+    test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222 \foobar" => [
        (Err(LexerErrorKind::InvalidChar), 0..1, r"\"),
        (Ok(TokenKind::Whitespace), 1..2, " "),
        (Err(LexerErrorKind::InvalidChar), 2..5, r"\xF"),
@ -495,6 +509,8 @@ mod tests {
        (Err(LexerErrorKind::InvalidChar), 16..19, r"\uG"),
        (Ok(TokenKind::Whitespace), 19..20, " "),
        (Err(LexerErrorKind::InvalidChar), 20..29, r"\u2222222"),
+        (Ok(TokenKind::Whitespace), 29..30, " "),
+        (Err(LexerErrorKind::InvalidChar), 30..37, r"\foobar"),
    ]);

    test!(err_invalid_integer: "0b012 0o8 0xFG 1N 0x" => [