diff --git a/onihime/src/lexer/mod.rs b/onihime/src/lexer/mod.rs index 758d6a5..a4b814d 100644 --- a/onihime/src/lexer/mod.rs +++ b/onihime/src/lexer/mod.rs @@ -108,6 +108,11 @@ impl<'a> Lexer<'a> { self.chars.peek() } + #[inline] + fn peek_some_not_separator(&mut self) -> bool { + self.peek().is_some_and(|c| !is_separator(c)) + } + fn advance(&mut self) -> Option { self.chars.next().inspect(|c| { self.cursor += c.len_utf8(); @@ -137,20 +142,29 @@ impl<'a> Lexer<'a> { fn read_char(&mut self) -> Result { // NOTE: We have already consumed the initial '\' when this function is invoked - let c = if self.peek().is_some_and(|c| !is_separator(c)) { - self.advance().unwrap() // SAFETY: This will never panic - } else { + if self.peek().is_none() || self.peek().is_some_and(|c| is_separator(c)) { return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())); - }; - - match c { - 'u' if self.peek().is_some_and(|c| !is_separator(c)) => self.complete_unicode_escape(), - 'x' if self.peek().is_some_and(|c| !is_separator(c)) => self.complete_ascii_escape(), - _ if self.peek().is_some_and(|c| !is_separator(c)) => { - Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())) - } - _ => Ok(TokenKind::Char), } + + match self.advance() { + Some('u') if self.peek_some_not_separator() => self.complete_unicode_escape(), + Some('x') if self.peek_some_not_separator() => self.complete_ascii_escape(), + _ => { + if self.peek_some_not_separator() { + // Consume the remainder of the literal: + self.read_word(); + + if !matches!( + &self.slice()[1..], // Ignore the leading '\' + b"backspace" | b"formfeed" | b"newline" | b"return" | b"space" | b"tab" + ) { + return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())); + } + } + + Ok(TokenKind::Char) + } + } } fn complete_ascii_escape(&mut self) -> Result { @@ -174,7 +188,7 @@ impl<'a> Lexer<'a> { // We should be at the end of the literal now, i.e. next char should be a // separator: - if self.peek().is_some_and(|c| !is_separator(c)) { + if self.peek_some_not_separator() { self.read_word(); // Recover return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())); } @@ -187,7 +201,7 @@ impl<'a> Lexer<'a> { // Expect between 1 and 6 hexadecimal digits: let mut count = 0; - while self.peek().is_some_and(|c| !is_separator(c)) && count < 6 { + while self.peek_some_not_separator() && count < 6 { match self.advance() { Some(c) if c.is_ascii_hexdigit() => count += 1, _ => { @@ -199,7 +213,7 @@ impl<'a> Lexer<'a> { // If no hexadecimal digits were found, or digits were found but we are still // not at the end of the literal, then the literal is invalid: - if count == 0 || self.peek().is_some_and(|c| !is_separator(c)) { + if count == 0 || self.peek_some_not_separator() { self.read_word(); // Recover return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span())); } @@ -210,7 +224,7 @@ impl<'a> Lexer<'a> { fn read_keyword(&mut self) -> Result { // NOTE: We have already consumed the initial ':' when this function is invoked - if self.peek().is_some_and(|c| !is_separator(c)) { + if self.peek_some_not_separator() { self.read_word(); Ok(TokenKind::Keyword) } else { @@ -229,7 +243,7 @@ impl<'a> Lexer<'a> { } } - while self.peek().is_some_and(|c| !is_separator(c)) { + while self.peek_some_not_separator() { match self.advance() { Some(c) if c.is_ascii_digit() => {} Some('.') => return self.complete_decimal(), @@ -250,23 +264,20 @@ impl<'a> Lexer<'a> { self.advance(); // Base prefix (i.e. 'b'/'B', 'o'/'O', 'x'/'X') let mut digit_found = false; - while let Some(c) = self.peek() { - match c { - _ if is_separator(c) => break, - _ if c.is_digit(radix) => { - self.advance(); - digit_found = true; - } + while self.peek_some_not_separator() { + match self.advance() { + Some(c) if c.is_digit(radix) => digit_found = true, _ => { self.read_word(); // Recover return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())); } - }; + } } if !digit_found { self.read_word(); // Recover - return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())); + return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()) + .with_context(|| "no valid digits found following the integer base prefix")); } Ok(TokenKind::Integer) @@ -280,7 +291,7 @@ impl<'a> Lexer<'a> { let mut exp_found = false; let mut sign_found = false; - while self.peek().is_some_and(|c| !is_separator(c)) { + while self.peek_some_not_separator() { match self.advance() { Some(c) if c.is_ascii_digit() => digit_found = true, Some('e') | Some('E') if digit_found && !exp_found => exp_found = true, @@ -303,7 +314,7 @@ impl<'a> Lexer<'a> { let mut sign_found = false; let mut digit_found = false; - while self.peek().is_some_and(|c| !is_separator(c)) { + while self.peek_some_not_separator() { match self.advance() { Some(c) if c.is_ascii_digit() => digit_found = true, Some('+') | Some('-') if !digit_found && !sign_found => sign_found = true, @@ -317,7 +328,8 @@ impl<'a> Lexer<'a> { if !digit_found { self.read_word(); // Recover - return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())); + return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()) + .with_context(|| "no digits found in the ratio's denominator")); } Ok(TokenKind::Ratio) @@ -467,7 +479,7 @@ mod tests { (Ok(TokenKind::Comment), 3..4, ";"), ]); - test!(char: r"\a \? \7 \λ \\ \u \x" => [ + test!(char: r"\a \? \7 \λ \\ \u \x \newline" => [ (Ok(TokenKind::Char), 0..2, r"\a"), (Ok(TokenKind::Whitespace), 2..3, " "), (Ok(TokenKind::Char), 3..5, r"\?"), @@ -481,9 +493,11 @@ mod tests { (Ok(TokenKind::Char), 16..18, r"\u"), (Ok(TokenKind::Whitespace), 18..19, " "), (Ok(TokenKind::Char), 19..21, r"\x"), + (Ok(TokenKind::Whitespace), 21..22, " "), + (Ok(TokenKind::Char), 22..30, r"\newline"), ]); - test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222" => [ + test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222 \foobar" => [ (Err(LexerErrorKind::InvalidChar), 0..1, r"\"), (Ok(TokenKind::Whitespace), 1..2, " "), (Err(LexerErrorKind::InvalidChar), 2..5, r"\xF"), @@ -495,6 +509,8 @@ mod tests { (Err(LexerErrorKind::InvalidChar), 16..19, r"\uG"), (Ok(TokenKind::Whitespace), 19..20, " "), (Err(LexerErrorKind::InvalidChar), 20..29, r"\u2222222"), + (Ok(TokenKind::Whitespace), 29..30, " "), + (Err(LexerErrorKind::InvalidChar), 30..37, r"\foobar"), ]); test!(err_invalid_integer: "0b012 0o8 0xFG 1N 0x" => [