More lexer refactoring, support character literals for e.g. newline, tab, etc.

This commit is contained in:
Jesse Braham 2025-01-31 17:03:21 +01:00
parent 2d95a58ce7
commit ece6645e50

View File

@ -108,6 +108,11 @@ impl<'a> Lexer<'a> {
self.chars.peek()
}
#[inline]
fn peek_some_not_separator(&mut self) -> bool {
self.peek().is_some_and(|c| !is_separator(c))
}
fn advance(&mut self) -> Option<char> {
self.chars.next().inspect(|c| {
self.cursor += c.len_utf8();
@ -137,20 +142,29 @@ impl<'a> Lexer<'a> {
fn read_char(&mut self) -> Result<TokenKind, LexerError> {
// NOTE: We have already consumed the initial '\' when this function is invoked
let c = if self.peek().is_some_and(|c| !is_separator(c)) {
self.advance().unwrap() // SAFETY: This will never panic
} else {
if self.peek().is_none() || self.peek().is_some_and(|c| is_separator(c)) {
return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
};
match c {
'u' if self.peek().is_some_and(|c| !is_separator(c)) => self.complete_unicode_escape(),
'x' if self.peek().is_some_and(|c| !is_separator(c)) => self.complete_ascii_escape(),
_ if self.peek().is_some_and(|c| !is_separator(c)) => {
Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()))
}
_ => Ok(TokenKind::Char),
}
match self.advance() {
Some('u') if self.peek_some_not_separator() => self.complete_unicode_escape(),
Some('x') if self.peek_some_not_separator() => self.complete_ascii_escape(),
_ => {
if self.peek_some_not_separator() {
// Consume the remainder of the literal:
self.read_word();
if !matches!(
&self.slice()[1..], // Ignore the leading '\'
b"backspace" | b"formfeed" | b"newline" | b"return" | b"space" | b"tab"
) {
return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
}
}
Ok(TokenKind::Char)
}
}
}
fn complete_ascii_escape(&mut self) -> Result<TokenKind, LexerError> {
@ -174,7 +188,7 @@ impl<'a> Lexer<'a> {
// We should be at the end of the literal now, i.e. next char should be a
// separator:
if self.peek().is_some_and(|c| !is_separator(c)) {
if self.peek_some_not_separator() {
self.read_word(); // Recover
return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
}
@ -187,7 +201,7 @@ impl<'a> Lexer<'a> {
// Expect between 1 and 6 hexadecimal digits:
let mut count = 0;
while self.peek().is_some_and(|c| !is_separator(c)) && count < 6 {
while self.peek_some_not_separator() && count < 6 {
match self.advance() {
Some(c) if c.is_ascii_hexdigit() => count += 1,
_ => {
@ -199,7 +213,7 @@ impl<'a> Lexer<'a> {
// If no hexadecimal digits were found, or digits were found but we are still
// not at the end of the literal, then the literal is invalid:
if count == 0 || self.peek().is_some_and(|c| !is_separator(c)) {
if count == 0 || self.peek_some_not_separator() {
self.read_word(); // Recover
return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
}
@ -210,7 +224,7 @@ impl<'a> Lexer<'a> {
fn read_keyword(&mut self) -> Result<TokenKind, LexerError> {
// NOTE: We have already consumed the initial ':' when this function is invoked
if self.peek().is_some_and(|c| !is_separator(c)) {
if self.peek_some_not_separator() {
self.read_word();
Ok(TokenKind::Keyword)
} else {
@ -229,7 +243,7 @@ impl<'a> Lexer<'a> {
}
}
while self.peek().is_some_and(|c| !is_separator(c)) {
while self.peek_some_not_separator() {
match self.advance() {
Some(c) if c.is_ascii_digit() => {}
Some('.') => return self.complete_decimal(),
@ -250,23 +264,20 @@ impl<'a> Lexer<'a> {
self.advance(); // Base prefix (i.e. 'b'/'B', 'o'/'O', 'x'/'X')
let mut digit_found = false;
while let Some(c) = self.peek() {
match c {
_ if is_separator(c) => break,
_ if c.is_digit(radix) => {
self.advance();
digit_found = true;
}
while self.peek_some_not_separator() {
match self.advance() {
Some(c) if c.is_digit(radix) => digit_found = true,
_ => {
self.read_word(); // Recover
return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()));
}
};
}
}
if !digit_found {
self.read_word(); // Recover
return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()));
return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())
.with_context(|| "no valid digits found following the integer base prefix"));
}
Ok(TokenKind::Integer)
@ -280,7 +291,7 @@ impl<'a> Lexer<'a> {
let mut exp_found = false;
let mut sign_found = false;
while self.peek().is_some_and(|c| !is_separator(c)) {
while self.peek_some_not_separator() {
match self.advance() {
Some(c) if c.is_ascii_digit() => digit_found = true,
Some('e') | Some('E') if digit_found && !exp_found => exp_found = true,
@ -303,7 +314,7 @@ impl<'a> Lexer<'a> {
let mut sign_found = false;
let mut digit_found = false;
while self.peek().is_some_and(|c| !is_separator(c)) {
while self.peek_some_not_separator() {
match self.advance() {
Some(c) if c.is_ascii_digit() => digit_found = true,
Some('+') | Some('-') if !digit_found && !sign_found => sign_found = true,
@ -317,7 +328,8 @@ impl<'a> Lexer<'a> {
if !digit_found {
self.read_word(); // Recover
return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()));
return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())
.with_context(|| "no digits found in the ratio's denominator"));
}
Ok(TokenKind::Ratio)
@ -467,7 +479,7 @@ mod tests {
(Ok(TokenKind::Comment), 3..4, ";"),
]);
test!(char: r"\a \? \7 \λ \\ \u \x" => [
test!(char: r"\a \? \7 \λ \\ \u \x \newline" => [
(Ok(TokenKind::Char), 0..2, r"\a"),
(Ok(TokenKind::Whitespace), 2..3, " "),
(Ok(TokenKind::Char), 3..5, r"\?"),
@ -481,9 +493,11 @@ mod tests {
(Ok(TokenKind::Char), 16..18, r"\u"),
(Ok(TokenKind::Whitespace), 18..19, " "),
(Ok(TokenKind::Char), 19..21, r"\x"),
(Ok(TokenKind::Whitespace), 21..22, " "),
(Ok(TokenKind::Char), 22..30, r"\newline"),
]);
test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222" => [
test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222 \foobar" => [
(Err(LexerErrorKind::InvalidChar), 0..1, r"\"),
(Ok(TokenKind::Whitespace), 1..2, " "),
(Err(LexerErrorKind::InvalidChar), 2..5, r"\xF"),
@ -495,6 +509,8 @@ mod tests {
(Err(LexerErrorKind::InvalidChar), 16..19, r"\uG"),
(Ok(TokenKind::Whitespace), 19..20, " "),
(Err(LexerErrorKind::InvalidChar), 20..29, r"\u2222222"),
(Ok(TokenKind::Whitespace), 29..30, " "),
(Err(LexerErrorKind::InvalidChar), 30..37, r"\foobar"),
]);
test!(err_invalid_integer: "0b012 0o8 0xFG 1N 0x" => [