More lexer refactoring, support character literals for e.g. newline, tab, etc.
This commit is contained in:
parent
2d95a58ce7
commit
ece6645e50
@ -108,6 +108,11 @@ impl<'a> Lexer<'a> {
|
||||
self.chars.peek()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn peek_some_not_separator(&mut self) -> bool {
|
||||
self.peek().is_some_and(|c| !is_separator(c))
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> Option<char> {
|
||||
self.chars.next().inspect(|c| {
|
||||
self.cursor += c.len_utf8();
|
||||
@ -137,20 +142,29 @@ impl<'a> Lexer<'a> {
|
||||
fn read_char(&mut self) -> Result<TokenKind, LexerError> {
|
||||
// NOTE: We have already consumed the initial '\' when this function is invoked
|
||||
|
||||
let c = if self.peek().is_some_and(|c| !is_separator(c)) {
|
||||
self.advance().unwrap() // SAFETY: This will never panic
|
||||
} else {
|
||||
if self.peek().is_none() || self.peek().is_some_and(|c| is_separator(c)) {
|
||||
return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
|
||||
};
|
||||
|
||||
match c {
|
||||
'u' if self.peek().is_some_and(|c| !is_separator(c)) => self.complete_unicode_escape(),
|
||||
'x' if self.peek().is_some_and(|c| !is_separator(c)) => self.complete_ascii_escape(),
|
||||
_ if self.peek().is_some_and(|c| !is_separator(c)) => {
|
||||
Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()))
|
||||
}
|
||||
_ => Ok(TokenKind::Char),
|
||||
}
|
||||
|
||||
match self.advance() {
|
||||
Some('u') if self.peek_some_not_separator() => self.complete_unicode_escape(),
|
||||
Some('x') if self.peek_some_not_separator() => self.complete_ascii_escape(),
|
||||
_ => {
|
||||
if self.peek_some_not_separator() {
|
||||
// Consume the remainder of the literal:
|
||||
self.read_word();
|
||||
|
||||
if !matches!(
|
||||
&self.slice()[1..], // Ignore the leading '\'
|
||||
b"backspace" | b"formfeed" | b"newline" | b"return" | b"space" | b"tab"
|
||||
) {
|
||||
return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(TokenKind::Char)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn complete_ascii_escape(&mut self) -> Result<TokenKind, LexerError> {
|
||||
@ -174,7 +188,7 @@ impl<'a> Lexer<'a> {
|
||||
|
||||
// We should be at the end of the literal now, i.e. next char should be a
|
||||
// separator:
|
||||
if self.peek().is_some_and(|c| !is_separator(c)) {
|
||||
if self.peek_some_not_separator() {
|
||||
self.read_word(); // Recover
|
||||
return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
|
||||
}
|
||||
@ -187,7 +201,7 @@ impl<'a> Lexer<'a> {
|
||||
|
||||
// Expect between 1 and 6 hexadecimal digits:
|
||||
let mut count = 0;
|
||||
while self.peek().is_some_and(|c| !is_separator(c)) && count < 6 {
|
||||
while self.peek_some_not_separator() && count < 6 {
|
||||
match self.advance() {
|
||||
Some(c) if c.is_ascii_hexdigit() => count += 1,
|
||||
_ => {
|
||||
@ -199,7 +213,7 @@ impl<'a> Lexer<'a> {
|
||||
|
||||
// If no hexadecimal digits were found, or digits were found but we are still
|
||||
// not at the end of the literal, then the literal is invalid:
|
||||
if count == 0 || self.peek().is_some_and(|c| !is_separator(c)) {
|
||||
if count == 0 || self.peek_some_not_separator() {
|
||||
self.read_word(); // Recover
|
||||
return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
|
||||
}
|
||||
@ -210,7 +224,7 @@ impl<'a> Lexer<'a> {
|
||||
fn read_keyword(&mut self) -> Result<TokenKind, LexerError> {
|
||||
// NOTE: We have already consumed the initial ':' when this function is invoked
|
||||
|
||||
if self.peek().is_some_and(|c| !is_separator(c)) {
|
||||
if self.peek_some_not_separator() {
|
||||
self.read_word();
|
||||
Ok(TokenKind::Keyword)
|
||||
} else {
|
||||
@ -229,7 +243,7 @@ impl<'a> Lexer<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
while self.peek().is_some_and(|c| !is_separator(c)) {
|
||||
while self.peek_some_not_separator() {
|
||||
match self.advance() {
|
||||
Some(c) if c.is_ascii_digit() => {}
|
||||
Some('.') => return self.complete_decimal(),
|
||||
@ -250,23 +264,20 @@ impl<'a> Lexer<'a> {
|
||||
self.advance(); // Base prefix (i.e. 'b'/'B', 'o'/'O', 'x'/'X')
|
||||
|
||||
let mut digit_found = false;
|
||||
while let Some(c) = self.peek() {
|
||||
match c {
|
||||
_ if is_separator(c) => break,
|
||||
_ if c.is_digit(radix) => {
|
||||
self.advance();
|
||||
digit_found = true;
|
||||
}
|
||||
while self.peek_some_not_separator() {
|
||||
match self.advance() {
|
||||
Some(c) if c.is_digit(radix) => digit_found = true,
|
||||
_ => {
|
||||
self.read_word(); // Recover
|
||||
return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()));
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if !digit_found {
|
||||
self.read_word(); // Recover
|
||||
return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()));
|
||||
return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())
|
||||
.with_context(|| "no valid digits found following the integer base prefix"));
|
||||
}
|
||||
|
||||
Ok(TokenKind::Integer)
|
||||
@ -280,7 +291,7 @@ impl<'a> Lexer<'a> {
|
||||
let mut exp_found = false;
|
||||
let mut sign_found = false;
|
||||
|
||||
while self.peek().is_some_and(|c| !is_separator(c)) {
|
||||
while self.peek_some_not_separator() {
|
||||
match self.advance() {
|
||||
Some(c) if c.is_ascii_digit() => digit_found = true,
|
||||
Some('e') | Some('E') if digit_found && !exp_found => exp_found = true,
|
||||
@ -303,7 +314,7 @@ impl<'a> Lexer<'a> {
|
||||
let mut sign_found = false;
|
||||
let mut digit_found = false;
|
||||
|
||||
while self.peek().is_some_and(|c| !is_separator(c)) {
|
||||
while self.peek_some_not_separator() {
|
||||
match self.advance() {
|
||||
Some(c) if c.is_ascii_digit() => digit_found = true,
|
||||
Some('+') | Some('-') if !digit_found && !sign_found => sign_found = true,
|
||||
@ -317,7 +328,8 @@ impl<'a> Lexer<'a> {
|
||||
|
||||
if !digit_found {
|
||||
self.read_word(); // Recover
|
||||
return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()));
|
||||
return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())
|
||||
.with_context(|| "no digits found in the ratio's denominator"));
|
||||
}
|
||||
|
||||
Ok(TokenKind::Ratio)
|
||||
@ -467,7 +479,7 @@ mod tests {
|
||||
(Ok(TokenKind::Comment), 3..4, ";"),
|
||||
]);
|
||||
|
||||
test!(char: r"\a \? \7 \λ \\ \u \x" => [
|
||||
test!(char: r"\a \? \7 \λ \\ \u \x \newline" => [
|
||||
(Ok(TokenKind::Char), 0..2, r"\a"),
|
||||
(Ok(TokenKind::Whitespace), 2..3, " "),
|
||||
(Ok(TokenKind::Char), 3..5, r"\?"),
|
||||
@ -481,9 +493,11 @@ mod tests {
|
||||
(Ok(TokenKind::Char), 16..18, r"\u"),
|
||||
(Ok(TokenKind::Whitespace), 18..19, " "),
|
||||
(Ok(TokenKind::Char), 19..21, r"\x"),
|
||||
(Ok(TokenKind::Whitespace), 21..22, " "),
|
||||
(Ok(TokenKind::Char), 22..30, r"\newline"),
|
||||
]);
|
||||
|
||||
test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222" => [
|
||||
test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222 \foobar" => [
|
||||
(Err(LexerErrorKind::InvalidChar), 0..1, r"\"),
|
||||
(Ok(TokenKind::Whitespace), 1..2, " "),
|
||||
(Err(LexerErrorKind::InvalidChar), 2..5, r"\xF"),
|
||||
@ -495,6 +509,8 @@ mod tests {
|
||||
(Err(LexerErrorKind::InvalidChar), 16..19, r"\uG"),
|
||||
(Ok(TokenKind::Whitespace), 19..20, " "),
|
||||
(Err(LexerErrorKind::InvalidChar), 20..29, r"\u2222222"),
|
||||
(Ok(TokenKind::Whitespace), 29..30, " "),
|
||||
(Err(LexerErrorKind::InvalidChar), 30..37, r"\foobar"),
|
||||
]);
|
||||
|
||||
test!(err_invalid_integer: "0b012 0o8 0xFG 1N 0x" => [
|
||||
|
Loading…
Reference in New Issue
Block a user