More lexer refactoring, support character literals for e.g. newline, tab, etc.
This commit is contained in:
parent
2d95a58ce7
commit
ece6645e50
@ -108,6 +108,11 @@ impl<'a> Lexer<'a> {
|
|||||||
self.chars.peek()
|
self.chars.peek()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn peek_some_not_separator(&mut self) -> bool {
|
||||||
|
self.peek().is_some_and(|c| !is_separator(c))
|
||||||
|
}
|
||||||
|
|
||||||
fn advance(&mut self) -> Option<char> {
|
fn advance(&mut self) -> Option<char> {
|
||||||
self.chars.next().inspect(|c| {
|
self.chars.next().inspect(|c| {
|
||||||
self.cursor += c.len_utf8();
|
self.cursor += c.len_utf8();
|
||||||
@ -137,20 +142,29 @@ impl<'a> Lexer<'a> {
|
|||||||
fn read_char(&mut self) -> Result<TokenKind, LexerError> {
|
fn read_char(&mut self) -> Result<TokenKind, LexerError> {
|
||||||
// NOTE: We have already consumed the initial '\' when this function is invoked
|
// NOTE: We have already consumed the initial '\' when this function is invoked
|
||||||
|
|
||||||
let c = if self.peek().is_some_and(|c| !is_separator(c)) {
|
if self.peek().is_none() || self.peek().is_some_and(|c| is_separator(c)) {
|
||||||
self.advance().unwrap() // SAFETY: This will never panic
|
|
||||||
} else {
|
|
||||||
return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
|
return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
|
||||||
};
|
|
||||||
|
|
||||||
match c {
|
|
||||||
'u' if self.peek().is_some_and(|c| !is_separator(c)) => self.complete_unicode_escape(),
|
|
||||||
'x' if self.peek().is_some_and(|c| !is_separator(c)) => self.complete_ascii_escape(),
|
|
||||||
_ if self.peek().is_some_and(|c| !is_separator(c)) => {
|
|
||||||
Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()))
|
|
||||||
}
|
|
||||||
_ => Ok(TokenKind::Char),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
match self.advance() {
|
||||||
|
Some('u') if self.peek_some_not_separator() => self.complete_unicode_escape(),
|
||||||
|
Some('x') if self.peek_some_not_separator() => self.complete_ascii_escape(),
|
||||||
|
_ => {
|
||||||
|
if self.peek_some_not_separator() {
|
||||||
|
// Consume the remainder of the literal:
|
||||||
|
self.read_word();
|
||||||
|
|
||||||
|
if !matches!(
|
||||||
|
&self.slice()[1..], // Ignore the leading '\'
|
||||||
|
b"backspace" | b"formfeed" | b"newline" | b"return" | b"space" | b"tab"
|
||||||
|
) {
|
||||||
|
return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(TokenKind::Char)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn complete_ascii_escape(&mut self) -> Result<TokenKind, LexerError> {
|
fn complete_ascii_escape(&mut self) -> Result<TokenKind, LexerError> {
|
||||||
@ -174,7 +188,7 @@ impl<'a> Lexer<'a> {
|
|||||||
|
|
||||||
// We should be at the end of the literal now, i.e. next char should be a
|
// We should be at the end of the literal now, i.e. next char should be a
|
||||||
// separator:
|
// separator:
|
||||||
if self.peek().is_some_and(|c| !is_separator(c)) {
|
if self.peek_some_not_separator() {
|
||||||
self.read_word(); // Recover
|
self.read_word(); // Recover
|
||||||
return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
|
return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
|
||||||
}
|
}
|
||||||
@ -187,7 +201,7 @@ impl<'a> Lexer<'a> {
|
|||||||
|
|
||||||
// Expect between 1 and 6 hexadecimal digits:
|
// Expect between 1 and 6 hexadecimal digits:
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
while self.peek().is_some_and(|c| !is_separator(c)) && count < 6 {
|
while self.peek_some_not_separator() && count < 6 {
|
||||||
match self.advance() {
|
match self.advance() {
|
||||||
Some(c) if c.is_ascii_hexdigit() => count += 1,
|
Some(c) if c.is_ascii_hexdigit() => count += 1,
|
||||||
_ => {
|
_ => {
|
||||||
@ -199,7 +213,7 @@ impl<'a> Lexer<'a> {
|
|||||||
|
|
||||||
// If no hexadecimal digits were found, or digits were found but we are still
|
// If no hexadecimal digits were found, or digits were found but we are still
|
||||||
// not at the end of the literal, then the literal is invalid:
|
// not at the end of the literal, then the literal is invalid:
|
||||||
if count == 0 || self.peek().is_some_and(|c| !is_separator(c)) {
|
if count == 0 || self.peek_some_not_separator() {
|
||||||
self.read_word(); // Recover
|
self.read_word(); // Recover
|
||||||
return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
|
return Err(LexerError::new(LexerErrorKind::InvalidChar, self.span()));
|
||||||
}
|
}
|
||||||
@ -210,7 +224,7 @@ impl<'a> Lexer<'a> {
|
|||||||
fn read_keyword(&mut self) -> Result<TokenKind, LexerError> {
|
fn read_keyword(&mut self) -> Result<TokenKind, LexerError> {
|
||||||
// NOTE: We have already consumed the initial ':' when this function is invoked
|
// NOTE: We have already consumed the initial ':' when this function is invoked
|
||||||
|
|
||||||
if self.peek().is_some_and(|c| !is_separator(c)) {
|
if self.peek_some_not_separator() {
|
||||||
self.read_word();
|
self.read_word();
|
||||||
Ok(TokenKind::Keyword)
|
Ok(TokenKind::Keyword)
|
||||||
} else {
|
} else {
|
||||||
@ -229,7 +243,7 @@ impl<'a> Lexer<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
while self.peek().is_some_and(|c| !is_separator(c)) {
|
while self.peek_some_not_separator() {
|
||||||
match self.advance() {
|
match self.advance() {
|
||||||
Some(c) if c.is_ascii_digit() => {}
|
Some(c) if c.is_ascii_digit() => {}
|
||||||
Some('.') => return self.complete_decimal(),
|
Some('.') => return self.complete_decimal(),
|
||||||
@ -250,23 +264,20 @@ impl<'a> Lexer<'a> {
|
|||||||
self.advance(); // Base prefix (i.e. 'b'/'B', 'o'/'O', 'x'/'X')
|
self.advance(); // Base prefix (i.e. 'b'/'B', 'o'/'O', 'x'/'X')
|
||||||
|
|
||||||
let mut digit_found = false;
|
let mut digit_found = false;
|
||||||
while let Some(c) = self.peek() {
|
while self.peek_some_not_separator() {
|
||||||
match c {
|
match self.advance() {
|
||||||
_ if is_separator(c) => break,
|
Some(c) if c.is_digit(radix) => digit_found = true,
|
||||||
_ if c.is_digit(radix) => {
|
|
||||||
self.advance();
|
|
||||||
digit_found = true;
|
|
||||||
}
|
|
||||||
_ => {
|
_ => {
|
||||||
self.read_word(); // Recover
|
self.read_word(); // Recover
|
||||||
return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()));
|
return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()));
|
||||||
}
|
}
|
||||||
};
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !digit_found {
|
if !digit_found {
|
||||||
self.read_word(); // Recover
|
self.read_word(); // Recover
|
||||||
return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()));
|
return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())
|
||||||
|
.with_context(|| "no valid digits found following the integer base prefix"));
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(TokenKind::Integer)
|
Ok(TokenKind::Integer)
|
||||||
@ -280,7 +291,7 @@ impl<'a> Lexer<'a> {
|
|||||||
let mut exp_found = false;
|
let mut exp_found = false;
|
||||||
let mut sign_found = false;
|
let mut sign_found = false;
|
||||||
|
|
||||||
while self.peek().is_some_and(|c| !is_separator(c)) {
|
while self.peek_some_not_separator() {
|
||||||
match self.advance() {
|
match self.advance() {
|
||||||
Some(c) if c.is_ascii_digit() => digit_found = true,
|
Some(c) if c.is_ascii_digit() => digit_found = true,
|
||||||
Some('e') | Some('E') if digit_found && !exp_found => exp_found = true,
|
Some('e') | Some('E') if digit_found && !exp_found => exp_found = true,
|
||||||
@ -303,7 +314,7 @@ impl<'a> Lexer<'a> {
|
|||||||
let mut sign_found = false;
|
let mut sign_found = false;
|
||||||
let mut digit_found = false;
|
let mut digit_found = false;
|
||||||
|
|
||||||
while self.peek().is_some_and(|c| !is_separator(c)) {
|
while self.peek_some_not_separator() {
|
||||||
match self.advance() {
|
match self.advance() {
|
||||||
Some(c) if c.is_ascii_digit() => digit_found = true,
|
Some(c) if c.is_ascii_digit() => digit_found = true,
|
||||||
Some('+') | Some('-') if !digit_found && !sign_found => sign_found = true,
|
Some('+') | Some('-') if !digit_found && !sign_found => sign_found = true,
|
||||||
@ -317,7 +328,8 @@ impl<'a> Lexer<'a> {
|
|||||||
|
|
||||||
if !digit_found {
|
if !digit_found {
|
||||||
self.read_word(); // Recover
|
self.read_word(); // Recover
|
||||||
return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span()));
|
return Err(LexerError::new(LexerErrorKind::InvalidNumber, self.span())
|
||||||
|
.with_context(|| "no digits found in the ratio's denominator"));
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(TokenKind::Ratio)
|
Ok(TokenKind::Ratio)
|
||||||
@ -467,7 +479,7 @@ mod tests {
|
|||||||
(Ok(TokenKind::Comment), 3..4, ";"),
|
(Ok(TokenKind::Comment), 3..4, ";"),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
test!(char: r"\a \? \7 \λ \\ \u \x" => [
|
test!(char: r"\a \? \7 \λ \\ \u \x \newline" => [
|
||||||
(Ok(TokenKind::Char), 0..2, r"\a"),
|
(Ok(TokenKind::Char), 0..2, r"\a"),
|
||||||
(Ok(TokenKind::Whitespace), 2..3, " "),
|
(Ok(TokenKind::Whitespace), 2..3, " "),
|
||||||
(Ok(TokenKind::Char), 3..5, r"\?"),
|
(Ok(TokenKind::Char), 3..5, r"\?"),
|
||||||
@ -481,9 +493,11 @@ mod tests {
|
|||||||
(Ok(TokenKind::Char), 16..18, r"\u"),
|
(Ok(TokenKind::Char), 16..18, r"\u"),
|
||||||
(Ok(TokenKind::Whitespace), 18..19, " "),
|
(Ok(TokenKind::Whitespace), 18..19, " "),
|
||||||
(Ok(TokenKind::Char), 19..21, r"\x"),
|
(Ok(TokenKind::Char), 19..21, r"\x"),
|
||||||
|
(Ok(TokenKind::Whitespace), 21..22, " "),
|
||||||
|
(Ok(TokenKind::Char), 22..30, r"\newline"),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222" => [
|
test!(err_invalid_char: r"\ \xF \x0 \x111 \uG \u2222222 \foobar" => [
|
||||||
(Err(LexerErrorKind::InvalidChar), 0..1, r"\"),
|
(Err(LexerErrorKind::InvalidChar), 0..1, r"\"),
|
||||||
(Ok(TokenKind::Whitespace), 1..2, " "),
|
(Ok(TokenKind::Whitespace), 1..2, " "),
|
||||||
(Err(LexerErrorKind::InvalidChar), 2..5, r"\xF"),
|
(Err(LexerErrorKind::InvalidChar), 2..5, r"\xF"),
|
||||||
@ -495,6 +509,8 @@ mod tests {
|
|||||||
(Err(LexerErrorKind::InvalidChar), 16..19, r"\uG"),
|
(Err(LexerErrorKind::InvalidChar), 16..19, r"\uG"),
|
||||||
(Ok(TokenKind::Whitespace), 19..20, " "),
|
(Ok(TokenKind::Whitespace), 19..20, " "),
|
||||||
(Err(LexerErrorKind::InvalidChar), 20..29, r"\u2222222"),
|
(Err(LexerErrorKind::InvalidChar), 20..29, r"\u2222222"),
|
||||||
|
(Ok(TokenKind::Whitespace), 29..30, " "),
|
||||||
|
(Err(LexerErrorKind::InvalidChar), 30..37, r"\foobar"),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
test!(err_invalid_integer: "0b012 0o8 0xFG 1N 0x" => [
|
test!(err_invalid_integer: "0b012 0o8 0xFG 1N 0x" => [
|
||||||
|
Loading…
Reference in New Issue
Block a user