I'm working through Crafting Interpreters and trying to apply what I've learned in Rust. I've got the lexer working in what feels like a reasonably idiomatic way (hooray!), but I have a feeling it could be simplified.
The main part I'm unsure about is the code for reading strings, numbers and identifiers - it feels too complex, but I wasn't able to figure out a better way of doing it.
Any other feedback would be appreciated! The full code can be found on GitHub, but there's not much there other than the lexer yet.
pub mod tokens; use std::str::CharIndices; use self::tokens::Token; #[inline] fn is_id_start(ch: char) -> bool { ch == '_' || ch.is_ascii_alphabetic() } #[inline] fn is_id_continue(ch: char) -> bool { ch == '_' || ch.is_ascii_digit() } pub type Location = usize; #[derive(Debug, Fail, PartialEq)] pub enum LexicalError { #[fail(display = "Invalid character '{}' found at {}", ch, location)] InvalidCharacter { ch: char, location: Location }, #[fail(display = "String starting at {} was not terminated", location)] UnterminatedString { location: Location }, } pub type SpanResult<'input> = Result<(Location, Token<'input>, Location), LexicalError>; pub struct Lexer<'input> { source: &'input str, chars: CharIndices<'input>, lookahead: Option<(usize, char)>, lookahead2: Option<(usize, char)>, } impl<'input> Lexer<'input> { pub fn new(source: &'input str) -> Lexer<'input> { let mut chars = source.char_indices(); let lookahead = chars.next(); let lookahead2 = chars.next(); Lexer { source, chars, lookahead, lookahead2, } } fn bump(&mut self) -> Option<(usize, char)> { let next = self.lookahead; self.lookahead = self.lookahead2; self.lookahead2 = self.chars.next(); next } fn take_until<F>(&mut self, mut terminate: F) -> Option<usize> where F: FnMut(char) -> bool, { while let Some((i, ch)) = self.lookahead { if terminate(ch) { return Some(i); } else { self.bump(); } } None } fn take_while<F>(&mut self, mut condition: F) -> Option<usize> where F: FnMut(char) -> bool, { self.take_until(|ch| !condition(ch)) } fn skip_to_line_end(&mut self) { self.take_while(|ch| ch != '\n'); } fn skip_whitespace(&mut self) { self.take_while(|ch| ch.is_whitespace()); } fn read_string(&mut self, pos: usize) -> SpanResult<'input> { match self.take_until(|ch| ch == '"') { Some(i) => { self.bump(); Ok((pos, Token::String(&self.source[pos + 1..i]), i + 1)) } None => Err(LexicalError::UnterminatedString { location: pos }), } } fn read_number(&mut self, pos: usize) -> SpanResult<'input> { let mut end = self.take_while(|ch| ch.is_ascii_digit()); if let Some((_, '.')) = self.lookahead { // Check if it's a decimal or a field access if let Some((_, next_ch)) = self.lookahead2 { if next_ch.is_ascii_digit() { self.bump(); end = self.take_while(|ch| ch.is_ascii_digit()); } } } let end = end.unwrap_or_else(|| self.source.len()); Ok(( pos, Token::Number(self.source[pos..end].parse().expect("unparsable number")), end, )) } fn read_identifier(&mut self, pos: usize) -> SpanResult<'input> { let end = self.take_while(|ch| is_id_start(ch) || is_id_continue(ch)) .unwrap_or_else(|| self.source.len()); match &self.source[pos..end] { "else" => Ok((pos, Token::Else, end)), "false" => Ok((pos, Token::False, end)), "fn" => Ok((pos, Token::Fn, end)), "for" => Ok((pos, Token::For, end)), "if" => Ok((pos, Token::If, end)), "nil" => Ok((pos, Token::Nil, end)), "print" => Ok((pos, Token::Print, end)), "return" => Ok((pos, Token::Return, end)), "this" => Ok((pos, Token::This, end)), "true" => Ok((pos, Token::True, end)), "let" => Ok((pos, Token::Let, end)), "while" => Ok((pos, Token::While, end)), id => Ok((pos, Token::Identifier(id), end)), } } } impl<'input> Iterator for Lexer<'input> { type Item = SpanResult<'input>; fn next(&mut self) -> Option<SpanResult<'input>> { self.skip_whitespace(); if let Some((i, ch)) = self.bump() { match ch { '{' => Some(Ok((i, Token::OpenBrace, i + 1))), '}' => Some(Ok((i, Token::CloseBrace, i + 1))), '(' => Some(Ok((i, Token::OpenParen, i + 1))), ')' => Some(Ok((i, Token::CloseParen, i + 1))), '[' => Some(Ok((i, Token::OpenBracket, i + 1))), ']' => Some(Ok((i, Token::CloseBracket, i + 1))), ';' => Some(Ok((i, Token::Semicolon, i + 1))), ',' => Some(Ok((i, Token::Comma, i + 1))), '.' => Some(Ok((i, Token::Dot, i + 1))), '+' => Some(Ok((i, Token::Plus, i + 1))), '-' => Some(Ok((i, Token::Minus, i + 1))), '*' => Some(Ok((i, Token::Star, i + 1))), '/' => { if let Some((_, '/')) = self.lookahead { self.skip_to_line_end(); self.next() } else { Some(Ok((i, Token::Slash, i + 1))) } } '!' => { if let Some((_, '=')) = self.lookahead { self.bump(); Some(Ok((i, Token::NotEqual, i + 2))) } else { Some(Ok((i, Token::Not, i + 1))) } } '=' => { if let Some((_, '=')) = self.lookahead { self.bump(); Some(Ok((i, Token::EqualEqual, i + 2))) } else { Some(Ok((i, Token::Equal, i + 1))) } } '>' => { if let Some((_, '=')) = self.lookahead { self.bump(); Some(Ok((i, Token::GreaterEqual, i + 2))) } else { Some(Ok((i, Token::Greater, i + 1))) } } '<' => { if let Some((_, '=')) = self.lookahead { self.bump(); Some(Ok((i, Token::LessEqual, i + 2))) } else { Some(Ok((i, Token::Less, i + 1))) } } '&' => { if let Some((_, '&')) = self.lookahead { self.bump(); Some(Ok((i, Token::AmpAmp, i + 2))) } else { Some(Err(LexicalError::InvalidCharacter { ch, location: i })) } } '|' => { if let Some((_, '|')) = self.lookahead { self.bump(); Some(Ok((i, Token::PipePipe, i + 2))) } else { Some(Err(LexicalError::InvalidCharacter { ch, location: i })) } } '"' => Some(self.read_string(i)), ch if is_id_start(ch) => Some(self.read_identifier(i)), ch if ch.is_ascii_digit() => Some(self.read_number(i)), ch => Some(Err(LexicalError::InvalidCharacter { ch, location: i })), } } else { None } } }