//! This module takes care of lexing python source text. //! //! This means source code is translated into separate tokens. use std::{char, cmp::Ordering, num::IntErrorKind, str::FromStr}; use unic_emoji_char::is_emoji_presentation; use unic_ucd_ident::{is_xid_continue, is_xid_start}; pub use super::token::Tok; use crate::{ ast::{FileName, Location}, error::{LexicalError, LexicalErrorType}, }; #[derive(Clone, Copy, PartialEq, Debug, Default)] struct IndentationLevel { tabs: usize, spaces: usize, } impl IndentationLevel { fn compare_strict( &self, other: &IndentationLevel, location: Location, ) -> Result { // We only know for sure that we're smaller or bigger if tabs // and spaces both differ in the same direction. Otherwise we're // dependent on the size of tabs. match self.tabs.cmp(&other.tabs) { Ordering::Less => { if self.spaces <= other.spaces { Ok(Ordering::Less) } else { Err(LexicalError { location, error: LexicalErrorType::TabError }) } } Ordering::Greater => { if self.spaces >= other.spaces { Ok(Ordering::Greater) } else { Err(LexicalError { location, error: LexicalErrorType::TabError }) } } Ordering::Equal => Ok(self.spaces.cmp(&other.spaces)), } } } pub struct Lexer> { chars: T, at_begin_of_line: bool, nesting: usize, // Amount of parenthesis indentation_stack: Vec, pending: Vec, chr0: Option, chr1: Option, chr2: Option, location: Location, config_comment_prefix: Option<&'static str>, } pub static KEYWORDS: phf::Map<&'static str, Tok> = phf::phf_map! { // Alphabetical keywords: "..." => Tok::Ellipsis, "False" => Tok::False, "None" => Tok::None, "True" => Tok::True, "and" => Tok::And, "as" => Tok::As, "assert" => Tok::Assert, "async" => Tok::Async, "await" => Tok::Await, "break" => Tok::Break, "class" => Tok::Class, "continue" => Tok::Continue, "def" => Tok::Def, "del" => Tok::Del, "elif" => Tok::Elif, "else" => Tok::Else, "except" => Tok::Except, "finally" => Tok::Finally, "for" => Tok::For, "from" => Tok::From, "global" => Tok::Global, "if" => Tok::If, "import" => Tok::Import, "in" => Tok::In, "is" => Tok::Is, "lambda" => Tok::Lambda, "nonlocal" => Tok::Nonlocal, "not" => Tok::Not, "or" => Tok::Or, "pass" => Tok::Pass, "raise" => Tok::Raise, "return" => Tok::Return, "try" => Tok::Try, "while" => Tok::While, "with" => Tok::With, "yield" => Tok::Yield, }; pub type Spanned = (Location, Tok, Location); pub type LexResult = Result; #[inline] pub fn make_tokenizer(source: &str, file: FileName) -> impl Iterator + '_ { make_tokenizer_located(source, Location::new(0, 0, file)) } pub fn make_tokenizer_located( source: &str, start_location: Location, ) -> impl Iterator + '_ { let nlh = NewlineHandler::new(source.chars()); Lexer::new(nlh, start_location) } // The newline handler is an iterator which collapses different newline // types into \n always. pub struct NewlineHandler> { source: T, chr0: Option, chr1: Option, } impl NewlineHandler where T: Iterator, { pub fn new(source: T) -> Self { let mut nlh = NewlineHandler { source, chr0: None, chr1: None }; nlh.shift(); nlh.shift(); nlh } fn shift(&mut self) -> Option { let result = self.chr0; self.chr0 = self.chr1; self.chr1 = self.source.next(); result } } impl Iterator for NewlineHandler where T: Iterator, { type Item = char; fn next(&mut self) -> Option { // Collapse \r\n into \n loop { if self.chr0 == Some('\r') { if self.chr1 == Some('\n') { // Transform windows EOL into \n self.shift(); } else { // Transform MAC EOL into \n self.chr0 = Some('\n'); } } else { break; } } self.shift() } } impl Lexer where T: Iterator, { pub fn new(input: T, start: Location) -> Self { let mut lxr = Lexer { chars: input, at_begin_of_line: true, nesting: 0, indentation_stack: vec![IndentationLevel::default()], pending: Vec::new(), chr0: None, location: start, chr1: None, chr2: None, config_comment_prefix: Some(" nac3:"), }; lxr.next_char(); lxr.next_char(); lxr.next_char(); // Start at top row (=1) left column (=1) lxr.location.reset(); lxr } // Lexer helper functions: fn lex_identifier(&mut self) -> LexResult { let mut name = String::new(); let start_pos = self.get_pos(); // Detect potential string like rb'' b'' f'' u'' r'' let mut saw_b = false; let mut saw_r = false; let mut saw_u = false; let mut saw_f = false; loop { // Detect r"", f"", b"" and u"" if !(saw_b || saw_u || saw_f) && matches!(self.chr0, Some('b' | 'B')) { saw_b = true; } else if !(saw_b || saw_r || saw_u || saw_f) && matches!(self.chr0, Some('u' | 'U')) { saw_u = true; } else if !(saw_r || saw_u) && (self.chr0 == Some('r') || self.chr0 == Some('R')) { saw_r = true; } else if !(saw_b || saw_u || saw_f) && (self.chr0 == Some('f') || self.chr0 == Some('F')) { saw_f = true; } else { break; } // Take up char into name: name.push(self.next_char().unwrap()); // Check if we have a string: if self.chr0 == Some('"') || self.chr0 == Some('\'') { return self.lex_string(saw_b, saw_r, saw_u, saw_f); } } while self.is_identifier_continuation() { name.push(self.next_char().unwrap()); } let end_pos = self.get_pos(); if let Some(tok) = KEYWORDS.get(name.as_str()) { Ok((start_pos, tok.clone(), end_pos)) } else { Ok((start_pos, Tok::Name { name: name.into() }, end_pos)) } } /// Numeric lexing. The feast can start! fn lex_number(&mut self) -> LexResult { let start_pos = self.get_pos(); if self.chr0 == Some('0') { if self.chr1 == Some('x') || self.chr1 == Some('X') { // Hex! self.next_char(); self.next_char(); self.lex_number_radix(start_pos, 16) } else if self.chr1 == Some('o') || self.chr1 == Some('O') { // Octal style! self.next_char(); self.next_char(); self.lex_number_radix(start_pos, 8) } else if self.chr1 == Some('b') || self.chr1 == Some('B') { // Binary! self.next_char(); self.next_char(); self.lex_number_radix(start_pos, 2) } else { self.lex_normal_number() } } else { self.lex_normal_number() } } /// Lex a hex/octal/decimal/binary number without a decimal point. fn lex_number_radix(&mut self, start_pos: Location, radix: u32) -> LexResult { let value_text = self.radix_run(radix); let end_pos = self.get_pos(); let value = match i128::from_str_radix(&value_text, radix) { Ok(value) => value, Err(e) => match e.kind() { IntErrorKind::PosOverflow | IntErrorKind::NegOverflow => i128::MAX, _ => { return Err(LexicalError { error: LexicalErrorType::OtherError(format!("{e:?}")), location: start_pos, }) } }, }; Ok((start_pos, Tok::Int { value }, end_pos)) } /// Lex a normal number, that is, no octal, hex or binary number. fn lex_normal_number(&mut self) -> LexResult { let start_pos = self.get_pos(); let start_is_zero = self.chr0 == Some('0'); // Normal number: let mut value_text = self.radix_run(10); // If float: if self.chr0 == Some('.') || self.at_exponent() { // Take '.': if self.chr0 == Some('.') { if self.chr1 == Some('_') { return Err(LexicalError { error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()), location: self.get_pos(), }); } value_text.push(self.next_char().unwrap()); value_text.push_str(&self.radix_run(10)); } // 1e6 for example: if self.chr0 == Some('e') || self.chr0 == Some('E') { value_text.push(self.next_char().unwrap().to_ascii_lowercase()); // Optional +/- if self.chr0 == Some('-') || self.chr0 == Some('+') { value_text.push(self.next_char().unwrap()); } value_text.push_str(&self.radix_run(10)); } let value = f64::from_str(&value_text).unwrap(); // Parse trailing 'j': if self.chr0 == Some('j') || self.chr0 == Some('J') { self.next_char(); let end_pos = self.get_pos(); Ok((start_pos, Tok::Complex { real: 0.0, imag: value }, end_pos)) } else { let end_pos = self.get_pos(); Ok((start_pos, Tok::Float { value }, end_pos)) } } else { // Parse trailing 'j': if self.chr0 == Some('j') || self.chr0 == Some('J') { self.next_char(); let end_pos = self.get_pos(); let imag = f64::from_str(&value_text).unwrap(); Ok((start_pos, Tok::Complex { real: 0.0, imag }, end_pos)) } else { let end_pos = self.get_pos(); // assumption: value_text contains a valid integer. // parse should only fail because of overflow. let value = value_text.parse::().ok(); let nonzero = match value { Some(value) => value != 0i128, None => true, }; if start_is_zero && nonzero { return Err(LexicalError { error: LexicalErrorType::OtherError("Invalid Token".to_owned()), location: self.get_pos(), }); } Ok((start_pos, Tok::Int { value: value.unwrap_or(i128::MAX) }, end_pos)) } } } /// Consume a sequence of numbers with the given radix, /// the digits can be decorated with underscores /// like this: `'1_2_3_4'` == `'1234'` fn radix_run(&mut self, radix: u32) -> String { let mut value_text = String::new(); loop { if let Some(c) = self.take_number(radix) { value_text.push(c); } else if self.chr0 == Some('_') && Lexer::::is_digit_of_radix(self.chr1, radix) { self.next_char(); } else { break; } } value_text } /// Consume a single character with the given radix. fn take_number(&mut self, radix: u32) -> Option { let take_char = Lexer::::is_digit_of_radix(self.chr0, radix); if take_char { Some(self.next_char().unwrap()) } else { None } } /// Test if a digit is of a certain radix. fn is_digit_of_radix(c: Option, radix: u32) -> bool { match radix { 2 => matches!(c, Some('0'..='1')), 8 => matches!(c, Some('0'..='7')), 10 => matches!(c, Some('0'..='9')), 16 => matches!(c, Some('0'..='9' | 'a'..='f' | 'A'..='F')), other => unimplemented!("Radix not implemented: {}", other), } } /// Test if we face '[eE][-+]?[0-9]+' fn at_exponent(&self) -> bool { match self.chr0 { Some('e' | 'E') => match self.chr1 { Some('+' | '-') => matches!(self.chr2, Some('0'..='9')), Some('0'..='9') => true, _ => false, }, _ => false, } } /// Skip everything until end of line, may produce nac3 pseudocomment fn lex_comment(&mut self) -> Option { self.next_char(); // if possibly nac3 pseudocomment, special handling for `# nac3:` let (mut prefix, mut is_comment) = self.config_comment_prefix.map_or_else(|| ("".chars(), false), |v| (v.chars(), true)); // for the correct location of config comment let mut start_loc = self.location; start_loc.go_left(); loop { match self.chr0 { Some('\n') | None => return None, Some(c) => { if let (true, Some(p)) = (is_comment, prefix.next()) { is_comment = is_comment && c == p; } else { // done checking prefix, if is comment then return the spanned if is_comment { let mut content = String::new(); loop { match self.chr0 { Some('\n') | None => break, Some(c) => content.push(c), } self.next_char(); } return Some(( start_loc, Tok::ConfigComment { content: content.trim().into() }, self.location, )); } } } } self.next_char(); } } fn unicode_literal(&mut self, literal_number: usize) -> Result { let mut p: u32 = 0u32; let unicode_error = LexicalError { error: LexicalErrorType::UnicodeError, location: self.get_pos() }; for i in 1..=literal_number { match self.next_char() { Some(c) => match c.to_digit(16) { Some(d) => p += d << ((literal_number - i) * 4), None => return Err(unicode_error), }, None => return Err(unicode_error), } } match p { 0xD800..=0xDFFF => Ok(char::REPLACEMENT_CHARACTER), _ => char::from_u32(p).ok_or(unicode_error), } } fn parse_octet(&mut self, first: char) -> char { let mut octet_content = String::new(); octet_content.push(first); while octet_content.len() < 3 { if let Some('0'..='7') = self.chr0 { octet_content.push(self.next_char().unwrap()); } else { break; } } let value = u32::from_str_radix(&octet_content, 8).unwrap(); char::from_u32(value).unwrap() } fn parse_unicode_name(&mut self) -> Result { let start_pos = self.get_pos(); match self.next_char() { Some('{') => {} _ => { return Err(LexicalError { error: LexicalErrorType::StringError, location: start_pos, }) } } let start_pos = self.get_pos(); let mut name = String::new(); loop { match self.next_char() { Some('}') => break, Some(c) => name.push(c), None => { return Err(LexicalError { error: LexicalErrorType::StringError, location: self.get_pos(), }) } } } unicode_names2::character(&name) .ok_or(LexicalError { error: LexicalErrorType::UnicodeError, location: start_pos }) } fn lex_string( &mut self, is_bytes: bool, is_raw: bool, _is_unicode: bool, is_fstring: bool, ) -> LexResult { let quote_char = self.next_char().unwrap(); let mut string_content = String::new(); let start_pos = self.get_pos(); // If the next two characters are also the quote character, then we have a triple-quoted // string; consume those two characters and ensure that we require a triple-quote to close let triple_quoted = if self.chr0 == Some(quote_char) && self.chr1 == Some(quote_char) { self.next_char(); self.next_char(); true } else { false }; loop { match self.next_char() { Some('\\') => { if self.chr0 == Some(quote_char) && !is_raw { string_content.push(quote_char); self.next_char(); } else if is_raw { string_content.push('\\'); if let Some(c) = self.next_char() { string_content.push(c); } else { return Err(LexicalError { error: LexicalErrorType::StringError, location: self.get_pos(), }); } } else { match self.next_char() { Some('\\') => { string_content.push('\\'); } Some('\'') => string_content.push('\''), Some('\"') => string_content.push('\"'), Some('\n') => { // Ignore Unix EOL character } Some('a') => string_content.push('\x07'), Some('b') => string_content.push('\x08'), Some('f') => string_content.push('\x0c'), Some('n') => { string_content.push('\n'); } Some('r') => string_content.push('\r'), Some('t') => { string_content.push('\t'); } Some('v') => string_content.push('\x0b'), Some(o @ '0'..='7') => string_content.push(self.parse_octet(o)), Some('x') => string_content.push(self.unicode_literal(2)?), Some('u') if !is_bytes => string_content.push(self.unicode_literal(4)?), Some('U') if !is_bytes => string_content.push(self.unicode_literal(8)?), Some('N') if !is_bytes => { string_content.push(self.parse_unicode_name()?); } Some(c) => { string_content.push('\\'); string_content.push(c); } None => { return Err(LexicalError { error: LexicalErrorType::StringError, location: self.get_pos(), }); } } } } Some(c) => { if c == quote_char { if triple_quoted { // Look ahead at the next two characters; if we have two more // quote_chars, it's the end of the string; consume the remaining // closing quotes and break the loop if self.chr0 == Some(quote_char) && self.chr1 == Some(quote_char) { self.next_char(); self.next_char(); break; } string_content.push(c); } else { break; } } else { if (c == '\n' && !triple_quoted) || (is_bytes && !c.is_ascii()) { return Err(LexicalError { error: LexicalErrorType::StringError, location: self.get_pos(), }); } string_content.push(c); } } None => { return Err(LexicalError { error: LexicalErrorType::StringError, location: self.get_pos(), }); } } } let end_pos = self.get_pos(); let tok = if is_bytes { Tok::Bytes { value: string_content.chars().map(|c| c as u8).collect() } } else { Tok::String { value: string_content, is_fstring } }; Ok((start_pos, tok, end_pos)) } fn is_identifier_start(c: char) -> bool { match c { '_' | 'a'..='z' | 'A'..='Z' => true, '+' | '-' | '*' | '/' | '=' | ' ' | '<' | '>' => false, c => is_xid_start(c), } } fn is_identifier_continuation(&self) -> bool { if let Some(c) = self.chr0 { match c { '_' | '0'..='9' | 'a'..='z' | 'A'..='Z' => true, '+' | '-' | '*' | '/' | '=' | ' ' | '<' | '>' => false, c => is_xid_continue(c), } } else { false } } /// This is the main entry point. Call this function to retrieve the next token. /// This function is used by the iterator implementation. fn inner_next(&mut self) -> LexResult { // top loop, keep on processing, until we have something pending. while self.pending.is_empty() { // Detect indentation levels if self.at_begin_of_line { self.handle_indentations()?; } self.consume_normal()?; } Ok(self.pending.remove(0)) } /// Given we are at the start of a line, count the number of spaces and/or tabs until the first character. fn eat_indentation(&mut self) -> Result<(IndentationLevel, Option), LexicalError> { // Determine indentation: let mut spaces: usize = 0; let mut tabs: usize = 0; let mut nac3comment: Option = None; loop { match self.chr0 { Some(' ') => { /* if tabs != 0 { // Don't allow spaces after tabs as part of indentation. // This is technically stricter than python3 but spaces after // tabs is even more insane than mixing spaces and tabs. return Some(Err(LexicalError { error: LexicalErrorType::OtherError("Spaces not allowed as part of indentation after tabs".to_owned()), location: self.get_pos(), })); } */ self.next_char(); spaces += 1; } Some('\t') => { if spaces != 0 { // Don't allow tabs after spaces as part of indentation. // This is technically stricter than python3 but spaces before // tabs is even more insane than mixing spaces and tabs. return Err(LexicalError { error: LexicalErrorType::TabsAfterSpaces, location: self.get_pos(), }); } self.next_char(); tabs += 1; } Some('#') => { nac3comment = self.lex_comment(); // if is nac3comment, we need to add newline, so it is not begin of line // and we should break from the loop, else in the next loop it will be // regarded as a empty line if nac3comment.is_some() { self.at_begin_of_line = false; break; } spaces = 0; tabs = 0; } Some('\x0C') => { // Form feed character! // Reset indentation for the Emacs user. self.next_char(); spaces = 0; tabs = 0; } Some('\n') => { // Empty line! self.next_char(); spaces = 0; tabs = 0; } None => { spaces = 0; tabs = 0; break; } _ => { self.at_begin_of_line = false; break; } } } Ok((IndentationLevel { tabs, spaces }, nac3comment)) } fn handle_indentations(&mut self) -> Result<(), LexicalError> { let eat_result = self.eat_indentation()?; let indentation_level = eat_result.0; if self.nesting == 0 { // Determine indent or dedent: let current_indentation = self.indentation_stack.last().unwrap(); let ordering = indentation_level.compare_strict(current_indentation, self.get_pos())?; match ordering { Ordering::Equal => { // Same same } Ordering::Greater => { // New indentation level: self.indentation_stack.push(indentation_level); let tok_pos = self.get_pos(); self.emit((tok_pos, Tok::Indent, tok_pos)); } Ordering::Less => { // One or more dedentations // Pop off other levels until col is found: loop { let current_indentation = self.indentation_stack.last().unwrap(); let ordering = indentation_level .compare_strict(current_indentation, self.get_pos())?; match ordering { Ordering::Less => { self.indentation_stack.pop(); let tok_pos = self.get_pos(); self.emit((tok_pos, Tok::Dedent, tok_pos)); } Ordering::Equal => { // We arrived at proper level of indentation. break; } Ordering::Greater => { return Err(LexicalError { error: LexicalErrorType::IndentationError, location: self.get_pos(), }); } } } } } }; if let Some(comment) = eat_result.1 { self.emit(comment); } Ok(()) } /// Take a look at the next character, if any, and decide upon the next steps. fn consume_normal(&mut self) -> Result<(), LexicalError> { // Check if we have some character: if let Some(c) = self.chr0 { // First check identifier: if Self::is_identifier_start(c) { let identifier = self.lex_identifier()?; self.emit(identifier); } else if is_emoji_presentation(c) { let tok_start = self.get_pos(); self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::Name { name: c.to_string().into() }, tok_end)); } else { self.consume_character(c)?; } } else { // We reached end of file. let tok_pos = self.get_pos(); // First of all, we need all nestings to be finished. if self.nesting > 0 { return Err(LexicalError { error: LexicalErrorType::NestingError, location: tok_pos, }); } // Next, insert a trailing newline, if required. if !self.at_begin_of_line { self.at_begin_of_line = true; self.emit((tok_pos, Tok::Newline, tok_pos)); } // Next, flush the indentation stack to zero. while self.indentation_stack.len() > 1 { self.indentation_stack.pop(); self.emit((tok_pos, Tok::Dedent, tok_pos)); } self.emit((tok_pos, Tok::EndOfFile, tok_pos)); } Ok(()) } /// Okay, we are facing a weird character, what is it? Determine that. fn consume_character(&mut self, c: char) -> Result<(), LexicalError> { match c { '0'..='9' => { let number = self.lex_number()?; self.emit(number); } '#' => { if let Some(c) = self.lex_comment() { self.emit(c); }; } '"' | '\'' => { let string = self.lex_string(false, false, false, false)?; self.emit(string); } '=' => { let tok_start = self.get_pos(); self.next_char(); if let Some('=') = self.chr0 { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::EqEqual, tok_end)); } else { let tok_end = self.get_pos(); self.emit((tok_start, Tok::Equal, tok_end)); } } '+' => { let tok_start = self.get_pos(); self.next_char(); if let Some('=') = self.chr0 { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::PlusEqual, tok_end)); } else { let tok_end = self.get_pos(); self.emit((tok_start, Tok::Plus, tok_end)); } } '*' => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::StarEqual, tok_end)); } Some('*') => { self.next_char(); if let Some('=') = self.chr0 { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::DoubleStarEqual, tok_end)); } else { let tok_end = self.get_pos(); self.emit((tok_start, Tok::DoubleStar, tok_end)); } } _ => { let tok_end = self.get_pos(); self.emit((tok_start, Tok::Star, tok_end)); } } } '/' => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::SlashEqual, tok_end)); } Some('/') => { self.next_char(); if let Some('=') = self.chr0 { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::DoubleSlashEqual, tok_end)); } else { let tok_end = self.get_pos(); self.emit((tok_start, Tok::DoubleSlash, tok_end)); } } _ => { let tok_end = self.get_pos(); self.emit((tok_start, Tok::Slash, tok_end)); } } } '%' => { let tok_start = self.get_pos(); self.next_char(); if let Some('=') = self.chr0 { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::PercentEqual, tok_end)); } else { let tok_end = self.get_pos(); self.emit((tok_start, Tok::Percent, tok_end)); } } '|' => { let tok_start = self.get_pos(); self.next_char(); if let Some('=') = self.chr0 { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::VbarEqual, tok_end)); } else { let tok_end = self.get_pos(); self.emit((tok_start, Tok::Vbar, tok_end)); } } '^' => { let tok_start = self.get_pos(); self.next_char(); if let Some('=') = self.chr0 { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::CircumflexEqual, tok_end)); } else { let tok_end = self.get_pos(); self.emit((tok_start, Tok::CircumFlex, tok_end)); } } '&' => { let tok_start = self.get_pos(); self.next_char(); if let Some('=') = self.chr0 { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::AmperEqual, tok_end)); } else { let tok_end = self.get_pos(); self.emit((tok_start, Tok::Amper, tok_end)); } } '-' => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('=') => { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::MinusEqual, tok_end)); } Some('>') => { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::Rarrow, tok_end)); } _ => { let tok_end = self.get_pos(); self.emit((tok_start, Tok::Minus, tok_end)); } } } '@' => { let tok_start = self.get_pos(); self.next_char(); if let Some('=') = self.chr0 { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::AtEqual, tok_end)); } else { let tok_end = self.get_pos(); self.emit((tok_start, Tok::At, tok_end)); } } '!' => { let tok_start = self.get_pos(); self.next_char(); if let Some('=') = self.chr0 { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::NotEqual, tok_end)); } else { return Err(LexicalError { error: LexicalErrorType::UnrecognizedToken { tok: '!' }, location: tok_start, }); } } '~' => { self.eat_single_char(Tok::Tilde); } '(' => { self.eat_single_char(Tok::Lpar); self.nesting += 1; } ')' => { self.eat_single_char(Tok::Rpar); if self.nesting == 0 { return Err(LexicalError { error: LexicalErrorType::NestingError, location: self.get_pos(), }); } self.nesting -= 1; } '[' => { self.eat_single_char(Tok::Lsqb); self.nesting += 1; } ']' => { self.eat_single_char(Tok::Rsqb); if self.nesting == 0 { return Err(LexicalError { error: LexicalErrorType::NestingError, location: self.get_pos(), }); } self.nesting -= 1; } '{' => { self.eat_single_char(Tok::Lbrace); self.nesting += 1; } '}' => { self.eat_single_char(Tok::Rbrace); if self.nesting == 0 { return Err(LexicalError { error: LexicalErrorType::NestingError, location: self.get_pos(), }); } self.nesting -= 1; } ':' => { let tok_start = self.get_pos(); self.next_char(); if let Some('=') = self.chr0 { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::ColonEqual, tok_end)); } else { let tok_end = self.get_pos(); self.emit((tok_start, Tok::Colon, tok_end)); } } ';' => { self.eat_single_char(Tok::Semi); } '<' => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('<') => { self.next_char(); if let Some('=') = self.chr0 { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::LeftShiftEqual, tok_end)); } else { let tok_end = self.get_pos(); self.emit((tok_start, Tok::LeftShift, tok_end)); } } Some('=') => { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::LessEqual, tok_end)); } _ => { let tok_end = self.get_pos(); self.emit((tok_start, Tok::Less, tok_end)); } } } '>' => { let tok_start = self.get_pos(); self.next_char(); match self.chr0 { Some('>') => { self.next_char(); if let Some('=') = self.chr0 { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::RightShiftEqual, tok_end)); } else { let tok_end = self.get_pos(); self.emit((tok_start, Tok::RightShift, tok_end)); } } Some('=') => { self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::GreaterEqual, tok_end)); } _ => { let tok_end = self.get_pos(); self.emit((tok_start, Tok::Greater, tok_end)); } } } ',' => { let tok_start = self.get_pos(); self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::Comma, tok_end)); } '.' => { if let Some('0'..='9') = self.chr1 { let number = self.lex_number()?; self.emit(number); } else { let tok_start = self.get_pos(); self.next_char(); if let (Some('.'), Some('.')) = (&self.chr0, &self.chr1) { self.next_char(); self.next_char(); let tok_end = self.get_pos(); self.emit((tok_start, Tok::Ellipsis, tok_end)); } else { let tok_end = self.get_pos(); self.emit((tok_start, Tok::Dot, tok_end)); } } } '\n' => { let tok_start = self.get_pos(); self.next_char(); let tok_end = self.get_pos(); // Depending on the nesting level, we emit newline or not: if self.nesting == 0 { self.at_begin_of_line = true; self.emit((tok_start, Tok::Newline, tok_end)); } } ' ' | '\t' | '\x0C' => { // Skip whitespaces self.next_char(); while self.chr0 == Some(' ') || self.chr0 == Some('\t') || self.chr0 == Some('\x0C') { self.next_char(); } } '\\' => { self.next_char(); if let Some('\n') = self.chr0 { self.next_char(); } else { return Err(LexicalError { error: LexicalErrorType::LineContinuationError, location: self.get_pos(), }); } if self.chr0.is_none() { return Err(LexicalError { error: LexicalErrorType::Eof, location: self.get_pos(), }); } } _ => { let c = self.next_char(); return Err(LexicalError { error: LexicalErrorType::UnrecognizedToken { tok: c.unwrap() }, location: self.get_pos(), }); } // Ignore all the rest.. } Ok(()) } fn eat_single_char(&mut self, ty: Tok) { let tok_start = self.get_pos(); self.next_char().unwrap(); let tok_end = self.get_pos(); self.emit((tok_start, ty, tok_end)); } /// Helper function to go to the next character coming up. fn next_char(&mut self) -> Option { let c = self.chr0; let nxt = self.chars.next(); self.chr0 = self.chr1; self.chr1 = self.chr2; self.chr2 = nxt; if c == Some('\n') { self.location.newline(); } else { self.location.go_right(); } c } /// Helper function to retrieve the current position. fn get_pos(&self) -> Location { self.location } /// Helper function to emit a lexed token to the queue of tokens. fn emit(&mut self, spanned: Spanned) { self.pending.push(spanned); } } /* Implement iterator pattern for the get_tok function. Calling the next element in the iterator will yield the next lexical token. */ impl Iterator for Lexer where T: Iterator, { type Item = LexResult; fn next(&mut self) -> Option { // Idea: create some sort of hash map for single char tokens: // let mut X = HashMap::new(); // X.insert('=', Tok::Equal); let token = self.inner_next(); trace!( "Lex token {:?}, nesting={:?}, indent stack: {:?}", token, self.nesting, self.indentation_stack ); match token { Ok((_, Tok::EndOfFile, _)) => None, r => Some(r), } } } #[cfg(test)] mod tests { use super::{make_tokenizer, NewlineHandler, Tok}; use nac3ast::FileName; const WINDOWS_EOL: &str = "\r\n"; const MAC_EOL: &str = "\r"; const UNIX_EOL: &str = "\n"; pub fn lex_source(source: &str) -> Vec { let lexer = make_tokenizer(source, FileName::default()); lexer.map(|x| x.unwrap().1).collect() } #[test] fn test_nac3comment() { let src = "\ a: int32 # nac3: b: int64"; let tokens = lex_source(src); assert_eq!( tokens, vec![ Tok::Name { name: "a".into() }, Tok::Colon, Tok::Name { name: "int32".into() }, Tok::Newline, Tok::ConfigComment { content: "".into() }, Tok::Newline, Tok::Name { name: "b".into() }, Tok::Colon, Tok::Name { name: "int64".into() }, Tok::Newline, ] ); } #[test] fn test_class_lex_with_nac3comment() { use Tok::*; let source = "\ class Foo(A, B): # normal comment # nac3: no indent # nac3: correct indent b: int32 a: int32 # nac3: no need indent def __init__(self): pass"; let tokens = lex_source(source); assert_eq!( tokens, vec![ Class, Name { name: "Foo".into() }, Lpar, Name { name: "A".into() }, Comma, Name { name: "B".into() }, Rpar, Colon, Newline, ConfigComment { content: "no indent".into() }, Newline, Indent, ConfigComment { content: "correct indent".into() }, Newline, Name { name: "b".into() }, Colon, Name { name: "int32".into() }, Newline, Name { name: "a".into() }, Colon, Name { name: "int32".into() }, ConfigComment { content: "no need indent".into() }, Newline, Def, Name { name: "__init__".into() }, Lpar, Name { name: "self".into() }, Rpar, Colon, Newline, Indent, Pass, Newline, Dedent, Dedent ] ); } #[test] fn test_newline_processor() { // Escape \ followed by \n (by removal): let src = "b\\\r\n"; assert_eq!(4, src.len()); let nlh = NewlineHandler::new(src.chars()); let x: Vec = nlh.collect(); assert_eq!(vec!['b', '\\', '\n'], x); } #[test] fn test_raw_string() { let source = "r\"\\\\\" \"\\\\\""; let tokens = lex_source(source); assert_eq!( tokens, vec![ Tok::String { value: "\\\\".to_owned(), is_fstring: false }, Tok::String { value: "\\".to_owned(), is_fstring: false }, Tok::Newline, ] ); } #[test] fn test_numbers() { let source = "0x2f 0b1101 0 123 0.2 2j 2.2j"; let tokens = lex_source(source); assert_eq!( tokens, vec![ Tok::Int { value: 47i128 }, Tok::Int { value: 13i128 }, Tok::Int { value: 0i128 }, Tok::Int { value: 123i128 }, Tok::Float { value: 0.2 }, Tok::Complex { real: 0.0, imag: 2.0 }, Tok::Complex { real: 0.0, imag: 2.2 }, Tok::Newline, ] ); } macro_rules! test_line_comment { ($($name:ident: $eol:expr,)*) => { $( #[test] fn $name() { let source = format!(r"99232 # {}", $eol); let tokens = lex_source(&source); assert_eq!(tokens, vec![Tok::Int { value: 99232i128 }, Tok::Newline]); } )* } } test_line_comment! { test_line_comment_long: " foo", test_line_comment_whitespace: " ", test_line_comment_single_whitespace: " ", test_line_comment_empty: "", } macro_rules! test_comment_until_eol { ($($name:ident: $eol:expr,)*) => { $( #[test] fn $name() { let source = format!("123 # Foo{}456", $eol); let tokens = lex_source(&source); assert_eq!( tokens, vec![ Tok::Int { value: 123i128 }, Tok::Newline, Tok::Int { value: 456i128 }, Tok::Newline, ] ) } )* } } test_comment_until_eol! { test_comment_until_windows_eol: WINDOWS_EOL, test_comment_until_mac_eol: MAC_EOL, test_comment_until_unix_eol: UNIX_EOL, } #[test] fn test_assignment() { let source = r"avariable = 99 + 2-0"; let tokens = lex_source(source); assert_eq!( tokens, vec![ Tok::Name { name: String::from("avariable").into() }, Tok::Equal, Tok::Int { value: 99i128 }, Tok::Plus, Tok::Int { value: 2i128 }, Tok::Minus, Tok::Int { value: 0i128 }, Tok::Newline, ] ); } macro_rules! test_indentation_with_eol { ($($name:ident: $eol:expr,)*) => { $( #[test] fn $name() { let source = format!("def foo():{} return 99{}{}", $eol, $eol, $eol); let tokens = lex_source(&source); assert_eq!( tokens, vec![ Tok::Def, Tok::Name { name: String::from("foo").into(), }, Tok::Lpar, Tok::Rpar, Tok::Colon, Tok::Newline, Tok::Indent, Tok::Return, Tok::Int { value: 99i128 }, Tok::Newline, Tok::Dedent, ] ); } )* }; } test_indentation_with_eol! { test_indentation_windows_eol: WINDOWS_EOL, test_indentation_mac_eol: MAC_EOL, test_indentation_unix_eol: UNIX_EOL, } macro_rules! test_double_dedent_with_eol { ($($name:ident: $eol:expr,)*) => { $( #[test] fn $name() { let source = format!("def foo():{} if x:{}{} return 99{}{}", $eol, $eol, $eol, $eol, $eol); let tokens = lex_source(&source); assert_eq!( tokens, vec![ Tok::Def, Tok::Name { name: String::from("foo").into(), }, Tok::Lpar, Tok::Rpar, Tok::Colon, Tok::Newline, Tok::Indent, Tok::If, Tok::Name { name: String::from("x").into(), }, Tok::Colon, Tok::Newline, Tok::Indent, Tok::Return, Tok::Int { value: 99i128 }, Tok::Newline, Tok::Dedent, Tok::Dedent, ] ); } )* } } macro_rules! test_double_dedent_with_tabs { ($($name:ident: $eol:expr,)*) => { $( #[test] fn $name() { let source = format!("def foo():{}\tif x:{}{}\t return 99{}{}", $eol, $eol, $eol, $eol, $eol); let tokens = lex_source(&source); assert_eq!( tokens, vec![ Tok::Def, Tok::Name { name: String::from("foo").into(), }, Tok::Lpar, Tok::Rpar, Tok::Colon, Tok::Newline, Tok::Indent, Tok::If, Tok::Name { name: String::from("x").into(), }, Tok::Colon, Tok::Newline, Tok::Indent, Tok::Return, Tok::Int { value: 99i128 }, Tok::Newline, Tok::Dedent, Tok::Dedent, ] ); } )* } } test_double_dedent_with_eol! { test_double_dedent_windows_eol: WINDOWS_EOL, test_double_dedent_mac_eol: MAC_EOL, test_double_dedent_unix_eol: UNIX_EOL, } test_double_dedent_with_tabs! { test_double_dedent_tabs_windows_eol: WINDOWS_EOL, test_double_dedent_tabs_mac_eol: MAC_EOL, test_double_dedent_tabs_unix_eol: UNIX_EOL, } macro_rules! test_newline_in_brackets { ($($name:ident: $eol:expr,)*) => { $( #[test] fn $name() { let source = format!("x = [{} 1,2{}]{}", $eol, $eol, $eol); let tokens = lex_source(&source); assert_eq!( tokens, vec![ Tok::Name { name: String::from("x").into(), }, Tok::Equal, Tok::Lsqb, Tok::Int { value: 1i128 }, Tok::Comma, Tok::Int { value: 2i128 }, Tok::Rsqb, Tok::Newline, ] ); } )* }; } test_newline_in_brackets! { test_newline_in_brackets_windows_eol: WINDOWS_EOL, test_newline_in_brackets_mac_eol: MAC_EOL, test_newline_in_brackets_unix_eol: UNIX_EOL, } #[test] fn test_operators() { let source = "//////=/ /"; let tokens = lex_source(source); assert_eq!( tokens, vec![ Tok::DoubleSlash, Tok::DoubleSlash, Tok::DoubleSlashEqual, Tok::Slash, Tok::Slash, Tok::Newline, ] ); } #[test] fn test_string() { let source = r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g' r'raw\'' '\420' '\200\0a'"#; let tokens = lex_source(source); assert_eq!( tokens, vec![ Tok::String { value: String::from("double"), is_fstring: false }, Tok::String { value: String::from("single"), is_fstring: false }, Tok::String { value: String::from("can't"), is_fstring: false }, Tok::String { value: String::from("\\\""), is_fstring: false }, Tok::String { value: String::from("\t\r\n"), is_fstring: false }, Tok::String { value: String::from("\\g"), is_fstring: false }, Tok::String { value: String::from("raw\\'"), is_fstring: false }, Tok::String { value: String::from("Đ"), is_fstring: false }, Tok::String { value: String::from("\u{80}\u{0}a"), is_fstring: false }, Tok::Newline, ] ); } macro_rules! test_string_continuation { ($($name:ident: $eol:expr,)*) => { $( #[test] fn $name() { let source = format!("\"abc\\{}def\"", $eol); let tokens = lex_source(&source); assert_eq!( tokens, vec![ Tok::String { value: String::from("abcdef"), is_fstring: false, }, Tok::Newline, ] ) } )* } } test_string_continuation! { test_string_continuation_windows_eol: WINDOWS_EOL, test_string_continuation_mac_eol: MAC_EOL, test_string_continuation_unix_eol: UNIX_EOL, } #[test] fn test_single_quoted_byte() { // single quote let source = r##"b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'"##; let tokens = lex_source(source); let res = (0..=255).collect::>(); assert_eq!(tokens, vec![Tok::Bytes { value: res }, Tok::Newline]); } #[test] fn test_double_quoted_byte() { // double quote let source = r##"b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff""##; let tokens = lex_source(source); let res = (0..=255).collect::>(); assert_eq!(tokens, vec![Tok::Bytes { value: res }, Tok::Newline]); } #[test] fn test_escape_char_in_byte_literal() { // backslash does not escape let source = r#"b"omkmok\Xaa""#; let tokens = lex_source(source); let res = vec![111, 109, 107, 109, 111, 107, 92, 88, 97, 97]; assert_eq!(tokens, vec![Tok::Bytes { value: res }, Tok::Newline]); } #[test] fn test_raw_byte_literal() { let source = r"rb'\x1z'"; let tokens = lex_source(source); assert_eq!(tokens, vec![Tok::Bytes { value: b"\\x1z".to_vec() }, Tok::Newline]); let source = r"rb'\\'"; let tokens = lex_source(source); assert_eq!(tokens, vec![Tok::Bytes { value: b"\\\\".to_vec() }, Tok::Newline]); } #[test] fn test_escape_octet() { let source = r"b'\43a\4\1234'"; let tokens = lex_source(source); assert_eq!(tokens, vec![Tok::Bytes { value: b"#a\x04S4".to_vec() }, Tok::Newline]); } #[test] fn test_escape_unicode_name() { let source = r#""\N{EN SPACE}""#; let tokens = lex_source(source); assert_eq!( tokens, vec![Tok::String { value: "\u{2002}".to_owned(), is_fstring: false }, Tok::Newline] ); } }