From 3b1cc02d06ae4e534ebe3086f18660db07f7ec11 Mon Sep 17 00:00:00 2001 From: ychenfo Date: Thu, 4 Nov 2021 15:00:27 +0800 Subject: [PATCH] nac3parser, ast: add comment support core changes --- nac3ast/Python.asdl | 48 ++++---- nac3parser/src/config_comment_helper.rs | 85 +++++++++++++ nac3parser/src/lexer.rs | 146 ++++++++++++++++++++-- nac3parser/src/lib.rs | 1 + nac3parser/src/python.lalrpop | 154 ++++++++++++++++-------- nac3parser/src/token.rs | 2 + 6 files changed, 354 insertions(+), 82 deletions(-) create mode 100644 nac3parser/src/config_comment_helper.rs diff --git a/nac3ast/Python.asdl b/nac3ast/Python.asdl index b3abe162..0bc56c81 100644 --- a/nac3ast/Python.asdl +++ b/nac3ast/Python.asdl @@ -10,43 +10,45 @@ module Python stmt = FunctionDef(identifier name, arguments args, stmt* body, expr* decorator_list, expr? returns, - string? type_comment) + string? type_comment, identifier* config_comment) | AsyncFunctionDef(identifier name, arguments args, stmt* body, expr* decorator_list, expr? returns, - string? type_comment) + string? type_comment, identifier* config_comment) | ClassDef(identifier name, expr* bases, keyword* keywords, stmt* body, - expr* decorator_list) - | Return(expr? value) + expr* decorator_list, identifier* config_comment) + | Return(expr? value, identifier* config_comment) - | Delete(expr* targets) - | Assign(expr* targets, expr value, string? type_comment) - | AugAssign(expr target, operator op, expr value) + | Delete(expr* targets, identifier* config_comment) + | Assign(expr* targets, expr value, string? type_comment, identifier* config_comment) + | AugAssign(expr target, operator op, expr value, identifier* config_comment) -- 'simple' indicates that we annotate simple name without parens - | AnnAssign(expr target, expr annotation, expr? value, bool simple) + | AnnAssign(expr target, expr annotation, expr? value, bool simple, identifier* config_comment) -- use 'orelse' because else is a keyword in target languages - | For(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment) - | AsyncFor(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment) - | While(expr test, stmt* body, stmt* orelse) - | If(expr test, stmt* body, stmt* orelse) - | With(withitem* items, stmt* body, string? type_comment) - | AsyncWith(withitem* items, stmt* body, string? type_comment) + | For(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment, identifier* config_comment) + | AsyncFor(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment, identifier* config_comment) + | While(expr test, stmt* body, stmt* orelse, identifier* config_comment) + | If(expr test, stmt* body, stmt* orelse, identifier* config_comment) + | With(withitem* items, stmt* body, string? type_comment, identifier* config_comment) + | AsyncWith(withitem* items, stmt* body, string? type_comment, identifier* config_comment) - | Raise(expr? exc, expr? cause) - | Try(stmt* body, excepthandler* handlers, stmt* orelse, stmt* finalbody) - | Assert(expr test, expr? msg) + | Raise(expr? exc, expr? cause, identifier* config_comment) + | Try(stmt* body, excepthandler* handlers, stmt* orelse, stmt* finalbody, identifier* config_comment) + | Assert(expr test, expr? msg, identifier* config_comment) - | Import(alias* names) - | ImportFrom(identifier? module, alias* names, int level) + | Import(alias* names, identifier* config_comment) + | ImportFrom(identifier? module, alias* names, int level, identifier* config_comment) - | Global(identifier* names) - | Nonlocal(identifier* names) - | Expr(expr value) - | Pass | Break | Continue + | Global(identifier* names, identifier* config_comment) + | Nonlocal(identifier* names, identifier* config_comment) + | Expr(expr value, identifier* config_comment) + | Pass(identifier* config_comment) + | Break(identifier* config_comment) + | Continue(identifier* config_comment) -- col_offset is the byte offset in the utf8 string the parser uses attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset) diff --git a/nac3parser/src/config_comment_helper.rs b/nac3parser/src/config_comment_helper.rs new file mode 100644 index 00000000..588f3e38 --- /dev/null +++ b/nac3parser/src/config_comment_helper.rs @@ -0,0 +1,85 @@ +use lalrpop_util::ParseError; +use nac3ast::*; +use crate::ast::Ident; +use crate::ast::Location; +use crate::token::Tok; +use crate::error::*; + +pub fn make_config_comment( + com_loc: Location, + stmt_loc: Location, + nac3com_above: Vec<(Ident, Tok)>, + nac3com_end: Option +) -> Result, ParseError> { + if com_loc.column() != stmt_loc.column() { + return Err(ParseError::User { + error: LexicalError { + location: com_loc, + error: LexicalErrorType::OtherError( + format!( + "config comment at top must have the same indentation with what it applies, comment at {}, statement at {}", + com_loc, + stmt_loc, + ) + ) + } + }) + }; + Ok( + nac3com_above + .into_iter() + .map(|(com, _)| com) + .chain(nac3com_end.map_or_else(|| vec![].into_iter(), |com| vec![com].into_iter())) + .collect() + ) +} + +pub fn handle_small_stmt(stmts: &mut [Stmt], nac3com_above: Vec<(Ident, Tok)>, nac3com_end: Option, com_above_loc: Location) -> Result<(), ParseError> { + if com_above_loc.column() != stmts[0].location.column() { + return Err(ParseError::User { + error: LexicalError { + location: com_above_loc, + error: LexicalErrorType::OtherError( + format!( + "config comment at top must have the same indentation with what it applies, comment at {}, statement at {}", + com_above_loc, + stmts[0].location, + ) + ) + } + }) + } + apply_config_comments( + &mut stmts[0], + nac3com_above + .into_iter() + .map(|(com, _)| com).collect() + ); + apply_config_comments( + stmts.last_mut().unwrap(), + nac3com_end.map_or_else(Vec::new, |com| vec![com]) + ); + Ok(()) +} + +fn apply_config_comments(stmt: &mut Stmt, comments: Vec) { + match &mut stmt.node { + StmtKind::Pass { config_comment, .. } + | StmtKind::Delete { config_comment, .. } + | StmtKind::Expr { config_comment, .. } + | StmtKind::Assign { config_comment, .. } + | StmtKind::AugAssign { config_comment, .. } + | StmtKind::AnnAssign { config_comment, .. } + | StmtKind::Break { config_comment, .. } + | StmtKind::Continue { config_comment, .. } + | StmtKind::Return { config_comment, .. } + | StmtKind::Raise { config_comment, .. } + | StmtKind::Import { config_comment, .. } + | StmtKind::ImportFrom { config_comment, .. } + | StmtKind::Global { config_comment, .. } + | StmtKind::Nonlocal { config_comment, .. } + | StmtKind::Assert { config_comment, .. } => config_comment.extend(comments), + + _ => { unreachable!("only small statements should call this function") } + } +} diff --git a/nac3parser/src/lexer.rs b/nac3parser/src/lexer.rs index 1936eecf..9166abd4 100644 --- a/nac3parser/src/lexer.rs +++ b/nac3parser/src/lexer.rs @@ -65,6 +65,7 @@ pub struct Lexer> { chr1: Option, chr2: Option, location: Location, + config_comment_prefix: Option<&'static str> } pub static KEYWORDS: phf::Map<&'static str, Tok> = phf::phf_map! { @@ -196,6 +197,7 @@ where location: start, chr1: None, chr2: None, + config_comment_prefix: Some(" nac3:") }; lxr.next_char(); lxr.next_char(); @@ -415,17 +417,45 @@ where } } - /// Skip everything until end of line - fn lex_comment(&mut self) { + /// Skip everything until end of line, may produce nac3 pseudocomment + fn lex_comment(&mut self) -> Option { self.next_char(); + // if possibly nac3 pseudocomment, special handling for `# nac3:` + let (mut prefix, mut is_comment) = self + .config_comment_prefix + .map_or_else(|| ("".chars(), false), |v| (v.chars(), true)); + // for the correct location of config comment + let mut start_loc = self.location; + start_loc.go_left(); loop { match self.chr0 { - Some('\n') => return, - Some(_) => {} - None => return, + Some('\n') => return None, + None => return None, + Some(c) => { + if let (true, Some(p)) = (is_comment, prefix.next()) { + is_comment = is_comment && c == p + } else { + // done checking prefix, if is comment then return the spanned + if is_comment { + let mut content = String::new(); + loop { + match self.chr0 { + Some('\n') | None => break, + Some(c) => content.push(c), + } + self.next_char(); + } + return Some(( + start_loc, + Tok::ConfigComment { content: content.trim().into() }, + self.location + )); + } + } + } } self.next_char(); - } + }; } fn unicode_literal(&mut self, literal_number: usize) -> Result { @@ -658,10 +688,11 @@ where } /// Given we are at the start of a line, count the number of spaces and/or tabs until the first character. - fn eat_indentation(&mut self) -> Result { + fn eat_indentation(&mut self) -> Result<(IndentationLevel, Option), LexicalError> { // Determine indentation: let mut spaces: usize = 0; let mut tabs: usize = 0; + let mut nac3comment: Option = None; loop { match self.chr0 { Some(' ') => { @@ -693,7 +724,14 @@ where tabs += 1; } Some('#') => { - self.lex_comment(); + nac3comment = self.lex_comment(); + // if is nac3comment, we need to add newline, so it is not begin of line + // and we should break from the loop, else in the next loop it will be + // regarded as a empty line + if nac3comment.is_some() { + self.at_begin_of_line = false; + break; + } spaces = 0; tabs = 0; } @@ -722,11 +760,12 @@ where } } - Ok(IndentationLevel { tabs, spaces }) + Ok((IndentationLevel { tabs, spaces }, nac3comment)) } fn handle_indentations(&mut self) -> Result<(), LexicalError> { - let indentation_level = self.eat_indentation()?; + let eat_result = self.eat_indentation()?; + let indentation_level = eat_result.0; if self.nesting == 0 { // Determine indent or dedent: @@ -770,6 +809,10 @@ where } } } + }; + + if let Some(comment) = eat_result.1 { + self.emit(comment); } Ok(()) @@ -833,7 +876,9 @@ where self.emit(number); } '#' => { - self.lex_comment(); + if let Some(c) = self.lex_comment() { + self.emit(c); + }; } '"' | '\'' => { let string = self.lex_string(false, false, false, false)?; @@ -1287,6 +1332,85 @@ mod tests { lexer.map(|x| x.unwrap().1).collect() } + #[test] + fn test_nac3comment() { + let src = "\ +a: int32 +# nac3: +b: int64"; + let tokens = lex_source(src); + assert_eq!( + tokens, + vec![ + Tok::Name { name: "a".into() }, + Tok::Colon, + Tok::Name { name: "int32".into() }, + Tok::Newline, + Tok::ConfigComment { content: "".into() }, + Tok::Newline, + Tok::Name { name: "b".into() }, + Tok::Colon, + Tok::Name { name: "int64".into() }, + Tok::Newline, + ] + ); + } + + #[test] + fn test_class_lex_with_nac3comment() { + use Tok::*; + let source = "\ +class Foo(A, B): +# normal comment +# nac3: no indent + # nac3: correct indent + b: int32 + a: int32 # nac3: no need indent + def __init__(self): + pass"; + let tokens = lex_source(source); + assert_eq!( + tokens, + vec![ + Class, + Name { name: "Foo".into() }, + Lpar, + Name { name: "A".into() }, + Comma, + Name { name: "B".into() }, + Rpar, + Colon, + Newline, + ConfigComment { content: "no indent".into() }, + Newline, + Indent, + ConfigComment { content: "correct indent".into() }, + Newline, + Name { name: "b".into() }, + Colon, + Name { name: "int32".into() }, + Newline, + Name { name: "a".into() }, + Colon, + Name { name: "int32".into() }, + ConfigComment { content: "no need indent".into() }, + Newline, + Def, + Name { name: "__init__".into() }, + Lpar, + Name { name: "self".into() }, + Rpar, + Colon, + Newline, + Indent, + Pass, + Newline, + Dedent, + Dedent + ] + ) + } + #[test] fn test_newline_processor() { // Escape \ followed by \n (by removal): diff --git a/nac3parser/src/lib.rs b/nac3parser/src/lib.rs index cab63529..5e253059 100644 --- a/nac3parser/src/lib.rs +++ b/nac3parser/src/lib.rs @@ -32,3 +32,4 @@ lalrpop_mod!( python ); pub mod token; +pub mod config_comment_helper; diff --git a/nac3parser/src/python.lalrpop b/nac3parser/src/python.lalrpop index 7a994790..5ca7a56b 100644 --- a/nac3parser/src/python.lalrpop +++ b/nac3parser/src/python.lalrpop @@ -9,8 +9,11 @@ use crate::ast; use crate::fstring::parse_located_fstring; use crate::function::{ArgumentList, parse_args, parse_params}; use crate::error::LexicalError; +use crate::error::LexicalErrorType; use crate::lexer; +use crate::config_comment_helper::*; +use lalrpop_util::ParseError; use num_bigint::BigInt; grammar; @@ -47,10 +50,11 @@ Statement: ast::Suite = { }; SimpleStatement: ast::Suite = { - ";"? "\n" => { + ";"? "\n" =>? { let mut statements = vec![s1]; statements.extend(s2.into_iter().map(|e| e.1)); - statements + handle_small_stmt(&mut statements, nac3com_above, nac3com_end, com_loc)?; + Ok(statements) } }; @@ -70,7 +74,7 @@ PassStatement: ast::Stmt = { ast::Stmt { location, custom: (), - node: ast::StmtKind::Pass, + node: ast::StmtKind::Pass { config_comment: vec![] }, } }, }; @@ -80,7 +84,7 @@ DelStatement: ast::Stmt = { ast::Stmt { location, custom: (), - node: ast::StmtKind::Delete { targets }, + node: ast::StmtKind::Delete { targets, config_comment: vec![] }, } }, }; @@ -92,7 +96,7 @@ ExpressionStatement: ast::Stmt = { ast::Stmt { custom: (), location, - node: ast::StmtKind::Expr { value: Box::new(expression) } + node: ast::StmtKind::Expr { value: Box::new(expression), config_comment: vec![] } } } else { let mut targets = vec![expression]; @@ -107,7 +111,7 @@ ExpressionStatement: ast::Stmt = { ast::Stmt { custom: (), location, - node: ast::StmtKind::Assign { targets, value, type_comment: None }, + node: ast::StmtKind::Assign { targets, value, type_comment: None, config_comment: vec![] }, } } }, @@ -118,7 +122,8 @@ ExpressionStatement: ast::Stmt = { node: ast::StmtKind::AugAssign { target: Box::new(target), op, - value: Box::new(rhs) + value: Box::new(rhs), + config_comment: vec![], }, } }, @@ -132,6 +137,7 @@ ExpressionStatement: ast::Stmt = { annotation: Box::new(annotation), value: rhs.map(Box::new), simple, + config_comment: vec![], }, } }, @@ -187,28 +193,28 @@ FlowStatement: ast::Stmt = { ast::Stmt { custom: (), location, - node: ast::StmtKind::Break, + node: ast::StmtKind::Break { config_comment: vec![] }, } }, "continue" => { ast::Stmt { custom: (), location, - node: ast::StmtKind::Continue, + node: ast::StmtKind::Continue { config_comment: vec![] }, } }, "return" => { ast::Stmt { custom: (), location, - node: ast::StmtKind::Return { value: value.map(Box::new) }, + node: ast::StmtKind::Return { value: value.map(Box::new), config_comment: vec![] }, } }, => { ast::Stmt { custom: (), location, - node: ast::StmtKind::Expr { value: Box::new(expression) }, + node: ast::StmtKind::Expr { value: Box::new(expression), config_comment: vec![] }, } }, RaiseStatement, @@ -219,14 +225,14 @@ RaiseStatement: ast::Stmt = { ast::Stmt { custom: (), location, - node: ast::StmtKind::Raise { exc: None, cause: None }, + node: ast::StmtKind::Raise { exc: None, cause: None, config_comment: vec![] }, } }, "raise" => { ast::Stmt { custom: (), location, - node: ast::StmtKind::Raise { exc: Some(Box::new(t)), cause: c.map(|x| Box::new(x.1)) }, + node: ast::StmtKind::Raise { exc: Some(Box::new(t)), cause: c.map(|x| Box::new(x.1)), config_comment: vec![] }, } }, }; @@ -236,7 +242,7 @@ ImportStatement: ast::Stmt = { ast::Stmt { custom: (), location, - node: ast::StmtKind::Import { names }, + node: ast::StmtKind::Import { names, config_comment: vec![] }, } }, "from" "import" => { @@ -247,7 +253,8 @@ ImportStatement: ast::Stmt = { node: ast::StmtKind::ImportFrom { level, module: module.map(|s| s.into()), - names + names, + config_comment: vec![] }, } }, @@ -301,7 +308,7 @@ GlobalStatement: ast::Stmt = { ast::Stmt { custom: (), location, - node: ast::StmtKind::Global { names } + node: ast::StmtKind::Global { names, config_comment: vec![] } } }, }; @@ -311,7 +318,7 @@ NonlocalStatement: ast::Stmt = { ast::Stmt { custom: (), location, - node: ast::StmtKind::Nonlocal { names } + node: ast::StmtKind::Nonlocal { names, config_comment: vec![] } } }, }; @@ -323,7 +330,8 @@ AssertStatement: ast::Stmt = { location, node: ast::StmtKind::Assert { test: Box::new(test), - msg: msg.map(|e| Box::new(e.1)) + msg: msg.map(|e| Box::new(e.1)), + config_comment: vec![], } } }, @@ -340,7 +348,7 @@ CompoundStatement: ast::Stmt = { }; IfStatement: ast::Stmt = { - "if" ":" => { + "if" ":" =>? { // Determine last else: let mut last = s3.map(|s| s.2).unwrap_or_default(); @@ -349,54 +357,74 @@ IfStatement: ast::Stmt = { let x = ast::Stmt { custom: (), location: i.0, - node: ast::StmtKind::If { test: Box::new(i.2), body: i.4, orelse: last }, + node: ast::StmtKind::If { test: Box::new(i.2), body: i.4, orelse: last, config_comment: vec![] }, }; last = vec![x]; } - ast::Stmt { + Ok(ast::Stmt { custom: (), location, - node: ast::StmtKind::If { test: Box::new(test), body, orelse: last } - } + node: ast::StmtKind::If { + test: Box::new(test), + body, + orelse: last, + config_comment: make_config_comment(location, stmt_loc, nac3com_above, nac3com_end)? + } + }) }, }; WhileStatement: ast::Stmt = { - "while" ":" => { + "while" ":" =>? { let orelse = s2.map(|s| s.2).unwrap_or_default(); - ast::Stmt { + Ok(ast::Stmt { custom: (), location, node: ast::StmtKind::While { test: Box::new(test), body, - orelse + orelse, + config_comment: make_config_comment(location, stmt_loc, nac3com_above, nac3com_end)? }, - } + }) }, }; ForStatement: ast::Stmt = { - "for" "in" ":" => { + "for" "in" ":" =>? { let orelse = s2.map(|s| s.2).unwrap_or_default(); let target = Box::new(target); let iter = Box::new(iter); let type_comment = None; let node = if is_async.is_some() { - ast::StmtKind::AsyncFor { target, iter, body, orelse, type_comment } + ast::StmtKind::AsyncFor { + target, + iter, + body, + orelse, + type_comment, + config_comment: make_config_comment(location, stmt_loc, nac3com_above, nac3com_end)? + } } else { - ast::StmtKind::For { target, iter, body, orelse, type_comment } + ast::StmtKind::For { + target, + iter, + body, + orelse, + type_comment, + config_comment: make_config_comment(location, stmt_loc, nac3com_above, nac3com_end)? + } }; - ast::Stmt::new(location, node) + Ok(ast::Stmt::new(location, node)) }, }; TryStatement: ast::Stmt = { - "try" ":" => { + "try" ":" =>? { let orelse = else_suite.map(|s| s.2).unwrap_or_default(); let finalbody = finally.map(|s| s.2).unwrap_or_default(); - ast::Stmt { + Ok(ast::Stmt { custom: (), location, node: ast::StmtKind::Try { @@ -404,14 +432,15 @@ TryStatement: ast::Stmt = { handlers, orelse, finalbody, + config_comment: make_config_comment(location, stmt_loc, nac3com_above, nac3com_end)? }, - } + }) }, - "try" ":" => { + "try" ":" =>? { let handlers = vec![]; let orelse = vec![]; let finalbody = finally.2; - ast::Stmt { + Ok(ast::Stmt { custom: (), location, node: ast::StmtKind::Try { @@ -419,8 +448,9 @@ TryStatement: ast::Stmt = { handlers, orelse, finalbody, + config_comment: make_config_comment(location, stmt_loc, nac3com_above, nac3com_end)? }, - } + }) }, }; @@ -448,14 +478,24 @@ ExceptClause: ast::Excepthandler = { }; WithStatement: ast::Stmt = { - "with" > ":" => { + "with" > ":" =>? { let type_comment = None; let node = if is_async.is_some() { - ast::StmtKind::AsyncWith { items, body, type_comment } + ast::StmtKind::AsyncWith { + items, + body, + type_comment, + config_comment: make_config_comment(location, stmt_loc, nac3com_above, nac3com_end)? + } } else { - ast::StmtKind::With { items, body, type_comment } + ast::StmtKind::With { + items, + body, + type_comment, + config_comment: make_config_comment(location, stmt_loc, nac3com_above, nac3com_end)? + } }; - ast::Stmt::new(location, node) + Ok(ast::Stmt::new(location, node)) }, }; @@ -468,16 +508,32 @@ WithItem: ast::Withitem = { }; FuncDef: ast::Stmt = { - "def" " Test)?> ":" => { + "def" " Test)?> ":" =>? { let args = Box::new(args); let returns = r.map(|x| Box::new(x.1)); let type_comment = None; let node = if is_async.is_some() { - ast::StmtKind::AsyncFunctionDef { name, args, body, decorator_list, returns, type_comment } + ast::StmtKind::AsyncFunctionDef { + name, + args, + body, + decorator_list, + returns, + type_comment, + config_comment: make_config_comment(location, stmt_loc, nac3com_above, nac3com_end)? + } } else { - ast::StmtKind::FunctionDef { name, args, body, decorator_list, returns, type_comment } + ast::StmtKind::FunctionDef { + name, + args, + body, + decorator_list, + returns, + type_comment, + config_comment: make_config_comment(location, stmt_loc, nac3com_above, nac3com_end)? + } }; - ast::Stmt::new(location, node) + Ok(ast::Stmt::new(location, node)) }, }; @@ -615,12 +671,12 @@ KwargParameter: Option> = { }; ClassDef: ast::Stmt = { - "class" ":" => { + "class" ":" =>? { let (bases, keywords) = match a { Some((_, arg, _)) => (arg.args, arg.keywords), None => (vec![], vec![]), }; - ast::Stmt { + Ok(ast::Stmt { custom: (), location, node: ast::StmtKind::ClassDef { @@ -629,8 +685,9 @@ ClassDef: ast::Stmt = { keywords, body, decorator_list, + config_comment: make_config_comment(location, stmt_loc, nac3com_above, nac3com_end)? }, - } + }) }, }; @@ -1301,6 +1358,7 @@ extern { string => lexer::Tok::String { value: , is_fstring: }, bytes => lexer::Tok::Bytes { value: > }, name => lexer::Tok::Name { name: }, + config_comment => lexer::Tok::ConfigComment { content: }, "\n" => lexer::Tok::Newline, ";" => lexer::Tok::Semi, } diff --git a/nac3parser/src/token.rs b/nac3parser/src/token.rs index 73249db2..163412aa 100644 --- a/nac3parser/src/token.rs +++ b/nac3parser/src/token.rs @@ -13,6 +13,7 @@ pub enum Tok { Complex { real: f64, imag: f64 }, String { value: String, is_fstring: bool }, Bytes { value: Vec }, + ConfigComment { content: ast::StrRef }, Newline, Indent, Dedent, @@ -134,6 +135,7 @@ impl fmt::Display for Tok { } f.write_str("\"") } + ConfigComment { content } => write!(f, "ConfigComment: '{}'", ast::get_str_from_ref(&ast::get_str_ref_lock(), *content)), Newline => f.write_str("Newline"), Indent => f.write_str("Indent"), Dedent => f.write_str("Dedent"),