From 5608369abee0b50ec6260b017e1b6c64b1c6561b Mon Sep 17 00:00:00 2001 From: Asger F Date: Fri, 12 Jun 2026 12:20:27 +0200 Subject: [PATCH 1/5] Extract trivia tokens from original parse tree --- .../src/extractor/mod.rs | 45 +++++++++++++ .../src/generator/mod.rs | 56 ++++++++++------ .../src/generator/ql_gen.rs | 64 +++++++++++++++++++ 3 files changed, 146 insertions(+), 19 deletions(-) diff --git a/shared/tree-sitter-extractor/src/extractor/mod.rs b/shared/tree-sitter-extractor/src/extractor/mod.rs index e8e608c32447..436ff9f65a15 100644 --- a/shared/tree-sitter-extractor/src/extractor/mod.rs +++ b/shared/tree-sitter-extractor/src/extractor/mod.rs @@ -333,6 +333,9 @@ pub fn extract( .run_from_tree(&tree, source) .unwrap_or_else(|e| panic!("Desugaring failed for {path_str}: {e}")); traverse_yeast(&ast, &mut visitor); + // Comments and other `extra` nodes are not represented in the desugared + // AST, so recover them directly from the original parse tree. + traverse_extras(&tree, &mut visitor); } else { traverse(&tree, &mut visitor); } @@ -365,6 +368,8 @@ struct Visitor<'a> { ast_node_parent_table_name: String, /// Language-specific name of the tokeninfo table tokeninfo_table_name: String, + /// Language-specific name of the trivia tokeninfo table + trivia_tokeninfo_table_name: String, /// A lookup table from type name to node types schema: &'a NodeTypeMap, /// A stack for gathering information from child nodes. Whenever a node is @@ -395,11 +400,33 @@ impl<'a> Visitor<'a> { ast_node_location_table_name: format!("{language_prefix}_ast_node_location"), ast_node_parent_table_name: format!("{language_prefix}_ast_node_parent"), tokeninfo_table_name: format!("{language_prefix}_tokeninfo"), + trivia_tokeninfo_table_name: format!("{language_prefix}_trivia_tokeninfo"), schema, stack: Vec::new(), } } + /// Emits a `TriviaToken` for the given `extra` node (e.g. a comment) from + /// the original parse tree. Trivia tokens carry a location and their source + /// text, but are not attached to a parent in the (possibly desugared) AST. + fn emit_trivia_token(&mut self, node: &Node) { + let id = self.trap_writer.fresh_id(); + let loc = location_for(self, self.file_label, node); + let loc_label = location_label(self.trap_writer, loc); + self.trap_writer.add_tuple( + &self.ast_node_location_table_name, + vec![trap::Arg::Label(id), trap::Arg::Label(loc_label)], + ); + self.trap_writer.add_tuple( + &self.trivia_tokeninfo_table_name, + vec![ + trap::Arg::Label(id), + trap::Arg::Int(node.kind_id() as usize), + sliced_source_arg(self.source, node), + ], + ); + } + fn record_parse_error(&mut self, loc: trap::Label, mesg: &diagnostics::DiagnosticMessage) { self.diagnostics_writer.write(mesg); let id = self.trap_writer.fresh_id(); @@ -835,6 +862,24 @@ fn traverse(tree: &Tree, visitor: &mut Visitor) { } } +/// Walks the original tree-sitter tree and emits a `TriviaToken` for every +/// `extra` node (e.g. a comment). Used to preserve comments that would +/// otherwise be lost after a desugaring pass rewrites the tree. +fn traverse_extras(tree: &Tree, visitor: &mut Visitor) { + emit_extras_in(visitor, tree.root_node()); +} + +fn emit_extras_in(visitor: &mut Visitor, node: Node<'_>) { + let mut cursor = node.walk(); + for child in node.children(&mut cursor) { + if child.is_extra() { + visitor.emit_trivia_token(&child); + } else { + emit_extras_in(visitor, child); + } + } +} + fn traverse_yeast(tree: &yeast::Ast, visitor: &mut Visitor) { use yeast::Cursor; let mut cursor = tree.walk(); diff --git a/shared/tree-sitter-extractor/src/generator/mod.rs b/shared/tree-sitter-extractor/src/generator/mod.rs index da13322fe60c..d3880a74579f 100644 --- a/shared/tree-sitter-extractor/src/generator/mod.rs +++ b/shared/tree-sitter-extractor/src/generator/mod.rs @@ -68,7 +68,12 @@ pub fn generate( let node_parent_table_name = format!("{}_ast_node_parent", &prefix); let token_name = format!("{}_token", &prefix); let tokeninfo_name = format!("{}_tokeninfo", &prefix); + let trivia_token_name = format!("{}_trivia_token", &prefix); + let trivia_tokeninfo_name = format!("{}_trivia_tokeninfo", &prefix); let reserved_word_name = format!("{}_reserved_word", &prefix); + // When a desugaring is configured, comments and other `extra` nodes are + // preserved from the original parse tree as `TriviaToken`s. + let has_trivia_tokens = language.desugar.is_some(); let effective_node_types: String = match language .desugar .as_ref() @@ -85,28 +90,35 @@ pub fn generate( let nodes = node_types::read_node_types_str(&prefix, &effective_node_types)?; let (dbscheme_entries, mut ast_node_members, token_kinds) = convert_nodes(&nodes); ast_node_members.insert(&token_name); + if has_trivia_tokens { + ast_node_members.insert(&trivia_token_name); + } writeln!(&mut dbscheme_writer, "/*- {} dbscheme -*/", language.name)?; dbscheme::write(&mut dbscheme_writer, &dbscheme_entries)?; let token_case = create_token_case(&token_name, token_kinds); - dbscheme::write( - &mut dbscheme_writer, - &[ - dbscheme::Entry::Table(create_tokeninfo(&tokeninfo_name, &token_name)), - dbscheme::Entry::Case(token_case), - dbscheme::Entry::Union(dbscheme::Union { - name: &ast_node_name, - members: ast_node_members, - }), - dbscheme::Entry::Table(create_ast_node_location_table( - &node_location_table_name, - &ast_node_name, - )), - dbscheme::Entry::Table(create_ast_node_parent_table( - &node_parent_table_name, - &ast_node_name, - )), - ], - )?; + let mut dbscheme_tail = vec![ + dbscheme::Entry::Table(create_tokeninfo(&tokeninfo_name, &token_name)), + dbscheme::Entry::Case(token_case), + ]; + if has_trivia_tokens { + dbscheme_tail.push(dbscheme::Entry::Table(create_tokeninfo( + &trivia_tokeninfo_name, + &trivia_token_name, + ))); + } + dbscheme_tail.push(dbscheme::Entry::Union(dbscheme::Union { + name: &ast_node_name, + members: ast_node_members, + })); + dbscheme_tail.push(dbscheme::Entry::Table(create_ast_node_location_table( + &node_location_table_name, + &ast_node_name, + ))); + dbscheme_tail.push(dbscheme::Entry::Table(create_ast_node_parent_table( + &node_parent_table_name, + &ast_node_name, + ))); + dbscheme::write(&mut dbscheme_writer, &dbscheme_tail)?; let mut body = vec![ ql::TopLevel::Class(ql_gen::create_ast_node_class( @@ -116,6 +128,12 @@ pub fn generate( )), ql::TopLevel::Class(ql_gen::create_token_class(&token_name, &tokeninfo_name)), ]; + if has_trivia_tokens { + body.push(ql::TopLevel::Class(ql_gen::create_trivia_token_class( + &trivia_token_name, + &trivia_tokeninfo_name, + ))); + } // Only emit the ReservedWord class when there are actually unnamed token // types in the schema (i.e., @{prefix}_reserved_word exists in the dbscheme). // When converting from a YEAST YAML schema that has no unnamed tokens, this diff --git a/shared/tree-sitter-extractor/src/generator/ql_gen.rs b/shared/tree-sitter-extractor/src/generator/ql_gen.rs index bb990beacc8a..f827b12580e8 100644 --- a/shared/tree-sitter-extractor/src/generator/ql_gen.rs +++ b/shared/tree-sitter-extractor/src/generator/ql_gen.rs @@ -199,6 +199,70 @@ pub fn create_token_class<'a>(token_type: &'a str, tokeninfo: &'a str) -> ql::Cl } } +/// Creates the `TriviaToken` class. Trivia tokens (e.g. comments) are +/// `extra` nodes preserved from the original parse tree even when the tree has +/// been rewritten by a desugaring pass. They are not part of the regular +/// `Token` hierarchy because they do not appear in the (possibly desugared) +/// output schema. +pub fn create_trivia_token_class<'a>( + trivia_token_type: &'a str, + trivia_tokeninfo: &'a str, +) -> ql::Class<'a> { + let trivia_tokeninfo_arity = 3; // id, kind, value + let get_value = ql::Predicate { + qldoc: Some(String::from("Gets the source text of this trivia token.")), + name: "getValue", + overridden: false, + is_private: false, + is_final: true, + return_type: Some(ql::Type::String), + formal_parameters: vec![], + body: create_get_field_expr_for_column_storage( + "result", + trivia_tokeninfo, + 1, + trivia_tokeninfo_arity, + ), + overlay: None, + }; + let to_string = ql::Predicate { + qldoc: Some(String::from( + "Gets a string representation of this element.", + )), + name: "toString", + overridden: true, + is_private: false, + is_final: true, + return_type: Some(ql::Type::String), + formal_parameters: vec![], + body: ql::Expression::Equals( + Box::new(ql::Expression::Var("result")), + Box::new(ql::Expression::Dot( + Box::new(ql::Expression::Var("this")), + "getValue", + vec![], + )), + ), + overlay: None, + }; + ql::Class { + qldoc: Some(String::from( + "A trivia token, such as a comment, preserved from the original parse tree.", + )), + name: "TriviaToken", + is_abstract: false, + supertypes: vec![ql::Type::At(trivia_token_type), ql::Type::Normal("AstNode")] + .into_iter() + .collect(), + characteristic_predicate: None, + predicates: vec![ + get_value, + to_string, + create_get_a_primary_ql_class("TriviaToken", false), + ], + } +} + // Creates the `ReservedWord` class. pub fn create_reserved_word_class(db_name: &str) -> ql::Class<'_> { let class_name = "ReservedWord"; From f83adb55cec63191c9e063c3a83cf4c5af798e25 Mon Sep 17 00:00:00 2001 From: Asger F Date: Fri, 12 Jun 2026 16:33:51 +0200 Subject: [PATCH 2/5] Unified: regenerate AST --- unified/ql/lib/codeql/unified/Ast.qll | 12 ++++++++++++ unified/ql/lib/unified.dbscheme | 8 +++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/unified/ql/lib/codeql/unified/Ast.qll b/unified/ql/lib/codeql/unified/Ast.qll index d9060c26f0f2..b6d6a76b5492 100644 --- a/unified/ql/lib/codeql/unified/Ast.qll +++ b/unified/ql/lib/codeql/unified/Ast.qll @@ -61,6 +61,18 @@ module Unified { override string getAPrimaryQlClass() { result = "Token" } } + /** A trivia token, such as a comment, preserved from the original parse tree. */ + class TriviaToken extends @unified_trivia_token, AstNode { + /** Gets the source text of this trivia token. */ + final string getValue() { unified_trivia_tokeninfo(this, _, result) } + + /** Gets a string representation of this element. */ + final override string toString() { result = this.getValue() } + + /** Gets the name of the primary QL class for this element. */ + override string getAPrimaryQlClass() { result = "TriviaToken" } + } + /** Gets the file containing the given `node`. */ private @file getNodeFile(@unified_ast_node node) { exists(@location_default loc | unified_ast_node_location(node, loc) | diff --git a/unified/ql/lib/unified.dbscheme b/unified/ql/lib/unified.dbscheme index 28718d794236..31b3ec6c3ed5 100644 --- a/unified/ql/lib/unified.dbscheme +++ b/unified/ql/lib/unified.dbscheme @@ -334,7 +334,13 @@ case @unified_token.kind of ; -@unified_ast_node = @unified_apply_pattern | @unified_binary_expr | @unified_block_stmt | @unified_call_expr | @unified_expr_condition | @unified_expr_stmt | @unified_guard_if_stmt | @unified_if_stmt | @unified_lambda_expr | @unified_let_pattern_condition | @unified_member_access_expr | @unified_name_expr | @unified_parameter | @unified_sequence_condition | @unified_token | @unified_top_level | @unified_tuple_pattern | @unified_unary_expr | @unified_var_pattern | @unified_variable_declaration_stmt | @unified_variable_declarator +unified_trivia_tokeninfo( + unique int id: @unified_trivia_token, + int kind: int ref, + string value: string ref +); + +@unified_ast_node = @unified_apply_pattern | @unified_binary_expr | @unified_block_stmt | @unified_call_expr | @unified_expr_condition | @unified_expr_stmt | @unified_guard_if_stmt | @unified_if_stmt | @unified_lambda_expr | @unified_let_pattern_condition | @unified_member_access_expr | @unified_name_expr | @unified_parameter | @unified_sequence_condition | @unified_token | @unified_top_level | @unified_trivia_token | @unified_tuple_pattern | @unified_unary_expr | @unified_var_pattern | @unified_variable_declaration_stmt | @unified_variable_declarator unified_ast_node_location( unique int node: @unified_ast_node ref, From 7d6d5bfb4aa0e72771113ae74b887718c7bcae6b Mon Sep 17 00:00:00 2001 From: Asger F Date: Fri, 12 Jun 2026 12:28:32 +0200 Subject: [PATCH 3/5] Unified: add test for comments --- unified/ql/lib/codeql/unified/Comments.qll | 13 +++++++++++++ unified/ql/lib/unified.qll | 4 ++++ .../test/library-tests/comments/comments.expected | 3 +++ unified/ql/test/library-tests/comments/comments.ql | 3 +++ .../ql/test/library-tests/comments/comments.swift | 11 +++++++++++ 5 files changed, 34 insertions(+) create mode 100644 unified/ql/lib/codeql/unified/Comments.qll create mode 100644 unified/ql/lib/unified.qll create mode 100644 unified/ql/test/library-tests/comments/comments.expected create mode 100644 unified/ql/test/library-tests/comments/comments.ql create mode 100644 unified/ql/test/library-tests/comments/comments.swift diff --git a/unified/ql/lib/codeql/unified/Comments.qll b/unified/ql/lib/codeql/unified/Comments.qll new file mode 100644 index 000000000000..c1b1bb9df901 --- /dev/null +++ b/unified/ql/lib/codeql/unified/Comments.qll @@ -0,0 +1,13 @@ +private import unified + +/** + * A comment appearing in the source code. + */ +class Comment extends TriviaToken { + // At the moment, comments are the only type trivia token we extract + string getCommentText() { + result = this.getValue().regexpCapture("//(.*)", 1) + or + result = this.getValue().regexpCapture("(?s)/\\*(.*)\\*/", 1) + } +} diff --git a/unified/ql/lib/unified.qll b/unified/ql/lib/unified.qll new file mode 100644 index 000000000000..5b073290acf8 --- /dev/null +++ b/unified/ql/lib/unified.qll @@ -0,0 +1,4 @@ +import codeql.Locations +import codeql.files.FileSystem +import codeql.unified.Ast::Unified +import codeql.unified.Comments diff --git a/unified/ql/test/library-tests/comments/comments.expected b/unified/ql/test/library-tests/comments/comments.expected new file mode 100644 index 000000000000..04e09d06e54c --- /dev/null +++ b/unified/ql/test/library-tests/comments/comments.expected @@ -0,0 +1,3 @@ +| comments.swift:1:1:1:22 | // Hello this is swift | Hello this is swift | +| comments.swift:3:1:6:3 | /*\n * This is a multi-line comment\n * It should be ignored by the parser\n */ | \n * This is a multi-line comment\n * It should be ignored by the parser\n | +| comments.swift:9:5:9:36 | // This is a single-line comment | This is a single-line comment | diff --git a/unified/ql/test/library-tests/comments/comments.ql b/unified/ql/test/library-tests/comments/comments.ql new file mode 100644 index 000000000000..db64ff737a71 --- /dev/null +++ b/unified/ql/test/library-tests/comments/comments.ql @@ -0,0 +1,3 @@ +import unified + +query predicate comments(Comment c, string text) { text = c.getCommentText() } diff --git a/unified/ql/test/library-tests/comments/comments.swift b/unified/ql/test/library-tests/comments/comments.swift new file mode 100644 index 000000000000..9f133142ef21 --- /dev/null +++ b/unified/ql/test/library-tests/comments/comments.swift @@ -0,0 +1,11 @@ +// Hello this is swift + +/* + * This is a multi-line comment + * It should be ignored by the parser + */ + +func hello() { + // This is a single-line comment + print("Hello, world!") +} From e81a3bcbc3deb87ac1bab2483e02045538ffa090 Mon Sep 17 00:00:00 2001 From: Asger F Date: Fri, 12 Jun 2026 16:47:00 +0200 Subject: [PATCH 4/5] Unified: Add QLDoc --- unified/ql/lib/codeql/unified/Comments.qll | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/unified/ql/lib/codeql/unified/Comments.qll b/unified/ql/lib/codeql/unified/Comments.qll index c1b1bb9df901..e839af2dbee2 100644 --- a/unified/ql/lib/codeql/unified/Comments.qll +++ b/unified/ql/lib/codeql/unified/Comments.qll @@ -1,3 +1,5 @@ +/** Provides classes for working with comments. */ + private import unified /** @@ -5,6 +7,9 @@ private import unified */ class Comment extends TriviaToken { // At the moment, comments are the only type trivia token we extract + /** + * Gets the text inside this comment, not counting the delimeters. + */ string getCommentText() { result = this.getValue().regexpCapture("//(.*)", 1) or From 6000c18c241abb6289d88670316f1668db9e4384 Mon Sep 17 00:00:00 2001 From: Asger F Date: Fri, 12 Jun 2026 16:48:25 +0200 Subject: [PATCH 5/5] Unified: also QLDoc for unified.qll --- unified/ql/lib/unified.qll | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/unified/ql/lib/unified.qll b/unified/ql/lib/unified.qll index 5b073290acf8..4f7387ef8f1c 100644 --- a/unified/ql/lib/unified.qll +++ b/unified/ql/lib/unified.qll @@ -1,3 +1,7 @@ +/** + * Provides classes for working with the AST, as well as files and locations. + */ + import codeql.Locations import codeql.files.FileSystem import codeql.unified.Ast::Unified