Files
email-tracker/external/duckdb/extension/autocomplete/inline_grammar.py
2025-10-24 19:21:19 -05:00

168 lines
5.4 KiB
Python

import os
import argparse
from pathlib import Path
parser = argparse.ArgumentParser(description='Inline the auto-complete PEG grammar files')
parser.add_argument(
'--print', action='store_true', help='Print the grammar instead of writing to a file', default=False
)
parser.add_argument(
'--grammar-file',
action='store_true',
help='Write the grammar to a .gram file instead of a C++ header',
default=False,
)
args = parser.parse_args()
autocomplete_dir = Path(__file__).parent
statements_dir = os.path.join(autocomplete_dir, 'grammar', 'statements')
keywords_dir = os.path.join(autocomplete_dir, 'grammar', 'keywords')
target_file = os.path.join(autocomplete_dir, 'include', 'inlined_grammar.hpp')
contents = ""
# Maps filenames to string categories
FILENAME_TO_CATEGORY = {
"reserved_keyword.list": "RESERVED_KEYWORD",
"unreserved_keyword.list": "UNRESERVED_KEYWORD",
"column_name_keyword.list": "COL_NAME_KEYWORD",
"func_name_keyword.list": "TYPE_FUNC_NAME_KEYWORD",
"type_name_keyword.list": "TYPE_FUNC_NAME_KEYWORD",
}
# Maps category names to their C++ map variable names
CPP_MAP_NAMES = {
"RESERVED_KEYWORD": "reserved_keyword_map",
"UNRESERVED_KEYWORD": "unreserved_keyword_map",
"COL_NAME_KEYWORD": "colname_keyword_map",
"TYPE_FUNC_NAME_KEYWORD": "typefunc_keyword_map",
}
# Use a dictionary of sets to collect keywords for each category, preventing duplicates
keyword_sets = {category: set() for category in CPP_MAP_NAMES.keys()}
# --- Validation and Loading (largely unchanged) ---
# For validation during the loading phase
reserved_set = set()
unreserved_set = set()
def load_keywords(filepath):
with open(filepath, "r") as f:
return [line.strip().lower() for line in f if line.strip()]
for filename in os.listdir(keywords_dir):
if filename not in FILENAME_TO_CATEGORY:
continue
category = FILENAME_TO_CATEGORY[filename]
keywords = load_keywords(os.path.join(keywords_dir, filename))
for kw in keywords:
# Validation logic remains the same to enforce rules
if category == "RESERVED_KEYWORD":
if kw in reserved_set or kw in unreserved_set:
print(f"Keyword '{kw}' has conflicting RESERVED/UNRESERVED categories")
exit(1)
reserved_set.add(kw)
elif category == "UNRESERVED_KEYWORD":
if kw in reserved_set or kw in unreserved_set:
print(f"Keyword '{kw}' has conflicting RESERVED/UNRESERVED categories")
exit(1)
unreserved_set.add(kw)
# Add the keyword to the appropriate set
keyword_sets[category].add(kw)
# --- C++ Code Generation ---
output_path = os.path.join(autocomplete_dir, "keyword_map.cpp")
with open(output_path, "w") as f:
f.write("/* THIS FILE WAS AUTOMATICALLY GENERATED BY inline_grammar.py */\n")
f.write("#include \"keyword_helper.hpp\"\n\n")
f.write("namespace duckdb {\n")
f.write("void PEGKeywordHelper::InitializeKeywordMaps() { // Renamed for clarity\n")
f.write("\tif (initialized) {\n\t\treturn;\n\t};\n")
f.write("\tinitialized = true;\n\n")
# Get the total number of categories to handle the last item differently
num_categories = len(keyword_sets)
# Iterate through each category and generate code for each map
for i, (category, keywords) in enumerate(keyword_sets.items()):
cpp_map_name = CPP_MAP_NAMES[category]
f.write(f"\t// Populating {cpp_map_name}\n")
# Sort keywords for deterministic output
for kw in sorted(list(keywords)):
# Populate the C++ set with insert
f.write(f'\t{cpp_map_name}.insert("{kw}");\n')
# Add a newline for all but the last block
if i < num_categories - 1:
f.write("\n")
f.write("}\n")
f.write("} // namespace duckdb\n")
print(f"Successfully generated {output_path}")
def filename_to_upper_camel(file):
name, _ = os.path.splitext(file) # column_name_keywords
parts = name.split('_') # ['column', 'name', 'keywords']
return ''.join(p.capitalize() for p in parts)
for file in os.listdir(keywords_dir):
if not file.endswith('.list'):
continue
rule_name = filename_to_upper_camel(file)
rule = f"{rule_name} <- "
with open(os.path.join(keywords_dir, file), 'r') as f:
lines = [f"'{line.strip()}'" for line in f if line.strip()]
rule += " /\n".join(lines) + "\n"
contents += rule
for file in os.listdir(statements_dir):
if not file.endswith('.gram'):
raise Exception(f"File {file} does not end with .gram")
with open(os.path.join(statements_dir, file), 'r') as f:
contents += f.read() + "\n"
if args.print:
print(contents)
exit(0)
if args.grammar_file:
grammar_file = target_file.replace('.hpp', '.gram')
with open(grammar_file, 'w+') as f:
f.write(contents)
exit(0)
def get_grammar_bytes(contents, add_null_terminator=True):
result_text = ""
for line in contents.split('\n'):
if len(line) == 0:
continue
result_text += "\t\"" + line.replace('\\', '\\\\').replace('"', '\\"') + "\\n\"\n"
return result_text
with open(target_file, 'w+') as f:
f.write(
'''/* THIS FILE WAS AUTOMATICALLY GENERATED BY inline_grammar.py */
#pragma once
namespace duckdb {
const char INLINED_PEG_GRAMMAR[] = {
'''
+ get_grammar_bytes(contents)
+ '''
};
} // namespace duckdb
'''
)