should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,194 @@
#include "parser/peg_parser.hpp"
namespace duckdb {
void PEGParser::AddRule(string_t rule_name, PEGRule rule) {
auto entry = rules.find(rule_name.GetString());
if (entry != rules.end()) {
throw InternalException("Failed to parse grammar - duplicate rule name %s", rule_name.GetString());
}
rules.insert(make_pair(rule_name, std::move(rule)));
}
void PEGParser::ParseRules(const char *grammar) {
string_t rule_name;
PEGRule rule;
PEGParseState parse_state = PEGParseState::RULE_NAME;
idx_t bracket_count = 0;
bool in_or_clause = false;
// look for the rules
idx_t c = 0;
while (grammar[c]) {
if (grammar[c] == '#') {
// comment - ignore until EOL
while (grammar[c] && !StringUtil::CharacterIsNewline(grammar[c])) {
c++;
}
continue;
}
if (parse_state == PEGParseState::RULE_DEFINITION && StringUtil::CharacterIsNewline(grammar[c]) &&
bracket_count == 0 && !in_or_clause && !rule.tokens.empty()) {
// if we see a newline while we are parsing a rule definition we can complete the rule
AddRule(rule_name, std::move(rule));
rule_name = string_t();
rule.Clear();
// look for the subsequent rule
parse_state = PEGParseState::RULE_NAME;
c++;
continue;
}
if (StringUtil::CharacterIsSpace(grammar[c])) {
// skip whitespace
c++;
continue;
}
switch (parse_state) {
case PEGParseState::RULE_NAME: {
// look for alpha-numerics
idx_t start_pos = c;
if (grammar[c] == '%') {
// rules can start with % (%whitespace)
c++;
}
while (grammar[c] && StringUtil::CharacterIsAlphaNumeric(grammar[c])) {
c++;
}
if (c == start_pos) {
throw InternalException("Failed to parse grammar - expected an alpha-numeric rule name (pos %d)", c);
}
rule_name = string_t(grammar + start_pos, c - start_pos);
rule.Clear();
parse_state = PEGParseState::RULE_SEPARATOR;
break;
}
case PEGParseState::RULE_SEPARATOR: {
if (grammar[c] == '(') {
if (!rule.parameters.empty()) {
throw InternalException("Failed to parse grammar - multiple parameters at position %d", c);
}
// parameter
c++;
idx_t parameter_start = c;
while (grammar[c] && StringUtil::CharacterIsAlphaNumeric(grammar[c])) {
c++;
}
if (parameter_start == c) {
throw InternalException("Failed to parse grammar - expected a parameter at position %d", c);
}
rule.parameters.insert(
make_pair(string_t(grammar + parameter_start, c - parameter_start), rule.parameters.size()));
if (grammar[c] != ')') {
throw InternalException("Failed to parse grammar - expected closing bracket at position %d", c);
}
c++;
} else {
if (grammar[c] != '<' || grammar[c + 1] != '-') {
throw InternalException("Failed to parse grammar - expected a rule definition (<-) (pos %d)", c);
}
c += 2;
parse_state = PEGParseState::RULE_DEFINITION;
}
break;
}
case PEGParseState::RULE_DEFINITION: {
// we parse either:
// (1) a literal ('Keyword'i)
// (2) a rule reference (Rule)
// (3) an operator ( '(' '/' '?' '*' ')')
in_or_clause = false;
if (grammar[c] == '\'') {
// parse literal
c++;
idx_t literal_start = c;
while (grammar[c] && grammar[c] != '\'') {
if (grammar[c] == '\\') {
// escape
c++;
}
c++;
}
if (!grammar[c]) {
throw InternalException("Failed to parse grammar - did not find closing ' (pos %d)", c);
}
PEGToken token;
token.text = string_t(grammar + literal_start, c - literal_start);
token.type = PEGTokenType::LITERAL;
rule.tokens.push_back(token);
c++;
} else if (StringUtil::CharacterIsAlphaNumeric(grammar[c])) {
// alphanumeric character - this is a rule reference
idx_t rule_start = c;
while (grammar[c] && StringUtil::CharacterIsAlphaNumeric(grammar[c])) {
c++;
}
PEGToken token;
token.text = string_t(grammar + rule_start, c - rule_start);
if (grammar[c] == '(') {
// this is a function call
c++;
bracket_count++;
token.type = PEGTokenType::FUNCTION_CALL;
} else {
token.type = PEGTokenType::REFERENCE;
}
rule.tokens.push_back(token);
} else if (grammar[c] == '[' || grammar[c] == '<') {
// regular expression- [^"] or <...>
idx_t rule_start = c;
char final_char = grammar[c] == '[' ? ']' : '>';
while (grammar[c] && grammar[c] != final_char) {
if (grammar[c] == '\\') {
// handle escapes
c++;
}
if (grammar[c]) {
c++;
}
}
c++;
PEGToken token;
token.text = string_t(grammar + rule_start, c - rule_start);
token.type = PEGTokenType::REGEX;
rule.tokens.push_back(token);
} else if (IsPEGOperator(grammar[c])) {
if (grammar[c] == '(') {
bracket_count++;
} else if (grammar[c] == ')') {
if (bracket_count == 0) {
throw InternalException("Failed to parse grammar - unclosed bracket at position %d in rule %s",
c, rule_name.GetString());
}
bracket_count--;
} else if (grammar[c] == '/') {
in_or_clause = true;
}
// operator - operators are always length 1
PEGToken token;
token.text = string_t(grammar + c, 1);
token.type = PEGTokenType::OPERATOR;
rule.tokens.push_back(token);
c++;
} else {
throw InternalException("Unrecognized rule contents in rule %s (character %s)", rule_name.GetString(),
string(1, grammar[c]));
}
}
default:
break;
}
if (!grammar[c]) {
break;
}
}
if (parse_state == PEGParseState::RULE_SEPARATOR) {
throw InternalException("Failed to parse grammar - rule %s does not have a definition", rule_name.GetString());
}
if (parse_state == PEGParseState::RULE_DEFINITION) {
if (rule.tokens.empty()) {
throw InternalException("Failed to parse grammar - rule %s is empty", rule_name.GetString());
}
AddRule(rule_name, std::move(rule));
}
}
} // namespace duckdb