Files
email-tracker/external/duckdb/extension/autocomplete/tokenizer.cpp
2025-10-24 19:21:19 -05:00

395 lines
8.8 KiB
C++

#include "tokenizer.hpp"
#include "duckdb/common/printer.hpp"
#include "duckdb/common/string_util.hpp"
namespace duckdb {
BaseTokenizer::BaseTokenizer(const string &sql, vector<MatcherToken> &tokens) : sql(sql), tokens(tokens) {
}
static bool OperatorEquals(const char *str, const char *op, idx_t len, idx_t &op_len) {
for (idx_t i = 0; i < len; i++) {
if (str[i] != op[i]) {
return false;
}
}
op_len = len;
return true;
}
bool BaseTokenizer::IsSpecialOperator(idx_t pos, idx_t &op_len) const {
const char *op_start = sql.c_str() + pos;
if (pos + 2 < sql.size()) {
if (OperatorEquals(op_start, "->>", 3, op_len)) {
return true;
}
}
if (pos + 1 >= sql.size()) {
// 2-byte operators are out-of-bounds
return false;
}
if (OperatorEquals(op_start, "::", 2, op_len)) {
return true;
}
if (OperatorEquals(op_start, ":=", 2, op_len)) {
return true;
}
if (OperatorEquals(op_start, "->", 2, op_len)) {
return true;
}
if (OperatorEquals(op_start, "**", 2, op_len)) {
return true;
}
if (OperatorEquals(op_start, "//", 2, op_len)) {
return true;
}
return false;
}
bool BaseTokenizer::IsSingleByteOperator(char c) {
switch (c) {
case '(':
case ')':
case '{':
case '}':
case '[':
case ']':
case ',':
case '?':
case '$':
case '+':
case '-':
case '#':
return true;
default:
return false;
}
}
bool BaseTokenizer::CharacterIsInitialNumber(char c) {
if (c >= '0' && c <= '9') {
return true;
}
return c == '.';
}
bool BaseTokenizer::CharacterIsNumber(char c) {
if (CharacterIsInitialNumber(c)) {
return true;
}
switch (c) {
case 'e': // exponents
case 'E':
case '-':
case '+':
case '_':
return true;
default:
return false;
}
}
bool BaseTokenizer::CharacterIsControlFlow(char c) {
switch (c) {
case '\'':
case '-':
case ';':
case '"':
case '.':
return true;
default:
return false;
}
}
bool BaseTokenizer::CharacterIsKeyword(char c) {
if (IsSingleByteOperator(c)) {
return false;
}
if (StringUtil::CharacterIsOperator(c)) {
return false;
}
if (StringUtil::CharacterIsSpace(c)) {
return false;
}
if (CharacterIsControlFlow(c)) {
return false;
}
return true;
}
bool BaseTokenizer::CharacterIsOperator(char c) {
if (IsSingleByteOperator(c)) {
return false;
}
if (CharacterIsControlFlow(c)) {
return false;
}
return StringUtil::CharacterIsOperator(c);
}
void BaseTokenizer::PushToken(idx_t start, idx_t end) {
if (start >= end) {
return;
}
string last_token = sql.substr(start, end - start);
tokens.emplace_back(std::move(last_token), start);
}
bool BaseTokenizer::IsValidDollarTagCharacter(char c) {
if (c >= 'A' && c <= 'Z') {
return true;
}
if (c >= 'a' && c <= 'z') {
return true;
}
if (c >= '\200' && c <= '\377') {
return true;
}
return false;
}
bool BaseTokenizer::TokenizeInput() {
auto state = TokenizeState::STANDARD;
idx_t last_pos = 0;
string dollar_quote_marker;
for (idx_t i = 0; i < sql.size(); i++) {
auto c = sql[i];
switch (state) {
case TokenizeState::STANDARD:
if (c == '\'') {
state = TokenizeState::STRING_LITERAL;
last_pos = i;
break;
}
if (c == '"') {
state = TokenizeState::QUOTED_IDENTIFIER;
last_pos = i;
break;
}
if (c == ';') {
// end of statement
OnStatementEnd(i);
last_pos = i + 1;
break;
}
if (c == '$') {
// Dollar-quoted string statement
if (i + 1 >= sql.size()) {
// We need more than a single dollar
break;
}
if (sql[i + 1] >= '0' && sql[i + 1] <= '9') {
// $[numeric] is a parameter, not a dollar-quoted string
break;
}
// Dollar-quoted string
last_pos = i;
// Scan until next $
idx_t next_dollar = 0;
for (idx_t idx = i + 1; idx < sql.size(); idx++) {
if (sql[idx] == '$') {
next_dollar = idx;
break;
}
if (!IsValidDollarTagCharacter(sql[idx])) {
break;
}
}
if (next_dollar == 0) {
break;
}
state = TokenizeState::DOLLAR_QUOTED_STRING;
last_pos = i;
i = next_dollar;
if (i < sql.size()) {
// Found a complete marker, store it.
idx_t marker_start = last_pos + 1;
dollar_quote_marker = string(sql.begin() + marker_start, sql.begin() + i);
}
break;
}
if (c == '-' && i + 1 < sql.size() && sql[i + 1] == '-') {
i++;
state = TokenizeState::SINGLE_LINE_COMMENT;
break;
}
if (c == '/' && i + 1 < sql.size() && sql[i + 1] == '*') {
i++;
state = TokenizeState::MULTI_LINE_COMMENT;
break;
}
if (StringUtil::CharacterIsSpace(c)) {
// space character - skip
last_pos = i + 1;
break;
}
idx_t op_len;
if (IsSpecialOperator(i, op_len)) {
// special operator - push the special operator
tokens.emplace_back(sql.substr(i, op_len), last_pos);
i += op_len - 1;
last_pos = i + 1;
break;
}
if (IsSingleByteOperator(c)) {
// single-byte operator - directly push the token
tokens.emplace_back(string(1, c), last_pos);
last_pos = i + 1;
break;
}
if (CharacterIsInitialNumber(c)) {
// parse a numeric literal
state = TokenizeState::NUMERIC;
last_pos = i;
break;
}
if (StringUtil::CharacterIsOperator(c)) {
state = TokenizeState::OPERATOR;
last_pos = i;
break;
}
state = TokenizeState::KEYWORD;
last_pos = i;
break;
case TokenizeState::NUMERIC:
// numeric literal - check if this is still numeric
if (!CharacterIsNumber(c)) {
// not a number - return to standard state
// number must END with initial number
// i.e. we accept "_" in numbers (1_1), but "1_" is tokenized as the number "1" followed by the keyword
// "_" backtrack until it does
while (!CharacterIsInitialNumber(sql[i - 1])) {
i--;
}
PushToken(last_pos, i);
state = TokenizeState::STANDARD;
last_pos = i;
i--;
}
break;
case TokenizeState::OPERATOR:
// operator literal - check if this is still an operator
if (!CharacterIsOperator(c)) {
// not an operator - return to standard state
PushToken(last_pos, i);
state = TokenizeState::STANDARD;
last_pos = i;
i--;
}
break;
case TokenizeState::KEYWORD:
// keyword - check if this is still a keyword
if (!CharacterIsKeyword(c)) {
// not a keyword - return to standard state
PushToken(last_pos, i);
state = TokenizeState::STANDARD;
last_pos = i;
i--;
}
break;
case TokenizeState::STRING_LITERAL:
if (c == '\'') {
if (i + 1 < sql.size() && sql[i + 1] == '\'') {
// escaped - skip escape
i++;
} else {
PushToken(last_pos, i + 1);
last_pos = i + 1;
state = TokenizeState::STANDARD;
}
}
break;
case TokenizeState::QUOTED_IDENTIFIER:
if (c == '"') {
if (i + 1 < sql.size() && sql[i + 1] == '"') {
// escaped - skip escape
i++;
} else {
PushToken(last_pos, i + 1);
last_pos = i + 1;
state = TokenizeState::STANDARD;
}
}
break;
case TokenizeState::SINGLE_LINE_COMMENT:
if (c == '\n' || c == '\r') {
last_pos = i + 1;
state = TokenizeState::STANDARD;
}
break;
case TokenizeState::MULTI_LINE_COMMENT:
if (c == '*' && i + 1 < sql.size() && sql[i + 1] == '/') {
i++;
last_pos = i + 1;
state = TokenizeState::STANDARD;
}
break;
case TokenizeState::DOLLAR_QUOTED_STRING: {
// Dollar-quoted string -- all that will get us out is a $[marker]$
if (c != '$') {
break;
}
if (i + 1 >= sql.size()) {
// No room for the final dollar
break;
}
// Skip to the next dollar symbol
idx_t start = i + 1;
idx_t end = start;
while (end < sql.size() && sql[end] != '$') {
end++;
}
if (end >= sql.size()) {
// No final dollar, continue as normal
break;
}
if (end - start != dollar_quote_marker.size()) {
// Length mismatch, cannot match
break;
}
if (sql.compare(start, dollar_quote_marker.size(), dollar_quote_marker) != 0) {
// marker mismatch
break;
}
// Marker found! Revert to standard state
size_t full_marker_len = dollar_quote_marker.size() + 2;
string quoted = sql.substr(last_pos, (start + dollar_quote_marker.size() + 1) - last_pos);
quoted = "'" + quoted.substr(full_marker_len, quoted.size() - 2 * full_marker_len) + "'";
tokens.emplace_back(quoted, full_marker_len);
dollar_quote_marker = string();
state = TokenizeState::STANDARD;
i = end;
last_pos = i + 1;
break;
}
default:
throw InternalException("unrecognized tokenize state");
}
}
// finished processing - check the final state
switch (state) {
case TokenizeState::STRING_LITERAL:
last_pos++;
break;
case TokenizeState::SINGLE_LINE_COMMENT:
case TokenizeState::MULTI_LINE_COMMENT:
// no suggestions in comments
return false;
default:
break;
}
string last_word = sql.substr(last_pos, sql.size() - last_pos);
OnLastToken(state, std::move(last_word), last_pos);
return true;
}
void BaseTokenizer::OnStatementEnd(idx_t pos) {
tokens.clear();
}
} // namespace duckdb