395 lines
8.8 KiB
C++
395 lines
8.8 KiB
C++
#include "tokenizer.hpp"
|
|
|
|
#include "duckdb/common/printer.hpp"
|
|
#include "duckdb/common/string_util.hpp"
|
|
|
|
namespace duckdb {
|
|
|
|
BaseTokenizer::BaseTokenizer(const string &sql, vector<MatcherToken> &tokens) : sql(sql), tokens(tokens) {
|
|
}
|
|
|
|
static bool OperatorEquals(const char *str, const char *op, idx_t len, idx_t &op_len) {
|
|
for (idx_t i = 0; i < len; i++) {
|
|
if (str[i] != op[i]) {
|
|
return false;
|
|
}
|
|
}
|
|
op_len = len;
|
|
return true;
|
|
}
|
|
|
|
bool BaseTokenizer::IsSpecialOperator(idx_t pos, idx_t &op_len) const {
|
|
const char *op_start = sql.c_str() + pos;
|
|
if (pos + 2 < sql.size()) {
|
|
if (OperatorEquals(op_start, "->>", 3, op_len)) {
|
|
return true;
|
|
}
|
|
}
|
|
if (pos + 1 >= sql.size()) {
|
|
// 2-byte operators are out-of-bounds
|
|
return false;
|
|
}
|
|
if (OperatorEquals(op_start, "::", 2, op_len)) {
|
|
return true;
|
|
}
|
|
if (OperatorEquals(op_start, ":=", 2, op_len)) {
|
|
return true;
|
|
}
|
|
if (OperatorEquals(op_start, "->", 2, op_len)) {
|
|
return true;
|
|
}
|
|
if (OperatorEquals(op_start, "**", 2, op_len)) {
|
|
return true;
|
|
}
|
|
if (OperatorEquals(op_start, "//", 2, op_len)) {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool BaseTokenizer::IsSingleByteOperator(char c) {
|
|
switch (c) {
|
|
case '(':
|
|
case ')':
|
|
case '{':
|
|
case '}':
|
|
case '[':
|
|
case ']':
|
|
case ',':
|
|
case '?':
|
|
case '$':
|
|
case '+':
|
|
case '-':
|
|
case '#':
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool BaseTokenizer::CharacterIsInitialNumber(char c) {
|
|
if (c >= '0' && c <= '9') {
|
|
return true;
|
|
}
|
|
return c == '.';
|
|
}
|
|
|
|
bool BaseTokenizer::CharacterIsNumber(char c) {
|
|
if (CharacterIsInitialNumber(c)) {
|
|
return true;
|
|
}
|
|
switch (c) {
|
|
case 'e': // exponents
|
|
case 'E':
|
|
case '-':
|
|
case '+':
|
|
case '_':
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool BaseTokenizer::CharacterIsControlFlow(char c) {
|
|
switch (c) {
|
|
case '\'':
|
|
case '-':
|
|
case ';':
|
|
case '"':
|
|
case '.':
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool BaseTokenizer::CharacterIsKeyword(char c) {
|
|
if (IsSingleByteOperator(c)) {
|
|
return false;
|
|
}
|
|
if (StringUtil::CharacterIsOperator(c)) {
|
|
return false;
|
|
}
|
|
if (StringUtil::CharacterIsSpace(c)) {
|
|
return false;
|
|
}
|
|
if (CharacterIsControlFlow(c)) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool BaseTokenizer::CharacterIsOperator(char c) {
|
|
if (IsSingleByteOperator(c)) {
|
|
return false;
|
|
}
|
|
if (CharacterIsControlFlow(c)) {
|
|
return false;
|
|
}
|
|
return StringUtil::CharacterIsOperator(c);
|
|
}
|
|
|
|
void BaseTokenizer::PushToken(idx_t start, idx_t end) {
|
|
if (start >= end) {
|
|
return;
|
|
}
|
|
string last_token = sql.substr(start, end - start);
|
|
tokens.emplace_back(std::move(last_token), start);
|
|
}
|
|
|
|
bool BaseTokenizer::IsValidDollarTagCharacter(char c) {
|
|
if (c >= 'A' && c <= 'Z') {
|
|
return true;
|
|
}
|
|
if (c >= 'a' && c <= 'z') {
|
|
return true;
|
|
}
|
|
if (c >= '\200' && c <= '\377') {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool BaseTokenizer::TokenizeInput() {
|
|
auto state = TokenizeState::STANDARD;
|
|
|
|
idx_t last_pos = 0;
|
|
string dollar_quote_marker;
|
|
for (idx_t i = 0; i < sql.size(); i++) {
|
|
auto c = sql[i];
|
|
switch (state) {
|
|
case TokenizeState::STANDARD:
|
|
if (c == '\'') {
|
|
state = TokenizeState::STRING_LITERAL;
|
|
last_pos = i;
|
|
break;
|
|
}
|
|
if (c == '"') {
|
|
state = TokenizeState::QUOTED_IDENTIFIER;
|
|
last_pos = i;
|
|
break;
|
|
}
|
|
if (c == ';') {
|
|
// end of statement
|
|
OnStatementEnd(i);
|
|
last_pos = i + 1;
|
|
break;
|
|
}
|
|
if (c == '$') {
|
|
// Dollar-quoted string statement
|
|
if (i + 1 >= sql.size()) {
|
|
// We need more than a single dollar
|
|
break;
|
|
}
|
|
if (sql[i + 1] >= '0' && sql[i + 1] <= '9') {
|
|
// $[numeric] is a parameter, not a dollar-quoted string
|
|
break;
|
|
}
|
|
// Dollar-quoted string
|
|
last_pos = i;
|
|
// Scan until next $
|
|
idx_t next_dollar = 0;
|
|
for (idx_t idx = i + 1; idx < sql.size(); idx++) {
|
|
if (sql[idx] == '$') {
|
|
next_dollar = idx;
|
|
break;
|
|
}
|
|
if (!IsValidDollarTagCharacter(sql[idx])) {
|
|
break;
|
|
}
|
|
}
|
|
if (next_dollar == 0) {
|
|
break;
|
|
}
|
|
state = TokenizeState::DOLLAR_QUOTED_STRING;
|
|
last_pos = i;
|
|
i = next_dollar;
|
|
if (i < sql.size()) {
|
|
// Found a complete marker, store it.
|
|
idx_t marker_start = last_pos + 1;
|
|
dollar_quote_marker = string(sql.begin() + marker_start, sql.begin() + i);
|
|
}
|
|
break;
|
|
}
|
|
if (c == '-' && i + 1 < sql.size() && sql[i + 1] == '-') {
|
|
i++;
|
|
state = TokenizeState::SINGLE_LINE_COMMENT;
|
|
break;
|
|
}
|
|
if (c == '/' && i + 1 < sql.size() && sql[i + 1] == '*') {
|
|
i++;
|
|
state = TokenizeState::MULTI_LINE_COMMENT;
|
|
break;
|
|
}
|
|
if (StringUtil::CharacterIsSpace(c)) {
|
|
// space character - skip
|
|
last_pos = i + 1;
|
|
break;
|
|
}
|
|
idx_t op_len;
|
|
if (IsSpecialOperator(i, op_len)) {
|
|
// special operator - push the special operator
|
|
tokens.emplace_back(sql.substr(i, op_len), last_pos);
|
|
i += op_len - 1;
|
|
last_pos = i + 1;
|
|
break;
|
|
}
|
|
if (IsSingleByteOperator(c)) {
|
|
// single-byte operator - directly push the token
|
|
tokens.emplace_back(string(1, c), last_pos);
|
|
last_pos = i + 1;
|
|
break;
|
|
}
|
|
if (CharacterIsInitialNumber(c)) {
|
|
// parse a numeric literal
|
|
state = TokenizeState::NUMERIC;
|
|
last_pos = i;
|
|
break;
|
|
}
|
|
if (StringUtil::CharacterIsOperator(c)) {
|
|
state = TokenizeState::OPERATOR;
|
|
last_pos = i;
|
|
break;
|
|
}
|
|
state = TokenizeState::KEYWORD;
|
|
last_pos = i;
|
|
break;
|
|
case TokenizeState::NUMERIC:
|
|
// numeric literal - check if this is still numeric
|
|
if (!CharacterIsNumber(c)) {
|
|
// not a number - return to standard state
|
|
// number must END with initial number
|
|
// i.e. we accept "_" in numbers (1_1), but "1_" is tokenized as the number "1" followed by the keyword
|
|
// "_" backtrack until it does
|
|
while (!CharacterIsInitialNumber(sql[i - 1])) {
|
|
i--;
|
|
}
|
|
PushToken(last_pos, i);
|
|
state = TokenizeState::STANDARD;
|
|
last_pos = i;
|
|
i--;
|
|
}
|
|
break;
|
|
case TokenizeState::OPERATOR:
|
|
// operator literal - check if this is still an operator
|
|
if (!CharacterIsOperator(c)) {
|
|
// not an operator - return to standard state
|
|
PushToken(last_pos, i);
|
|
state = TokenizeState::STANDARD;
|
|
last_pos = i;
|
|
i--;
|
|
}
|
|
break;
|
|
case TokenizeState::KEYWORD:
|
|
// keyword - check if this is still a keyword
|
|
if (!CharacterIsKeyword(c)) {
|
|
// not a keyword - return to standard state
|
|
PushToken(last_pos, i);
|
|
state = TokenizeState::STANDARD;
|
|
last_pos = i;
|
|
i--;
|
|
}
|
|
break;
|
|
case TokenizeState::STRING_LITERAL:
|
|
if (c == '\'') {
|
|
if (i + 1 < sql.size() && sql[i + 1] == '\'') {
|
|
// escaped - skip escape
|
|
i++;
|
|
} else {
|
|
PushToken(last_pos, i + 1);
|
|
last_pos = i + 1;
|
|
state = TokenizeState::STANDARD;
|
|
}
|
|
}
|
|
break;
|
|
case TokenizeState::QUOTED_IDENTIFIER:
|
|
if (c == '"') {
|
|
if (i + 1 < sql.size() && sql[i + 1] == '"') {
|
|
// escaped - skip escape
|
|
i++;
|
|
} else {
|
|
PushToken(last_pos, i + 1);
|
|
last_pos = i + 1;
|
|
state = TokenizeState::STANDARD;
|
|
}
|
|
}
|
|
break;
|
|
case TokenizeState::SINGLE_LINE_COMMENT:
|
|
if (c == '\n' || c == '\r') {
|
|
last_pos = i + 1;
|
|
state = TokenizeState::STANDARD;
|
|
}
|
|
break;
|
|
case TokenizeState::MULTI_LINE_COMMENT:
|
|
if (c == '*' && i + 1 < sql.size() && sql[i + 1] == '/') {
|
|
i++;
|
|
last_pos = i + 1;
|
|
state = TokenizeState::STANDARD;
|
|
}
|
|
break;
|
|
case TokenizeState::DOLLAR_QUOTED_STRING: {
|
|
// Dollar-quoted string -- all that will get us out is a $[marker]$
|
|
if (c != '$') {
|
|
break;
|
|
}
|
|
if (i + 1 >= sql.size()) {
|
|
// No room for the final dollar
|
|
break;
|
|
}
|
|
// Skip to the next dollar symbol
|
|
idx_t start = i + 1;
|
|
idx_t end = start;
|
|
while (end < sql.size() && sql[end] != '$') {
|
|
end++;
|
|
}
|
|
if (end >= sql.size()) {
|
|
// No final dollar, continue as normal
|
|
break;
|
|
}
|
|
if (end - start != dollar_quote_marker.size()) {
|
|
// Length mismatch, cannot match
|
|
break;
|
|
}
|
|
if (sql.compare(start, dollar_quote_marker.size(), dollar_quote_marker) != 0) {
|
|
// marker mismatch
|
|
break;
|
|
}
|
|
// Marker found! Revert to standard state
|
|
size_t full_marker_len = dollar_quote_marker.size() + 2;
|
|
string quoted = sql.substr(last_pos, (start + dollar_quote_marker.size() + 1) - last_pos);
|
|
quoted = "'" + quoted.substr(full_marker_len, quoted.size() - 2 * full_marker_len) + "'";
|
|
tokens.emplace_back(quoted, full_marker_len);
|
|
dollar_quote_marker = string();
|
|
state = TokenizeState::STANDARD;
|
|
i = end;
|
|
last_pos = i + 1;
|
|
break;
|
|
}
|
|
default:
|
|
throw InternalException("unrecognized tokenize state");
|
|
}
|
|
}
|
|
|
|
// finished processing - check the final state
|
|
switch (state) {
|
|
case TokenizeState::STRING_LITERAL:
|
|
last_pos++;
|
|
break;
|
|
case TokenizeState::SINGLE_LINE_COMMENT:
|
|
case TokenizeState::MULTI_LINE_COMMENT:
|
|
// no suggestions in comments
|
|
return false;
|
|
default:
|
|
break;
|
|
}
|
|
string last_word = sql.substr(last_pos, sql.size() - last_pos);
|
|
OnLastToken(state, std::move(last_word), last_pos);
|
|
return true;
|
|
}
|
|
|
|
void BaseTokenizer::OnStatementEnd(idx_t pos) {
|
|
tokens.clear();
|
|
}
|
|
|
|
} // namespace duckdb
|