should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

13
external/duckdb/third_party/re2/AUTHORS vendored Normal file
View File

@@ -0,0 +1,13 @@
# This is the official list of RE2 authors for copyright purposes.
# This file is distinct from the CONTRIBUTORS files.
# See the latter for an explanation.
# Names should be added to this file as
# Name or Organization <email address>
# The email address is not required for organizations.
# Please keep the list sorted.
Google Inc.
Samsung Electronics
Stefano Rivera <stefano.rivera@gmail.com>

View File

@@ -0,0 +1,104 @@
# Copyright 2015 The RE2 Authors. All Rights Reserved. Use of this source code
# is governed by a BSD-style license that can be found in the LICENSE file.
cmake_minimum_required(VERSION 3.5...3.29)
if(POLICY CMP0048)
cmake_policy(SET CMP0048 NEW)
endif()
if(POLICY CMP0063)
cmake_policy(SET CMP0063 NEW)
endif()
project(RE2 CXX)
set(CMAKE_CXX_VISIBILITY_PRESET hidden)
include(CTest)
# CMake seems to have no way to enable/disable testing per subproject, so we
# provide an option similar to BUILD_TESTING, but just for RE2.
option(RE2_BUILD_TESTING "enable testing for RE2" OFF)
set(EXTRA_TARGET_LINK_LIBRARIES)
if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
if(MSVC_VERSION LESS 1900)
message(FATAL_ERROR "you need Visual Studio 2015 or later")
endif()
if(BUILD_SHARED_LIBS)
# See http://www.kitware.com/blog/home/post/939 for details.
cmake_minimum_required(VERSION 3.4)
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()
# CMake defaults to /W3, but some users like /W4 (or /Wall) and /WX, so we
# disable various warnings that aren't particularly helpful.
add_compile_options(/wd4100
/wd4201
/wd4456
/wd4457
/wd4702
/wd4815)
# Without a byte order mark (BOM), Visual Studio assumes that the source file
# is encoded using the current user code page, so we specify UTF-8.
add_compile_options(/utf-8)
elseif(CYGWIN OR MINGW)
# See https://stackoverflow.com/questions/38139631 for details.
add_compile_options(-std=gnu++11)
elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
add_compile_options(-std=c++11)
endif()
add_definitions(-DRE2_ON_VALGRIND)
if(WIN32)
add_definitions(-DUNICODE
-D_UNICODE
-DSTRICT
-DNOMINMAX)
add_definitions(-D_CRT_SECURE_NO_WARNINGS -D_SCL_SECURE_NO_WARNINGS)
elseif(UNIX)
# add_compile_options(-pthread) list(APPEND EXTRA_TARGET_LINK_LIBRARIES
# -pthread)
endif()
set(RE2_SOURCES
re2/bitmap256.cc
re2/compile.cc
re2/bitstate.cc
re2/dfa.cc
re2/filtered_re2.cc
re2/mimics_pcre.cc
re2/nfa.cc
re2/onepass.cc
re2/parse.cc
re2/perl_groups.cc
re2/prefilter.cc
re2/prefilter_tree.cc
re2/prog.cc
re2/re2.cc
re2/regexp.cc
re2/set.cc
re2/simplify.cc
re2/stringpiece.cc
re2/tostring.cc
re2/unicode_casefold.cc
re2/unicode_groups.cc
util/rune.cc
util/strutil.cc
)
add_library(duckdb_re2 STATIC ${RE2_SOURCES})
target_include_directories(
duckdb_re2
PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
install(TARGETS duckdb_re2
EXPORT "${DUCKDB_EXPORT_SET}"
LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
ARCHIVE DESTINATION "${INSTALL_LIB_DIR}")
disable_target_warnings(duckdb_re2)

27
external/duckdb/third_party/re2/LICENSE vendored Normal file
View File

@@ -0,0 +1,27 @@
// Copyright (c) 2009 The RE2 Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -0,0 +1,44 @@
// Copyright 2023 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/bitmap256.h"
#include <stdint.h>
#include "util/util.h"
#include "util/logging.h"
namespace duckdb_re2 {
int Bitmap256::FindNextSetBit(int c) const {
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
// Check the word that contains the bit. Mask out any lower bits.
int i = c / 64;
uint64_t word = words_[i] & (~uint64_t{0} << (c % 64));
if (word != 0)
return (i * 64) + FindLSBSet(word);
// Check any following words.
i++;
switch (i) {
case 1:
if (words_[1] != 0)
return (1 * 64) + FindLSBSet(words_[1]);
FALLTHROUGH_INTENDED;
case 2:
if (words_[2] != 0)
return (2 * 64) + FindLSBSet(words_[2]);
FALLTHROUGH_INTENDED;
case 3:
if (words_[3] != 0)
return (3 * 64) + FindLSBSet(words_[3]);
FALLTHROUGH_INTENDED;
default:
return -1;
}
}
} // namespace re2

View File

@@ -0,0 +1,86 @@
// Copyright 2016 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_BITMAP256_H_
#define RE2_BITMAP256_H_
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <stdint.h>
#include <string.h>
#include "util/logging.h"
namespace duckdb_re2 {
class Bitmap256 {
public:
Bitmap256() {
Clear();
}
// Clears all of the bits.
void Clear() {
memset(words_, 0, sizeof words_);
}
// Tests the bit with index c.
bool Test(int c) const {
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
return (words_[c / 64] & (uint64_t{1} << (c % 64))) != 0;
}
// Sets the bit with index c.
void Set(int c) {
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
words_[c / 64] |= (uint64_t{1} << (c % 64));
}
// Finds the next non-zero bit with index >= c.
// Returns -1 if no such bit exists.
int FindNextSetBit(int c) const;
private:
// Finds the least significant non-zero bit in n.
static int FindLSBSet(uint64_t n) {
DCHECK_NE(n, 0);
#if defined(__GNUC__)
return __builtin_ctzll(n);
#elif defined(_MSC_VER) && defined(_M_X64)
unsigned long c;
_BitScanForward64(&c, n);
return static_cast<int>(c);
#elif defined(_MSC_VER) && defined(_M_IX86)
unsigned long c;
if (static_cast<uint32_t>(n) != 0) {
_BitScanForward(&c, static_cast<uint32_t>(n));
return static_cast<int>(c);
} else {
_BitScanForward(&c, static_cast<uint32_t>(n >> 32));
return static_cast<int>(c) + 32;
}
#else
int c = 63;
for (int shift = 1 << 5; shift != 0; shift >>= 1) {
uint64_t word = n << shift;
if (word != 0) {
n = word;
c -= shift;
}
}
return c;
#endif
}
uint64_t words_[4];
};
} // namespace re2
#endif // RE2_BITMAP256_H_

View File

@@ -0,0 +1,385 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
// Prog::SearchBitState is a regular expression search with submatch
// tracking for small regular expressions and texts. Similarly to
// testing/backtrack.cc, it allocates a bitmap with (count of
// lists) * (length of text) bits to make sure it never explores the
// same (instruction list, character position) multiple times. This
// limits the search to run in time linear in the length of the text.
//
// Unlike testing/backtrack.cc, SearchBitState is not recursive
// on the text.
//
// SearchBitState is a fast replacement for the NFA code on small
// regexps and texts when SearchOnePass cannot be used.
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <limits>
#include <utility>
#include "util/logging.h"
#include "re2/pod_array.h"
#include "re2/prog.h"
#include "re2/regexp.h"
namespace duckdb_re2 {
struct Job {
int id;
int rle; // run length encoding
const char* p;
};
class BitState {
public:
explicit BitState(Prog* prog);
// The usual Search prototype.
// Can only call Search once per BitState.
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
private:
inline bool ShouldVisit(int id, const char* p);
void Push(int id, const char* p);
void GrowStack();
bool TrySearch(int id, const char* p);
// Search parameters
Prog* prog_; // program being run
StringPiece text_; // text being searched
StringPiece context_; // greater context of text being searched
bool anchored_; // whether search is anchored at text.begin()
bool longest_; // whether search wants leftmost-longest match
bool endmatch_; // whether match must end at text.end()
StringPiece* submatch_; // submatches to fill in
int nsubmatch_; // # of submatches to fill in
// Search state
static constexpr int kVisitedBits = 64;
PODArray<uint64_t> visited_; // bitmap: (list ID, char*) pairs visited
PODArray<const char*> cap_; // capture registers
PODArray<Job> job_; // stack of text positions to explore
int njob_; // stack size
BitState(const BitState&) = delete;
BitState& operator=(const BitState&) = delete;
};
BitState::BitState(Prog* prog)
: prog_(prog),
anchored_(false),
longest_(false),
endmatch_(false),
submatch_(NULL),
nsubmatch_(0),
njob_(0) {
}
// Given id, which *must* be a list head, we can look up its list ID.
// Then the question is: Should the search visit the (list ID, p) pair?
// If so, remember that it was visited so that the next time,
// we don't repeat the visit.
bool BitState::ShouldVisit(int id, const char* p) {
int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) +
static_cast<int>(p-text_.data());
if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1))))
return false;
visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1));
return true;
}
// Grow the stack.
void BitState::GrowStack() {
PODArray<Job> tmp(2*job_.size());
memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
job_ = std::move(tmp);
}
// Push (id, p) onto the stack, growing it if necessary.
void BitState::Push(int id, const char* p) {
if (njob_ >= job_.size()) {
GrowStack();
if (njob_ >= job_.size()) {
LOG(DFATAL) << "GrowStack() failed: "
<< "njob_ = " << njob_ << ", "
<< "job_.size() = " << job_.size();
return;
}
}
// If id < 0, it's undoing a Capture,
// so we mustn't interfere with that.
if (id >= 0 && njob_ > 0) {
Job* top = &job_[njob_-1];
if (id == top->id &&
p == top->p + top->rle + 1 &&
top->rle < std::numeric_limits<int>::max()) {
++top->rle;
return;
}
}
Job* top = &job_[njob_++];
top->id = id;
top->rle = 0;
top->p = p;
}
// Try a search from instruction id0 in state p0.
// Return whether it succeeded.
bool BitState::TrySearch(int id0, const char* p0) {
bool matched = false;
const char* end = text_.data() + text_.size();
njob_ = 0;
// Push() no longer checks ShouldVisit(),
// so we must perform the check ourselves.
if (ShouldVisit(id0, p0))
Push(id0, p0);
while (njob_ > 0) {
// Pop job off stack.
--njob_;
int id = job_[njob_].id;
int& rle = job_[njob_].rle;
const char* p = job_[njob_].p;
if (id < 0) {
// Undo the Capture.
cap_[prog_->inst(-id)->cap()] = p;
continue;
}
if (rle > 0) {
p += rle;
// Revivify job on stack.
--rle;
++njob_;
}
Loop:
// Visit id, p.
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode: " << ip->opcode();
return false;
case kInstFail:
break;
case kInstAltMatch:
if (ip->greedy(prog_)) {
// out1 is the Match instruction.
id = ip->out1();
p = end;
goto Loop;
}
if (longest_) {
// ip must be non-greedy...
// out is the Match instruction.
id = ip->out();
p = end;
goto Loop;
}
goto Next;
case kInstByteRange: {
int c = -1;
if (p < end)
c = *p & 0xFF;
if (!ip->Matches(c))
goto Next;
if (ip->hint() != 0)
Push(id+ip->hint(), p); // try the next when we're done
id = ip->out();
p++;
goto CheckAndLoop;
}
case kInstCapture:
if (!ip->last())
Push(id+1, p); // try the next when we're done
if (0 <= ip->cap() && ip->cap() < cap_.size()) {
// Capture p to register, but save old value first.
Push(-id, cap_[ip->cap()]); // undo when we're done
cap_[ip->cap()] = p;
}
id = ip->out();
goto CheckAndLoop;
case kInstEmptyWidth:
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
goto Next;
if (!ip->last())
Push(id+1, p); // try the next when we're done
id = ip->out();
goto CheckAndLoop;
case kInstNop:
if (!ip->last())
Push(id+1, p); // try the next when we're done
id = ip->out();
CheckAndLoop:
// Sanity check: id is the head of its list, which must
// be the case if id-1 is the last of *its* list. :)
DCHECK(id == 0 || prog_->inst(id-1)->last());
if (ShouldVisit(id, p))
goto Loop;
break;
case kInstMatch: {
if (endmatch_ && p != end)
goto Next;
// We found a match. If the caller doesn't care
// where the match is, no point going further.
if (nsubmatch_ == 0)
return true;
// Record best match so far.
// Only need to check end point, because this entire
// call is only considering one start position.
matched = true;
cap_[1] = p;
if (submatch_[0].data() == NULL ||
(longest_ && p > submatch_[0].data() + submatch_[0].size())) {
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] =
StringPiece(cap_[2 * i],
static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
}
// If going for first match, we're done.
if (!longest_)
return true;
// If we used the entire text, no longer match is possible.
if (p == end)
return true;
// Otherwise, continue on in hope of a longer match.
// Note the absence of the ShouldVisit() check here
// due to execution remaining in the same list.
Next:
if (!ip->last()) {
id++;
goto Loop;
}
break;
}
}
}
return matched;
}
// Search text (within context) for prog_.
bool BitState::Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
// Search parameters.
text_ = text;
context_ = context;
if (context_.data() == NULL)
context_ = text;
if (prog_->anchor_start() && BeginPtr(context_) != BeginPtr(text))
return false;
if (prog_->anchor_end() && EndPtr(context_) != EndPtr(text))
return false;
anchored_ = anchored || prog_->anchor_start();
longest_ = longest || prog_->anchor_end();
endmatch_ = prog_->anchor_end();
submatch_ = submatch;
nsubmatch_ = nsubmatch;
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = StringPiece();
// Allocate scratch space.
int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
nvisited = (nvisited + kVisitedBits-1) / kVisitedBits;
visited_ = PODArray<uint64_t>(nvisited);
memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
int ncap = 2*nsubmatch;
if (ncap < 2)
ncap = 2;
cap_ = PODArray<const char*>(ncap);
memset(cap_.data(), 0, ncap*sizeof cap_[0]);
// When sizeof(Job) == 16, we start with a nice round 1KiB. :)
job_ = PODArray<Job>(64);
// Anchored search must start at text.begin().
if (anchored_) {
cap_[0] = text.data();
return TrySearch(prog_->start(), text.data());
}
// Unanchored search, starting from each possible text position.
// Notice that we have to try the empty string at the end of
// the text, so the loop condition is p <= text.end(), not p < text.end().
// This looks like it's quadratic in the size of the text,
// but we are not clearing visited_ between calls to TrySearch,
// so no work is duplicated and it ends up still being linear.
const char* etext = text.data() + text.size();
for (const char* p = text.data(); p <= etext; p++) {
// Try to use prefix accel (e.g. memchr) to skip ahead.
if (p < etext && prog_->can_prefix_accel()) {
p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext - p));
if (p == NULL)
p = etext;
}
cap_[0] = p;
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
return true;
// Avoid invoking undefined behavior (arithmetic on a null pointer)
// by simply not continuing the loop.
if (p == NULL)
break;
}
return false;
}
// Bit-state search.
bool Prog::SearchBitState(const StringPiece& text,
const StringPiece& context,
Anchor anchor,
MatchKind kind,
StringPiece* match,
int nmatch) {
// If full match, we ask for an anchored longest match
// and then check that match[0] == text.
// So make sure match[0] exists.
StringPiece sp0;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch < 1) {
match = &sp0;
nmatch = 1;
}
}
// Run the search.
BitState b(this);
bool anchored = anchor == kAnchored;
bool longest = kind != kFirstMatch;
if (!b.Search(text, context, anchored, longest, match, nmatch))
return false;
if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
return false;
return true;
}
} // namespace re2

File diff suppressed because it is too large Load Diff

2072
external/duckdb/third_party/re2/re2/dfa.cc vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,137 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/filtered_re2.h"
#include <stddef.h>
#include <string>
#include <utility>
#include "util/util.h"
#include "util/logging.h"
#include "re2/prefilter.h"
#include "re2/prefilter_tree.h"
namespace duckdb_re2 {
FilteredRE2::FilteredRE2()
: compiled_(false),
prefilter_tree_(new PrefilterTree()) {
}
FilteredRE2::FilteredRE2(int min_atom_len)
: compiled_(false),
prefilter_tree_(new PrefilterTree(min_atom_len)) {
}
FilteredRE2::~FilteredRE2() {
for (size_t i = 0; i < re2_vec_.size(); i++)
delete re2_vec_[i];
}
FilteredRE2::FilteredRE2(FilteredRE2&& other)
: re2_vec_(std::move(other.re2_vec_)),
compiled_(other.compiled_),
prefilter_tree_(std::move(other.prefilter_tree_)) {
other.re2_vec_.clear();
other.re2_vec_.shrink_to_fit();
other.compiled_ = false;
other.prefilter_tree_.reset(new PrefilterTree());
}
FilteredRE2& FilteredRE2::operator=(FilteredRE2&& other) {
this->~FilteredRE2();
(void) new (this) FilteredRE2(std::move(other));
return *this;
}
RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
const RE2::Options& options, int* id) {
RE2* re = new RE2(pattern, options);
RE2::ErrorCode code = re->error_code();
if (!re->ok()) {
if (options.log_errors()) {
LOG(ERROR) << "Couldn't compile regular expression, skipping: "
<< pattern << " due to error " << re->error();
}
delete re;
} else {
*id = static_cast<int>(re2_vec_.size());
re2_vec_.push_back(re);
}
return code;
}
void FilteredRE2::Compile(std::vector<std::string>* atoms) {
if (compiled_) {
LOG(ERROR) << "Compile called already.";
return;
}
if (re2_vec_.empty()) {
LOG(ERROR) << "Compile called before Add.";
return;
}
for (size_t i = 0; i < re2_vec_.size(); i++) {
Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
prefilter_tree_->Add(prefilter);
}
atoms->clear();
prefilter_tree_->Compile(atoms);
compiled_ = true;
}
int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
for (size_t i = 0; i < re2_vec_.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[i]))
return static_cast<int>(i);
return -1;
}
int FilteredRE2::FirstMatch(const StringPiece& text,
const std::vector<int>& atoms) const {
if (!compiled_) {
LOG(DFATAL) << "FirstMatch called before Compile.";
return -1;
}
std::vector<int> regexps;
prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (size_t i = 0; i < regexps.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
return regexps[i];
return -1;
}
bool FilteredRE2::AllMatches(
const StringPiece& text,
const std::vector<int>& atoms,
std::vector<int>* matching_regexps) const {
matching_regexps->clear();
std::vector<int> regexps;
prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (size_t i = 0; i < regexps.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
matching_regexps->push_back(regexps[i]);
return !matching_regexps->empty();
}
void FilteredRE2::AllPotentials(
const std::vector<int>& atoms,
std::vector<int>* potential_regexps) const {
prefilter_tree_->RegexpsGivenStrings(atoms, potential_regexps);
}
void FilteredRE2::RegexpsGivenStrings(const std::vector<int>& matched_atoms,
std::vector<int>* passed_regexps) {
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
}
void FilteredRE2::PrintPrefilter(int regexpid) {
prefilter_tree_->PrintPrefilter(regexpid);
}
} // namespace re2

View File

@@ -0,0 +1,120 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_FILTERED_RE2_H_
#define RE2_FILTERED_RE2_H_
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
// It provides a prefilter mechanism that helps in cutting down the
// number of regexps that need to be actually searched.
//
// By design, it does not include a string matching engine. This is to
// allow the user of the class to use their favorite string matching
// engine. The overall flow is: Add all the regexps using Add, then
// Compile the FilteredRE2. Compile returns strings that need to be
// matched. Note that the returned strings are lowercased and distinct.
// For applying regexps to a search text, the caller does the string
// matching using the returned strings. When doing the string match,
// note that the caller has to do that in a case-insensitive way or
// on a lowercased version of the search text. Then call FirstMatch
// or AllMatches with a vector of indices of strings that were found
// in the text to get the actual regexp matches.
#include <memory>
#include <string>
#include <vector>
#include "re2/re2.h"
#ifndef DUCKDB_BASE_STD
namespace duckdb_base_std {
using ::std::unique_ptr;
} // namespace duckdb_base_std
#endif
namespace duckdb_re2 {
class PrefilterTree;
class FilteredRE2 {
public:
FilteredRE2();
explicit FilteredRE2(int min_atom_len);
~FilteredRE2();
// Not copyable.
FilteredRE2(const FilteredRE2&) = delete;
FilteredRE2& operator=(const FilteredRE2&) = delete;
// Movable.
FilteredRE2(FilteredRE2&& other);
FilteredRE2& operator=(FilteredRE2&& other);
// Uses RE2 constructor to create a RE2 object (re). Returns
// re->error_code(). If error_code is other than NoError, then re is
// deleted and not added to re2_vec_.
RE2::ErrorCode Add(const StringPiece& pattern,
const RE2::Options& options,
int* id);
// Prepares the regexps added by Add for filtering. Returns a set
// of strings that the caller should check for in candidate texts.
// The returned strings are lowercased and distinct. When doing
// string matching, it should be performed in a case-insensitive
// way or the search text should be lowercased first. Call after
// all Add calls are done.
void Compile(std::vector<std::string>* strings_to_match);
// Returns the index of the first matching regexp.
// Returns -1 on no match. Can be called prior to Compile.
// Does not do any filtering: simply tries to Match the
// regexps in a loop.
int SlowFirstMatch(const StringPiece& text) const;
// Returns the index of the first matching regexp.
// Returns -1 on no match. Compile has to be called before
// calling this.
int FirstMatch(const StringPiece& text,
const std::vector<int>& atoms) const;
// Returns the indices of all matching regexps, after first clearing
// matched_regexps.
bool AllMatches(const StringPiece& text,
const std::vector<int>& atoms,
std::vector<int>* matching_regexps) const;
// Returns the indices of all potentially matching regexps after first
// clearing potential_regexps.
// A regexp is potentially matching if it passes the filter.
// If a regexp passes the filter it may still not match.
// A regexp that does not pass the filter is guaranteed to not match.
void AllPotentials(const std::vector<int>& atoms,
std::vector<int>* potential_regexps) const;
// The number of regexps added.
int NumRegexps() const { return static_cast<int>(re2_vec_.size()); }
// Get the individual RE2 objects.
const RE2& GetRE2(int regexpid) const { return *re2_vec_[regexpid]; }
private:
// Print prefilter.
void PrintPrefilter(int regexpid);
// Useful for testing and debugging.
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
std::vector<int>* passed_regexps);
// All the regexps in the FilteredRE2.
std::vector<RE2*> re2_vec_;
// Has the FilteredRE2 been compiled using Compile()
bool compiled_;
// An AND-OR tree of string atoms used for filtering regexps.
duckdb_base_std::unique_ptr<PrefilterTree> prefilter_tree_;
};
} // namespace re2
#endif // RE2_FILTERED_RE2_H_

View File

@@ -0,0 +1,197 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Determine whether this library should match PCRE exactly
// for a particular Regexp. (If so, the testing framework can
// check that it does.)
//
// This library matches PCRE except in these cases:
// * the regexp contains a repetition of an empty string,
// like (a*)* or (a*)+. In this case, PCRE will treat
// the repetition sequence as ending with an empty string,
// while this library does not.
// * Perl and PCRE differ on whether \v matches \n.
// For historical reasons, this library implements the Perl behavior.
// * Perl and PCRE allow $ in one-line mode to match either the very
// end of the text or just before a \n at the end of the text.
// This library requires it to match only the end of the text.
// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
// match the end of the text if the last character is a \n.
// This library does allow it.
//
// Regexp::MimicsPCRE checks for any of these conditions.
#include "util/util.h"
#include "util/logging.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace duckdb_re2 {
// Returns whether re might match an empty string.
static bool CanBeEmptyString(Regexp *re);
// Walker class to compute whether library handles a regexp
// exactly as PCRE would. See comment at top for conditions.
class PCREWalker : public Regexp::Walker<bool> {
public:
PCREWalker() {}
virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
virtual bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
LOG(DFATAL) << "PCREWalker::ShortVisit called";
#endif
return a;
}
private:
PCREWalker(const PCREWalker&) = delete;
PCREWalker& operator=(const PCREWalker&) = delete;
};
// Called after visiting each of re's children and accumulating
// the return values in child_args. So child_args contains whether
// this library mimics PCRE for those subexpressions.
bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
// If children failed, so do we.
for (int i = 0; i < nchild_args; i++)
if (!child_args[i])
return false;
// Otherwise look for other reasons to fail.
switch (re->op()) {
// Look for repeated empty string.
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
if (CanBeEmptyString(re->sub()[0]))
return false;
break;
case kRegexpRepeat:
if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
return false;
break;
// Look for \v
case kRegexpLiteral:
if (re->rune() == '\v')
return false;
break;
// Look for $ in single-line mode.
case kRegexpEndText:
case kRegexpEmptyMatch:
if (re->parse_flags() & Regexp::WasDollar)
return false;
break;
// Look for ^ in multi-line mode.
case kRegexpBeginLine:
// No condition: in single-line mode ^ becomes kRegexpBeginText.
return false;
default:
break;
}
// Not proven guilty.
return true;
}
// Returns whether this regexp's behavior will mimic PCRE's exactly.
bool Regexp::MimicsPCRE() {
PCREWalker w;
return w.Walk(this, true);
}
// Walker class to compute whether a Regexp can match an empty string.
// It is okay to overestimate. For example, \b\B cannot match an empty
// string, because \b and \B are mutually exclusive, but this isn't
// that smart and will say it can. Spurious empty strings
// will reduce the number of regexps we sanity check against PCRE,
// but they won't break anything.
class EmptyStringWalker : public Regexp::Walker<bool> {
public:
EmptyStringWalker() {}
virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
virtual bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
#endif
return a;
}
private:
EmptyStringWalker(const EmptyStringWalker&) = delete;
EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
};
// Called after visiting re's children. child_args contains the return
// value from each of the children's PostVisits (i.e., whether each child
// can match an empty string). Returns whether this clause can match an
// empty string.
bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
switch (re->op()) {
case kRegexpNoMatch: // never empty
case kRegexpLiteral:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpCharClass:
case kRegexpLiteralString:
return false;
case kRegexpEmptyMatch: // always empty
case kRegexpBeginLine: // always empty, when they match
case kRegexpEndLine:
case kRegexpNoWordBoundary:
case kRegexpWordBoundary:
case kRegexpBeginText:
case kRegexpEndText:
case kRegexpStar: // can always be empty
case kRegexpQuest:
case kRegexpHaveMatch:
return true;
case kRegexpConcat: // can be empty if all children can
for (int i = 0; i < nchild_args; i++)
if (!child_args[i])
return false;
return true;
case kRegexpAlternate: // can be empty if any child can
for (int i = 0; i < nchild_args; i++)
if (child_args[i])
return true;
return false;
case kRegexpPlus: // can be empty if the child can
case kRegexpCapture:
return child_args[0];
case kRegexpRepeat: // can be empty if child can or is x{0}
return child_args[0] || re->min() == 0;
}
return false;
}
// Returns whether re can match an empty string.
static bool CanBeEmptyString(Regexp* re) {
EmptyStringWalker w;
return w.Walk(re, true);
}
} // namespace re2

View File

@@ -0,0 +1,674 @@
// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc.
//
// Prog::SearchNFA, an NFA search.
// This is an actual NFA like the theorists talk about,
// not the pseudo-NFA found in backtracking regexp implementations.
//
// IMPLEMENTATION
//
// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
// which is a variant of the one described in Thompson's 1968 CACM paper.
// See http://swtch.com/~rsc/regexp/ for various history. The main feature
// over the DFA implementation is that it tracks submatch boundaries.
//
// When the choice of submatch boundaries is ambiguous, this particular
// implementation makes the same choices that traditional backtracking
// implementations (in particular, Perl and PCRE) do.
// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
// time in the length of the input.
//
// Like Thompson's original machine and like the DFA implementation, this
// implementation notices a match only once it is one byte past it.
#include <stdio.h>
#include <string.h>
#include <algorithm>
#include <deque>
#include <string>
#include <utility>
#include <vector>
#include "util/logging.h"
#include "util/strutil.h"
#include "re2/pod_array.h"
#include "re2/prog.h"
#include "re2/regexp.h"
#include "re2/sparse_array.h"
#include "re2/sparse_set.h"
namespace duckdb_re2 {
class NFA {
public:
NFA(Prog* prog);
~NFA();
// Searches for a matching string.
// * If anchored is true, only considers matches starting at offset.
// Otherwise finds lefmost match at or after offset.
// * If longest is true, returns the longest match starting
// at the chosen start point. Otherwise returns the so-called
// left-biased match, the one traditional backtracking engines
// (like Perl and PCRE) find.
// Records submatch boundaries in submatch[1..nsubmatch-1].
// Submatch[0] is the entire match. When there is a choice in
// which text matches each subexpression, the submatch boundaries
// are chosen to match what a backtracking implementation would choose.
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
private:
struct Thread {
union {
int ref;
Thread* next; // when on free list
};
const char** capture;
};
// State for explicit stack in AddToThreadq.
struct AddState {
int id; // Inst to process
Thread* t; // if not null, set t0 = t before processing id
};
// Threadq is a list of threads. The list is sorted by the order
// in which Perl would explore that particular state -- the earlier
// choices appear earlier in the list.
typedef SparseArray<Thread*> Threadq;
inline Thread* AllocThread();
inline Thread* Incref(Thread* t);
inline void Decref(Thread* t);
// Follows all empty arrows from id0 and enqueues all the states reached.
// Enqueues only the ByteRange instructions that match byte c.
// context is used (with p) for evaluating empty-width specials.
// p is the current input position, and t0 is the current thread.
void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
const char* p, Thread* t0);
// Run runq on byte c, appending new states to nextq.
// Updates matched_ and match_ as new, better matches are found.
// context is used (with p) for evaluating empty-width specials.
// p is the position of byte c in the input string for AddToThreadq;
// p-1 will be used when processing Match instructions.
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
const char* p);
// Returns text version of capture information, for debugging.
std::string FormatCapture(const char** capture);
void CopyCapture(const char** dst, const char** src) {
memmove(dst, src, ncapture_*sizeof src[0]);
}
Prog* prog_; // underlying program
int start_; // start instruction in program
int ncapture_; // number of submatches to track
bool longest_; // whether searching for longest match
bool endmatch_; // whether match must end at text.end()
const char* btext_; // beginning of text (for FormatSubmatch)
const char* etext_; // end of text (for endmatch_)
Threadq q0_, q1_; // pre-allocated for Search.
PODArray<AddState> stack_; // pre-allocated for AddToThreadq
std::deque<Thread> arena_; // thread arena
Thread* freelist_; // thread freelist
const char** match_; // best match so far
bool matched_; // any match so far?
NFA(const NFA&) = delete;
NFA& operator=(const NFA&) = delete;
};
NFA::NFA(Prog* prog) {
prog_ = prog;
start_ = prog_->start();
ncapture_ = 0;
longest_ = false;
endmatch_ = false;
btext_ = NULL;
etext_ = NULL;
q0_.resize(prog_->size());
q1_.resize(prog_->size());
// See NFA::AddToThreadq() for why this is so.
int nstack = 2*prog_->inst_count(kInstCapture) +
prog_->inst_count(kInstEmptyWidth) +
prog_->inst_count(kInstNop) + 1; // + 1 for start inst
stack_ = PODArray<AddState>(nstack);
freelist_ = NULL;
match_ = NULL;
matched_ = false;
}
NFA::~NFA() {
delete[] match_;
for (const Thread& t : arena_)
delete[] t.capture;
}
NFA::Thread* NFA::AllocThread() {
Thread* t = freelist_;
if (t != NULL) {
freelist_ = t->next;
t->ref = 1;
// We don't need to touch t->capture because
// the caller will immediately overwrite it.
return t;
}
arena_.emplace_back();
t = &arena_.back();
t->ref = 1;
t->capture = new const char*[ncapture_];
return t;
}
NFA::Thread* NFA::Incref(Thread* t) {
DCHECK(t != NULL);
t->ref++;
return t;
}
void NFA::Decref(Thread* t) {
DCHECK(t != NULL);
t->ref--;
if (t->ref > 0)
return;
DCHECK_EQ(t->ref, 0);
t->next = freelist_;
freelist_ = t;
}
// Follows all empty arrows from id0 and enqueues all the states reached.
// Enqueues only the ByteRange instructions that match byte c.
// context is used (with p) for evaluating empty-width specials.
// p is the current input position, and t0 is the current thread.
void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
const char* p, Thread* t0) {
if (id0 == 0)
return;
// Use stack_ to hold our stack of instructions yet to process.
// It was preallocated as follows:
// two entries per Capture;
// one entry per EmptyWidth; and
// one entry per Nop.
// This reflects the maximum number of stack pushes that each can
// perform. (Each instruction can be processed at most once.)
AddState* stk = stack_.data();
int nstk = 0;
stk[nstk++] = {id0, NULL};
while (nstk > 0) {
DCHECK_LE(nstk, stack_.size());
AddState a = stk[--nstk];
Loop:
if (a.t != NULL) {
// t0 was a thread that we allocated and copied in order to
// record the capture, so we must now decref it.
Decref(t0);
t0 = a.t;
}
int id = a.id;
if (id == 0)
continue;
if (q->has_index(id)) {
continue;
}
// Create entry in q no matter what. We might fill it in below,
// or we might not. Even if not, it is necessary to have it,
// so that we don't revisit id0 during the recursion.
q->set_new(id, NULL);
Thread** tp = &q->get_existing(id);
int j;
Thread* t;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
break;
case kInstFail:
break;
case kInstAltMatch:
// Save state; will pick up at next byte.
t = Incref(t0);
*tp = t;
DCHECK(!ip->last());
a = {id+1, NULL};
goto Loop;
case kInstNop:
if (!ip->last())
stk[nstk++] = {id+1, NULL};
// Continue on.
a = {ip->out(), NULL};
goto Loop;
case kInstCapture:
if (!ip->last())
stk[nstk++] = {id+1, NULL};
if ((j=ip->cap()) < ncapture_) {
// Push a dummy whose only job is to restore t0
// once we finish exploring this possibility.
stk[nstk++] = {0, t0};
// Record capture.
t = AllocThread();
CopyCapture(t->capture, t0->capture);
t->capture[j] = p;
t0 = t;
}
a = {ip->out(), NULL};
goto Loop;
case kInstByteRange:
if (!ip->Matches(c))
goto Next;
// Save state; will pick up at next byte.
t = Incref(t0);
*tp = t;
if (ip->hint() == 0)
break;
a = {id+ip->hint(), NULL};
goto Loop;
case kInstMatch:
// Save state; will pick up at next byte.
t = Incref(t0);
*tp = t;
Next:
if (ip->last())
break;
a = {id+1, NULL};
goto Loop;
case kInstEmptyWidth:
if (!ip->last())
stk[nstk++] = {id+1, NULL};
// Continue on if we have all the right flag bits.
if (ip->empty() & ~Prog::EmptyFlags(context, p))
break;
a = {ip->out(), NULL};
goto Loop;
}
}
}
// Run runq on byte c, appending new states to nextq.
// Updates matched_ and match_ as new, better matches are found.
// context is used (with p) for evaluating empty-width specials.
// p is the position of byte c in the input string for AddToThreadq;
// p-1 will be used when processing Match instructions.
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
const char* p) {
nextq->clear();
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->value();
if (t == NULL)
continue;
if (longest_) {
// Can skip any threads started after our current best match.
if (matched_ && match_[0] < t->capture[0]) {
Decref(t);
continue;
}
}
int id = i->index();
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
// Should only see the values handled below.
LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
break;
case kInstByteRange:
AddToThreadq(nextq, ip->out(), c, context, p, t);
break;
case kInstAltMatch:
if (i != runq->begin())
break;
// The match is ours if we want it.
if (ip->greedy(prog_) || longest_) {
CopyCapture(match_, t->capture);
matched_ = true;
Decref(t);
for (++i; i != runq->end(); ++i) {
if (i->value() != NULL)
Decref(i->value());
}
runq->clear();
if (ip->greedy(prog_))
return ip->out1();
return ip->out();
}
break;
case kInstMatch: {
// Avoid invoking undefined behavior (arithmetic on a null pointer)
// by storing p instead of p-1. (What would the latter even mean?!)
// This complements the special case in NFA::Search().
if (p == NULL) {
CopyCapture(match_, t->capture);
match_[1] = p;
matched_ = true;
break;
}
if (endmatch_ && p-1 != etext_)
break;
if (longest_) {
// Leftmost-longest mode: save this match only if
// it is either farther to the left or at the same
// point but longer than an existing match.
if (!matched_ || t->capture[0] < match_[0] ||
(t->capture[0] == match_[0] && p-1 > match_[1])) {
CopyCapture(match_, t->capture);
match_[1] = p-1;
matched_ = true;
}
} else {
// Leftmost-biased mode: this match is by definition
// better than what we've already found (see next line).
CopyCapture(match_, t->capture);
match_[1] = p-1;
matched_ = true;
// Cut off the threads that can only find matches
// worse than the one we just found: don't run the
// rest of the current Threadq.
Decref(t);
for (++i; i != runq->end(); ++i) {
if (i->value() != NULL)
Decref(i->value());
}
runq->clear();
return 0;
}
break;
}
}
Decref(t);
}
runq->clear();
return 0;
}
std::string NFA::FormatCapture(const char** capture) {
std::string s;
for (int i = 0; i < ncapture_; i+=2) {
if (capture[i] == NULL)
s += "(?,?)";
else if (capture[i+1] == NULL)
s += StringPrintf("(%td,?)",
capture[i] - btext_);
else
s += StringPrintf("(%td,%td)",
capture[i] - btext_,
capture[i+1] - btext_);
}
return s;
}
bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
if (start_ == 0)
return false;
StringPiece context = const_context;
if (context.data() == NULL)
context = text;
// Sanity check: make sure that text lies within context.
if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) {
LOG(DFATAL) << "context does not contain text";
return false;
}
if (prog_->anchor_start() && BeginPtr(context) != BeginPtr(text))
return false;
if (prog_->anchor_end() && EndPtr(context) != EndPtr(text))
return false;
anchored |= prog_->anchor_start();
if (prog_->anchor_end()) {
longest = true;
endmatch_ = true;
}
if (nsubmatch < 0) {
LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
return false;
}
// Save search parameters.
ncapture_ = 2*nsubmatch;
longest_ = longest;
if (nsubmatch == 0) {
// We need to maintain match[0], both to distinguish the
// longest match (if longest is true) and also to tell
// whether we've seen any matches at all.
ncapture_ = 2;
}
match_ = new const char*[ncapture_];
memset(match_, 0, ncapture_*sizeof match_[0]);
matched_ = false;
// For debugging prints.
btext_ = context.data();
// For convenience.
etext_ = text.data() + text.size();
// Set up search.
Threadq* runq = &q0_;
Threadq* nextq = &q1_;
runq->clear();
nextq->clear();
// Loop over the text, stepping the machine.
for (const char* p = text.data();; p++) {
// This is a no-op the first time around the loop because runq is empty.
int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p);
DCHECK_EQ(runq->size(), 0);
using std::swap;
swap(nextq, runq);
nextq->clear();
if (id != 0) {
// We're done: full match ahead.
p = etext_;
for (;;) {
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
break;
case kInstCapture:
if (ip->cap() < ncapture_)
match_[ip->cap()] = p;
id = ip->out();
continue;
case kInstNop:
id = ip->out();
continue;
case kInstMatch:
match_[1] = p;
matched_ = true;
break;
}
break;
}
break;
}
if (p > etext_)
break;
// Start a new thread if there have not been any matches.
// (No point in starting a new thread if there have been
// matches, since it would be to the right of the match
// we already found.)
if (!matched_ && (!anchored || p == text.data())) {
// Try to use prefix accel (e.g. memchr) to skip ahead.
// The search must be unanchored and there must be zero
// possible matches already.
if (!anchored && runq->size() == 0 &&
p < etext_ && prog_->can_prefix_accel()) {
p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p));
if (p == NULL)
p = etext_;
}
Thread* t = AllocThread();
CopyCapture(t->capture, match_);
t->capture[0] = p;
AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p,
t);
Decref(t);
}
// If all the threads have died, stop early.
if (runq->size() == 0) {
break;
}
// Avoid invoking undefined behavior (arithmetic on a null pointer)
// by simply not continuing the loop.
// This complements the special case in NFA::Step().
if (p == NULL) {
(void) Step(runq, nextq, -1, context, p);
DCHECK_EQ(runq->size(), 0);
using std::swap;
swap(nextq, runq);
nextq->clear();
break;
}
}
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
if (i->value() != NULL)
Decref(i->value());
}
if (matched_) {
for (int i = 0; i < nsubmatch; i++)
submatch[i] =
StringPiece(match_[2 * i],
static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
return true;
}
return false;
}
bool
Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch) {
NFA nfa(this);
StringPiece sp;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch == 0) {
match = &sp;
nmatch = 1;
}
}
if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
return false;
if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
return false;
return true;
}
// For each instruction i in the program reachable from the start, compute the
// number of instructions reachable from i by following only empty transitions
// and record that count as fanout[i].
//
// fanout holds the results and is also the work queue for the outer iteration.
// reachable holds the reached nodes for the inner iteration.
void Prog::Fanout(SparseArray<int>* fanout) {
DCHECK_EQ(fanout->max_size(), size());
SparseSet reachable(size());
fanout->clear();
fanout->set_new(start(), 0);
for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
int* count = &i->value();
reachable.clear();
reachable.insert(i->index());
for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
int id = *j;
Prog::Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
break;
case kInstByteRange:
if (!ip->last())
reachable.insert(id+1);
(*count)++;
if (!fanout->has_index(ip->out())) {
fanout->set_new(ip->out(), 0);
}
break;
case kInstAltMatch:
DCHECK(!ip->last());
reachable.insert(id+1);
break;
case kInstCapture:
case kInstEmptyWidth:
case kInstNop:
if (!ip->last())
reachable.insert(id+1);
reachable.insert(ip->out());
break;
case kInstMatch:
if (!ip->last())
reachable.insert(id+1);
break;
case kInstFail:
break;
}
}
}
}
} // namespace re2

View File

@@ -0,0 +1,577 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc.
//
// Prog::SearchOnePass is an efficient implementation of
// regular expression search with submatch tracking for
// what I call "one-pass regular expressions". (An alternate
// name might be "backtracking-free regular expressions".)
//
// One-pass regular expressions have the property that
// at each input byte during an anchored match, there may be
// multiple alternatives but only one can proceed for any
// given input byte.
//
// For example, the regexp /x*yx*/ is one-pass: you read
// x's until a y, then you read the y, then you keep reading x's.
// At no point do you have to guess what to do or back up
// and try a different guess.
//
// On the other hand, /x*x/ is not one-pass: when you're
// looking at an input "x", it's not clear whether you should
// use it to extend the x* or as the final x.
//
// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
//
// A simple intuition for identifying one-pass regular expressions
// is that it's always immediately obvious when a repetition ends.
// It must also be immediately obvious which branch of an | to take:
//
// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
//
// The NFA-based search in nfa.cc does some bookkeeping to
// avoid the need for backtracking and its associated exponential blowup.
// But if we have a one-pass regular expression, there is no
// possibility of backtracking, so there is no need for the
// extra bookkeeping. Hence, this code.
//
// On a one-pass regular expression, the NFA code in nfa.cc
// runs at about 1/20 of the backtracking-based PCRE speed.
// In contrast, the code in this file runs at about the same
// speed as PCRE.
//
// One-pass regular expressions get used a lot when RE is
// used for parsing simple strings, so it pays off to
// notice them and handle them efficiently.
//
// See also Anne Brüggemann-Klein and Derick Wood,
// "One-unambiguous regular languages", Information and Computation 142(2).
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <map>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/pod_array.h"
#include "re2/prog.h"
#include "re2/sparse_set.h"
#include "re2/stringpiece.h"
// Silence "zero-sized array in struct/union" warning for OneState::action.
#ifdef _MSC_VER
#pragma warning(disable: 4200)
#endif
namespace duckdb_re2 {
// The key insight behind this implementation is that the
// non-determinism in an NFA for a one-pass regular expression
// is contained. To explain what that means, first a
// refresher about what regular expression programs look like
// and how the usual NFA execution runs.
//
// In a regular expression program, only the kInstByteRange
// instruction processes an input byte c and moves on to the
// next byte in the string (it does so if c is in the given range).
// The kInstByteRange instructions correspond to literal characters
// and character classes in the regular expression.
//
// The kInstAlt instructions are used as wiring to connect the
// kInstByteRange instructions together in interesting ways when
// implementing | + and *.
// The kInstAlt instruction forks execution, like a goto that
// jumps to ip->out() and ip->out1() in parallel. Each of the
// resulting computation paths is called a thread.
//
// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
// are interesting in their own right but like kInstAlt they don't
// advance the input pointer. Only kInstByteRange does.
//
// The automaton execution in nfa.cc runs all the possible
// threads of execution in lock-step over the input. To process
// a particular byte, each thread gets run until it either dies
// or finds a kInstByteRange instruction matching the byte.
// If the latter happens, the thread stops just past the
// kInstByteRange instruction (at ip->out()) and waits for
// the other threads to finish processing the input byte.
// Then, once all the threads have processed that input byte,
// the whole process repeats. The kInstAlt state instruction
// might create new threads during input processing, but no
// matter what, all the threads stop after a kInstByteRange
// and wait for the other threads to "catch up".
// Running in lock step like this ensures that the NFA reads
// the input string only once.
//
// Each thread maintains its own set of capture registers
// (the string positions at which it executed the kInstCapture
// instructions corresponding to capturing parentheses in the
// regular expression). Repeated copying of the capture registers
// is the main performance bottleneck in the NFA implementation.
//
// A regular expression program is "one-pass" if, no matter what
// the input string, there is only one thread that makes it
// past a kInstByteRange instruction at each input byte. This means
// that there is in some sense only one active thread throughout
// the execution. Other threads might be created during the
// processing of an input byte, but they are ephemeral: only one
// thread is left to start processing the next input byte.
// This is what I meant above when I said the non-determinism
// was "contained".
//
// To execute a one-pass regular expression program, we can build
// a DFA (no non-determinism) that has at most as many states as
// the NFA (compare this to the possibly exponential number of states
// in the general case). Each state records, for each possible
// input byte, the next state along with the conditions required
// before entering that state -- empty-width flags that must be true
// and capture operations that must be performed. It also records
// whether a set of conditions required to finish a match at that
// point in the input rather than process the next byte.
// A state in the one-pass NFA - just an array of actions indexed
// by the bytemap_[] of the next input byte. (The bytemap
// maps next input bytes into equivalence classes, to reduce
// the memory footprint.)
struct OneState {
uint32_t matchcond; // conditions to match right now.
uint32_t action[256];
};
// The uint32_t conditions in the action are a combination of
// condition and capture bits and the next state. The bottom 16 bits
// are the condition and capture bits, and the top 16 are the index of
// the next state.
//
// Bits 0-5 are the empty-width flags from prog.h.
// Bit 6 is kMatchWins, which means the match takes
// priority over moving to next in a first-match search.
// The remaining bits mark capture registers that should
// be set to the current input position. The capture bits
// start at index 2, since the search loop can take care of
// cap[0], cap[1] (the overall match position).
// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
// No input position can satisfy both kEmptyWordBoundary
// and kEmptyNonWordBoundary, so we can use that as a sentinel
// instead of needing an extra bit.
static const int kIndexShift = 16; // number of bits below index
static const int kEmptyShift = 6; // number of empty flags in prog.h
static const int kRealCapShift = kEmptyShift + 1;
static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
// Parameters used to skip over cap[0], cap[1].
static const int kCapShift = kRealCapShift - 2;
static const int kMaxCap = kRealMaxCap + 2;
static const uint32_t kMatchWins = 1 << kEmptyShift;
static const uint32_t kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
static const uint32_t kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
// Check, at compile time, that prog.h agrees with math above.
// This function is never called.
void OnePass_Checks() {
static_assert((1<<kEmptyShift)-1 == kEmptyAllFlags,
"kEmptyShift disagrees with kEmptyAllFlags");
// kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
static_assert(kMaxCap == Prog::kMaxOnePassCapture*2,
"kMaxCap disagrees with kMaxOnePassCapture");
}
static bool Satisfy(uint32_t cond, const StringPiece& context, const char* p) {
uint32_t satisfied = Prog::EmptyFlags(context, p);
if (cond & kEmptyAllFlags & ~satisfied)
return false;
return true;
}
// Apply the capture bits in cond, saving p to the appropriate
// locations in cap[].
static void ApplyCaptures(uint32_t cond, const char* p,
const char** cap, int ncap) {
for (int i = 2; i < ncap; i++)
if (cond & (1 << kCapShift << i))
cap[i] = p;
}
// Computes the OneState* for the given nodeindex.
static inline OneState* IndexToNode(uint8_t* nodes, int statesize,
int nodeindex) {
return reinterpret_cast<OneState*>(nodes + statesize*nodeindex);
}
bool Prog::SearchOnePass(const StringPiece& text,
const StringPiece& const_context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch) {
if (anchor != kAnchored && kind != kFullMatch) {
LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
return false;
}
// Make sure we have at least cap[1],
// because we use it to tell if we matched.
int ncap = 2*nmatch;
if (ncap < 2)
ncap = 2;
const char* cap[kMaxCap];
for (int i = 0; i < ncap; i++)
cap[i] = NULL;
const char* matchcap[kMaxCap];
for (int i = 0; i < ncap; i++)
matchcap[i] = NULL;
StringPiece context = const_context;
if (context.data() == NULL)
context = text;
if (anchor_start() && BeginPtr(context) != BeginPtr(text))
return false;
if (anchor_end() && EndPtr(context) != EndPtr(text))
return false;
if (anchor_end())
kind = kFullMatch;
uint8_t* nodes = onepass_nodes_.data();
int statesize = sizeof(uint32_t) + bytemap_range()*sizeof(uint32_t);
// start() is always mapped to the zeroth OneState.
OneState* state = IndexToNode(nodes, statesize, 0);
uint8_t* bytemap = bytemap_;
const char* bp = text.data();
const char* ep = text.data() + text.size();
const char* p;
bool matched = false;
matchcap[0] = bp;
cap[0] = bp;
uint32_t nextmatchcond = state->matchcond;
for (p = bp; p < ep; p++) {
int c = bytemap[*p & 0xFF];
uint32_t matchcond = nextmatchcond;
uint32_t cond = state->action[c];
// Determine whether we can reach act->next.
// If so, advance state and nextmatchcond.
if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
uint32_t nextindex = cond >> kIndexShift;
state = IndexToNode(nodes, statesize, nextindex);
nextmatchcond = state->matchcond;
} else {
state = NULL;
nextmatchcond = kImpossible;
}
// This code section is carefully tuned.
// The goto sequence is about 10% faster than the
// obvious rewrite as a large if statement in the
// ASCIIMatchRE2 and DotMatchRE2 benchmarks.
// Saving the match capture registers is expensive.
// Is this intermediate match worth thinking about?
// Not if we want a full match.
if (kind == kFullMatch)
goto skipmatch;
// Not if it's impossible.
if (matchcond == kImpossible)
goto skipmatch;
// Not if the possible match is beaten by the certain
// match at the next byte. When this test is useless
// (e.g., HTTPPartialMatchRE2) it slows the loop by
// about 10%, but when it avoids work (e.g., DotMatchRE2),
// it cuts the loop execution by about 45%.
if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
goto skipmatch;
// Finally, the match conditions must be satisfied.
if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
for (int i = 2; i < 2*nmatch; i++)
matchcap[i] = cap[i];
if (nmatch > 1 && (matchcond & kCapMask))
ApplyCaptures(matchcond, p, matchcap, ncap);
matchcap[1] = p;
matched = true;
// If we're in longest match mode, we have to keep
// going and see if we find a longer match.
// In first match mode, we can stop if the match
// takes priority over the next state for this input byte.
// That bit is per-input byte and thus in cond, not matchcond.
if (kind == kFirstMatch && (cond & kMatchWins))
goto done;
}
skipmatch:
if (state == NULL)
goto done;
if ((cond & kCapMask) && nmatch > 1)
ApplyCaptures(cond, p, cap, ncap);
}
// Look for match at end of input.
{
uint32_t matchcond = state->matchcond;
if (matchcond != kImpossible &&
((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
if (nmatch > 1 && (matchcond & kCapMask))
ApplyCaptures(matchcond, p, cap, ncap);
for (int i = 2; i < ncap; i++)
matchcap[i] = cap[i];
matchcap[1] = p;
matched = true;
}
}
done:
if (!matched)
return false;
for (int i = 0; i < nmatch; i++)
match[i] =
StringPiece(matchcap[2 * i],
static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i]));
return true;
}
// Analysis to determine whether a given regexp program is one-pass.
// If ip is not on workq, adds ip to work queue and returns true.
// If ip is already on work queue, does nothing and returns false.
// If ip is NULL, does nothing and returns true (pretends to add it).
typedef SparseSet Instq;
static bool AddQ(Instq *q, int id) {
if (id == 0)
return true;
if (q->contains(id))
return false;
q->insert(id);
return true;
}
struct InstCond {
int id;
uint32_t cond;
};
// Returns whether this is a one-pass program; that is,
// returns whether it is safe to use SearchOnePass on this program.
// These conditions must be true for any instruction ip:
//
// (1) for any other Inst nip, there is at most one input-free
// path from ip to nip.
// (2) there is at most one kInstByte instruction reachable from
// ip that matches any particular byte c.
// (3) there is at most one input-free path from ip to a kInstMatch
// instruction.
//
// This is actually just a conservative approximation: it might
// return false when the answer is true, when kInstEmptyWidth
// instructions are involved.
// Constructs and saves corresponding one-pass NFA on success.
bool Prog::IsOnePass() {
if (did_onepass_)
return onepass_nodes_.data() != NULL;
did_onepass_ = true;
if (start() == 0) // no match
return false;
// Steal memory for the one-pass NFA from the overall DFA budget.
// Willing to use at most 1/4 of the DFA budget (heuristic).
// Limit max node count to 65000 as a conservative estimate to
// avoid overflowing 16-bit node index in encoding.
int maxnodes = 2 + inst_count(kInstByteRange);
int statesize = sizeof(uint32_t) + bytemap_range()*sizeof(uint32_t);
if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
return false;
// Flood the graph starting at the start state, and check
// that in each reachable state, each possible byte leads
// to a unique next state.
int stacksize = inst_count(kInstCapture) +
inst_count(kInstEmptyWidth) +
inst_count(kInstNop) + 1; // + 1 for start inst
PODArray<InstCond> stack(stacksize);
int size = this->size();
PODArray<int> nodebyid(size); // indexed by ip
memset(nodebyid.data(), 0xFF, size*sizeof nodebyid[0]);
// Originally, nodes was a uint8_t[maxnodes*statesize], but that was
// unnecessarily optimistic: why allocate a large amount of memory
// upfront for a large program when it is unlikely to be one-pass?
std::vector<uint8_t> nodes;
Instq tovisit(size), workq(size);
AddQ(&tovisit, start());
nodebyid[start()] = 0;
int nalloc = 1;
nodes.insert(nodes.end(), statesize, 0);
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
int id = *it;
int nodeindex = nodebyid[id];
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
// Flood graph using manual stack, filling in actions as found.
// Default is none.
for (int b = 0; b < bytemap_range_; b++)
node->action[b] = kImpossible;
node->matchcond = kImpossible;
workq.clear();
bool matched = false;
int nstack = 0;
stack[nstack].id = id;
stack[nstack++].cond = 0;
while (nstack > 0) {
int id = stack[--nstack].id;
uint32_t cond = stack[nstack].cond;
Loop:
Prog::Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
break;
case kInstAltMatch:
// TODO(rsc): Ignoring kInstAltMatch optimization.
// Should implement it in this engine, but it's subtle.
DCHECK(!ip->last());
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, id+1))
goto fail;
id = id+1;
goto Loop;
case kInstByteRange: {
int nextindex = nodebyid[ip->out()];
if (nextindex == -1) {
if (nalloc >= maxnodes) {
goto fail;
}
nextindex = nalloc;
AddQ(&tovisit, ip->out());
nodebyid[ip->out()] = nalloc;
nalloc++;
nodes.insert(nodes.end(), statesize, 0);
// Update node because it might have been invalidated.
node = IndexToNode(nodes.data(), statesize, nodeindex);
}
for (int c = ip->lo(); c <= ip->hi(); c++) {
int b = bytemap_[c];
// Skip any bytes immediately after c that are also in b.
while (c < 256-1 && bytemap_[c+1] == b)
c++;
uint32_t act = node->action[b];
uint32_t newact = (nextindex << kIndexShift) | cond;
if (matched)
newact |= kMatchWins;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
goto fail;
}
}
if (ip->foldcase()) {
Rune lo = std::max<Rune>(ip->lo(), 'a') + 'A' - 'a';
Rune hi = std::min<Rune>(ip->hi(), 'z') + 'A' - 'a';
for (int c = lo; c <= hi; c++) {
int b = bytemap_[c];
// Skip any bytes immediately after c that are also in b.
while (c < 256-1 && bytemap_[c+1] == b)
c++;
uint32_t act = node->action[b];
uint32_t newact = (nextindex << kIndexShift) | cond;
if (matched)
newact |= kMatchWins;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
goto fail;
}
}
}
if (ip->last())
break;
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, id+1))
goto fail;
id = id+1;
goto Loop;
}
case kInstCapture:
case kInstEmptyWidth:
case kInstNop:
if (!ip->last()) {
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, id+1))
goto fail;
stack[nstack].id = id+1;
stack[nstack++].cond = cond;
}
if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap)
cond |= (1 << kCapShift) << ip->cap();
if (ip->opcode() == kInstEmptyWidth)
cond |= ip->empty();
// kInstCapture and kInstNop always proceed to ip->out().
// kInstEmptyWidth only sometimes proceeds to ip->out(),
// but as a conservative approximation we assume it always does.
// We could be a little more precise by looking at what c
// is, but that seems like overkill.
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, ip->out())) {
goto fail;
}
id = ip->out();
goto Loop;
case kInstMatch:
if (matched) {
// (3) is violated
goto fail;
}
matched = true;
node->matchcond = cond;
if (ip->last())
break;
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, id+1))
goto fail;
id = id+1;
goto Loop;
case kInstFail:
break;
}
}
}
dfa_mem_ -= nalloc*statesize;
onepass_nodes_ = PODArray<uint8_t>(nalloc*statesize);
memmove(onepass_nodes_.data(), nodes.data(), nalloc*statesize);
return true;
fail:
return false;
}
} // namespace re2

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,119 @@
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
// make_perl_groups.pl >perl_groups.cc
#include "re2/unicode_groups.h"
namespace duckdb_re2 {
static const URange16 code1[] = { /* \d */
{ 0x30, 0x39 },
};
static const URange16 code2[] = { /* \s */
{ 0x9, 0xa },
{ 0xc, 0xd },
{ 0x20, 0x20 },
};
static const URange16 code3[] = { /* \w */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x5f, 0x5f },
{ 0x61, 0x7a },
};
const UGroup perl_groups[] = {
{ "\\d", +1, code1, 1, 0, 0 },
{ "\\D", -1, code1, 1, 0, 0 },
{ "\\s", +1, code2, 3, 0, 0 },
{ "\\S", -1, code2, 3, 0, 0 },
{ "\\w", +1, code3, 4, 0, 0 },
{ "\\W", -1, code3, 4, 0, 0 },
};
const int num_perl_groups = 6;
static const URange16 code4[] = { /* [:alnum:] */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x61, 0x7a },
};
static const URange16 code5[] = { /* [:alpha:] */
{ 0x41, 0x5a },
{ 0x61, 0x7a },
};
static const URange16 code6[] = { /* [:ascii:] */
{ 0x0, 0x7f },
};
static const URange16 code7[] = { /* [:blank:] */
{ 0x9, 0x9 },
{ 0x20, 0x20 },
};
static const URange16 code8[] = { /* [:cntrl:] */
{ 0x0, 0x1f },
{ 0x7f, 0x7f },
};
static const URange16 code9[] = { /* [:digit:] */
{ 0x30, 0x39 },
};
static const URange16 code10[] = { /* [:graph:] */
{ 0x21, 0x7e },
};
static const URange16 code11[] = { /* [:lower:] */
{ 0x61, 0x7a },
};
static const URange16 code12[] = { /* [:print:] */
{ 0x20, 0x7e },
};
static const URange16 code13[] = { /* [:punct:] */
{ 0x21, 0x2f },
{ 0x3a, 0x40 },
{ 0x5b, 0x60 },
{ 0x7b, 0x7e },
};
static const URange16 code14[] = { /* [:space:] */
{ 0x9, 0xd },
{ 0x20, 0x20 },
};
static const URange16 code15[] = { /* [:upper:] */
{ 0x41, 0x5a },
};
static const URange16 code16[] = { /* [:word:] */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x5f, 0x5f },
{ 0x61, 0x7a },
};
static const URange16 code17[] = { /* [:xdigit:] */
{ 0x30, 0x39 },
{ 0x41, 0x46 },
{ 0x61, 0x66 },
};
const UGroup posix_groups[] = {
{ "[:alnum:]", +1, code4, 3, 0, 0 },
{ "[:^alnum:]", -1, code4, 3, 0, 0 },
{ "[:alpha:]", +1, code5, 2, 0, 0 },
{ "[:^alpha:]", -1, code5, 2, 0, 0 },
{ "[:ascii:]", +1, code6, 1, 0, 0 },
{ "[:^ascii:]", -1, code6, 1, 0, 0 },
{ "[:blank:]", +1, code7, 2, 0, 0 },
{ "[:^blank:]", -1, code7, 2, 0, 0 },
{ "[:cntrl:]", +1, code8, 2, 0, 0 },
{ "[:^cntrl:]", -1, code8, 2, 0, 0 },
{ "[:digit:]", +1, code9, 1, 0, 0 },
{ "[:^digit:]", -1, code9, 1, 0, 0 },
{ "[:graph:]", +1, code10, 1, 0, 0 },
{ "[:^graph:]", -1, code10, 1, 0, 0 },
{ "[:lower:]", +1, code11, 1, 0, 0 },
{ "[:^lower:]", -1, code11, 1, 0, 0 },
{ "[:print:]", +1, code12, 1, 0, 0 },
{ "[:^print:]", -1, code12, 1, 0, 0 },
{ "[:punct:]", +1, code13, 4, 0, 0 },
{ "[:^punct:]", -1, code13, 4, 0, 0 },
{ "[:space:]", +1, code14, 2, 0, 0 },
{ "[:^space:]", -1, code14, 2, 0, 0 },
{ "[:upper:]", +1, code15, 1, 0, 0 },
{ "[:^upper:]", -1, code15, 1, 0, 0 },
{ "[:word:]", +1, code16, 4, 0, 0 },
{ "[:^word:]", -1, code16, 4, 0, 0 },
{ "[:xdigit:]", +1, code17, 3, 0, 0 },
{ "[:^xdigit:]", -1, code17, 3, 0, 0 },
};
const int num_posix_groups = 28;
} // namespace re2

View File

@@ -0,0 +1,61 @@
// Copyright 2018 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_POD_ARRAY_H_
#define RE2_POD_ARRAY_H_
#include <memory>
#include <type_traits>
#ifndef DUCKDB_BASE_STD
namespace duckdb_base_std {
using ::std::unique_ptr;
} // namespace duckdb_base_std
#endif
namespace duckdb_re2 {
template <typename T>
class PODArray {
public:
static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
"T must be POD");
PODArray()
: ptr_() {}
explicit PODArray(int len)
: ptr_(std::allocator<T>().allocate(len), Deleter(len)) {}
T* data() const {
return ptr_.get();
}
int size() const {
return ptr_.get_deleter().len_;
}
T& operator[](int pos) const {
return ptr_[pos];
}
private:
struct Deleter {
Deleter()
: len_(0) {}
explicit Deleter(int len)
: len_(len) {}
void operator()(T* ptr) const {
std::allocator<T>().deallocate(ptr, len_);
}
int len_;
};
duckdb_base_std::unique_ptr<T[], Deleter> ptr_;
};
} // namespace re2
#endif // RE2_POD_ARRAY_H_

View File

@@ -0,0 +1,692 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/prefilter.h"
#include <stddef.h>
#include <stdint.h>
#include <string>
#include <utility>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/re2.h"
#include "re2/unicode_casefold.h"
#include "re2/walker-inl.h"
namespace duckdb_re2 {
// Initializes a Prefilter, allocating subs_ as necessary.
Prefilter::Prefilter(Op op) {
op_ = op;
subs_ = NULL;
if (op_ == AND || op_ == OR)
subs_ = new std::vector<Prefilter*>;
}
// Destroys a Prefilter.
Prefilter::~Prefilter() {
if (subs_) {
for (size_t i = 0; i < subs_->size(); i++)
delete (*subs_)[i];
delete subs_;
subs_ = NULL;
}
}
// Simplify if the node is an empty Or or And.
Prefilter* Prefilter::Simplify() {
if (op_ != AND && op_ != OR) {
return this;
}
// Nothing left in the AND/OR.
if (subs_->empty()) {
if (op_ == AND)
op_ = ALL; // AND of nothing is true
else
op_ = NONE; // OR of nothing is false
return this;
}
// Just one subnode: throw away wrapper.
if (subs_->size() == 1) {
Prefilter* a = (*subs_)[0];
subs_->clear();
delete this;
return a->Simplify();
}
return this;
}
// Combines two Prefilters together to create an "op" (AND or OR).
// The passed Prefilters will be part of the returned Prefilter or deleted.
// Does lots of work to avoid creating unnecessarily complicated structures.
Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
// If a, b can be rewritten as op, do so.
a = a->Simplify();
b = b->Simplify();
// Canonicalize: a->op <= b->op.
if (a->op() > b->op()) {
Prefilter* t = a;
a = b;
b = t;
}
// Trivial cases.
// ALL AND b = b
// NONE OR b = b
// ALL OR b = ALL
// NONE AND b = NONE
// Don't need to look at b, because of canonicalization above.
// ALL and NONE are smallest opcodes.
if (a->op() == ALL || a->op() == NONE) {
if ((a->op() == ALL && op == AND) ||
(a->op() == NONE && op == OR)) {
delete a;
return b;
} else {
delete b;
return a;
}
}
// If a and b match op, merge their contents.
if (a->op() == op && b->op() == op) {
for (size_t i = 0; i < b->subs()->size(); i++) {
Prefilter* bb = (*b->subs())[i];
a->subs()->push_back(bb);
}
b->subs()->clear();
delete b;
return a;
}
// If a already has the same op as the op that is under construction
// add in b (similarly if b already has the same op, add in a).
if (b->op() == op) {
Prefilter* t = a;
a = b;
b = t;
}
if (a->op() == op) {
a->subs()->push_back(b);
return a;
}
// Otherwise just return the op.
Prefilter* c = new Prefilter(op);
c->subs()->push_back(a);
c->subs()->push_back(b);
return c;
}
Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
return AndOr(AND, a, b);
}
Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
return AndOr(OR, a, b);
}
void Prefilter::SimplifyStringSet(SSet* ss) {
// Now make sure that the strings aren't redundant. For example, if
// we know "ab" is a required string, then it doesn't help at all to
// know that "abc" is also a required string, so delete "abc". This
// is because, when we are performing a string search to filter
// regexps, matching "ab" will already allow this regexp to be a
// candidate for match, so further matching "abc" is redundant.
// Note that we must ignore "" because find() would find it at the
// start of everything and thus we would end up erasing everything.
//
// The SSet sorts strings by length, then lexicographically. Note that
// smaller strings appear first and all strings must be unique. These
// observations let us skip string comparisons when possible.
SSIter i = ss->begin();
if (i != ss->end() && i->empty()) {
++i;
}
for (; i != ss->end(); ++i) {
SSIter j = i;
++j;
while (j != ss->end()) {
if (j->size() > i->size() && j->find(*i) != std::string::npos) {
j = ss->erase(j);
continue;
}
++j;
}
}
}
Prefilter* Prefilter::OrStrings(SSet* ss) {
Prefilter* or_prefilter = new Prefilter(NONE);
SimplifyStringSet(ss);
for (SSIter i = ss->begin(); i != ss->end(); ++i)
or_prefilter = Or(or_prefilter, FromString(*i));
return or_prefilter;
}
static Rune ToLowerRune(Rune r) {
if (r < Runeself) {
if ('A' <= r && r <= 'Z')
r += 'a' - 'A';
return r;
}
const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
if (f == NULL || r < f->lo)
return r;
return ApplyFold(f, r);
}
static Rune ToLowerRuneLatin1(Rune r) {
if ('A' <= r && r <= 'Z')
r += 'a' - 'A';
return r;
}
Prefilter* Prefilter::FromString(const std::string& str) {
Prefilter* m = new Prefilter(Prefilter::ATOM);
m->atom_ = str;
return m;
}
// Information about a regexp used during computation of Prefilter.
// Can be thought of as information about the set of strings matching
// the given regular expression.
class Prefilter::Info {
public:
Info();
~Info();
// More constructors. They delete their Info* arguments.
static Info* Alt(Info* a, Info* b);
static Info* Concat(Info* a, Info* b);
static Info* And(Info* a, Info* b);
static Info* Star(Info* a);
static Info* Plus(Info* a);
static Info* Quest(Info* a);
static Info* EmptyString();
static Info* NoMatch();
static Info* AnyCharOrAnyByte();
static Info* CClass(CharClass* cc, bool latin1);
static Info* Literal(Rune r);
static Info* LiteralLatin1(Rune r);
static Info* AnyMatch();
// Format Info as a string.
std::string ToString();
// Caller takes ownership of the Prefilter.
Prefilter* TakeMatch();
SSet& exact() { return exact_; }
bool is_exact() const { return is_exact_; }
class Walker;
private:
SSet exact_;
// When is_exact_ is true, the strings that match
// are placed in exact_. When it is no longer an exact
// set of strings that match this RE, then is_exact_
// is false and the match_ contains the required match
// criteria.
bool is_exact_;
// Accumulated Prefilter query that any
// match for this regexp is guaranteed to match.
Prefilter* match_;
};
Prefilter::Info::Info()
: is_exact_(false),
match_(NULL) {
}
Prefilter::Info::~Info() {
delete match_;
}
Prefilter* Prefilter::Info::TakeMatch() {
if (is_exact_) {
match_ = Prefilter::OrStrings(&exact_);
is_exact_ = false;
}
Prefilter* m = match_;
match_ = NULL;
return m;
}
// Format a Info in string form.
std::string Prefilter::Info::ToString() {
if (is_exact_) {
int n = 0;
std::string s;
for (SSIter i = exact_.begin(); i != exact_.end(); ++i) {
if (n++ > 0)
s += ",";
s += *i;
}
return s;
}
if (match_)
return match_->DebugString();
return "";
}
void Prefilter::CrossProduct(const SSet& a, const SSet& b, SSet* dst) {
for (ConstSSIter i = a.begin(); i != a.end(); ++i)
for (ConstSSIter j = b.begin(); j != b.end(); ++j)
dst->insert(*i + *j);
}
// Concats a and b. Requires that both are exact sets.
// Forms an exact set that is a crossproduct of a and b.
Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
if (a == NULL)
return b;
DCHECK(a->is_exact_);
DCHECK(b && b->is_exact_);
Info *ab = new Info();
CrossProduct(a->exact_, b->exact_, &ab->exact_);
ab->is_exact_ = true;
delete a;
delete b;
return ab;
}
// Constructs an inexact Info for ab given a and b.
// Used only when a or b is not exact or when the
// exact cross product is likely to be too big.
Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
if (a == NULL)
return b;
if (b == NULL)
return a;
Info *ab = new Info();
ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
ab->is_exact_ = false;
delete a;
delete b;
return ab;
}
// Constructs Info for a|b given a and b.
Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
Info *ab = new Info();
if (a->is_exact_ && b->is_exact_) {
// Avoid string copies by moving the larger exact_ set into
// ab directly, then merge in the smaller set.
if (a->exact_.size() < b->exact_.size()) {
using std::swap;
swap(a, b);
}
ab->exact_ = std::move(a->exact_);
ab->exact_.insert(b->exact_.begin(), b->exact_.end());
ab->is_exact_ = true;
} else {
// Either a or b has is_exact_ = false. If the other
// one has is_exact_ = true, we move it to match_ and
// then create a OR of a,b. The resulting Info has
// is_exact_ = false.
ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
ab->is_exact_ = false;
}
delete a;
delete b;
return ab;
}
// Constructs Info for a? given a.
Prefilter::Info* Prefilter::Info::Quest(Info *a) {
Info *ab = new Info();
ab->is_exact_ = false;
ab->match_ = new Prefilter(ALL);
delete a;
return ab;
}
// Constructs Info for a* given a.
// Same as a? -- not much to do.
Prefilter::Info* Prefilter::Info::Star(Info *a) {
return Quest(a);
}
// Constructs Info for a+ given a. If a was exact set, it isn't
// anymore.
Prefilter::Info* Prefilter::Info::Plus(Info *a) {
Info *ab = new Info();
ab->match_ = a->TakeMatch();
ab->is_exact_ = false;
delete a;
return ab;
}
static std::string RuneToString(Rune r) {
char buf[UTFmax];
int n = runetochar(buf, &r);
return std::string(buf, n);
}
static std::string RuneToStringLatin1(Rune r) {
char c = r & 0xff;
return std::string(&c, 1);
}
// Constructs Info for literal rune.
Prefilter::Info* Prefilter::Info::Literal(Rune r) {
Info* info = new Info();
info->exact_.insert(RuneToString(ToLowerRune(r)));
info->is_exact_ = true;
return info;
}
// Constructs Info for literal rune for Latin1 encoded string.
Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
Info* info = new Info();
info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
info->is_exact_ = true;
return info;
}
// Constructs Info for dot (any character) or \C (any byte).
Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() {
Prefilter::Info* info = new Prefilter::Info();
info->match_ = new Prefilter(ALL);
return info;
}
// Constructs Prefilter::Info for no possible match.
Prefilter::Info* Prefilter::Info::NoMatch() {
Prefilter::Info* info = new Prefilter::Info();
info->match_ = new Prefilter(NONE);
return info;
}
// Constructs Prefilter::Info for any possible match.
// This Prefilter::Info is valid for any regular expression,
// since it makes no assertions whatsoever about the
// strings being matched.
Prefilter::Info* Prefilter::Info::AnyMatch() {
Prefilter::Info *info = new Prefilter::Info();
info->match_ = new Prefilter(ALL);
return info;
}
// Constructs Prefilter::Info for just the empty string.
Prefilter::Info* Prefilter::Info::EmptyString() {
Prefilter::Info* info = new Prefilter::Info();
info->is_exact_ = true;
info->exact_.insert("");
return info;
}
// Constructs Prefilter::Info for a character class.
typedef CharClass::iterator CCIter;
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
bool latin1) {
// If the class is too large, it's okay to overestimate.
if (cc->size() > 10)
return AnyCharOrAnyByte();
Prefilter::Info *a = new Prefilter::Info();
for (CCIter i = cc->begin(); i != cc->end(); ++i)
for (Rune r = i->lo; r <= i->hi; r++) {
if (latin1) {
a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
} else {
a->exact_.insert(RuneToString(ToLowerRune(r)));
}
}
a->is_exact_ = true;
return a;
}
class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
public:
Walker(bool latin1) : latin1_(latin1) {}
virtual Info* PostVisit(
Regexp* re, Info* parent_arg,
Info* pre_arg,
Info** child_args, int nchild_args);
virtual Info* ShortVisit(
Regexp* re,
Info* parent_arg);
bool latin1() { return latin1_; }
private:
bool latin1_;
Walker(const Walker&) = delete;
Walker& operator=(const Walker&) = delete;
};
Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
Prefilter::Info::Walker w(latin1);
Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
if (w.stopped_early()) {
delete info;
return NULL;
}
return info;
}
Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
Regexp* re, Prefilter::Info* parent_arg) {
return AnyMatch();
}
// Constructs the Prefilter::Info for the given regular expression.
// Assumes re is simplified.
Prefilter::Info* Prefilter::Info::Walker::PostVisit(
Regexp* re, Prefilter::Info* parent_arg,
Prefilter::Info* pre_arg, Prefilter::Info** child_args,
int nchild_args) {
Prefilter::Info *info;
switch (re->op()) {
default:
case kRegexpRepeat:
info = EmptyString();
LOG(DFATAL) << "Bad regexp op " << re->op();
break;
case kRegexpNoMatch:
info = NoMatch();
break;
// These ops match the empty string:
case kRegexpEmptyMatch: // anywhere
case kRegexpBeginLine: // at beginning of line
case kRegexpEndLine: // at end of line
case kRegexpBeginText: // at beginning of text
case kRegexpEndText: // at end of text
case kRegexpWordBoundary: // at word boundary
case kRegexpNoWordBoundary: // not at word boundary
info = EmptyString();
break;
case kRegexpLiteral:
if (latin1()) {
info = LiteralLatin1(re->rune());
}
else {
info = Literal(re->rune());
}
break;
case kRegexpLiteralString:
if (re->nrunes() == 0) {
info = NoMatch();
break;
}
if (latin1()) {
info = LiteralLatin1(re->runes()[0]);
for (int i = 1; i < re->nrunes(); i++) {
info = Concat(info, LiteralLatin1(re->runes()[i]));
}
} else {
info = Literal(re->runes()[0]);
for (int i = 1; i < re->nrunes(); i++) {
info = Concat(info, Literal(re->runes()[i]));
}
}
break;
case kRegexpConcat: {
// Accumulate in info.
// Exact is concat of recent contiguous exact nodes.
info = NULL;
Info* exact = NULL;
for (int i = 0; i < nchild_args; i++) {
Info* ci = child_args[i]; // child info
if (!ci->is_exact() ||
(exact && ci->exact().size() * exact->exact().size() > 16)) {
// Exact run is over.
info = And(info, exact);
exact = NULL;
// Add this child's info.
info = And(info, ci);
} else {
// Append to exact run.
exact = Concat(exact, ci);
}
}
info = And(info, exact);
}
break;
case kRegexpAlternate:
info = child_args[0];
for (int i = 1; i < nchild_args; i++)
info = Alt(info, child_args[i]);
break;
case kRegexpStar:
info = Star(child_args[0]);
break;
case kRegexpQuest:
info = Quest(child_args[0]);
break;
case kRegexpPlus:
info = Plus(child_args[0]);
break;
case kRegexpAnyChar:
case kRegexpAnyByte:
// Claim nothing, except that it's not empty.
info = AnyCharOrAnyByte();
break;
case kRegexpCharClass:
info = CClass(re->cc(), latin1());
break;
case kRegexpCapture:
// These don't affect the set of matching strings.
info = child_args[0];
break;
}
return info;
}
Prefilter* Prefilter::FromRegexp(Regexp* re) {
if (re == NULL)
return NULL;
Regexp* simple = re->Simplify();
if (simple == NULL)
return NULL;
Prefilter::Info* info = BuildInfo(simple);
simple->Decref();
if (info == NULL)
return NULL;
Prefilter* m = info->TakeMatch();
delete info;
return m;
}
std::string Prefilter::DebugString() const {
switch (op_) {
default:
LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
return StringPrintf("op%d", op_);
case NONE:
return "*no-matches*";
case ATOM:
return atom_;
case ALL:
return "";
case AND: {
std::string s = "";
for (size_t i = 0; i < subs_->size(); i++) {
if (i > 0)
s += " ";
Prefilter* sub = (*subs_)[i];
s += sub ? sub->DebugString() : "<nil>";
}
return s;
}
case OR: {
std::string s = "(";
for (size_t i = 0; i < subs_->size(); i++) {
if (i > 0)
s += "|";
Prefilter* sub = (*subs_)[i];
s += sub ? sub->DebugString() : "<nil>";
}
s += ")";
return s;
}
}
}
Prefilter* Prefilter::FromRE2(const RE2* re2) {
if (re2 == NULL)
return NULL;
Regexp* regexp = re2->Regexp();
if (regexp == NULL)
return NULL;
return FromRegexp(regexp);
}
} // namespace re2

View File

@@ -0,0 +1,130 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_PREFILTER_H_
#define RE2_PREFILTER_H_
// Prefilter is the class used to extract string guards from regexps.
// Rather than using Prefilter class directly, use FilteredRE2.
// See filtered_re2.h
#include <set>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
namespace duckdb_re2 {
class RE2;
class Regexp;
class Prefilter {
// Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
public:
enum Op {
ALL = 0, // Everything matches
NONE, // Nothing matches
ATOM, // The string atom() must match
AND, // All in subs() must match
OR, // One of subs() must match
};
explicit Prefilter(Op op);
~Prefilter();
Op op() { return op_; }
const std::string& atom() const { return atom_; }
void set_unique_id(int id) { unique_id_ = id; }
int unique_id() const { return unique_id_; }
// The children of the Prefilter node.
std::vector<Prefilter*>* subs() {
DCHECK(op_ == AND || op_ == OR);
return subs_;
}
// Set the children vector. Prefilter takes ownership of subs and
// subs_ will be deleted when Prefilter is deleted.
void set_subs(std::vector<Prefilter*>* subs) { subs_ = subs; }
// Given a RE2, return a Prefilter. The caller takes ownership of
// the Prefilter and should deallocate it. Returns NULL if Prefilter
// cannot be formed.
static Prefilter* FromRE2(const RE2* re2);
// Returns a readable debug string of the prefilter.
std::string DebugString() const;
private:
// A comparator used to store exact strings. We compare by length,
// then lexicographically. This ordering makes it easier to reduce the
// set of strings in SimplifyStringSet.
struct LengthThenLex {
bool operator()(const std::string& a, const std::string& b) const {
return (a.size() < b.size()) || (a.size() == b.size() && a < b);
}
};
class Info;
using SSet = std::set<std::string, LengthThenLex>;
using SSIter = SSet::iterator;
using ConstSSIter = SSet::const_iterator;
// Combines two prefilters together to create an AND. The passed
// Prefilters will be part of the returned Prefilter or deleted.
static Prefilter* And(Prefilter* a, Prefilter* b);
// Combines two prefilters together to create an OR. The passed
// Prefilters will be part of the returned Prefilter or deleted.
static Prefilter* Or(Prefilter* a, Prefilter* b);
// Generalized And/Or
static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
static Prefilter* FromRegexp(Regexp* a);
static Prefilter* FromString(const std::string& str);
static Prefilter* OrStrings(SSet* ss);
static Info* BuildInfo(Regexp* re);
Prefilter* Simplify();
// Removes redundant strings from the set. A string is redundant if
// any of the other strings appear as a substring. The empty string
// is a special case, which is ignored.
static void SimplifyStringSet(SSet* ss);
// Adds the cross-product of a and b to dst.
// (For each string i in a and j in b, add i+j.)
static void CrossProduct(const SSet& a, const SSet& b, SSet* dst);
// Kind of Prefilter.
Op op_;
// Sub-matches for AND or OR Prefilter.
std::vector<Prefilter*>* subs_;
// Actual string to match in leaf node.
std::string atom_;
// If different prefilters have the same string atom, or if they are
// structurally the same (e.g., OR of same atom strings) they are
// considered the same unique nodes. This is the id for each unique
// node. This field is populated with a unique id for every node,
// and -1 for duplicate nodes.
int unique_id_;
Prefilter(const Prefilter&) = delete;
Prefilter& operator=(const Prefilter&) = delete;
};
} // namespace re2
#endif // RE2_PREFILTER_H_

View File

@@ -0,0 +1,388 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/prefilter_tree.h"
#include <stddef.h>
#include <algorithm>
#include <cmath>
#include <map>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "re2/prefilter.h"
#include "re2/re2.h"
namespace duckdb_re2 {
PrefilterTree::PrefilterTree()
: compiled_(false),
min_atom_len_(3) {
}
PrefilterTree::PrefilterTree(int min_atom_len)
: compiled_(false),
min_atom_len_(min_atom_len) {
}
PrefilterTree::~PrefilterTree() {
for (size_t i = 0; i < prefilter_vec_.size(); i++)
delete prefilter_vec_[i];
}
void PrefilterTree::Add(Prefilter* prefilter) {
if (compiled_) {
LOG(DFATAL) << "Add called after Compile.";
return;
}
if (prefilter != NULL && !KeepNode(prefilter)) {
delete prefilter;
prefilter = NULL;
}
prefilter_vec_.push_back(prefilter);
}
void PrefilterTree::Compile(std::vector<std::string>* atom_vec) {
if (compiled_) {
LOG(DFATAL) << "Compile called already.";
return;
}
// Some legacy users of PrefilterTree call Compile() before
// adding any regexps and expect Compile() to have no effect.
if (prefilter_vec_.empty())
return;
compiled_ = true;
NodeMap nodes;
AssignUniqueIds(&nodes, atom_vec);
}
Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) {
std::string node_string = NodeString(node);
NodeMap::iterator iter = nodes->find(node_string);
if (iter == nodes->end())
return NULL;
return (*iter).second;
}
std::string PrefilterTree::NodeString(Prefilter* node) const {
// Adding the operation disambiguates AND/OR/atom nodes.
std::string s = StringPrintf("%d", node->op()) + ":";
if (node->op() == Prefilter::ATOM) {
s += node->atom();
} else {
for (size_t i = 0; i < node->subs()->size(); i++) {
if (i > 0)
s += ',';
s += StringPrintf("%d", (*node->subs())[i]->unique_id());
}
}
return s;
}
bool PrefilterTree::KeepNode(Prefilter* node) const {
if (node == NULL)
return false;
switch (node->op()) {
default:
LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op();
return false;
case Prefilter::ALL:
case Prefilter::NONE:
return false;
case Prefilter::ATOM:
return node->atom().size() >= static_cast<size_t>(min_atom_len_);
case Prefilter::AND: {
int j = 0;
std::vector<Prefilter*>* subs = node->subs();
for (size_t i = 0; i < subs->size(); i++)
if (KeepNode((*subs)[i]))
(*subs)[j++] = (*subs)[i];
else
delete (*subs)[i];
subs->resize(j);
return j > 0;
}
case Prefilter::OR:
for (size_t i = 0; i < node->subs()->size(); i++)
if (!KeepNode((*node->subs())[i]))
return false;
return true;
}
}
void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
std::vector<std::string>* atom_vec) {
atom_vec->clear();
// Build vector of all filter nodes, sorted topologically
// from top to bottom in v.
std::vector<Prefilter*> v;
// Add the top level nodes of each regexp prefilter.
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
Prefilter* f = prefilter_vec_[i];
if (f == NULL)
unfiltered_.push_back(static_cast<int>(i));
// We push NULL also on to v, so that we maintain the
// mapping of index==regexpid for level=0 prefilter nodes.
v.push_back(f);
}
// Now add all the descendant nodes.
for (size_t i = 0; i < v.size(); i++) {
Prefilter* f = v[i];
if (f == NULL)
continue;
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
const std::vector<Prefilter*>& subs = *f->subs();
for (size_t j = 0; j < subs.size(); j++)
v.push_back(subs[j]);
}
}
// Identify unique nodes.
int unique_id = 0;
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
Prefilter *node = v[i];
if (node == NULL)
continue;
node->set_unique_id(-1);
Prefilter* canonical = CanonicalNode(nodes, node);
if (canonical == NULL) {
// Any further nodes that have the same node string
// will find this node as the canonical node.
nodes->emplace(NodeString(node), node);
if (node->op() == Prefilter::ATOM) {
atom_vec->push_back(node->atom());
atom_index_to_id_.push_back(unique_id);
}
node->set_unique_id(unique_id++);
} else {
node->set_unique_id(canonical->unique_id());
}
}
entries_.resize(unique_id);
// Fill the entries.
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
if (CanonicalNode(nodes, prefilter) != prefilter)
continue;
int id = prefilter->unique_id();
switch (prefilter->op()) {
default:
LOG(DFATAL) << "Unexpected op: " << prefilter->op();
return;
case Prefilter::ATOM:
entries_[id].propagate_up_at_count = 1;
break;
case Prefilter::OR:
case Prefilter::AND: {
// For each child, we append our id to the child's list of
// parent ids... unless we happen to have done so already.
// The number of appends is the number of unique children,
// which allows correct upward propagation from AND nodes.
int up_count = 0;
for (size_t j = 0; j < prefilter->subs()->size(); j++) {
int child_id = (*prefilter->subs())[j]->unique_id();
std::vector<int>& parents = entries_[child_id].parents;
if (parents.empty() || parents.back() != id) {
parents.push_back(id);
up_count++;
}
}
entries_[id].propagate_up_at_count =
prefilter->op() == Prefilter::AND ? up_count : 1;
break;
}
}
}
// For top level nodes, populate regexp id.
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
if (prefilter_vec_[i] == NULL)
continue;
int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id();
DCHECK_LE(0, id);
Entry* entry = &entries_[id];
entry->regexps.push_back(static_cast<int>(i));
}
// Lastly, using probability-based heuristics, we identify nodes
// that trigger too many parents and then we try to prune edges.
// We use logarithms below to avoid the likelihood of underflow.
double log_num_regexps = std::log(prefilter_vec_.size() - unfiltered_.size());
// Hoisted this above the loop so that we don't thrash the heap.
std::vector<std::pair<size_t, int>> entries_by_num_edges;
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
// Pruning applies only to AND nodes because it "just" reduces
// precision; applied to OR nodes, it would break correctness.
if (prefilter == NULL || prefilter->op() != Prefilter::AND)
continue;
if (CanonicalNode(nodes, prefilter) != prefilter)
continue;
int id = prefilter->unique_id();
// Sort the current node's children by the numbers of parents.
entries_by_num_edges.clear();
for (size_t j = 0; j < prefilter->subs()->size(); j++) {
int child_id = (*prefilter->subs())[j]->unique_id();
const std::vector<int>& parents = entries_[child_id].parents;
entries_by_num_edges.emplace_back(parents.size(), child_id);
}
std::stable_sort(entries_by_num_edges.begin(), entries_by_num_edges.end());
// A running estimate of how many regexps will be triggered by
// pruning the remaining children's edges to the current node.
// Our nominal target is one, so the threshold is log(1) == 0;
// pruning occurs iff the child has more than nine edges left.
double log_num_triggered = log_num_regexps;
for (const auto& pair : entries_by_num_edges) {
int child_id = pair.second;
std::vector<int>& parents = entries_[child_id].parents;
if (log_num_triggered > 0.) {
log_num_triggered += std::log(parents.size());
log_num_triggered -= log_num_regexps;
} else if (parents.size() > 9) {
auto it = std::find(parents.begin(), parents.end(), id);
if (it != parents.end()) {
parents.erase(it);
entries_[id].propagate_up_at_count--;
}
}
}
}
}
// Functions for triggering during search.
void PrefilterTree::RegexpsGivenStrings(
const std::vector<int>& matched_atoms,
std::vector<int>* regexps) const {
regexps->clear();
if (!compiled_) {
// Some legacy users of PrefilterTree call Compile() before
// adding any regexps and expect Compile() to have no effect.
// This kludge is a counterpart to that kludge.
if (prefilter_vec_.empty())
return;
LOG(ERROR) << "RegexpsGivenStrings called before Compile.";
for (size_t i = 0; i < prefilter_vec_.size(); i++)
regexps->push_back(static_cast<int>(i));
} else {
IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
std::vector<int> matched_atom_ids;
for (size_t j = 0; j < matched_atoms.size(); j++)
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
PropagateMatch(matched_atom_ids, &regexps_map);
for (IntMap::iterator it = regexps_map.begin();
it != regexps_map.end();
++it)
regexps->push_back(it->index());
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
}
std::sort(regexps->begin(), regexps->end());
}
void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
IntMap* regexps) const {
IntMap count(static_cast<int>(entries_.size()));
IntMap work(static_cast<int>(entries_.size()));
for (size_t i = 0; i < atom_ids.size(); i++)
work.set(atom_ids[i], 1);
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
const Entry& entry = entries_[it->index()];
// Record regexps triggered.
for (size_t i = 0; i < entry.regexps.size(); i++)
regexps->set(entry.regexps[i], 1);
int c;
// Pass trigger up to parents.
for (int j : entry.parents) {
const Entry& parent = entries_[j];
// Delay until all the children have succeeded.
if (parent.propagate_up_at_count > 1) {
if (count.has_index(j)) {
c = count.get_existing(j) + 1;
count.set_existing(j, c);
} else {
c = 1;
count.set_new(j, c);
}
if (c < parent.propagate_up_at_count)
continue;
}
// Trigger the parent.
work.set(j, 1);
}
}
}
// Debugging help.
void PrefilterTree::PrintPrefilter(int regexpid) {
LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
}
void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
LOG(ERROR) << "#Unique Nodes: " << entries_.size();
for (size_t i = 0; i < entries_.size(); i++) {
const std::vector<int>& parents = entries_[i].parents;
const std::vector<int>& regexps = entries_[i].regexps;
LOG(ERROR) << "EntryId: " << i
<< " N: " << parents.size() << " R: " << regexps.size();
for (int parent : parents)
LOG(ERROR) << parent;
}
LOG(ERROR) << "Map:";
for (NodeMap::const_iterator iter = nodes->begin();
iter != nodes->end(); ++iter)
LOG(ERROR) << "NodeId: " << (*iter).second->unique_id()
<< " Str: " << (*iter).first;
}
std::string PrefilterTree::DebugNodeString(Prefilter* node) const {
std::string node_string = "";
if (node->op() == Prefilter::ATOM) {
DCHECK(!node->atom().empty());
node_string += node->atom();
} else {
// Adding the operation disambiguates AND and OR nodes.
node_string += node->op() == Prefilter::AND ? "AND" : "OR";
node_string += "(";
for (size_t i = 0; i < node->subs()->size(); i++) {
if (i > 0)
node_string += ',';
node_string += StringPrintf("%d", (*node->subs())[i]->unique_id());
node_string += ":";
node_string += DebugNodeString((*node->subs())[i]);
}
node_string += ")";
}
return node_string;
}
} // namespace re2

View File

@@ -0,0 +1,140 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_PREFILTER_TREE_H_
#define RE2_PREFILTER_TREE_H_
// The PrefilterTree class is used to form an AND-OR tree of strings
// that would trigger each regexp. The 'prefilter' of each regexp is
// added to PrefilterTree, and then PrefilterTree is used to find all
// the unique strings across the prefilters. During search, by using
// matches from a string matching engine, PrefilterTree deduces the
// set of regexps that are to be triggered. The 'string matching
// engine' itself is outside of this class, and the caller can use any
// favorite engine. PrefilterTree provides a set of strings (called
// atoms) that the user of this class should use to do the string
// matching.
#include <map>
#include <string>
#include <vector>
#include "util/util.h"
#include "re2/prefilter.h"
#include "re2/sparse_array.h"
namespace duckdb_re2 {
class PrefilterTree {
public:
PrefilterTree();
explicit PrefilterTree(int min_atom_len);
~PrefilterTree();
// Adds the prefilter for the next regexp. Note that we assume that
// Add called sequentially for all regexps. All Add calls
// must precede Compile.
void Add(Prefilter* prefilter);
// The Compile returns a vector of string in atom_vec.
// Call this after all the prefilters are added through Add.
// No calls to Add after Compile are allowed.
// The caller should use the returned set of strings to do string matching.
// Each time a string matches, the corresponding index then has to be
// and passed to RegexpsGivenStrings below.
void Compile(std::vector<std::string>* atom_vec);
// Given the indices of the atoms that matched, returns the indexes
// of regexps that should be searched. The matched_atoms should
// contain all the ids of string atoms that were found to match the
// content. The caller can use any string match engine to perform
// this function. This function is thread safe.
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
std::vector<int>* regexps) const;
// Print debug prefilter. Also prints unique ids associated with
// nodes of the prefilter of the regexp.
void PrintPrefilter(int regexpid);
private:
typedef SparseArray<int> IntMap;
// TODO(junyer): Use std::unordered_set<Prefilter*> instead?
// It should be trivial to get rid of the stringification...
typedef std::map<std::string, Prefilter*> NodeMap;
// Each unique node has a corresponding Entry that helps in
// passing the matching trigger information along the tree.
struct Entry {
public:
// How many children should match before this node triggers the
// parent. For an atom and an OR node, this is 1 and for an AND
// node, it is the number of unique children.
int propagate_up_at_count;
// When this node is ready to trigger the parent, what are the indices
// of the parent nodes to trigger. The reason there may be more than
// one is because of sharing. For example (abc | def) and (xyz | def)
// are two different nodes, but they share the atom 'def'. So when
// 'def' matches, it triggers two parents, corresponding to the two
// different OR nodes.
std::vector<int> parents;
// When this node is ready to trigger the parent, what are the
// regexps that are triggered.
std::vector<int> regexps;
};
// Returns true if the prefilter node should be kept.
bool KeepNode(Prefilter* node) const;
// This function assigns unique ids to various parts of the
// prefilter, by looking at if these nodes are already in the
// PrefilterTree.
void AssignUniqueIds(NodeMap* nodes, std::vector<std::string>* atom_vec);
// Given the matching atoms, find the regexps to be triggered.
void PropagateMatch(const std::vector<int>& atom_ids,
IntMap* regexps) const;
// Returns the prefilter node that has the same NodeString as this
// node. For the canonical node, returns node.
Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node);
// A string that uniquely identifies the node. Assumes that the
// children of node has already been assigned unique ids.
std::string NodeString(Prefilter* node) const;
// Recursively constructs a readable prefilter string.
std::string DebugNodeString(Prefilter* node) const;
// Used for debugging.
void PrintDebugInfo(NodeMap* nodes);
// These are all the nodes formed by Compile. Essentially, there is
// one node for each unique atom and each unique AND/OR node.
std::vector<Entry> entries_;
// indices of regexps that always pass through the filter (since we
// found no required literals in these regexps).
std::vector<int> unfiltered_;
// vector of Prefilter for all regexps.
std::vector<Prefilter*> prefilter_vec_;
// Atom index in returned strings to entry id mapping.
std::vector<int> atom_index_to_id_;
// Has the prefilter tree been compiled.
bool compiled_;
// Strings less than this length are not stored as atoms.
const int min_atom_len_;
PrefilterTree(const PrefilterTree&) = delete;
PrefilterTree& operator=(const PrefilterTree&) = delete;
};
} // namespace
#endif // RE2_PREFILTER_TREE_H_

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,467 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_PROG_H_
#define RE2_PROG_H_
// Compiled representation of regular expressions.
// See regexp.h for the Regexp class, which represents a regular
// expression symbolically.
#include <stdint.h>
#include <functional>
#include <mutex>
#include <string>
#include <vector>
#include <type_traits>
#include "util/util.h"
#include "util/logging.h"
#include "re2/pod_array.h"
#include "re2/re2.h"
#include "re2/sparse_array.h"
#include "re2/sparse_set.h"
namespace duckdb_re2 {
// Opcodes for Inst
enum InstOp {
kInstAlt = 0, // choose between out_ and out1_
kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
kInstCapture, // capturing parenthesis number cap_
kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
kInstMatch, // found a match!
kInstNop, // no-op; occasionally unavoidable
kInstFail, // never match; occasionally unavoidable
kNumInst,
};
// Bit flags for empty-width specials
enum EmptyOp {
kEmptyBeginLine = 1<<0, // ^ - beginning of line
kEmptyEndLine = 1<<1, // $ - end of line
kEmptyBeginText = 1<<2, // \A - beginning of text
kEmptyEndText = 1<<3, // \z - end of text
kEmptyWordBoundary = 1<<4, // \b - word boundary
kEmptyNonWordBoundary = 1<<5, // \B - not \b
kEmptyAllFlags = (1<<6)-1,
};
class DFA;
class Regexp;
// Compiled form of regexp program.
class Prog {
public:
Prog();
~Prog();
// Single instruction in regexp program.
class Inst {
public:
// See the assertion below for why this is so.
Inst() = default;
// Copyable.
Inst(const Inst&) = default;
Inst& operator=(const Inst&) = default;
// Constructors per opcode
void InitAlt(uint32_t out, uint32_t out1);
void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
void InitCapture(int cap, uint32_t out);
void InitEmptyWidth(EmptyOp empty, uint32_t out);
void InitMatch(int id);
void InitNop(uint32_t out);
void InitFail();
// Getters
int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); }
InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
int last() { return (out_opcode_>>3)&1; }
int out() { return out_opcode_>>4; }
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
int lo() { DCHECK_EQ(opcode(), kInstByteRange); return byte_range.lo_; }
int hi() { DCHECK_EQ(opcode(), kInstByteRange); return byte_range.hi_; }
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return byte_range.hint_foldcase_&1; }
int hint() { DCHECK_EQ(opcode(), kInstByteRange); return byte_range.hint_foldcase_>>1; }
int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
bool greedy(Prog* p) {
DCHECK_EQ(opcode(), kInstAltMatch);
return p->inst(out())->opcode() == kInstByteRange ||
(p->inst(out())->opcode() == kInstNop &&
p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
}
// Does this inst (an kInstByteRange) match c?
inline bool Matches(int c) {
DCHECK_EQ(opcode(), kInstByteRange);
if (foldcase() && 'A' <= c && c <= 'Z')
c += 'a' - 'A';
return byte_range.lo_ <= c && c <= byte_range.hi_;
}
// Returns string representation for debugging.
std::string Dump();
// Maximum instruction id.
// (Must fit in out_opcode_. PatchList/last steal another bit.)
static const int kMaxInst = (1<<28) - 1;
private:
void set_opcode(InstOp opcode) {
out_opcode_ = (out()<<4) | (last()<<3) | opcode;
}
void set_last() {
out_opcode_ = (out()<<4) | (1<<3) | opcode();
}
void set_out(int out) {
out_opcode_ = (out<<4) | (last()<<3) | opcode();
}
void set_out_opcode(int out, InstOp opcode) {
out_opcode_ = (out<<4) | (last()<<3) | opcode;
}
uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
union { // additional instruction arguments:
uint32_t out1_; // opcode == kInstAlt
// alternate next instruction
int32_t cap_; // opcode == kInstCapture
// Index of capture register (holds text
// position recorded by capturing parentheses).
// For \n (the submatch for the nth parentheses),
// the left parenthesis captures into register 2*n
// and the right one captures into register 2*n+1.
int32_t match_id_; // opcode == kInstMatch
// Match ID to identify this match (for duckdb_re2::Set).
struct { // opcode == kInstByteRange
uint8_t lo_; // byte range is lo_-hi_ inclusive
uint8_t hi_; //
uint16_t hint_foldcase_; // 15 bits: hint, 1 (low) bit: foldcase
// hint to execution engines: the delta to the
// next instruction (in the current list) worth
// exploring iff this instruction matched; 0
// means there are no remaining possibilities,
// which is most likely for character classes.
// foldcase: A-Z -> a-z before checking range.
} byte_range;
EmptyOp empty_; // opcode == kInstEmptyWidth
// empty_ is bitwise OR of kEmpty* flags above.
};
friend class Compiler;
friend struct PatchList;
friend class Prog;
};
// Inst must be trivial so that we can freely clear it with memset(3).
// Arrays of Inst are initialised by copying the initial elements with
// memmove(3) and then clearing any remaining elements with memset(3).
static_assert(std::is_trivial<Inst>::value, "Inst must be trivial");
// Whether to anchor the search.
enum Anchor {
kUnanchored, // match anywhere
kAnchored, // match only starting at beginning of text
};
// Kind of match to look for (for anchor != kFullMatch)
//
// kLongestMatch mode finds the overall longest
// match but still makes its submatch choices the way
// Perl would, not in the way prescribed by POSIX.
// The POSIX rules are much more expensive to implement,
// and no one has needed them.
//
// kFullMatch is not strictly necessary -- we could use
// kLongestMatch and then check the length of the match -- but
// the matching code can run faster if it knows to consider only
// full matches.
enum MatchKind {
kFirstMatch, // like Perl, PCRE
kLongestMatch, // like egrep or POSIX
kFullMatch, // match only entire text; implies anchor==kAnchored
kManyMatch // for SearchDFA, records set of matches
};
Inst *inst(int id) { return &inst_[id]; }
int start() { return start_; }
void set_start(int start) { start_ = start; }
int start_unanchored() { return start_unanchored_; }
void set_start_unanchored(int start) { start_unanchored_ = start; }
int size() { return size_; }
bool reversed() { return reversed_; }
void set_reversed(bool reversed) { reversed_ = reversed; }
int list_count() { return list_count_; }
int inst_count(InstOp op) { return inst_count_[op]; }
uint16_t* list_heads() { return list_heads_.data(); }
size_t bit_state_text_max_size() { return bit_state_text_max_size_; }
int64_t dfa_mem() { return dfa_mem_; }
void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
bool anchor_start() { return anchor_start_; }
void set_anchor_start(bool b) { anchor_start_ = b; }
bool anchor_end() { return anchor_end_; }
void set_anchor_end(bool b) { anchor_end_ = b; }
int bytemap_range() { return bytemap_range_; }
const uint8_t* bytemap() { return bytemap_; }
bool can_prefix_accel() { return prefix_size_ != 0; }
// Accelerates to the first likely occurrence of the prefix.
// Returns a pointer to the first byte or NULL if not found.
const void* PrefixAccel(const void* data, size_t size) {
DCHECK(can_prefix_accel());
if (prefix_foldcase_) {
return PrefixAccel_ShiftDFA(data, size);
} else if (prefix_size_ != 1) {
return PrefixAccel_FrontAndBack(data, size);
} else {
return memchr(data, prefix_front_back.prefix_front_, size);
}
}
// Configures prefix accel using the analysis performed during compilation.
void ConfigurePrefixAccel(const std::string& prefix, bool prefix_foldcase);
// An implementation of prefix accel that uses prefix_dfa_ to perform
// case-insensitive search.
const void* PrefixAccel_ShiftDFA(const void* data, size_t size);
// An implementation of prefix accel that looks for prefix_front_ and
// prefix_back_ to return fewer false positives than memchr(3) alone.
const void* PrefixAccel_FrontAndBack(const void* data, size_t size);
// Returns string representation of program for debugging.
std::string Dump();
std::string DumpUnanchored();
std::string DumpByteMap();
// Returns the set of kEmpty flags that are in effect at
// position p within context.
static uint32_t EmptyFlags(const StringPiece& context, const char* p);
// Returns whether byte c is a word character: ASCII only.
// Used by the implementation of \b and \B.
// This is not right for Unicode, but:
// - it's hard to get right in a byte-at-a-time matching world
// (the DFA has only one-byte lookahead).
// - even if the lookahead were possible, the Progs would be huge.
// This crude approximation is the same one PCRE uses.
static bool IsWordChar(uint8_t c) {
return ('A' <= c && c <= 'Z') ||
('a' <= c && c <= 'z') ||
('0' <= c && c <= '9') ||
c == '_';
}
// Execution engines. They all search for the regexp (run the prog)
// in text, which is in the larger context (used for ^ $ \b etc).
// Anchor and kind control the kind of search.
// Returns true if match found, false if not.
// If match found, fills match[0..nmatch-1] with submatch info.
// match[0] is overall match, match[1] is first set of parens, etc.
// If a particular submatch is not matched during the regexp match,
// it is set to NULL.
//
// Matching text == StringPiece(NULL, 0) is treated as any other empty
// string, but note that on return, it will not be possible to distinguish
// submatches that matched that empty string from submatches that didn't
// match anything. Either way, match[i] == NULL.
// Search using NFA: can find submatches but kind of slow.
bool SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Search using DFA: much faster than NFA but only finds
// end of match and can use a lot more memory.
// Returns whether a match was found.
// If the DFA runs out of memory, sets *failed to true and returns false.
// If matches != NULL and kind == kManyMatch and there is a match,
// SearchDFA fills matches with the match IDs of the final matching state.
bool SearchDFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind, StringPiece* match0,
bool* failed, SparseSet* matches);
// The callback issued after building each DFA state with BuildEntireDFA().
// If next is null, then the memory budget has been exhausted and building
// will halt. Otherwise, the state has been built and next points to an array
// of bytemap_range()+1 slots holding the next states as per the bytemap and
// kByteEndText. The number of the state is implied by the callback sequence:
// the first callback is for state 0, the second callback is for state 1, ...
// match indicates whether the state is a matching state.
using DFAStateCallback = std::function<void(const int* next, bool match)>;
// Build the entire DFA for the given match kind.
// Usually the DFA is built out incrementally, as needed, which
// avoids lots of unnecessary work.
// If cb is not empty, it receives one callback per state built.
// Returns the number of states built.
// FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
// Compute bytemap.
void ComputeByteMap();
// Run peep-hole optimizer on program.
void Optimize();
// One-pass NFA: only correct if IsOnePass() is true,
// but much faster than NFA (competitive with PCRE)
// for those expressions.
bool IsOnePass();
bool SearchOnePass(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Bit-state backtracking. Fast on small cases but uses memory
// proportional to the product of the list count and the text size.
bool CanBitState() { return list_heads_.data() != NULL; }
bool SearchBitState(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
static const int kMaxOnePassCapture = 5; // $0 through $4
// Backtracking search: the gold standard against which the other
// implementations are checked. FOR TESTING ONLY.
// It allocates a ton of memory to avoid running forever.
// It is also recursive, so can't use in production (will overflow stacks).
// The name "Unsafe" here is supposed to be a flag that
// you should not be using this function.
bool UnsafeSearchBacktrack(const StringPiece& text,
const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Computes range for any strings matching regexp. The min and max can in
// some cases be arbitrarily precise, so the caller gets to specify the
// maximum desired length of string returned.
//
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
// string s that is an anchored match for this regexp satisfies
// min <= s && s <= max.
//
// Note that PossibleMatchRange() will only consider the first copy of an
// infinitely repeated element (i.e., any regexp element followed by a '*' or
// '+' operator). Regexps with "{N}" constructions are not affected, as those
// do not compile down to infinite repetitions.
//
// Returns true on success, false on error.
bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
// Outputs the program fanout into the given sparse array.
void Fanout(SparseArray<int>* fanout);
// Compiles a collection of regexps to Prog. Each regexp will have
// its own Match instruction recording the index in the output vector.
static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
// Flattens the Prog from "tree" form to "list" form. This is an in-place
// operation in the sense that the old instructions are lost.
void Flatten();
// Walks the Prog; the "successor roots" or predecessors of the reachable
// instructions are marked in rootmap or predmap/predvec, respectively.
// reachable and stk are preallocated scratch structures.
void MarkSuccessors(SparseArray<int>* rootmap,
SparseArray<int>* predmap,
std::vector<std::vector<int>>* predvec,
SparseSet* reachable, std::vector<int>* stk);
// Walks the Prog from the given "root" instruction; the "dominator root"
// of the reachable instructions (if such exists) is marked in rootmap.
// reachable and stk are preallocated scratch structures.
void MarkDominator(int root, SparseArray<int>* rootmap,
SparseArray<int>* predmap,
std::vector<std::vector<int>>* predvec,
SparseSet* reachable, std::vector<int>* stk);
// Walks the Prog from the given "root" instruction; the reachable
// instructions are emitted in "list" form and appended to flat.
// reachable and stk are preallocated scratch structures.
void EmitList(int root, SparseArray<int>* rootmap,
std::vector<Inst>* flat,
SparseSet* reachable, std::vector<int>* stk);
// Computes hints for ByteRange instructions in [begin, end).
void ComputeHints(std::vector<Inst>* flat, int begin, int end);
// Controls whether the DFA should bail out early if the NFA would be faster.
// FOR TESTING ONLY.
static void TESTING_ONLY_set_dfa_should_bail_when_slow(bool b);
private:
friend class Compiler;
DFA* GetDFA(MatchKind kind);
void DeleteDFA(DFA* dfa);
bool anchor_start_; // regexp has explicit start anchor
bool anchor_end_; // regexp has explicit end anchor
bool reversed_; // whether program runs backward over input
bool did_flatten_; // has Flatten been called?
bool did_onepass_; // has IsOnePass been called?
int start_; // entry point for program
int start_unanchored_; // unanchored entry point for program
int size_; // number of instructions
int bytemap_range_; // bytemap_[x] < bytemap_range_
bool prefix_foldcase_; // whether prefix is case-insensitive
size_t prefix_size_; // size of prefix (0 if no prefix)
union {
uint64_t* prefix_dfa_; // "Shift DFA" for prefix
struct {
int prefix_front_; // first byte of prefix
int prefix_back_; // last byte of prefix
} prefix_front_back;
};
int list_count_; // count of lists (see above)
int inst_count_[kNumInst]; // count of instructions by opcode
PODArray<uint16_t> list_heads_; // sparse array enumerating list heads
// not populated if size_ is overly large
size_t bit_state_text_max_size_; // upper bound (inclusive) on text.size()
PODArray<Inst> inst_; // pointer to instruction array
PODArray<uint8_t> onepass_nodes_; // data for OnePass nodes
int64_t dfa_mem_; // Maximum memory for DFAs.
DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch
DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch
uint8_t bytemap_[256]; // map from input bytes to byte classes
std::once_flag dfa_first_once_;
std::once_flag dfa_longest_once_;
Prog(const Prog&) = delete;
Prog& operator=(const Prog&) = delete;
};
// std::string_view in MSVC has iterators that aren't just pointers and
// that don't allow comparisons between different objects - not even if
// those objects are views into the same string! Thus, we provide these
// conversion functions for convenience.
static inline const char* BeginPtr(const StringPiece& s) {
return s.data();
}
static inline const char* EndPtr(const StringPiece& s) {
return s.data() + s.size();
}
} // namespace re2
#endif // RE2_PROG_H_

1360
external/duckdb/third_party/re2/re2/re2.cc vendored Normal file

File diff suppressed because it is too large Load Diff

1031
external/duckdb/third_party/re2/re2/re2.h vendored Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,665 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_REGEXP_H_
#define RE2_REGEXP_H_
// --- SPONSORED LINK --------------------------------------------------
// If you want to use this library for regular expression matching,
// you should use re2/re2.h, which provides a class RE2 that
// mimics the PCRE interface provided by PCRE's C++ wrappers.
// This header describes the low-level interface used to implement RE2
// and may change in backwards-incompatible ways from time to time.
// In contrast, RE2's interface will not.
// ---------------------------------------------------------------------
// Regular expression library: parsing, execution, and manipulation
// of regular expressions.
//
// Any operation that traverses the Regexp structures should be written
// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
// regular expressions such as x++++++++++++++++++++... might cause recursive
// traversals to overflow the stack.
//
// It is the caller's responsibility to provide appropriate mutual exclusion
// around manipulation of the regexps. RE2 does this.
//
// PARSING
//
// Regexp::Parse parses regular expressions encoded in UTF-8.
// The default syntax is POSIX extended regular expressions,
// with the following changes:
//
// 1. Backreferences (optional in POSIX EREs) are not supported.
// (Supporting them precludes the use of DFA-based
// matching engines.)
//
// 2. Collating elements and collation classes are not supported.
// (No one has needed or wanted them.)
//
// The exact syntax accepted can be modified by passing flags to
// Regexp::Parse. In particular, many of the basic Perl additions
// are available. The flags are documented below (search for LikePerl).
//
// If parsed with the flag Regexp::Latin1, both the regular expression
// and the input to the matching routines are assumed to be encoded in
// Latin-1, not UTF-8.
//
// EXECUTION
//
// Once Regexp has parsed a regular expression, it provides methods
// to search text using that regular expression. These methods are
// implemented via calling out to other regular expression libraries.
// (Let's call them the sublibraries.)
//
// To call a sublibrary, Regexp does not simply prepare a
// string version of the regular expression and hand it to the
// sublibrary. Instead, Regexp prepares, from its own parsed form, the
// corresponding internal representation used by the sublibrary.
// This has the drawback of needing to know the internal representation
// used by the sublibrary, but it has two important benefits:
//
// 1. The syntax and meaning of regular expressions is guaranteed
// to be that used by Regexp's parser, not the syntax expected
// by the sublibrary. Regexp might accept a restricted or
// expanded syntax for regular expressions as compared with
// the sublibrary. As long as Regexp can translate from its
// internal form into the sublibrary's, clients need not know
// exactly which sublibrary they are using.
//
// 2. The sublibrary parsers are bypassed. For whatever reason,
// sublibrary regular expression parsers often have security
// problems. For example, plan9grep's regular expression parser
// has a buffer overflow in its handling of large character
// classes, and PCRE's parser has had buffer overflow problems
// in the past. Security-team requires sandboxing of sublibrary
// regular expression parsers. Avoiding the sublibrary parsers
// avoids the sandbox.
//
// The execution methods we use now are provided by the compiled form,
// Prog, described in prog.h
//
// MANIPULATION
//
// Unlike other regular expression libraries, Regexp makes its parsed
// form accessible to clients, so that client code can analyze the
// parsed regular expressions.
#include <stddef.h>
#include <stdint.h>
#include <map>
#include <set>
#include <string>
#include "util/util.h"
#include "util/logging.h"
#include "util/utf.h"
#include "re2/stringpiece.h"
namespace duckdb_re2 {
// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
enum RegexpOp {
// Matches no strings.
kRegexpNoMatch = 1,
// Matches empty string.
kRegexpEmptyMatch,
// Matches rune_.
kRegexpLiteral,
// Matches runes_.
kRegexpLiteralString,
// Matches concatenation of sub_[0..nsub-1].
kRegexpConcat,
// Matches union of sub_[0..nsub-1].
kRegexpAlternate,
// Matches sub_[0] zero or more times.
kRegexpStar,
// Matches sub_[0] one or more times.
kRegexpPlus,
// Matches sub_[0] zero or one times.
kRegexpQuest,
// Matches sub_[0] at least min_ times, at most max_ times.
// max_ == -1 means no upper limit.
kRegexpRepeat,
// Parenthesized (capturing) subexpression. Index is cap_.
// Optionally, capturing name is name_.
kRegexpCapture,
// Matches any character.
kRegexpAnyChar,
// Matches any byte [sic].
kRegexpAnyByte,
// Matches empty string at beginning of line.
kRegexpBeginLine,
// Matches empty string at end of line.
kRegexpEndLine,
// Matches word boundary "\b".
kRegexpWordBoundary,
// Matches not-a-word boundary "\B".
kRegexpNoWordBoundary,
// Matches empty string at beginning of text.
kRegexpBeginText,
// Matches empty string at end of text.
kRegexpEndText,
// Matches character class given by cc_.
kRegexpCharClass,
// Forces match of entire expression right now,
// with match ID match_id_ (used by RE2::Set).
kRegexpHaveMatch,
kMaxRegexpOp = kRegexpHaveMatch,
};
// Keep in sync with string list in regexp.cc
enum RegexpStatusCode {
// No error
kRegexpSuccess = 0,
// Unexpected error
kRegexpInternalError,
// Parse errors
kRegexpBadEscape, // bad escape sequence
kRegexpBadCharClass, // bad character class
kRegexpBadCharRange, // bad character class range
kRegexpMissingBracket, // missing closing ]
kRegexpMissingParen, // missing closing )
kRegexpUnexpectedParen, // unexpected closing )
kRegexpTrailingBackslash, // at end of regexp
kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
kRegexpRepeatSize, // bad repetition argument
kRegexpRepeatOp, // bad repetition operator
kRegexpBadPerlOp, // bad perl operator
kRegexpBadUTF8, // invalid UTF-8 in regexp
kRegexpBadNamedCapture, // bad named capture
};
// Error status for certain operations.
class RegexpStatus {
public:
RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
~RegexpStatus() { delete tmp_; }
void set_code(RegexpStatusCode code) { code_ = code; }
void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
void set_tmp(std::string* tmp) { delete tmp_; tmp_ = tmp; }
RegexpStatusCode code() const { return code_; }
const StringPiece& error_arg() const { return error_arg_; }
bool ok() const { return code() == kRegexpSuccess; }
// Copies state from status.
void Copy(const RegexpStatus& status);
// Returns text equivalent of code, e.g.:
// "Bad character class"
static std::string CodeText(RegexpStatusCode code);
// Returns text describing error, e.g.:
// "Bad character class: [z-a]"
std::string Text() const;
private:
RegexpStatusCode code_; // Kind of error
StringPiece error_arg_; // Piece of regexp containing syntax error.
std::string* tmp_; // Temporary storage, possibly where error_arg_ is.
RegexpStatus(const RegexpStatus&) = delete;
RegexpStatus& operator=(const RegexpStatus&) = delete;
};
// Compiled form; see prog.h
class Prog;
struct RuneRange {
RuneRange() : lo(0), hi(0) { }
RuneRange(int l, int h) : lo(l), hi(h) { }
Rune lo;
Rune hi;
};
// Less-than on RuneRanges treats a == b if they overlap at all.
// This lets us look in a set to find the range covering a particular Rune.
struct RuneRangeLess {
bool operator()(const RuneRange& a, const RuneRange& b) const {
return a.hi < b.lo;
}
};
class CharClassBuilder;
class CharClass {
public:
void Delete();
typedef RuneRange* iterator;
iterator begin() { return ranges_; }
iterator end() { return ranges_ + nranges_; }
int size() { return nrunes_; }
bool empty() { return nrunes_ == 0; }
bool full() { return nrunes_ == Runemax+1; }
bool FoldsASCII() { return folds_ascii_; }
bool Contains(Rune r) const;
CharClass* Negate();
private:
CharClass(); // not implemented
~CharClass(); // not implemented
static CharClass* New(size_t maxranges);
friend class CharClassBuilder;
bool folds_ascii_;
int nrunes_;
RuneRange *ranges_;
int nranges_;
CharClass(const CharClass&) = delete;
CharClass& operator=(const CharClass&) = delete;
};
class Regexp {
public:
// Flags for parsing. Can be ORed together.
enum ParseFlags {
NoParseFlags = 0,
FoldCase = 1<<0, // Fold case during matching (case-insensitive).
Literal = 1<<1, // Treat s as literal string instead of a regexp.
ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
// and [[:space:]] to match newline.
DotNL = 1<<3, // Allow . to match newline.
MatchNL = ClassNL | DotNL,
OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and
// end of text, not around embedded newlines.
// (Perl's default)
Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
PerlClasses = 1<<7, // Allow Perl character classes like \d.
PerlB = 1<<8, // Allow Perl's \b and \B.
PerlX = 1<<9, // Perl extensions:
// non-capturing parens - (?: )
// non-greedy operators - *? +? ?? {}?
// flag edits - (?i) (?-i) (?i: )
// i - FoldCase
// m - !OneLine
// s - DotNL
// U - NonGreedy
// line ends: \A \z
// \Q and \E to disable/enable metacharacters
// (?P<name>expr) for named captures
// \C to match any single byte
UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
// and \P{Han} for its negation.
NeverNL = 1<<11, // Never match NL, even if the regexp mentions
// it explicitly.
NeverCapture = 1<<12, // Parse all parens as non-capturing.
// As close to Perl as we can get.
LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
UnicodeGroups,
// Internal use only.
WasDollar = 1<<13, // on kRegexpEndText: was $ in regexp text
AllParseFlags = (1<<14)-1,
};
// Get. No set, Regexps are logically immutable once created.
RegexpOp op() { return static_cast<RegexpOp>(op_); }
int nsub() { return nsub_; }
bool simple() { return simple_ != 0; }
ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
int Ref(); // For testing.
Regexp** sub() {
if(nsub_ <= 1)
return &subone_;
else
return submany_;
}
int min() { DCHECK_EQ(op_, kRegexpRepeat); return arguments.repeat.min_; }
int max() { DCHECK_EQ(op_, kRegexpRepeat); return arguments.repeat.max_; }
Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return arguments.rune_; }
CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return arguments.char_class.cc_; }
int cap() { DCHECK_EQ(op_, kRegexpCapture); return arguments.capture.cap_; }
const std::string* name() { DCHECK_EQ(op_, kRegexpCapture); return arguments.capture.name_; }
Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return arguments.literal_string.runes_; }
int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return arguments.literal_string.nrunes_; }
int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return arguments.match_id_; }
// Increments reference count, returns object as convenience.
Regexp* Incref();
// Decrements reference count and deletes this object if count reaches 0.
void Decref();
// Parses string s to produce regular expression, returned.
// Caller must release return value with re->Decref().
// On failure, sets *status (if status != NULL) and returns NULL.
static Regexp* Parse(const StringPiece& s, ParseFlags flags,
RegexpStatus* status);
// Returns a _new_ simplified version of the current regexp.
// Does not edit the current regexp.
// Caller must release return value with re->Decref().
// Simplified means that counted repetition has been rewritten
// into simpler terms and all Perl/POSIX features have been
// removed. The result will capture exactly the same
// subexpressions the original did, unless formatted with ToString.
Regexp* Simplify();
friend class CoalesceWalker;
friend class SimplifyWalker;
// Parses the regexp src and then simplifies it and sets *dst to the
// string representation of the simplified form. Returns true on success.
// Returns false and sets *status (if status != NULL) on parse error.
static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
std::string* dst, RegexpStatus* status);
// Returns the number of capturing groups in the regexp.
int NumCaptures();
friend class NumCapturesWalker;
// Returns a map from names to capturing group indices,
// or NULL if the regexp contains no named capture groups.
// The caller is responsible for deleting the map.
std::map<std::string, int>* NamedCaptures();
// Returns a map from capturing group indices to capturing group
// names or NULL if the regexp contains no named capture groups. The
// caller is responsible for deleting the map.
std::map<int, std::string>* CaptureNames();
// Returns a string representation of the current regexp,
// using as few parentheses as possible.
std::string ToString();
// Convenience functions. They consume the passed reference,
// so in many cases you should use, e.g., Plus(re->Incref(), flags).
// They do not consume allocated arrays like subs or runes.
static Regexp* Plus(Regexp* sub, ParseFlags flags);
static Regexp* Star(Regexp* sub, ParseFlags flags);
static Regexp* Quest(Regexp* sub, ParseFlags flags);
static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
static Regexp* NewLiteral(Rune rune, ParseFlags flags);
static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
static Regexp* HaveMatch(int match_id, ParseFlags flags);
// Like Alternate but does not factor out common prefixes.
static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
// Debugging function. Returns string format for regexp
// that makes structure clear. Does NOT use regexp syntax.
std::string Dump();
// Helper traversal class, defined fully in walker-inl.h.
template<typename T> class Walker;
// Compile to Prog. See prog.h
// Reverse prog expects to be run over text backward.
// Construction and execution of prog will
// stay within approximately max_mem bytes of memory.
// If max_mem <= 0, a reasonable default is used.
Prog* CompileToProg(int64_t max_mem);
Prog* CompileToReverseProg(int64_t max_mem);
// Whether to expect this library to find exactly the same answer as PCRE
// when running this regexp. Most regexps do mimic PCRE exactly, but a few
// obscure cases behave differently. Technically this is more a property
// of the Prog than the Regexp, but the computation is much easier to do
// on the Regexp. See mimics_pcre.cc for the exact conditions.
bool MimicsPCRE();
// Benchmarking function.
void NullWalk();
// Whether every match of this regexp must be anchored and
// begin with a non-empty fixed string (perhaps after ASCII
// case-folding). If so, returns the prefix and the sub-regexp that
// follows it.
// Callers should expect *prefix, *foldcase and *suffix to be "zeroed"
// regardless of the return value.
bool RequiredPrefix(std::string* prefix, bool* foldcase,
Regexp** suffix);
// Whether every match of this regexp must be unanchored and
// begin with a non-empty fixed string (perhaps after ASCII
// case-folding). If so, returns the prefix.
// Callers should expect *prefix and *foldcase to be "zeroed"
// regardless of the return value.
bool RequiredPrefixForAccel(std::string* prefix, bool* foldcase);
// Controls the maximum repeat count permitted by the parser.
// FOR FUZZING ONLY.
static void FUZZING_ONLY_set_maximum_repeat_count(int i);
private:
// Constructor allocates vectors as appropriate for operator.
explicit Regexp(RegexpOp op, ParseFlags parse_flags);
// Use Decref() instead of delete to release Regexps.
// This is private to catch deletes at compile time.
~Regexp();
void Destroy();
bool QuickDestroy();
// Helpers for Parse. Listed here so they can edit Regexps.
class ParseState;
friend class ParseState;
friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
RegexpStatus* status);
// Helper for testing [sic].
friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
// Computes whether Regexp is already simple.
bool ComputeSimple();
// Constructor that generates a Star, Plus or Quest,
// squashing the pair if sub is also a Star, Plus or Quest.
static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);
// Constructor that generates a concatenation or alternation,
// enforcing the limit on the number of subexpressions for
// a particular Regexp.
static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
ParseFlags flags, bool can_factor);
// Returns the leading string that re starts with.
// The returned Rune* points into a piece of re,
// so it must not be used after the caller calls re->Decref().
static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
// Removes the first n leading runes from the beginning of re.
// Edits re in place.
static void RemoveLeadingString(Regexp* re, int n);
// Returns the leading regexp in re's top-level concatenation.
// The returned Regexp* points at re or a sub-expression of re,
// so it must not be used after the caller calls re->Decref().
static Regexp* LeadingRegexp(Regexp* re);
// Removes LeadingRegexp(re) from re and returns the remainder.
// Might edit re in place.
static Regexp* RemoveLeadingRegexp(Regexp* re);
// Simplifies an alternation of literal strings by factoring out
// common prefixes.
static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
friend class FactorAlternationImpl;
// Is a == b? Only efficient on regexps that have not been through
// Simplify yet - the expansion of a kRegexpRepeat will make this
// take a long time. Do not call on such regexps, hence private.
static bool Equal(Regexp* a, Regexp* b);
// Allocate space for n sub-regexps.
void AllocSub(int n) {
DCHECK(n >= 0 && static_cast<uint16_t>(n) == n);
if (n > 1)
submany_ = new Regexp*[n];
nsub_ = static_cast<uint16_t>(n);
}
// Add Rune to LiteralString
void AddRuneToString(Rune r);
// Swaps this with that, in place.
void Swap(Regexp *that);
// Operator. See description of operators above.
// uint8_t instead of RegexpOp to control space usage.
uint8_t op_;
// Is this regexp structure already simple
// (has it been returned by Simplify)?
// uint8_t instead of bool to control space usage.
uint8_t simple_;
// Flags saved from parsing and used during execution.
// (Only FoldCase is used.)
// uint16_t instead of ParseFlags to control space usage.
uint16_t parse_flags_;
// Reference count. Exists so that SimplifyRegexp can build
// regexp structures that are dags rather than trees to avoid
// exponential blowup in space requirements.
// uint16_t to control space usage.
// The standard regexp routines will never generate a
// ref greater than the maximum repeat count (kMaxRepeat),
// but even so, Incref and Decref consult an overflow map
// when ref_ reaches kMaxRef.
uint16_t ref_;
static const uint16_t kMaxRef = 0xffff;
// Subexpressions.
// uint16_t to control space usage.
// Concat and Alternate handle larger numbers of subexpressions
// by building concatenation or alternation trees.
// Other routines should call Concat or Alternate instead of
// filling in sub() by hand.
uint16_t nsub_;
static const uint16_t kMaxNsub = 0xffff;
union {
Regexp** submany_; // if nsub_ > 1
Regexp* subone_; // if nsub_ == 1
};
// Extra space for parse and teardown stacks.
Regexp* down_;
// Arguments to operator. See description of operators above.
union {
struct { // Repeat
int max_;
int min_;
} repeat;
struct { // Capture
int cap_;
std::string* name_;
} capture;
struct { // LiteralString
int nrunes_;
Rune* runes_;
} literal_string;
struct { // CharClass
// These two could be in separate union members,
// but it wouldn't save any space (there are other two-word structs)
// and keeping them separate avoids confusion during parsing.
CharClass* cc_;
CharClassBuilder* ccb_;
} char_class;
Rune rune_; // Literal
int match_id_; // HaveMatch
void *the_union_[2]; // as big as any other element, for memset
} arguments;
Regexp(const Regexp&) = delete;
Regexp& operator=(const Regexp&) = delete;
};
// Character class set: contains non-overlapping, non-abutting RuneRanges.
typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet;
class CharClassBuilder {
public:
CharClassBuilder();
typedef RuneRangeSet::iterator iterator;
iterator begin() { return ranges_.begin(); }
iterator end() { return ranges_.end(); }
int size() { return nrunes_; }
bool empty() { return nrunes_ == 0; }
bool full() { return nrunes_ == Runemax+1; }
bool Contains(Rune r);
bool FoldsASCII();
bool AddRange(Rune lo, Rune hi); // returns whether class changed
CharClassBuilder* Copy();
void AddCharClass(CharClassBuilder* cc);
void Negate();
void RemoveAbove(Rune r);
CharClass* GetCharClass();
void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
private:
static const uint32_t AlphaMask = (1<<26) - 1;
uint32_t upper_; // bitmap of A-Z
uint32_t lower_; // bitmap of a-z
int nrunes_;
RuneRangeSet ranges_;
CharClassBuilder(const CharClassBuilder&) = delete;
CharClassBuilder& operator=(const CharClassBuilder&) = delete;
};
// Bitwise ops on ParseFlags produce ParseFlags.
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a,
Regexp::ParseFlags b) {
return static_cast<Regexp::ParseFlags>(
static_cast<int>(a) | static_cast<int>(b));
}
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a,
Regexp::ParseFlags b) {
return static_cast<Regexp::ParseFlags>(
static_cast<int>(a) ^ static_cast<int>(b));
}
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a,
Regexp::ParseFlags b) {
return static_cast<Regexp::ParseFlags>(
static_cast<int>(a) & static_cast<int>(b));
}
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) {
// Attempting to produce a value out of enum's range has undefined behaviour.
return static_cast<Regexp::ParseFlags>(
~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags));
}
} // namespace re2
#endif // RE2_REGEXP_H_

View File

@@ -0,0 +1,176 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/set.h"
#include <stddef.h>
#include <algorithm>
#include <memory>
#include <utility>
#include "util/util.h"
#include "util/logging.h"
#include "re2/pod_array.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
#include "re2/stringpiece.h"
namespace duckdb_re2 {
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor)
: options_(options),
anchor_(anchor),
compiled_(false),
size_(0) {
options_.set_never_capture(true); // might unblock some optimisations
}
RE2::Set::~Set() {
for (size_t i = 0; i < elem_.size(); i++)
elem_[i].second->Decref();
}
RE2::Set::Set(Set&& other)
: options_(other.options_),
anchor_(other.anchor_),
elem_(std::move(other.elem_)),
compiled_(other.compiled_),
size_(other.size_),
prog_(std::move(other.prog_)) {
other.elem_.clear();
other.elem_.shrink_to_fit();
other.compiled_ = false;
other.size_ = 0;
other.prog_.reset();
}
RE2::Set& RE2::Set::operator=(Set&& other) {
this->~Set();
(void) new (this) Set(std::move(other));
return *this;
}
int RE2::Set::Add(const StringPiece& pattern, std::string* error) {
if (compiled_) {
LOG(DFATAL) << "RE2::Set::Add() called after compiling";
return -1;
}
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
RegexpStatus status;
duckdb_re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
if (re == NULL) {
if (error != NULL)
*error = status.Text();
if (options_.log_errors())
LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
return -1;
}
// Concatenate with match index and push on vector.
int n = static_cast<int>(elem_.size());
duckdb_re2::Regexp* m = duckdb_re2::Regexp::HaveMatch(n, pf);
if (re->op() == kRegexpConcat) {
int nsub = re->nsub();
PODArray<duckdb_re2::Regexp*> sub(nsub + 1);
for (int i = 0; i < nsub; i++)
sub[i] = re->sub()[i]->Incref();
sub[nsub] = m;
re->Decref();
re = duckdb_re2::Regexp::Concat(sub.data(), nsub + 1, pf);
} else {
duckdb_re2::Regexp* sub[2];
sub[0] = re;
sub[1] = m;
re = duckdb_re2::Regexp::Concat(sub, 2, pf);
}
elem_.emplace_back(std::string(pattern), re);
return n;
}
bool RE2::Set::Compile() {
if (compiled_) {
LOG(DFATAL) << "RE2::Set::Compile() called more than once";
return false;
}
compiled_ = true;
size_ = static_cast<int>(elem_.size());
// Sort the elements by their patterns. This is good enough for now
// until we have a Regexp comparison function. (Maybe someday...)
std::sort(elem_.begin(), elem_.end(),
[](const Elem& a, const Elem& b) -> bool {
return a.first < b.first;
});
PODArray<duckdb_re2::Regexp*> sub(size_);
for (int i = 0; i < size_; i++)
sub[i] = elem_[i].second;
elem_.clear();
elem_.shrink_to_fit();
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
duckdb_re2::Regexp* re = duckdb_re2::Regexp::Alternate(sub.data(), size_, pf);
prog_.reset(Prog::CompileSet(re, anchor_, options_.max_mem()));
re->Decref();
return prog_ != nullptr;
}
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const {
return Match(text, v, NULL);
}
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v,
ErrorInfo* error_info) const {
if (!compiled_) {
if (error_info != NULL)
error_info->kind = kNotCompiled;
LOG(DFATAL) << "RE2::Set::Match() called before compiling";
return false;
}
#ifdef RE2_HAVE_THREAD_LOCAL
hooks::context = NULL;
#endif
bool dfa_failed = false;
duckdb_base_std::unique_ptr<SparseSet> matches;
if (v != NULL) {
matches.reset(new SparseSet(size_));
v->clear();
}
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, Prog::kManyMatch,
NULL, &dfa_failed, matches.get());
if (dfa_failed) {
if (options_.log_errors())
LOG(ERROR) << "DFA out of memory: "
<< "program size " << prog_->size() << ", "
<< "list count " << prog_->list_count() << ", "
<< "bytemap range " << prog_->bytemap_range();
if (error_info != NULL)
error_info->kind = kOutOfMemory;
return false;
}
if (ret == false) {
if (error_info != NULL)
error_info->kind = kNoError;
return false;
}
if (v != NULL) {
if (matches->empty()) {
if (error_info != NULL)
error_info->kind = kInconsistent;
LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!";
return false;
}
v->assign(matches->begin(), matches->end());
}
if (error_info != NULL)
error_info->kind = kNoError;
return true;
}
} // namespace re2

View File

@@ -0,0 +1,91 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_SET_H_
#define RE2_SET_H_
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "re2/re2.h"
#ifndef DUCKDB_BASE_STD
namespace duckdb_base_std {
using ::std::unique_ptr;
} // namespace duckdb_base_std
#endif
namespace duckdb_re2 {
class Prog;
class Regexp;
} // namespace re2
namespace duckdb_re2 {
// An RE2::Set represents a collection of regexps that can
// be searched for simultaneously.
class RE2::Set {
public:
enum ErrorKind {
kNoError = 0,
kNotCompiled, // The set is not compiled.
kOutOfMemory, // The DFA ran out of memory.
kInconsistent, // The result is inconsistent. This should never happen.
};
struct ErrorInfo {
ErrorKind kind;
};
Set(const RE2::Options& options, RE2::Anchor anchor);
~Set();
// Not copyable.
Set(const Set&) = delete;
Set& operator=(const Set&) = delete;
// Movable.
Set(Set&& other);
Set& operator=(Set&& other);
// Adds pattern to the set using the options passed to the constructor.
// Returns the index that will identify the regexp in the output of Match(),
// or -1 if the regexp cannot be parsed.
// Indices are assigned in sequential order starting from 0.
// Errors do not increment the index; if error is not NULL, *error will hold
// the error message from the parser.
int Add(const StringPiece& pattern, std::string* error);
// Compiles the set in preparation for matching.
// Returns false if the compiler runs out of memory.
// Add() must not be called again after Compile().
// Compile() must be called before Match().
bool Compile();
// Returns true if text matches at least one of the regexps in the set.
// Fills v (if not NULL) with the indices of the matching regexps.
// Callers must not expect v to be sorted.
bool Match(const StringPiece& text, std::vector<int>* v) const;
// As above, but populates error_info (if not NULL) when none of the regexps
// in the set matched. This can inform callers when DFA execution fails, for
// example, because they might wish to handle that case differently.
bool Match(const StringPiece& text, std::vector<int>* v,
ErrorInfo* error_info) const;
private:
typedef std::pair<std::string, duckdb_re2::Regexp*> Elem;
RE2::Options options_;
RE2::Anchor anchor_;
std::vector<Elem> elem_;
bool compiled_;
int size_;
duckdb_base_std::unique_ptr<duckdb_re2::Prog> prog_;
};
} // namespace re2
#endif // RE2_SET_H_

View File

@@ -0,0 +1,665 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Rewrite POSIX and other features in re
// to use simple extended regular expression features.
// Also sort and simplify character classes.
#include <string>
#include "util/util.h"
#include "util/logging.h"
#include "util/utf.h"
#include "re2/pod_array.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace duckdb_re2 {
// Parses the regexp src and then simplifies it and sets *dst to the
// string representation of the simplified form. Returns true on success.
// Returns false and sets *error (if error != NULL) on error.
bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
std::string* dst, RegexpStatus* status) {
Regexp* re = Parse(src, flags, status);
if (re == NULL)
return false;
Regexp* sre = re->Simplify();
re->Decref();
if (sre == NULL) {
if (status) {
status->set_code(kRegexpInternalError);
status->set_error_arg(src);
}
return false;
}
*dst = sre->ToString();
sre->Decref();
return true;
}
// Assuming the simple_ flags on the children are accurate,
// is this Regexp* simple?
bool Regexp::ComputeSimple() {
Regexp** subs;
switch (op_) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpLiteralString:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpHaveMatch:
return true;
case kRegexpConcat:
case kRegexpAlternate:
// These are simple as long as the subpieces are simple.
subs = sub();
for (int i = 0; i < nsub_; i++)
if (!subs[i]->simple())
return false;
return true;
case kRegexpCharClass:
// Simple as long as the char class is not empty, not full.
if (arguments.char_class.ccb_ != NULL)
return !arguments.char_class.ccb_->empty() && !arguments.char_class.ccb_->full();
return !arguments.char_class.cc_->empty() && !arguments.char_class.cc_->full();
case kRegexpCapture:
subs = sub();
return subs[0]->simple();
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
subs = sub();
if (!subs[0]->simple())
return false;
switch (subs[0]->op_) {
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpEmptyMatch:
case kRegexpNoMatch:
return false;
default:
break;
}
return true;
case kRegexpRepeat:
return false;
}
LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
return false;
}
// Walker subclass used by Simplify.
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
// occurrences of that literal into repeats of that literal. It also works for
// char classes, any char and any byte.
// PostVisit creates the coalesced result, which should then be simplified.
class CoalesceWalker : public Regexp::Walker<Regexp*> {
public:
CoalesceWalker() {}
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
Regexp** child_args, int nchild_args);
virtual Regexp* Copy(Regexp* re);
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
private:
// These functions are declared inside CoalesceWalker so that
// they can edit the private fields of the Regexps they construct.
// Returns true if r1 and r2 can be coalesced. In particular, ensures that
// the parse flags are consistent. (They will not be checked again later.)
static bool CanCoalesce(Regexp* r1, Regexp* r2);
// Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
// will be empty match and the coalesced op. In other cases, where part of a
// literal string was removed to be coalesced, the array elements afterwards
// will be the coalesced op and the remainder of the literal string.
static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
CoalesceWalker(const CoalesceWalker&) = delete;
CoalesceWalker& operator=(const CoalesceWalker&) = delete;
};
// Walker subclass used by Simplify.
// The simplify walk is purely post-recursive: given the simplified children,
// PostVisit creates the simplified result.
// The child_args are simplified Regexp*s.
class SimplifyWalker : public Regexp::Walker<Regexp*> {
public:
SimplifyWalker() {}
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
Regexp** child_args, int nchild_args);
virtual Regexp* Copy(Regexp* re);
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
private:
// These functions are declared inside SimplifyWalker so that
// they can edit the private fields of the Regexps they construct.
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
// Caller must Decref return value when done with it.
static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp. Does not edit re. Does not consume reference to re.
// Caller must Decref return value when done with it.
static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
Regexp::ParseFlags parse_flags);
// Simplifies a character class by expanding any named classes
// into rune ranges. Does not edit re. Does not consume ref to re.
// Caller must Decref return value when done with it.
static Regexp* SimplifyCharClass(Regexp* re);
SimplifyWalker(const SimplifyWalker&) = delete;
SimplifyWalker& operator=(const SimplifyWalker&) = delete;
};
// Simplifies a regular expression, returning a new regexp.
// The new regexp uses traditional Unix egrep features only,
// plus the Perl (?:) non-capturing parentheses.
// Otherwise, no POSIX or Perl additions. The new regexp
// captures exactly the same subexpressions (with the same indices)
// as the original.
// Does not edit current object.
// Caller must Decref() return value when done with it.
Regexp* Regexp::Simplify() {
CoalesceWalker cw;
Regexp* cre = cw.Walk(this, NULL);
if (cre == NULL)
return NULL;
if (cw.stopped_early()) {
cre->Decref();
return NULL;
}
SimplifyWalker sw;
Regexp* sre = sw.Walk(cre, NULL);
cre->Decref();
if (sre == NULL)
return NULL;
if (sw.stopped_early()) {
sre->Decref();
return NULL;
}
return sre;
}
#define Simplify DontCallSimplify // Avoid accidental recursion
// Utility function for PostVisit implementations that compares re->sub() with
// child_args to determine whether any child_args changed. In the common case,
// where nothing changed, calls Decref() for all child_args and returns false,
// so PostVisit must return re->Incref(). Otherwise, returns true.
static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
for (int i = 0; i < re->nsub(); i++) {
Regexp* sub = re->sub()[i];
Regexp* newsub = child_args[i];
if (newsub != sub)
return true;
}
for (int i = 0; i < re->nsub(); i++) {
Regexp* newsub = child_args[i];
newsub->Decref();
}
return false;
}
Regexp* CoalesceWalker::Copy(Regexp* re) {
return re->Incref();
}
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
#endif
return re->Incref();
}
Regexp* CoalesceWalker::PostVisit(Regexp* re,
Regexp* parent_arg,
Regexp* pre_arg,
Regexp** child_args,
int nchild_args) {
if (re->nsub() == 0)
return re->Incref();
if (re->op() != kRegexpConcat) {
if (!ChildArgsChanged(re, child_args))
return re->Incref();
// Something changed. Build a new op.
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub());
Regexp** nre_subs = nre->sub();
for (int i = 0; i < re->nsub(); i++)
nre_subs[i] = child_args[i];
// Repeats and Captures have additional data that must be copied.
if (re->op() == kRegexpRepeat) {
nre->arguments.repeat.min_ = re->min();
nre->arguments.repeat.max_ = re->max();
} else if (re->op() == kRegexpCapture) {
nre->arguments.capture.cap_ = re->cap();
}
return nre;
}
bool can_coalesce = false;
for (int i = 0; i < re->nsub(); i++) {
if (i+1 < re->nsub() &&
CanCoalesce(child_args[i], child_args[i+1])) {
can_coalesce = true;
break;
}
}
if (!can_coalesce) {
if (!ChildArgsChanged(re, child_args))
return re->Incref();
// Something changed. Build a new op.
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub());
Regexp** nre_subs = nre->sub();
for (int i = 0; i < re->nsub(); i++)
nre_subs[i] = child_args[i];
return nre;
}
for (int i = 0; i < re->nsub(); i++) {
if (i+1 < re->nsub() &&
CanCoalesce(child_args[i], child_args[i+1]))
DoCoalesce(&child_args[i], &child_args[i+1]);
}
// Determine how many empty matches were left by DoCoalesce.
int n = 0;
for (int i = n; i < re->nsub(); i++) {
if (child_args[i]->op() == kRegexpEmptyMatch)
n++;
}
// Build a new op.
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub() - n);
Regexp** nre_subs = nre->sub();
for (int i = 0, j = 0; i < re->nsub(); i++) {
if (child_args[i]->op() == kRegexpEmptyMatch) {
child_args[i]->Decref();
continue;
}
nre_subs[j] = child_args[i];
j++;
}
return nre;
}
bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
// r1 must be a star/plus/quest/repeat of a literal, char class, any char or
// any byte.
if ((r1->op() == kRegexpStar ||
r1->op() == kRegexpPlus ||
r1->op() == kRegexpQuest ||
r1->op() == kRegexpRepeat) &&
(r1->sub()[0]->op() == kRegexpLiteral ||
r1->sub()[0]->op() == kRegexpCharClass ||
r1->sub()[0]->op() == kRegexpAnyChar ||
r1->sub()[0]->op() == kRegexpAnyByte)) {
// r2 must be a star/plus/quest/repeat of the same literal, char class,
// any char or any byte.
if ((r2->op() == kRegexpStar ||
r2->op() == kRegexpPlus ||
r2->op() == kRegexpQuest ||
r2->op() == kRegexpRepeat) &&
Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
// The parse flags must be consistent.
((r1->parse_flags() & Regexp::NonGreedy) ==
(r2->parse_flags() & Regexp::NonGreedy))) {
return true;
}
// ... OR an occurrence of that literal, char class, any char or any byte
if (Regexp::Equal(r1->sub()[0], r2)) {
return true;
}
// ... OR a literal string that begins with that literal.
if (r1->sub()[0]->op() == kRegexpLiteral &&
r2->op() == kRegexpLiteralString &&
r2->runes()[0] == r1->sub()[0]->rune() &&
// The parse flags must be consistent.
((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
(r2->parse_flags() & Regexp::FoldCase))) {
return true;
}
}
return false;
}
void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
Regexp* r1 = *r1ptr;
Regexp* r2 = *r2ptr;
Regexp* nre = Regexp::Repeat(
r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
switch (r1->op()) {
case kRegexpStar:
nre->arguments.repeat.min_ = 0;
nre->arguments.repeat.max_ = -1;
break;
case kRegexpPlus:
nre->arguments.repeat.min_ = 1;
nre->arguments.repeat.max_ = -1;
break;
case kRegexpQuest:
nre->arguments.repeat.min_ = 0;
nre->arguments.repeat.max_ = 1;
break;
case kRegexpRepeat:
nre->arguments.repeat.min_ = r1->min();
nre->arguments.repeat.max_ = r1->max();
break;
default:
nre->Decref();
LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
return;
}
switch (r2->op()) {
case kRegexpStar:
nre->arguments.repeat.max_ = -1;
goto LeaveEmpty;
case kRegexpPlus:
nre->arguments.repeat.min_++;
nre->arguments.repeat.max_ = -1;
goto LeaveEmpty;
case kRegexpQuest:
if (nre->max() != -1)
nre->arguments.repeat.max_++;
goto LeaveEmpty;
case kRegexpRepeat:
nre->arguments.repeat.min_ += r2->min();
if (r2->max() == -1)
nre->arguments.repeat.max_ = -1;
else if (nre->max() != -1)
nre->arguments.repeat.max_ += r2->max();
goto LeaveEmpty;
case kRegexpLiteral:
case kRegexpCharClass:
case kRegexpAnyChar:
case kRegexpAnyByte:
nre->arguments.repeat.min_++;
if (nre->max() != -1)
nre->arguments.repeat.max_++;
goto LeaveEmpty;
LeaveEmpty:
*r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
*r2ptr = nre;
break;
case kRegexpLiteralString: {
Rune r = r1->sub()[0]->rune();
// Determine how much of the literal string is removed.
// We know that we have at least one rune. :)
int n = 1;
while (n < r2->nrunes() && r2->runes()[n] == r)
n++;
nre->arguments.repeat.min_ += n;
if (nre->max() != -1)
nre->arguments.repeat.max_ += n;
if (n == r2->nrunes())
goto LeaveEmpty;
*r1ptr = nre;
*r2ptr = Regexp::LiteralString(
&r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
break;
}
default:
nre->Decref();
LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
return;
}
r1->Decref();
r2->Decref();
}
Regexp* SimplifyWalker::Copy(Regexp* re) {
return re->Incref();
}
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
#endif
return re->Incref();
}
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
if (re->simple()) {
*stop = true;
return re->Incref();
}
return NULL;
}
Regexp* SimplifyWalker::PostVisit(Regexp* re,
Regexp* parent_arg,
Regexp* pre_arg,
Regexp** child_args,
int nchild_args) {
switch (re->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpLiteralString:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpHaveMatch:
// All these are always simple.
re->simple_ = true;
return re->Incref();
case kRegexpConcat:
case kRegexpAlternate: {
// These are simple as long as the subpieces are simple.
if (!ChildArgsChanged(re, child_args)) {
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub());
Regexp** nre_subs = nre->sub();
for (int i = 0; i < re->nsub(); i++)
nre_subs[i] = child_args[i];
nre->simple_ = true;
return nre;
}
case kRegexpCapture: {
Regexp* newsub = child_args[0];
if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
nre->arguments.capture.cap_ = re->cap();
nre->simple_ = true;
return nre;
}
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest: {
Regexp* newsub = child_args[0];
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if (newsub->op() == kRegexpEmptyMatch)
return newsub;
// These are simple as long as the subpiece is simple.
if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
}
// These are also idempotent if flags are constant.
if (re->op() == newsub->op() &&
re->parse_flags() == newsub->parse_flags())
return newsub;
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
nre->simple_ = true;
return nre;
}
case kRegexpRepeat: {
Regexp* newsub = child_args[0];
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if (newsub->op() == kRegexpEmptyMatch)
return newsub;
Regexp* nre = SimplifyRepeat(newsub, re->arguments.repeat.min_, re->arguments.repeat.max_,
re->parse_flags());
newsub->Decref();
nre->simple_ = true;
return nre;
}
case kRegexpCharClass: {
Regexp* nre = SimplifyCharClass(re);
nre->simple_ = true;
return nre;
}
}
LOG(ERROR) << "Simplify case not handled: " << re->op();
return re->Incref();
}
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
// Returns a new Regexp, handing the ref to the caller.
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
Regexp::ParseFlags parse_flags) {
Regexp* re = new Regexp(kRegexpConcat, parse_flags);
re->AllocSub(2);
Regexp** subs = re->sub();
subs[0] = re1;
subs[1] = re2;
return re;
}
// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp. Does not edit re. Does not consume reference to re.
// Caller must Decref return value when done with it.
// The result will *not* necessarily have the right capturing parens
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
// but in the Regexp* representation, both (x) are marked as $1.
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
Regexp::ParseFlags f) {
// x{n,} means at least n matches of x.
if (max == -1) {
// Special case: x{0,} is x*
if (min == 0)
return Regexp::Star(re->Incref(), f);
// Special case: x{1,} is x+
if (min == 1)
return Regexp::Plus(re->Incref(), f);
// General case: x{4,} is xxxx+
PODArray<Regexp*> nre_subs(min);
for (int i = 0; i < min-1; i++)
nre_subs[i] = re->Incref();
nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
return Regexp::Concat(nre_subs.data(), min, f);
}
// Special case: (x){0} matches only empty string.
if (min == 0 && max == 0)
return new Regexp(kRegexpEmptyMatch, f);
// Special case: x{1} is just x.
if (min == 1 && max == 1)
return re->Incref();
// General case: x{n,m} means n copies of x and m copies of x?.
// The machine will do less work if we nest the final m copies,
// so that x{2,5} = xx(x(x(x)?)?)?
// Build leading prefix: xx. Capturing only on the last one.
Regexp* nre = NULL;
if (min > 0) {
PODArray<Regexp*> nre_subs(min);
for (int i = 0; i < min; i++)
nre_subs[i] = re->Incref();
nre = Regexp::Concat(nre_subs.data(), min, f);
}
// Build and attach suffix: (x(x(x)?)?)?
if (max > min) {
Regexp* suf = Regexp::Quest(re->Incref(), f);
for (int i = min+1; i < max; i++)
suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
if (nre == NULL)
nre = suf;
else
nre = Concat2(nre, suf, f);
}
if (nre == NULL) {
// Some degenerate case, like min > max, or min < max < 0.
// This shouldn't happen, because the parser rejects such regexps.
LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
return new Regexp(kRegexpNoMatch, f);
}
return nre;
}
// Simplifies a character class.
// Caller must Decref return value when done with it.
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
CharClass* cc = re->cc();
// Special cases
if (cc->empty())
return new Regexp(kRegexpNoMatch, re->parse_flags());
if (cc->full())
return new Regexp(kRegexpAnyChar, re->parse_flags());
return re->Incref();
}
} // namespace re2

View File

@@ -0,0 +1,392 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_SPARSE_ARRAY_H_
#define RE2_SPARSE_ARRAY_H_
// DESCRIPTION
//
// SparseArray<T>(m) is a map from integers in [0, m) to T values.
// It requires (sizeof(T)+sizeof(int))*m memory, but it provides
// fast iteration through the elements in the array and fast clearing
// of the array. The array has a concept of certain elements being
// uninitialized (having no value).
//
// Insertion and deletion are constant time operations.
//
// Allocating the array is a constant time operation
// when memory allocation is a constant time operation.
//
// Clearing the array is a constant time operation (unusual!).
//
// Iterating through the array is an O(n) operation, where n
// is the number of items in the array (not O(m)).
//
// The array iterator visits entries in the order they were first
// inserted into the array. It is safe to add items to the array while
// using an iterator: the iterator will visit indices added to the array
// during the iteration, but will not re-visit indices whose values
// change after visiting. Thus SparseArray can be a convenient
// implementation of a work queue.
//
// The SparseArray implementation is NOT thread-safe. It is up to the
// caller to make sure only one thread is accessing the array. (Typically
// these arrays are temporary values and used in situations where speed is
// important.)
//
// The SparseArray interface does not present all the usual STL bells and
// whistles.
//
// Implemented with reference to Briggs & Torczon, An Efficient
// Representation for Sparse Sets, ACM Letters on Programming Languages
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
//
// Briggs & Torczon popularized this technique, but it had been known
// long before their paper. They point out that Aho, Hopcroft, and
// Ullman's 1974 Design and Analysis of Computer Algorithms and Bentley's
// 1986 Programming Pearls both hint at the technique in exercises to the
// reader (in Aho & Hopcroft, exercise 2.12; in Bentley, column 1
// exercise 8).
//
// Briggs & Torczon describe a sparse set implementation. I have
// trivially generalized it to create a sparse array (actually the original
// target of the AHU and Bentley exercises).
// IMPLEMENTATION
//
// SparseArray is an array dense_ and an array sparse_ of identical size.
// At any point, the number of elements in the sparse array is size_.
//
// The array dense_ contains the size_ elements in the sparse array (with
// their indices),
// in the order that the elements were first inserted. This array is dense:
// the size_ pairs are dense_[0] through dense_[size_-1].
//
// The array sparse_ maps from indices in [0,m) to indices in [0,size_).
// For indices present in the array, dense_[sparse_[i]].index_ == i.
// For indices not present in the array, sparse_ can contain any value at all,
// perhaps outside the range [0, size_) but perhaps not.
//
// The lax requirement on sparse_ values makes clearing the array very easy:
// set size_ to 0. Lookups are slightly more complicated.
// An index i has a value in the array if and only if:
// sparse_[i] is in [0, size_) AND
// dense_[sparse_[i]].index_ == i.
// If both these properties hold, only then it is safe to refer to
// dense_[sparse_[i]].value_
// as the value associated with index i.
//
// To insert a new entry, set sparse_[i] to size_,
// initialize dense_[size_], and then increment size_.
//
// To make the sparse array as efficient as possible for non-primitive types,
// elements may or may not be destroyed when they are deleted from the sparse
// array through a call to resize(). They immediately become inaccessible, but
// they are only guaranteed to be destroyed when the SparseArray destructor is
// called.
//
// A moved-from SparseArray will be empty.
// Doing this simplifies the logic below.
#ifndef __has_feature
#define __has_feature(x) 0
#endif
#include <assert.h>
#include <stdint.h>
#if __has_feature(memory_sanitizer)
#include <sanitizer/msan_interface.h>
#endif
#include <algorithm>
#include <memory>
#include <utility>
#include "re2/pod_array.h"
namespace duckdb_re2 {
template<typename Value>
class SparseArray {
public:
SparseArray();
explicit SparseArray(int max_size);
~SparseArray();
// IndexValue pairs: exposed in SparseArray::iterator.
class IndexValue;
typedef IndexValue* iterator;
typedef const IndexValue* const_iterator;
SparseArray(const SparseArray& src);
SparseArray(SparseArray&& src);
SparseArray& operator=(const SparseArray& src);
SparseArray& operator=(SparseArray&& src);
// Return the number of entries in the array.
int size() const {
return size_;
}
// Indicate whether the array is empty.
int empty() const {
return size_ == 0;
}
// Iterate over the array.
iterator begin() {
return dense_.data();
}
iterator end() {
return dense_.data() + size_;
}
const_iterator begin() const {
return dense_.data();
}
const_iterator end() const {
return dense_.data() + size_;
}
// Change the maximum size of the array.
// Invalidates all iterators.
void resize(int new_max_size);
// Return the maximum size of the array.
// Indices can be in the range [0, max_size).
int max_size() const {
if (dense_.data() != NULL)
return dense_.size();
else
return 0;
}
// Clear the array.
void clear() {
size_ = 0;
}
// Check whether index i is in the array.
bool has_index(int i) const;
// Comparison function for sorting.
// Can sort the sparse array so that future iterations
// will visit indices in increasing order using
// std::sort(arr.begin(), arr.end(), arr.less);
static bool less(const IndexValue& a, const IndexValue& b);
public:
// Set the value at index i to v.
iterator set(int i, const Value& v) {
return SetInternal(true, i, v);
}
// Set the value at new index i to v.
// Fast but unsafe: only use if has_index(i) is false.
iterator set_new(int i, const Value& v) {
return SetInternal(false, i, v);
}
// Set the value at index i to v.
// Fast but unsafe: only use if has_index(i) is true.
iterator set_existing(int i, const Value& v) {
return SetExistingInternal(i, v);
}
// Get the value at index i.
// Fast but unsafe: only use if has_index(i) is true.
Value& get_existing(int i) {
assert(has_index(i));
return dense_[sparse_[i]].value_;
}
const Value& get_existing(int i) const {
assert(has_index(i));
return dense_[sparse_[i]].value_;
}
private:
iterator SetInternal(bool allow_existing, int i, const Value& v) {
DebugCheckInvariants();
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) {
assert(false && "illegal index");
// Semantically, end() would be better here, but we already know
// the user did something stupid, so begin() insulates them from
// dereferencing an invalid pointer.
return begin();
}
if (!allow_existing) {
assert(!has_index(i));
create_index(i);
} else {
if (!has_index(i))
create_index(i);
}
return SetExistingInternal(i, v);
}
iterator SetExistingInternal(int i, const Value& v) {
DebugCheckInvariants();
assert(has_index(i));
dense_[sparse_[i]].value_ = v;
DebugCheckInvariants();
return dense_.data() + sparse_[i];
}
// Add the index i to the array.
// Only use if has_index(i) is known to be false.
// Since it doesn't set the value associated with i,
// this function is private, only intended as a helper
// for other methods.
void create_index(int i);
// In debug mode, verify that some invariant properties of the class
// are being maintained. This is called at the end of the constructor
// and at the beginning and end of all public non-const member functions.
void DebugCheckInvariants() const;
// Initializes memory for elements [min, max).
void MaybeInitializeMemory(int min, int max) {
#if __has_feature(memory_sanitizer)
__msan_unpoison(sparse_.data() + min, (max - min) * sizeof sparse_[0]);
#elif defined(RE2_ON_VALGRIND)
for (int i = min; i < max; i++) {
sparse_[i] = 0xababababU;
}
#endif
}
int size_ = 0;
PODArray<int> sparse_;
PODArray<IndexValue> dense_;
};
template<typename Value>
SparseArray<Value>::SparseArray() = default;
template<typename Value>
SparseArray<Value>::SparseArray(const SparseArray& src)
: size_(src.size_),
sparse_(src.max_size()),
dense_(src.max_size()) {
std::copy_n(src.sparse_.data(), src.max_size(), sparse_.data());
std::copy_n(src.dense_.data(), src.max_size(), dense_.data());
}
template<typename Value>
SparseArray<Value>::SparseArray(SparseArray&& src)
: size_(src.size_),
sparse_(std::move(src.sparse_)),
dense_(std::move(src.dense_)) {
src.size_ = 0;
}
template<typename Value>
SparseArray<Value>& SparseArray<Value>::operator=(const SparseArray& src) {
// Construct these first for exception safety.
PODArray<int> a(src.max_size());
PODArray<IndexValue> b(src.max_size());
size_ = src.size_;
sparse_ = std::move(a);
dense_ = std::move(b);
std::copy_n(src.sparse_.data(), src.max_size(), sparse_.data());
std::copy_n(src.dense_.data(), src.max_size(), dense_.data());
return *this;
}
template<typename Value>
SparseArray<Value>& SparseArray<Value>::operator=(SparseArray&& src) {
size_ = src.size_;
sparse_ = std::move(src.sparse_);
dense_ = std::move(src.dense_);
src.size_ = 0;
return *this;
}
// IndexValue pairs: exposed in SparseArray::iterator.
template<typename Value>
class SparseArray<Value>::IndexValue {
public:
int index() const { return index_; }
Value& value() { return value_; }
const Value& value() const { return value_; }
private:
friend class SparseArray;
int index_;
Value value_;
};
// Change the maximum size of the array.
// Invalidates all iterators.
template<typename Value>
void SparseArray<Value>::resize(int new_max_size) {
DebugCheckInvariants();
if (new_max_size > max_size()) {
const int old_max_size = max_size();
// Construct these first for exception safety.
PODArray<int> a(new_max_size);
PODArray<IndexValue> b(new_max_size);
std::copy_n(sparse_.data(), old_max_size, a.data());
std::copy_n(dense_.data(), old_max_size, b.data());
sparse_ = std::move(a);
dense_ = std::move(b);
MaybeInitializeMemory(old_max_size, new_max_size);
}
if (size_ > new_max_size)
size_ = new_max_size;
DebugCheckInvariants();
}
// Check whether index i is in the array.
template<typename Value>
bool SparseArray<Value>::has_index(int i) const {
assert(i >= 0);
assert(i < max_size());
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) {
return false;
}
// Unsigned comparison avoids checking sparse_[i] < 0.
return (uint32_t)sparse_[i] < (uint32_t)size_ &&
dense_[sparse_[i]].index_ == i;
}
template<typename Value>
void SparseArray<Value>::create_index(int i) {
assert(!has_index(i));
assert(size_ < max_size());
sparse_[i] = size_;
dense_[size_].index_ = i;
size_++;
}
template<typename Value> SparseArray<Value>::SparseArray(int max_size) :
sparse_(max_size), dense_(max_size) {
MaybeInitializeMemory(size_, max_size);
DebugCheckInvariants();
}
template<typename Value> SparseArray<Value>::~SparseArray() {
DebugCheckInvariants();
}
template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const {
assert(0 <= size_);
assert(size_ <= max_size());
}
// Comparison function for sorting.
template<typename Value> bool SparseArray<Value>::less(const IndexValue& a,
const IndexValue& b) {
return a.index_ < b.index_;
}
} // namespace re2
#endif // RE2_SPARSE_ARRAY_H_

View File

@@ -0,0 +1,264 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_SPARSE_SET_H_
#define RE2_SPARSE_SET_H_
// DESCRIPTION
//
// SparseSet(m) is a set of integers in [0, m).
// It requires sizeof(int)*m memory, but it provides
// fast iteration through the elements in the set and fast clearing
// of the set.
//
// Insertion and deletion are constant time operations.
//
// Allocating the set is a constant time operation
// when memory allocation is a constant time operation.
//
// Clearing the set is a constant time operation (unusual!).
//
// Iterating through the set is an O(n) operation, where n
// is the number of items in the set (not O(m)).
//
// The set iterator visits entries in the order they were first
// inserted into the set. It is safe to add items to the set while
// using an iterator: the iterator will visit indices added to the set
// during the iteration, but will not re-visit indices whose values
// change after visiting. Thus SparseSet can be a convenient
// implementation of a work queue.
//
// The SparseSet implementation is NOT thread-safe. It is up to the
// caller to make sure only one thread is accessing the set. (Typically
// these sets are temporary values and used in situations where speed is
// important.)
//
// The SparseSet interface does not present all the usual STL bells and
// whistles.
//
// Implemented with reference to Briggs & Torczon, An Efficient
// Representation for Sparse Sets, ACM Letters on Programming Languages
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
//
// This is a specialization of sparse array; see sparse_array.h.
// IMPLEMENTATION
//
// See sparse_array.h for implementation details.
// Doing this simplifies the logic below.
#ifndef __has_feature
#define __has_feature(x) 0
#endif
#include <assert.h>
#include <stdint.h>
#if __has_feature(memory_sanitizer)
#include <sanitizer/msan_interface.h>
#endif
#include <algorithm>
#include <memory>
#include <utility>
#include "re2/pod_array.h"
namespace duckdb_re2 {
template<typename Value>
class SparseSetT {
public:
SparseSetT();
explicit SparseSetT(int max_size);
~SparseSetT();
typedef int* iterator;
typedef const int* const_iterator;
// Return the number of entries in the set.
int size() const {
return size_;
}
// Indicate whether the set is empty.
int empty() const {
return size_ == 0;
}
// Iterate over the set.
iterator begin() {
return dense_.data();
}
iterator end() {
return dense_.data() + size_;
}
const_iterator begin() const {
return dense_.data();
}
const_iterator end() const {
return dense_.data() + size_;
}
// Change the maximum size of the set.
// Invalidates all iterators.
void resize(int new_max_size);
// Return the maximum size of the set.
// Indices can be in the range [0, max_size).
int max_size() const {
if (dense_.data() != NULL)
return dense_.size();
else
return 0;
}
// Clear the set.
void clear() {
size_ = 0;
}
// Check whether index i is in the set.
bool contains(int i) const;
// Comparison function for sorting.
// Can sort the sparse set so that future iterations
// will visit indices in increasing order using
// std::sort(arr.begin(), arr.end(), arr.less);
static bool less(int a, int b);
public:
// Insert index i into the set.
iterator insert(int i) {
return InsertInternal(true, i);
}
// Insert index i into the set.
// Fast but unsafe: only use if contains(i) is false.
iterator insert_new(int i) {
return InsertInternal(false, i);
}
private:
iterator InsertInternal(bool allow_existing, int i) {
DebugCheckInvariants();
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) {
assert(false && "illegal index");
// Semantically, end() would be better here, but we already know
// the user did something stupid, so begin() insulates them from
// dereferencing an invalid pointer.
return begin();
}
if (!allow_existing) {
assert(!contains(i));
create_index(i);
} else {
if (!contains(i))
create_index(i);
}
DebugCheckInvariants();
return dense_.data() + sparse_[i];
}
// Add the index i to the set.
// Only use if contains(i) is known to be false.
// This function is private, only intended as a helper
// for other methods.
void create_index(int i);
// In debug mode, verify that some invariant properties of the class
// are being maintained. This is called at the end of the constructor
// and at the beginning and end of all public non-const member functions.
void DebugCheckInvariants() const;
// Initializes memory for elements [min, max).
void MaybeInitializeMemory(int min, int max) {
#if __has_feature(memory_sanitizer)
__msan_unpoison(sparse_.data() + min, (max - min) * sizeof sparse_[0]);
#elif defined(RE2_ON_VALGRIND)
for (int i = min; i < max; i++) {
sparse_[i] = 0xababababU;
}
#endif
}
int size_ = 0;
PODArray<int> sparse_;
PODArray<int> dense_;
};
template<typename Value>
SparseSetT<Value>::SparseSetT() = default;
// Change the maximum size of the set.
// Invalidates all iterators.
template<typename Value>
void SparseSetT<Value>::resize(int new_max_size) {
DebugCheckInvariants();
if (new_max_size > max_size()) {
const int old_max_size = max_size();
// Construct these first for exception safety.
PODArray<int> a(new_max_size);
PODArray<int> b(new_max_size);
std::copy_n(sparse_.data(), old_max_size, a.data());
std::copy_n(dense_.data(), old_max_size, b.data());
sparse_ = std::move(a);
dense_ = std::move(b);
MaybeInitializeMemory(old_max_size, new_max_size);
}
if (size_ > new_max_size)
size_ = new_max_size;
DebugCheckInvariants();
}
// Check whether index i is in the set.
template<typename Value>
bool SparseSetT<Value>::contains(int i) const {
assert(i >= 0);
assert(i < max_size());
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) {
return false;
}
// Unsigned comparison avoids checking sparse_[i] < 0.
return (uint32_t)sparse_[i] < (uint32_t)size_ &&
dense_[sparse_[i]] == i;
}
template<typename Value>
void SparseSetT<Value>::create_index(int i) {
assert(!contains(i));
assert(size_ < max_size());
sparse_[i] = size_;
dense_[size_] = i;
size_++;
}
template<typename Value> SparseSetT<Value>::SparseSetT(int max_size) :
sparse_(max_size), dense_(max_size) {
MaybeInitializeMemory(size_, max_size);
DebugCheckInvariants();
}
template<typename Value> SparseSetT<Value>::~SparseSetT() {
DebugCheckInvariants();
}
template<typename Value> void SparseSetT<Value>::DebugCheckInvariants() const {
assert(0 <= size_);
assert(size_ <= max_size());
}
// Comparison function for sorting.
template<typename Value> bool SparseSetT<Value>::less(int a, int b) {
return a < b;
}
typedef SparseSetT<void> SparseSet;
} // namespace re2
#endif // RE2_SPARSE_SET_H_

View File

@@ -0,0 +1,65 @@
// Copyright 2004 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/stringpiece.h"
#include <ostream>
#include "util/util.h"
namespace duckdb_re2 {
const StringPiece::size_type StringPiece::npos; // initialized in stringpiece.h
StringPiece::size_type StringPiece::copy(char* buf, size_type n,
size_type pos) const {
size_type ret = std::min(size_ - pos, n);
memcpy(buf, data_ + pos, ret);
return ret;
}
StringPiece StringPiece::substr(size_type pos, size_type n) const {
if (pos > size_) pos = size_;
if (n > size_ - pos) n = size_ - pos;
return StringPiece(data_ + pos, n);
}
StringPiece::size_type StringPiece::find(const StringPiece& s,
size_type pos) const {
if (pos > size_) return npos;
const_pointer result = std::search(data_ + pos, data_ + size_,
s.data_, s.data_ + s.size_);
size_type xpos = result - data_;
return xpos + s.size_ <= size_ ? xpos : npos;
}
StringPiece::size_type StringPiece::find(char c, size_type pos) const {
if (size_ <= 0 || pos >= size_) return npos;
const_pointer result = std::find(data_ + pos, data_ + size_, c);
return result != data_ + size_ ? result - data_ : npos;
}
StringPiece::size_type StringPiece::rfind(const StringPiece& s,
size_type pos) const {
if (size_ < s.size_) return npos;
if (s.size_ == 0) return std::min(size_, pos);
const_pointer last = data_ + std::min(size_ - s.size_, pos) + s.size_;
const_pointer result = std::find_end(data_, last, s.data_, s.data_ + s.size_);
return result != last ? result - data_ : npos;
}
StringPiece::size_type StringPiece::rfind(char c, size_type pos) const {
if (size_ <= 0) return npos;
for (size_t i = std::min(pos + 1, size_); i != 0;) {
if (data_[--i] == c) return i;
}
return npos;
}
std::ostream& operator<<(std::ostream& o, const StringPiece& p) {
o.write(p.data(), p.size());
return o;
}
} // namespace re2

View File

@@ -0,0 +1,217 @@
// Copyright 2001-2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_STRINGPIECE_H_
#define RE2_STRINGPIECE_H_
#ifdef min
#undef min
#endif
// A string-like object that points to a sized piece of memory.
//
// Functions or methods may use const StringPiece& parameters to accept either
// a "const char*" or a "string" value that will be implicitly converted to
// a StringPiece. The implicit conversion means that it is often appropriate
// to include this .h file in other files rather than forward-declaring
// StringPiece as would be appropriate for most other Google classes.
//
// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
// conversions from "const char*" to "string" and back again.
//
//
// Arghh! I wish C++ literals were "string".
#include <stddef.h>
#include <string.h>
#include <algorithm>
#include <iosfwd>
#include <iterator>
#include <string>
#ifdef __cpp_lib_string_view
#include <string_view>
#endif
namespace duckdb_re2 {
class StringPiece {
public:
typedef std::char_traits<char> traits_type;
typedef char value_type;
typedef char* pointer;
typedef const char* const_pointer;
typedef char& reference;
typedef const char& const_reference;
typedef const char* const_iterator;
typedef const_iterator iterator;
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
typedef const_reverse_iterator reverse_iterator;
typedef size_t size_type;
typedef ptrdiff_t difference_type;
static const size_type npos = static_cast<size_type>(-1);
// We provide non-explicit singleton constructors so users can pass
// in a "const char*" or a "string" wherever a "StringPiece" is
// expected.
StringPiece()
: data_(NULL), size_(0) {}
#ifdef __cpp_lib_string_view
StringPiece(const std::string_view& str)
: data_(str.data()), size_(str.size()) {}
#endif
StringPiece(const std::string& str)
: data_(str.data()), size_(str.size()) {}
StringPiece(const char* str)
: data_(str), size_(str == NULL ? 0 : strlen(str)) {}
StringPiece(const char* str, size_type len)
: data_(str), size_(len) {}
const_iterator begin() const { return data_; }
const_iterator end() const { return data_ + size_; }
const_reverse_iterator rbegin() const {
return const_reverse_iterator(data_ + size_);
}
const_reverse_iterator rend() const {
return const_reverse_iterator(data_);
}
size_type size() const { return size_; }
size_type length() const { return size_; }
bool empty() const { return size_ == 0; }
const_reference operator[](size_type i) const { return data_[i]; }
const_pointer data() const { return data_; }
void remove_prefix(size_type n) {
data_ += n;
size_ -= n;
}
void remove_suffix(size_type n) {
size_ -= n;
}
void set(const char* str) {
data_ = str;
size_ = str == NULL ? 0 : strlen(str);
}
void set(const char* str, size_type len) {
data_ = str;
size_ = len;
}
#ifdef __cpp_lib_string_view
// Converts to `std::basic_string_view`.
operator std::basic_string_view<char, traits_type>() const {
if (!data_) return {};
return std::basic_string_view<char, traits_type>(data_, size_);
}
#endif
// Converts to `std::basic_string`.
template <typename A>
explicit operator std::basic_string<char, traits_type, A>() const {
if (!data_) return {};
return std::basic_string<char, traits_type, A>(data_, size_);
}
std::string as_string() const {
return std::string(data_, size_);
}
// We also define ToString() here, since many other string-like
// interfaces name the routine that converts to a C++ string
// "ToString", and it's confusing to have the method that does that
// for a StringPiece be called "as_string()". We also leave the
// "as_string()" method defined here for existing code.
std::string ToString() const {
return std::string(data_, size_);
}
void CopyToString(std::string* target) const {
target->assign(data_, size_);
}
void AppendToString(std::string* target) const {
target->append(data_, size_);
}
size_type copy(char* buf, size_type n, size_type pos = 0) const;
StringPiece substr(size_type pos = 0, size_type n = npos) const;
int compare(const StringPiece& x) const {
size_type min_size = std::min(size(), x.size());
if (min_size > 0) {
int r = memcmp(data(), x.data(), min_size);
if (r < 0) return -1;
if (r > 0) return 1;
}
if (size() < x.size()) return -1;
if (size() > x.size()) return 1;
return 0;
}
// Does "this" start with "x"?
bool starts_with(const StringPiece& x) const {
return x.empty() ||
(size() >= x.size() && memcmp(data(), x.data(), x.size()) == 0);
}
// Does "this" end with "x"?
bool ends_with(const StringPiece& x) const {
return x.empty() ||
(size() >= x.size() &&
memcmp(data() + (size() - x.size()), x.data(), x.size()) == 0);
}
bool contains(const StringPiece& s) const {
return find(s) != npos;
}
size_type find(const StringPiece& s, size_type pos = 0) const;
size_type find(char c, size_type pos = 0) const;
size_type rfind(const StringPiece& s, size_type pos = npos) const;
size_type rfind(char c, size_type pos = npos) const;
private:
const_pointer data_;
size_type size_;
};
inline bool operator==(const StringPiece& x, const StringPiece& y) {
StringPiece::size_type len = x.size();
if (len != y.size()) return false;
return x.data() == y.data() || len == 0 ||
memcmp(x.data(), y.data(), len) == 0;
}
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
return !(x == y);
}
inline bool operator<(const StringPiece& x, const StringPiece& y) {
StringPiece::size_type min_size = std::min(x.size(), y.size());
int r = min_size == 0 ? 0 : memcmp(x.data(), y.data(), min_size);
return (r < 0) || (r == 0 && x.size() < y.size());
}
inline bool operator>(const StringPiece& x, const StringPiece& y) {
return y < x;
}
inline bool operator<=(const StringPiece& x, const StringPiece& y) {
return !(x > y);
}
inline bool operator>=(const StringPiece& x, const StringPiece& y) {
return !(x < y);
}
// Allow StringPiece to be logged.
std::ostream& operator<<(std::ostream& o, const StringPiece& p);
} // namespace re2
#endif // RE2_STRINGPIECE_H_

View File

@@ -0,0 +1,351 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Format a regular expression structure as a string.
// Tested by parse_test.cc
#include <string.h>
#include <string>
#include "util/util.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace duckdb_re2 {
enum {
PrecAtom,
PrecUnary,
PrecConcat,
PrecAlternate,
PrecEmpty,
PrecParen,
PrecToplevel,
};
// Helper function. See description below.
static void AppendCCRange(std::string* t, Rune lo, Rune hi);
// Walker to generate string in s_.
// The arg pointers are actually integers giving the
// context precedence.
// The child_args are always NULL.
class ToStringWalker : public Regexp::Walker<int> {
public:
explicit ToStringWalker(std::string* t) : t_(t) {}
virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
int* child_args, int nchild_args);
virtual int ShortVisit(Regexp* re, int parent_arg) {
return 0;
}
private:
std::string* t_; // The string the walker appends to.
ToStringWalker(const ToStringWalker&) = delete;
ToStringWalker& operator=(const ToStringWalker&) = delete;
};
std::string Regexp::ToString() {
std::string t;
ToStringWalker w(&t);
w.WalkExponential(this, PrecToplevel, 100000);
if (w.stopped_early())
t += " [truncated]";
return t;
}
#define ToString DontCallToString // Avoid accidental recursion.
// Visits re before children are processed.
// Appends ( if needed and passes new precedence to children.
int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
int prec = parent_arg;
int nprec = PrecAtom;
switch (re->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpEndText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpCharClass:
case kRegexpHaveMatch:
nprec = PrecAtom;
break;
case kRegexpConcat:
case kRegexpLiteralString:
if (prec < PrecConcat)
t_->append("(?:");
nprec = PrecConcat;
break;
case kRegexpAlternate:
if (prec < PrecAlternate)
t_->append("(?:");
nprec = PrecAlternate;
break;
case kRegexpCapture:
t_->append("(");
if (re->cap() == 0)
LOG(DFATAL) << "kRegexpCapture cap() == 0";
if (re->name()) {
t_->append("?P<");
t_->append(*re->name());
t_->append(">");
}
nprec = PrecParen;
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
if (prec < PrecUnary)
t_->append("(?:");
// The subprecedence here is PrecAtom instead of PrecUnary
// because PCRE treats two unary ops in a row as a parse error.
nprec = PrecAtom;
break;
}
return nprec;
}
static void AppendLiteral(std::string *t, Rune r, bool foldcase) {
if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
t->append(1, '\\');
t->append(1, static_cast<char>(r));
} else if (foldcase && 'a' <= r && r <= 'z') {
r -= 'a' - 'A';
t->append(1, '[');
t->append(1, static_cast<char>(r));
t->append(1, static_cast<char>(r) + 'a' - 'A');
t->append(1, ']');
} else {
AppendCCRange(t, r, r);
}
}
// Visits re after children are processed.
// For childless regexps, all the work is done here.
// For regexps with children, append any unary suffixes or ).
int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
int* child_args, int nchild_args) {
int prec = parent_arg;
switch (re->op()) {
case kRegexpNoMatch:
// There's no simple symbol for "no match", but
// [^0-Runemax] excludes everything.
t_->append("[^\\x00-\\x{10ffff}]");
break;
case kRegexpEmptyMatch:
// Append (?:) to make empty string visible,
// unless this is already being parenthesized.
if (prec < PrecEmpty)
t_->append("(?:)");
break;
case kRegexpLiteral:
AppendLiteral(t_, re->rune(),
(re->parse_flags() & Regexp::FoldCase) != 0);
break;
case kRegexpLiteralString:
for (int i = 0; i < re->nrunes(); i++)
AppendLiteral(t_, re->runes()[i],
(re->parse_flags() & Regexp::FoldCase) != 0);
if (prec < PrecConcat)
t_->append(")");
break;
case kRegexpConcat:
if (prec < PrecConcat)
t_->append(")");
break;
case kRegexpAlternate:
// Clumsy but workable: the children all appended |
// at the end of their strings, so just remove the last one.
if ((*t_)[t_->size()-1] == '|')
t_->erase(t_->size()-1);
else
LOG(DFATAL) << "Bad final char: " << t_;
if (prec < PrecAlternate)
t_->append(")");
break;
case kRegexpStar:
t_->append("*");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpPlus:
t_->append("+");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpQuest:
t_->append("?");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpRepeat:
if (re->max() == -1)
t_->append(StringPrintf("{%d,}", re->min()));
else if (re->min() == re->max())
t_->append(StringPrintf("{%d}", re->min()));
else
t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpAnyChar:
t_->append(".");
break;
case kRegexpAnyByte:
t_->append("\\C");
break;
case kRegexpBeginLine:
t_->append("^");
break;
case kRegexpEndLine:
t_->append("$");
break;
case kRegexpBeginText:
t_->append("(?-m:^)");
break;
case kRegexpEndText:
if (re->parse_flags() & Regexp::WasDollar)
t_->append("(?-m:$)");
else
t_->append("\\z");
break;
case kRegexpWordBoundary:
t_->append("\\b");
break;
case kRegexpNoWordBoundary:
t_->append("\\B");
break;
case kRegexpCharClass: {
if (re->cc()->size() == 0) {
t_->append("[^\\x00-\\x{10ffff}]");
break;
}
t_->append("[");
// Heuristic: show class as negated if it contains the
// non-character 0xFFFE and yet somehow isn't full.
CharClass* cc = re->cc();
if (cc->Contains(0xFFFE) && !cc->full()) {
cc = cc->Negate();
t_->append("^");
}
for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
AppendCCRange(t_, i->lo, i->hi);
if (cc != re->cc())
cc->Delete();
t_->append("]");
break;
}
case kRegexpCapture:
t_->append(")");
break;
case kRegexpHaveMatch:
// There's no syntax accepted by the parser to generate
// this node (it is generated by RE2::Set) so make something
// up that is readable but won't compile.
t_->append(StringPrintf("(?HaveMatch:%d)", re->match_id()));
break;
}
// If the parent is an alternation, append the | for it.
if (prec == PrecAlternate)
t_->append("|");
return 0;
}
// Appends a rune for use in a character class to the string t.
static void AppendCCChar(std::string* t, Rune r) {
if (0x20 <= r && r <= 0x7E) {
if (strchr("[]^-\\", r))
t->append("\\");
t->append(1, static_cast<char>(r));
return;
}
switch (r) {
default:
break;
case '\r':
t->append("\\r");
return;
case '\t':
t->append("\\t");
return;
case '\n':
t->append("\\n");
return;
case '\f':
t->append("\\f");
return;
}
if (r < 0x100) {
*t += StringPrintf("\\x%02x", static_cast<int>(r));
return;
}
*t += StringPrintf("\\x{%x}", static_cast<int>(r));
}
static void AppendCCRange(std::string* t, Rune lo, Rune hi) {
if (lo > hi)
return;
AppendCCChar(t, lo);
if (lo < hi) {
t->append("-");
AppendCCChar(t, hi);
}
}
} // namespace re2

View File

@@ -0,0 +1,596 @@
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
// make_unicode_casefold.py >unicode_casefold.cc
#include "re2/unicode_casefold.h"
namespace duckdb_re2 {
// 1424 groups, 2878 pairs, 367 ranges
const CaseFold unicode_casefold[] = {
{ 65, 90, 32 },
{ 97, 106, -32 },
{ 107, 107, 8383 },
{ 108, 114, -32 },
{ 115, 115, 268 },
{ 116, 122, -32 },
{ 181, 181, 743 },
{ 192, 214, 32 },
{ 216, 222, 32 },
{ 223, 223, 7615 },
{ 224, 228, -32 },
{ 229, 229, 8262 },
{ 230, 246, -32 },
{ 248, 254, -32 },
{ 255, 255, 121 },
{ 256, 303, EvenOdd },
{ 306, 311, EvenOdd },
{ 313, 328, OddEven },
{ 330, 375, EvenOdd },
{ 376, 376, -121 },
{ 377, 382, OddEven },
{ 383, 383, -300 },
{ 384, 384, 195 },
{ 385, 385, 210 },
{ 386, 389, EvenOdd },
{ 390, 390, 206 },
{ 391, 392, OddEven },
{ 393, 394, 205 },
{ 395, 396, OddEven },
{ 398, 398, 79 },
{ 399, 399, 202 },
{ 400, 400, 203 },
{ 401, 402, OddEven },
{ 403, 403, 205 },
{ 404, 404, 207 },
{ 405, 405, 97 },
{ 406, 406, 211 },
{ 407, 407, 209 },
{ 408, 409, EvenOdd },
{ 410, 410, 163 },
{ 412, 412, 211 },
{ 413, 413, 213 },
{ 414, 414, 130 },
{ 415, 415, 214 },
{ 416, 421, EvenOdd },
{ 422, 422, 218 },
{ 423, 424, OddEven },
{ 425, 425, 218 },
{ 428, 429, EvenOdd },
{ 430, 430, 218 },
{ 431, 432, OddEven },
{ 433, 434, 217 },
{ 435, 438, OddEven },
{ 439, 439, 219 },
{ 440, 441, EvenOdd },
{ 444, 445, EvenOdd },
{ 447, 447, 56 },
{ 452, 452, EvenOdd },
{ 453, 453, OddEven },
{ 454, 454, -2 },
{ 455, 455, OddEven },
{ 456, 456, EvenOdd },
{ 457, 457, -2 },
{ 458, 458, EvenOdd },
{ 459, 459, OddEven },
{ 460, 460, -2 },
{ 461, 476, OddEven },
{ 477, 477, -79 },
{ 478, 495, EvenOdd },
{ 497, 497, OddEven },
{ 498, 498, EvenOdd },
{ 499, 499, -2 },
{ 500, 501, EvenOdd },
{ 502, 502, -97 },
{ 503, 503, -56 },
{ 504, 543, EvenOdd },
{ 544, 544, -130 },
{ 546, 563, EvenOdd },
{ 570, 570, 10795 },
{ 571, 572, OddEven },
{ 573, 573, -163 },
{ 574, 574, 10792 },
{ 575, 576, 10815 },
{ 577, 578, OddEven },
{ 579, 579, -195 },
{ 580, 580, 69 },
{ 581, 581, 71 },
{ 582, 591, EvenOdd },
{ 592, 592, 10783 },
{ 593, 593, 10780 },
{ 594, 594, 10782 },
{ 595, 595, -210 },
{ 596, 596, -206 },
{ 598, 599, -205 },
{ 601, 601, -202 },
{ 603, 603, -203 },
{ 604, 604, 42319 },
{ 608, 608, -205 },
{ 609, 609, 42315 },
{ 611, 611, -207 },
{ 613, 613, 42280 },
{ 614, 614, 42308 },
{ 616, 616, -209 },
{ 617, 617, -211 },
{ 618, 618, 42308 },
{ 619, 619, 10743 },
{ 620, 620, 42305 },
{ 623, 623, -211 },
{ 625, 625, 10749 },
{ 626, 626, -213 },
{ 629, 629, -214 },
{ 637, 637, 10727 },
{ 640, 640, -218 },
{ 642, 642, 42307 },
{ 643, 643, -218 },
{ 647, 647, 42282 },
{ 648, 648, -218 },
{ 649, 649, -69 },
{ 650, 651, -217 },
{ 652, 652, -71 },
{ 658, 658, -219 },
{ 669, 669, 42261 },
{ 670, 670, 42258 },
{ 837, 837, 84 },
{ 880, 883, EvenOdd },
{ 886, 887, EvenOdd },
{ 891, 893, 130 },
{ 895, 895, 116 },
{ 902, 902, 38 },
{ 904, 906, 37 },
{ 908, 908, 64 },
{ 910, 911, 63 },
{ 913, 929, 32 },
{ 931, 931, 31 },
{ 932, 939, 32 },
{ 940, 940, -38 },
{ 941, 943, -37 },
{ 945, 945, -32 },
{ 946, 946, 30 },
{ 947, 948, -32 },
{ 949, 949, 64 },
{ 950, 951, -32 },
{ 952, 952, 25 },
{ 953, 953, 7173 },
{ 954, 954, 54 },
{ 955, 955, -32 },
{ 956, 956, -775 },
{ 957, 959, -32 },
{ 960, 960, 22 },
{ 961, 961, 48 },
{ 962, 962, EvenOdd },
{ 963, 965, -32 },
{ 966, 966, 15 },
{ 967, 968, -32 },
{ 969, 969, 7517 },
{ 970, 971, -32 },
{ 972, 972, -64 },
{ 973, 974, -63 },
{ 975, 975, 8 },
{ 976, 976, -62 },
{ 977, 977, 35 },
{ 981, 981, -47 },
{ 982, 982, -54 },
{ 983, 983, -8 },
{ 984, 1007, EvenOdd },
{ 1008, 1008, -86 },
{ 1009, 1009, -80 },
{ 1010, 1010, 7 },
{ 1011, 1011, -116 },
{ 1012, 1012, -92 },
{ 1013, 1013, -96 },
{ 1015, 1016, OddEven },
{ 1017, 1017, -7 },
{ 1018, 1019, EvenOdd },
{ 1021, 1023, -130 },
{ 1024, 1039, 80 },
{ 1040, 1071, 32 },
{ 1072, 1073, -32 },
{ 1074, 1074, 6222 },
{ 1075, 1075, -32 },
{ 1076, 1076, 6221 },
{ 1077, 1085, -32 },
{ 1086, 1086, 6212 },
{ 1087, 1088, -32 },
{ 1089, 1090, 6210 },
{ 1091, 1097, -32 },
{ 1098, 1098, 6204 },
{ 1099, 1103, -32 },
{ 1104, 1119, -80 },
{ 1120, 1122, EvenOdd },
{ 1123, 1123, 6180 },
{ 1124, 1153, EvenOdd },
{ 1162, 1215, EvenOdd },
{ 1216, 1216, 15 },
{ 1217, 1230, OddEven },
{ 1231, 1231, -15 },
{ 1232, 1327, EvenOdd },
{ 1329, 1366, 48 },
{ 1377, 1414, -48 },
{ 4256, 4293, 7264 },
{ 4295, 4295, 7264 },
{ 4301, 4301, 7264 },
{ 4304, 4346, 3008 },
{ 4349, 4351, 3008 },
{ 5024, 5103, 38864 },
{ 5104, 5109, 8 },
{ 5112, 5117, -8 },
{ 7296, 7296, -6254 },
{ 7297, 7297, -6253 },
{ 7298, 7298, -6244 },
{ 7299, 7299, -6242 },
{ 7300, 7300, EvenOdd },
{ 7301, 7301, -6243 },
{ 7302, 7302, -6236 },
{ 7303, 7303, -6181 },
{ 7304, 7304, 35266 },
{ 7312, 7354, -3008 },
{ 7357, 7359, -3008 },
{ 7545, 7545, 35332 },
{ 7549, 7549, 3814 },
{ 7566, 7566, 35384 },
{ 7680, 7776, EvenOdd },
{ 7777, 7777, 58 },
{ 7778, 7829, EvenOdd },
{ 7835, 7835, -59 },
{ 7838, 7838, -7615 },
{ 7840, 7935, EvenOdd },
{ 7936, 7943, 8 },
{ 7944, 7951, -8 },
{ 7952, 7957, 8 },
{ 7960, 7965, -8 },
{ 7968, 7975, 8 },
{ 7976, 7983, -8 },
{ 7984, 7991, 8 },
{ 7992, 7999, -8 },
{ 8000, 8005, 8 },
{ 8008, 8013, -8 },
{ 8017, 8017, 8 },
{ 8019, 8019, 8 },
{ 8021, 8021, 8 },
{ 8023, 8023, 8 },
{ 8025, 8025, -8 },
{ 8027, 8027, -8 },
{ 8029, 8029, -8 },
{ 8031, 8031, -8 },
{ 8032, 8039, 8 },
{ 8040, 8047, -8 },
{ 8048, 8049, 74 },
{ 8050, 8053, 86 },
{ 8054, 8055, 100 },
{ 8056, 8057, 128 },
{ 8058, 8059, 112 },
{ 8060, 8061, 126 },
{ 8064, 8071, 8 },
{ 8072, 8079, -8 },
{ 8080, 8087, 8 },
{ 8088, 8095, -8 },
{ 8096, 8103, 8 },
{ 8104, 8111, -8 },
{ 8112, 8113, 8 },
{ 8115, 8115, 9 },
{ 8120, 8121, -8 },
{ 8122, 8123, -74 },
{ 8124, 8124, -9 },
{ 8126, 8126, -7289 },
{ 8131, 8131, 9 },
{ 8136, 8139, -86 },
{ 8140, 8140, -9 },
{ 8144, 8145, 8 },
{ 8152, 8153, -8 },
{ 8154, 8155, -100 },
{ 8160, 8161, 8 },
{ 8165, 8165, 7 },
{ 8168, 8169, -8 },
{ 8170, 8171, -112 },
{ 8172, 8172, -7 },
{ 8179, 8179, 9 },
{ 8184, 8185, -128 },
{ 8186, 8187, -126 },
{ 8188, 8188, -9 },
{ 8486, 8486, -7549 },
{ 8490, 8490, -8415 },
{ 8491, 8491, -8294 },
{ 8498, 8498, 28 },
{ 8526, 8526, -28 },
{ 8544, 8559, 16 },
{ 8560, 8575, -16 },
{ 8579, 8580, OddEven },
{ 9398, 9423, 26 },
{ 9424, 9449, -26 },
{ 11264, 11311, 48 },
{ 11312, 11359, -48 },
{ 11360, 11361, EvenOdd },
{ 11362, 11362, -10743 },
{ 11363, 11363, -3814 },
{ 11364, 11364, -10727 },
{ 11365, 11365, -10795 },
{ 11366, 11366, -10792 },
{ 11367, 11372, OddEven },
{ 11373, 11373, -10780 },
{ 11374, 11374, -10749 },
{ 11375, 11375, -10783 },
{ 11376, 11376, -10782 },
{ 11378, 11379, EvenOdd },
{ 11381, 11382, OddEven },
{ 11390, 11391, -10815 },
{ 11392, 11491, EvenOdd },
{ 11499, 11502, OddEven },
{ 11506, 11507, EvenOdd },
{ 11520, 11557, -7264 },
{ 11559, 11559, -7264 },
{ 11565, 11565, -7264 },
{ 42560, 42570, EvenOdd },
{ 42571, 42571, -35267 },
{ 42572, 42605, EvenOdd },
{ 42624, 42651, EvenOdd },
{ 42786, 42799, EvenOdd },
{ 42802, 42863, EvenOdd },
{ 42873, 42876, OddEven },
{ 42877, 42877, -35332 },
{ 42878, 42887, EvenOdd },
{ 42891, 42892, OddEven },
{ 42893, 42893, -42280 },
{ 42896, 42899, EvenOdd },
{ 42900, 42900, 48 },
{ 42902, 42921, EvenOdd },
{ 42922, 42922, -42308 },
{ 42923, 42923, -42319 },
{ 42924, 42924, -42315 },
{ 42925, 42925, -42305 },
{ 42926, 42926, -42308 },
{ 42928, 42928, -42258 },
{ 42929, 42929, -42282 },
{ 42930, 42930, -42261 },
{ 42931, 42931, 928 },
{ 42932, 42947, EvenOdd },
{ 42948, 42948, -48 },
{ 42949, 42949, -42307 },
{ 42950, 42950, -35384 },
{ 42951, 42954, OddEven },
{ 42960, 42961, EvenOdd },
{ 42966, 42969, EvenOdd },
{ 42997, 42998, OddEven },
{ 43859, 43859, -928 },
{ 43888, 43967, -38864 },
{ 65313, 65338, 32 },
{ 65345, 65370, -32 },
{ 66560, 66599, 40 },
{ 66600, 66639, -40 },
{ 66736, 66771, 40 },
{ 66776, 66811, -40 },
{ 66928, 66938, 39 },
{ 66940, 66954, 39 },
{ 66956, 66962, 39 },
{ 66964, 66965, 39 },
{ 66967, 66977, -39 },
{ 66979, 66993, -39 },
{ 66995, 67001, -39 },
{ 67003, 67004, -39 },
{ 68736, 68786, 64 },
{ 68800, 68850, -64 },
{ 71840, 71871, 32 },
{ 71872, 71903, -32 },
{ 93760, 93791, 32 },
{ 93792, 93823, -32 },
{ 125184, 125217, 34 },
{ 125218, 125251, -34 },
};
const int num_unicode_casefold = 367;
// 1424 groups, 1454 pairs, 205 ranges
const CaseFold unicode_tolower[] = {
{ 65, 90, 32 },
{ 181, 181, 775 },
{ 192, 214, 32 },
{ 216, 222, 32 },
{ 256, 302, EvenOddSkip },
{ 306, 310, EvenOddSkip },
{ 313, 327, OddEvenSkip },
{ 330, 374, EvenOddSkip },
{ 376, 376, -121 },
{ 377, 381, OddEvenSkip },
{ 383, 383, -268 },
{ 385, 385, 210 },
{ 386, 388, EvenOddSkip },
{ 390, 390, 206 },
{ 391, 391, OddEven },
{ 393, 394, 205 },
{ 395, 395, OddEven },
{ 398, 398, 79 },
{ 399, 399, 202 },
{ 400, 400, 203 },
{ 401, 401, OddEven },
{ 403, 403, 205 },
{ 404, 404, 207 },
{ 406, 406, 211 },
{ 407, 407, 209 },
{ 408, 408, EvenOdd },
{ 412, 412, 211 },
{ 413, 413, 213 },
{ 415, 415, 214 },
{ 416, 420, EvenOddSkip },
{ 422, 422, 218 },
{ 423, 423, OddEven },
{ 425, 425, 218 },
{ 428, 428, EvenOdd },
{ 430, 430, 218 },
{ 431, 431, OddEven },
{ 433, 434, 217 },
{ 435, 437, OddEvenSkip },
{ 439, 439, 219 },
{ 440, 440, EvenOdd },
{ 444, 444, EvenOdd },
{ 452, 452, 2 },
{ 453, 453, OddEven },
{ 455, 455, 2 },
{ 456, 456, EvenOdd },
{ 458, 458, 2 },
{ 459, 475, OddEvenSkip },
{ 478, 494, EvenOddSkip },
{ 497, 497, 2 },
{ 498, 500, EvenOddSkip },
{ 502, 502, -97 },
{ 503, 503, -56 },
{ 504, 542, EvenOddSkip },
{ 544, 544, -130 },
{ 546, 562, EvenOddSkip },
{ 570, 570, 10795 },
{ 571, 571, OddEven },
{ 573, 573, -163 },
{ 574, 574, 10792 },
{ 577, 577, OddEven },
{ 579, 579, -195 },
{ 580, 580, 69 },
{ 581, 581, 71 },
{ 582, 590, EvenOddSkip },
{ 837, 837, 116 },
{ 880, 882, EvenOddSkip },
{ 886, 886, EvenOdd },
{ 895, 895, 116 },
{ 902, 902, 38 },
{ 904, 906, 37 },
{ 908, 908, 64 },
{ 910, 911, 63 },
{ 913, 929, 32 },
{ 931, 939, 32 },
{ 962, 962, EvenOdd },
{ 975, 975, 8 },
{ 976, 976, -30 },
{ 977, 977, -25 },
{ 981, 981, -15 },
{ 982, 982, -22 },
{ 984, 1006, EvenOddSkip },
{ 1008, 1008, -54 },
{ 1009, 1009, -48 },
{ 1012, 1012, -60 },
{ 1013, 1013, -64 },
{ 1015, 1015, OddEven },
{ 1017, 1017, -7 },
{ 1018, 1018, EvenOdd },
{ 1021, 1023, -130 },
{ 1024, 1039, 80 },
{ 1040, 1071, 32 },
{ 1120, 1152, EvenOddSkip },
{ 1162, 1214, EvenOddSkip },
{ 1216, 1216, 15 },
{ 1217, 1229, OddEvenSkip },
{ 1232, 1326, EvenOddSkip },
{ 1329, 1366, 48 },
{ 4256, 4293, 7264 },
{ 4295, 4295, 7264 },
{ 4301, 4301, 7264 },
{ 5112, 5117, -8 },
{ 7296, 7296, -6222 },
{ 7297, 7297, -6221 },
{ 7298, 7298, -6212 },
{ 7299, 7300, -6210 },
{ 7301, 7301, -6211 },
{ 7302, 7302, -6204 },
{ 7303, 7303, -6180 },
{ 7304, 7304, 35267 },
{ 7312, 7354, -3008 },
{ 7357, 7359, -3008 },
{ 7680, 7828, EvenOddSkip },
{ 7835, 7835, -58 },
{ 7838, 7838, -7615 },
{ 7840, 7934, EvenOddSkip },
{ 7944, 7951, -8 },
{ 7960, 7965, -8 },
{ 7976, 7983, -8 },
{ 7992, 7999, -8 },
{ 8008, 8013, -8 },
{ 8025, 8025, -8 },
{ 8027, 8027, -8 },
{ 8029, 8029, -8 },
{ 8031, 8031, -8 },
{ 8040, 8047, -8 },
{ 8072, 8079, -8 },
{ 8088, 8095, -8 },
{ 8104, 8111, -8 },
{ 8120, 8121, -8 },
{ 8122, 8123, -74 },
{ 8124, 8124, -9 },
{ 8126, 8126, -7173 },
{ 8136, 8139, -86 },
{ 8140, 8140, -9 },
{ 8152, 8153, -8 },
{ 8154, 8155, -100 },
{ 8168, 8169, -8 },
{ 8170, 8171, -112 },
{ 8172, 8172, -7 },
{ 8184, 8185, -128 },
{ 8186, 8187, -126 },
{ 8188, 8188, -9 },
{ 8486, 8486, -7517 },
{ 8490, 8490, -8383 },
{ 8491, 8491, -8262 },
{ 8498, 8498, 28 },
{ 8544, 8559, 16 },
{ 8579, 8579, OddEven },
{ 9398, 9423, 26 },
{ 11264, 11311, 48 },
{ 11360, 11360, EvenOdd },
{ 11362, 11362, -10743 },
{ 11363, 11363, -3814 },
{ 11364, 11364, -10727 },
{ 11367, 11371, OddEvenSkip },
{ 11373, 11373, -10780 },
{ 11374, 11374, -10749 },
{ 11375, 11375, -10783 },
{ 11376, 11376, -10782 },
{ 11378, 11378, EvenOdd },
{ 11381, 11381, OddEven },
{ 11390, 11391, -10815 },
{ 11392, 11490, EvenOddSkip },
{ 11499, 11501, OddEvenSkip },
{ 11506, 11506, EvenOdd },
{ 42560, 42604, EvenOddSkip },
{ 42624, 42650, EvenOddSkip },
{ 42786, 42798, EvenOddSkip },
{ 42802, 42862, EvenOddSkip },
{ 42873, 42875, OddEvenSkip },
{ 42877, 42877, -35332 },
{ 42878, 42886, EvenOddSkip },
{ 42891, 42891, OddEven },
{ 42893, 42893, -42280 },
{ 42896, 42898, EvenOddSkip },
{ 42902, 42920, EvenOddSkip },
{ 42922, 42922, -42308 },
{ 42923, 42923, -42319 },
{ 42924, 42924, -42315 },
{ 42925, 42925, -42305 },
{ 42926, 42926, -42308 },
{ 42928, 42928, -42258 },
{ 42929, 42929, -42282 },
{ 42930, 42930, -42261 },
{ 42931, 42931, 928 },
{ 42932, 42946, EvenOddSkip },
{ 42948, 42948, -48 },
{ 42949, 42949, -42307 },
{ 42950, 42950, -35384 },
{ 42951, 42953, OddEvenSkip },
{ 42960, 42960, EvenOdd },
{ 42966, 42968, EvenOddSkip },
{ 42997, 42997, OddEven },
{ 43888, 43967, -38864 },
{ 65313, 65338, 32 },
{ 66560, 66599, 40 },
{ 66736, 66771, 40 },
{ 66928, 66938, 39 },
{ 66940, 66954, 39 },
{ 66956, 66962, 39 },
{ 66964, 66965, 39 },
{ 68736, 68786, 64 },
{ 71840, 71871, 32 },
{ 93760, 93791, 32 },
{ 125184, 125217, 34 },
};
const int num_unicode_tolower = 205;
} // namespace re2

View File

@@ -0,0 +1,78 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UNICODE_CASEFOLD_H_
#define RE2_UNICODE_CASEFOLD_H_
// Unicode case folding tables.
// The Unicode case folding tables encode the mapping from one Unicode point
// to the next largest Unicode point with equivalent folding. The largest
// point wraps back to the first. For example, the tables map:
//
// 'A' -> 'a'
// 'a' -> 'A'
//
// 'K' -> 'k'
// 'k' -> '' (Kelvin symbol)
// '' -> 'K'
//
// Like everything Unicode, these tables are big. If we represent the table
// as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB.
// Most table entries look like the ones around them:
// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
// Instead of listing all the pairs explicitly, we make a list of ranges
// and deltas, so that the table entries for 'A' through 'Z' can be represented
// as a single entry { 'A', 'Z', +32 }.
//
// In addition to blocks that map to each other (A-Z mapping to a-z)
// there are blocks of pairs that individually map to each other
// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
// For those, the special delta value EvenOdd marks even/odd pairs
// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
//
// In this form, the table has 274 entries, about 3kB. If we were to split
// the table into one for 16-bit codes and an overflow table for larger ones,
// we could get it down to about 1.5kB, but that's not worth the complexity.
//
// The grouped form also allows for efficient fold range calculations
// rather than looping one character at a time.
#include <stdint.h>
#include "util/util.h"
#include "util/utf.h"
namespace duckdb_re2 {
enum {
EvenOdd = 1,
OddEven = -1,
EvenOddSkip = 1<<30,
OddEvenSkip,
};
struct CaseFold {
Rune lo;
Rune hi;
int32_t delta;
};
extern const CaseFold unicode_casefold[];
extern const int num_unicode_casefold;
extern const CaseFold unicode_tolower[];
extern const int num_unicode_tolower;
// Returns the CaseFold* in the tables that contains rune.
// If rune is not in the tables, returns the first CaseFold* after rune.
// If rune is larger than any value in the tables, returns NULL.
extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune);
// Returns the result of applying the fold f to the rune r.
extern Rune ApplyFold(const CaseFold *f, Rune r);
} // namespace re2
#endif // RE2_UNICODE_CASEFOLD_H_

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,67 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UNICODE_GROUPS_H_
#define RE2_UNICODE_GROUPS_H_
// Unicode character groups.
// The codes get split into ranges of 16-bit codes
// and ranges of 32-bit codes. It would be simpler
// to use only 32-bit ranges, but these tables are large
// enough to warrant extra care.
//
// Using just 32-bit ranges gives 27 kB of data.
// Adding 16-bit ranges gives 18 kB of data.
// Adding an extra table of 16-bit singletons would reduce
// to 16.5 kB of data but make the data harder to use;
// we don't bother.
#include <stdint.h>
#include "util/util.h"
#include "util/utf.h"
namespace duckdb_re2 {
struct URange16
{
uint16_t lo;
uint16_t hi;
};
struct URange32
{
Rune lo;
Rune hi;
};
struct UGroup
{
const char *name;
int sign; // +1 for [abc], -1 for [^abc]
const URange16 *r16;
int nr16;
const URange32 *r32;
int nr32;
};
// Named by property or script name (e.g., "Nd", "N", "Han").
// Negated groups are not included.
extern const UGroup unicode_groups[];
extern const int num_unicode_groups;
// Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]").
// Negated groups are included.
extern const UGroup posix_groups[];
extern const int num_posix_groups;
// Named by Perl name (e.g., "\\d", "\\D").
// Negated groups are included.
extern const UGroup perl_groups[];
extern const int num_perl_groups;
} // namespace re2
#endif // RE2_UNICODE_GROUPS_H_

View File

@@ -0,0 +1,247 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_WALKER_INL_H_
#define RE2_WALKER_INL_H_
// Helper class for traversing Regexps without recursion.
// Clients should declare their own subclasses that override
// the PreVisit and PostVisit methods, which are called before
// and after visiting the subexpressions.
// Not quite the Visitor pattern, because (among other things)
// the Visitor pattern is recursive.
#include <stack>
#include "util/logging.h"
#include "re2/regexp.h"
namespace duckdb_re2 {
template<typename T> struct WalkState;
template<typename T> class Regexp::Walker {
public:
Walker();
virtual ~Walker();
// Virtual method called before visiting re's children.
// PreVisit passes ownership of its return value to its caller.
// The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
// and passed to the child PreVisits and PostVisits as parent_arg.
// At the top-most Regexp, parent_arg is arg passed to walk.
// If PreVisit sets *stop to true, the walk does not recurse
// into the children. Instead it behaves as though the return
// value from PreVisit is the return value from PostVisit.
// The default PreVisit returns parent_arg.
virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
// Virtual method called after visiting re's children.
// The pre_arg is the T that PreVisit returned.
// The child_args is a vector of the T that the child PostVisits returned.
// PostVisit takes ownership of pre_arg.
// PostVisit takes ownership of the Ts
// in *child_args, but not the vector itself.
// PostVisit passes ownership of its return value
// to its caller.
// The default PostVisit simply returns pre_arg.
virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
T* child_args, int nchild_args);
// Virtual method called to copy a T,
// when Walk notices that more than one child is the same re.
virtual T Copy(T arg);
// Virtual method called to do a "quick visit" of the re,
// but not its children. Only called once the visit budget
// has been used up and we're trying to abort the walk
// as quickly as possible. Should return a value that
// makes sense for the parent PostVisits still to be run.
// This function is (hopefully) only called by
// WalkExponential, but must be implemented by all clients,
// just in case.
virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
// Walks over a regular expression.
// Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
// Returns the T returned by PostVisit on re.
T Walk(Regexp* re, T top_arg);
// Like Walk, but doesn't use Copy. This can lead to
// exponential runtimes on cross-linked Regexps like the
// ones generated by Simplify. To help limit this,
// at most max_visits nodes will be visited and then
// the walk will be cut off early.
// If the walk *is* cut off early, ShortVisit(re)
// will be called on regexps that cannot be fully
// visited rather than calling PreVisit/PostVisit.
T WalkExponential(Regexp* re, T top_arg, int max_visits);
// Clears the stack. Should never be necessary, since
// Walk always enters and exits with an empty stack.
// Logs DFATAL if stack is not already clear.
void Reset();
// Returns whether walk was cut off.
bool stopped_early() { return stopped_early_; }
private:
// Walk state for the entire traversal.
std::stack<WalkState<T>> stack_;
bool stopped_early_;
int max_visits_;
T WalkInternal(Regexp* re, T top_arg, bool use_copy);
Walker(const Walker&) = delete;
Walker& operator=(const Walker&) = delete;
};
template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
T parent_arg,
bool* stop) {
return parent_arg;
}
template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
T parent_arg,
T pre_arg,
T* child_args,
int nchild_args) {
return pre_arg;
}
template<typename T> T Regexp::Walker<T>::Copy(T arg) {
return arg;
}
// State about a single level in the traversal.
template<typename T> struct WalkState {
WalkState(Regexp* re, T parent)
: re(re),
n(-1),
parent_arg(parent),
child_args(NULL) { }
Regexp* re; // The regexp
int n; // The index of the next child to process; -1 means need to PreVisit
T parent_arg; // Accumulated arguments.
T pre_arg;
T child_arg; // One-element buffer for child_args.
T* child_args;
};
template<typename T> Regexp::Walker<T>::Walker() {
stopped_early_ = false;
}
template<typename T> Regexp::Walker<T>::~Walker() {
Reset();
}
// Clears the stack. Should never be necessary, since
// Walk always enters and exits with an empty stack.
// Logs DFATAL if stack is not already clear.
template<typename T> void Regexp::Walker<T>::Reset() {
if (!stack_.empty()) {
LOG(DFATAL) << "Stack not empty.";
while (!stack_.empty()) {
if (stack_.top().re->nsub_ > 1)
delete[] stack_.top().child_args;
stack_.pop();
}
}
}
template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
bool use_copy) {
Reset();
if (re == NULL) {
LOG(DFATAL) << "Walk NULL";
return top_arg;
}
stack_.push(WalkState<T>(re, top_arg));
WalkState<T>* s;
for (;;) {
T t;
s = &stack_.top();
re = s->re;
switch (s->n) {
case -1: {
if (--max_visits_ < 0) {
stopped_early_ = true;
t = ShortVisit(re, s->parent_arg);
break;
}
bool stop = false;
s->pre_arg = PreVisit(re, s->parent_arg, &stop);
if (stop) {
t = s->pre_arg;
break;
}
s->n = 0;
s->child_args = NULL;
if (re->nsub_ == 1)
s->child_args = &s->child_arg;
else if (re->nsub_ > 1)
s->child_args = new T[re->nsub_];
FALLTHROUGH_INTENDED;
}
default: {
if (re->nsub_ > 0) {
Regexp** sub = re->sub();
if (s->n < re->nsub_) {
if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
s->n++;
} else {
stack_.push(WalkState<T>(sub[s->n], s->pre_arg));
}
continue;
}
}
t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
if (re->nsub_ > 1)
delete[] s->child_args;
break;
}
}
// We've finished stack_.top().
// Update next guy down.
stack_.pop();
if (stack_.empty())
return t;
s = &stack_.top();
if (s->child_args != NULL)
s->child_args[s->n] = t;
else
s->child_arg = t;
s->n++;
}
}
template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
// Without the exponential walking behavior,
// this budget should be more than enough for any
// regexp, and yet not enough to get us in trouble
// as far as CPU time.
max_visits_ = 1000000;
return WalkInternal(re, top_arg, true);
}
template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
int max_visits) {
max_visits_ = max_visits;
return WalkInternal(re, top_arg, false);
}
} // namespace re2
#endif // RE2_WALKER_INL_H_

View File

@@ -0,0 +1,111 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_LOGGING_H_
#define UTIL_LOGGING_H_
// Simplified version of Google's logging.
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <ostream>
#include <sstream>
#include <stdexcept>
#include "util/util.h"
// Debug-only checking.
#define DCHECK(condition) assert(condition)
#define DCHECK_EQ(val1, val2) assert((val1) == (val2))
#define DCHECK_NE(val1, val2) assert((val1) != (val2))
#define DCHECK_LE(val1, val2) assert((val1) <= (val2))
#define DCHECK_LT(val1, val2) assert((val1) < (val2))
#define DCHECK_GE(val1, val2) assert((val1) >= (val2))
#define DCHECK_GT(val1, val2) assert((val1) > (val2))
// Always-on checking
#define CHECK(x) if(x){}else LogMessageFatal(__FILE__, __LINE__).stream() << "Check failed: " #x
#define CHECK_LT(x, y) CHECK((x) < (y))
#define CHECK_GT(x, y) CHECK((x) > (y))
#define CHECK_LE(x, y) CHECK((x) <= (y))
#define CHECK_GE(x, y) CHECK((x) >= (y))
#define CHECK_EQ(x, y) CHECK((x) == (y))
#define CHECK_NE(x, y) CHECK((x) != (y))
#define RE2_LOG_INFO LogMessage(__FILE__, __LINE__)
#define RE2_LOG_WARNING LogMessage(__FILE__, __LINE__)
#define RE2_LOG_ERROR LogMessage(__FILE__, __LINE__)
#define RE2_LOG_FATAL LogMessageFatal(__FILE__, __LINE__)
#define RE2_LOG_QFATAL RE2_LOG_FATAL
// It seems that one of the Windows header files defines ERROR as 0.
#ifdef _WIN32
#define LOG_0 RE2_LOG_INFO
#endif
#ifdef NDEBUG
#define RE2_LOG_DFATAL RE2_LOG_ERROR
#else
#define RE2_LOG_DFATAL RE2_LOG_FATAL
#endif
#define LOG(severity) RE2_LOG_ ## severity.stream()
#define VLOG(x) if((x)>0){}else RE2_LOG_INFO.stream()
class LogMessage {
public:
LogMessage(const char* file, int line)
: flushed_(false) {
// stream() << file << ":" << line << ": ";
}
void Flush() {
// stream() << "\n";
// std::string s = str_.str();
// size_t n = s.size();
// if (fwrite(s.data(), 1, n, stderr) < n) {} // shut up gcc
// flushed_ = true;
}
~LogMessage() {
if (!flushed_) {
Flush();
}
}
std::ostream& stream() { return str_; }
private:
bool flushed_;
std::ostringstream str_;
LogMessage(const LogMessage&) = delete;
LogMessage& operator=(const LogMessage&) = delete;
};
// Silence "destructor never returns" warning for ~LogMessageFatal().
// Since this is a header file, push and then pop to limit the scope.
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable: 4722)
#endif
class LogMessageFatal : public LogMessage {
public:
LogMessageFatal(const char* file, int line)
: LogMessage(file, line) {
throw std::runtime_error("RE2 Fatal Error");
}
~LogMessageFatal() {
Flush();
}
private:
LogMessageFatal(const LogMessageFatal&) = delete;
LogMessageFatal& operator=(const LogMessageFatal&) = delete;
};
#ifdef _MSC_VER
#pragma warning(pop)
#endif
#endif // UTIL_LOGGING_H_

View File

@@ -0,0 +1,41 @@
// Copyright 2016 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_MIX_H_
#define UTIL_MIX_H_
#include <stddef.h>
#include <limits>
namespace duckdb_re2 {
// Silence "truncation of constant value" warning for kMul in 32-bit mode.
// Since this is a header file, push and then pop to limit the scope.
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable: 4309)
#endif
class HashMix {
public:
HashMix() : hash_(1) {}
explicit HashMix(size_t val) : hash_(val + 83) {}
void Mix(size_t val) {
static const size_t kMul = static_cast<size_t>(0xdc3eb94af8ab4c93ULL);
hash_ *= kMul;
hash_ = ((hash_ << 19) |
(hash_ >> (std::numeric_limits<size_t>::digits - 19))) + val;
}
size_t get() const { return hash_; }
private:
size_t hash_;
};
#ifdef _MSC_VER
#pragma warning(pop)
#endif
} // namespace re2
#endif // UTIL_MIX_H_

View File

@@ -0,0 +1,165 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_MUTEX_H_
#define UTIL_MUTEX_H_
/*
* A simple mutex wrapper, supporting locks and read-write locks.
* You should assume the locks are *not* re-entrant.
*/
#ifdef RE2_NO_THREADS
#include <assert.h>
#define MUTEX_IS_LOCK_COUNTER
#else
#ifdef _WIN32
// Requires Windows Vista or Windows Server 2008 at minimum.
#include <windows.h>
#if defined(WINVER) && WINVER >= 0x0600
#define MUTEX_IS_WIN32_SRWLOCK
#endif
#else
#ifndef _POSIX_C_SOURCE
#define _POSIX_C_SOURCE 200809L
#endif
#include <unistd.h>
#if defined(_POSIX_READER_WRITER_LOCKS) && _POSIX_READER_WRITER_LOCKS > 0
#define MUTEX_IS_PTHREAD_RWLOCK
#endif
#endif
#endif
#if defined(MUTEX_IS_LOCK_COUNTER)
typedef int MutexType;
#elif defined(MUTEX_IS_WIN32_SRWLOCK)
typedef SRWLOCK MutexType;
#elif defined(MUTEX_IS_PTHREAD_RWLOCK)
#include <stdexcept>
#include <pthread.h>
#include <stdlib.h>
typedef pthread_rwlock_t MutexType;
#else
#include <shared_mutex>
typedef std::shared_mutex MutexType;
#endif
namespace duckdb_re2 {
class Mutex {
public:
inline Mutex();
inline ~Mutex();
inline void Lock(); // Block if needed until free then acquire exclusively
inline void Unlock(); // Release a lock acquired via Lock()
// Note that on systems that don't support read-write locks, these may
// be implemented as synonyms to Lock() and Unlock(). So you can use
// these for efficiency, but don't use them anyplace where being able
// to do shared reads is necessary to avoid deadlock.
inline void ReaderLock(); // Block until free or shared then acquire a share
inline void ReaderUnlock(); // Release a read share of this Mutex
inline void WriterLock() { Lock(); } // Acquire an exclusive lock
inline void WriterUnlock() { Unlock(); } // Release a lock from WriterLock()
private:
MutexType mutex_;
// Catch the error of writing Mutex when intending MutexLock.
Mutex(Mutex *ignored);
Mutex(const Mutex&) = delete;
Mutex& operator=(const Mutex&) = delete;
};
#if defined(MUTEX_IS_LOCK_COUNTER)
Mutex::Mutex() : mutex_(0) { }
Mutex::~Mutex() { assert(mutex_ == 0); }
void Mutex::Lock() { assert(--mutex_ == -1); }
void Mutex::Unlock() { assert(mutex_++ == -1); }
void Mutex::ReaderLock() { assert(++mutex_ > 0); }
void Mutex::ReaderUnlock() { assert(mutex_-- > 0); }
#elif defined(MUTEX_IS_WIN32_SRWLOCK)
Mutex::Mutex() : mutex_(SRWLOCK_INIT) { }
Mutex::~Mutex() { }
void Mutex::Lock() { AcquireSRWLockExclusive(&mutex_); }
void Mutex::Unlock() { ReleaseSRWLockExclusive(&mutex_); }
void Mutex::ReaderLock() { AcquireSRWLockShared(&mutex_); }
void Mutex::ReaderUnlock() { ReleaseSRWLockShared(&mutex_); }
#elif defined(MUTEX_IS_PTHREAD_RWLOCK)
#define SAFE_PTHREAD(fncall) \
do { \
if ((fncall) != 0) throw std::runtime_error("RE2 pthread failure"); \
} while (0);
Mutex::Mutex() { SAFE_PTHREAD(pthread_rwlock_init(&mutex_, NULL)); }
Mutex::~Mutex() { pthread_rwlock_destroy(&mutex_); }
void Mutex::Lock() { SAFE_PTHREAD(pthread_rwlock_wrlock(&mutex_)); }
void Mutex::Unlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
void Mutex::ReaderLock() { SAFE_PTHREAD(pthread_rwlock_rdlock(&mutex_)); }
void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
#undef SAFE_PTHREAD
#else
Mutex::Mutex() { }
Mutex::~Mutex() { }
void Mutex::Lock() { mutex_.lock(); }
void Mutex::Unlock() { mutex_.unlock(); }
void Mutex::ReaderLock() { mutex_.lock_shared(); }
void Mutex::ReaderUnlock() { mutex_.unlock_shared(); }
#endif
// --------------------------------------------------------------------------
// Some helper classes
// MutexLock(mu) acquires mu when constructed and releases it when destroyed.
class MutexLock {
public:
explicit MutexLock(Mutex *mu) : mu_(mu) { mu_->Lock(); }
~MutexLock() { mu_->Unlock(); }
private:
Mutex * const mu_;
MutexLock(const MutexLock&) = delete;
MutexLock& operator=(const MutexLock&) = delete;
};
// ReaderMutexLock and WriterMutexLock do the same, for rwlocks
class ReaderMutexLock {
public:
explicit ReaderMutexLock(Mutex *mu) : mu_(mu) { mu_->ReaderLock(); }
~ReaderMutexLock() { mu_->ReaderUnlock(); }
private:
Mutex * const mu_;
ReaderMutexLock(const ReaderMutexLock&) = delete;
ReaderMutexLock& operator=(const ReaderMutexLock&) = delete;
};
class WriterMutexLock {
public:
explicit WriterMutexLock(Mutex *mu) : mu_(mu) { mu_->WriterLock(); }
~WriterMutexLock() { mu_->WriterUnlock(); }
private:
Mutex * const mu_;
WriterMutexLock(const WriterMutexLock&) = delete;
WriterMutexLock& operator=(const WriterMutexLock&) = delete;
};
// Catch bug where variable name is omitted, e.g. MutexLock (&mu);
#define MutexLock(x) static_assert(false, "MutexLock declaration missing variable name")
#define ReaderMutexLock(x) static_assert(false, "ReaderMutexLock declaration missing variable name")
#define WriterMutexLock(x) static_assert(false, "WriterMutexLock declaration missing variable name")
} // namespace re2
#endif // UTIL_MUTEX_H_

View File

@@ -0,0 +1,260 @@
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "util/utf.h"
namespace duckdb_re2 {
enum
{
Bit1 = 7,
Bitx = 6,
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1,
/* 0001 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
Bad = Runeerror,
};
int
chartorune(Rune *rune, const char *str)
{
int c, c1, c2, c3;
Rune l;
/*
* one character sequence
* 00000-0007F => T1
*/
c = *(unsigned char*)str;
if(c < Tx) {
*rune = c;
return 1;
}
/*
* two character sequence
* 0080-07FF => T2 Tx
*/
c1 = *(unsigned char*)(str+1) ^ Tx;
if(c1 & Testx)
goto bad;
if(c < T3) {
if(c < T2)
goto bad;
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
*rune = l;
return 2;
}
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
c2 = *(unsigned char*)(str+2) ^ Tx;
if(c2 & Testx)
goto bad;
if(c < T4) {
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
*rune = l;
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
c3 = *(unsigned char*)(str+3) ^ Tx;
if (c3 & Testx)
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if (l <= Rune3)
goto bad;
*rune = l;
return 4;
}
/*
* Support for 5-byte or longer UTF-8 would go here, but
* since we don't have that, we'll just fall through to bad.
*/
/*
* bad decoding
*/
bad:
*rune = Bad;
return 1;
}
int
runetochar(char *str, const Rune *rune)
{
/* Runes are signed, so convert to unsigned for range check. */
unsigned int c;
/*
* one character sequence
* 00000-0007F => 00-7F
*/
c = *rune;
if(c <= Rune1) {
str[0] = static_cast<char>(c);
return 1;
}
/*
* two character sequence
* 0080-07FF => T2 Tx
*/
if(c <= Rune2) {
str[0] = T2 | static_cast<char>(c >> 1*Bitx);
str[1] = Tx | (c & Maskx);
return 2;
}
/*
* If the Rune is out of range, convert it to the error rune.
* Do this test here because the error rune encodes to three bytes.
* Doing it earlier would duplicate work, since an out of range
* Rune wouldn't have fit in one or two bytes.
*/
if (c > Runemax)
c = Runeerror;
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
if (c <= Rune3) {
str[0] = T3 | static_cast<char>(c >> 2*Bitx);
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
str[2] = Tx | (c & Maskx);
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | static_cast<char>(c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
return 4;
}
int
runelen(Rune rune)
{
char str[10];
return runetochar(str, &rune);
}
int
fullrune(const char *str, int n)
{
if (n > 0) {
int c = *(unsigned char*)str;
if (c < Tx)
return 1;
if (n > 1) {
if (c < T3)
return 1;
if (n > 2) {
if (c < T4 || n > 3)
return 1;
}
}
}
return 0;
}
int
utflen(const char *s)
{
int c;
int n;
Rune rune;
n = 0;
for(;;) {
c = *(unsigned char*)s;
if(c < Runeself) {
if(c == 0)
return n;
s++;
} else
s += chartorune(&rune, s);
n++;
}
return 0;
}
char*
utfrune(const char *s, Rune c)
{
int c1;
Rune r;
int n;
if(c < Runesync) /* not part of utf sequence */
return strchr((char*)s, c);
for(;;) {
c1 = *(unsigned char*)s;
if(c1 < Runeself) { /* one byte rune */
if(c1 == 0)
return 0;
if(c1 == c)
return (char*)s;
s++;
continue;
}
n = chartorune(&r, s);
if(r == c)
return (char*)s;
s += n;
}
return 0;
}
} // namespace re2

View File

@@ -0,0 +1,149 @@
// Copyright 1999-2005 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <stdarg.h>
#include <stdio.h>
#include "util/strutil.h"
#ifdef _WIN32
#define snprintf _snprintf
#define vsnprintf _vsnprintf
#endif
namespace duckdb_re2 {
// ----------------------------------------------------------------------
// CEscapeString()
// Copies 'src' to 'dest', escaping dangerous characters using
// C-style escape sequences. 'src' and 'dest' should not overlap.
// Returns the number of bytes written to 'dest' (not including the \0)
// or (size_t)-1 if there was insufficient space.
// ----------------------------------------------------------------------
static size_t CEscapeString(const char* src, size_t src_len,
char* dest, size_t dest_len) {
const char* src_end = src + src_len;
size_t used = 0;
for (; src < src_end; src++) {
if (dest_len - used < 2) // space for two-character escape
return (size_t)-1;
unsigned char c = *src;
switch (c) {
case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break;
case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break;
case '\t': dest[used++] = '\\'; dest[used++] = 't'; break;
case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
default:
// Note that if we emit \xNN and the src character after that is a hex
// digit then that digit must be escaped too to prevent it being
// interpreted as part of the character code by C.
if (c < ' ' || c > '~') {
if (dest_len - used < 5) // space for four-character escape + \0
return (size_t)-1;
snprintf(dest + used, 5, "\\%03o", c);
used += 4;
} else {
dest[used++] = c; break;
}
}
}
if (dest_len - used < 1) // make sure that there is room for \0
return (size_t)-1;
dest[used] = '\0'; // doesn't count towards return value though
return used;
}
// ----------------------------------------------------------------------
// CEscape()
// Copies 'src' to result, escaping dangerous characters using
// C-style escape sequences. 'src' and 'dest' should not overlap.
// ----------------------------------------------------------------------
std::string CEscape(const StringPiece& src) {
const size_t dest_len = src.size() * 4 + 1; // Maximum possible expansion
char* dest = new char[dest_len];
const size_t used = CEscapeString(src.data(), src.size(),
dest, dest_len);
std::string s = std::string(dest, used);
delete[] dest;
return s;
}
void PrefixSuccessor(std::string* prefix) {
// We can increment the last character in the string and be done
// unless that character is 255, in which case we have to erase the
// last character and increment the previous character, unless that
// is 255, etc. If the string is empty or consists entirely of
// 255's, we just return the empty string.
while (!prefix->empty()) {
char& c = prefix->back();
if (c == '\xff') { // char literal avoids signed/unsigned.
prefix->pop_back();
} else {
++c;
break;
}
}
}
static void StringAppendV(std::string* dst, const char* format, va_list ap) {
// First try with a small fixed size buffer
char space[1024];
// It's possible for methods that use a va_list to invalidate
// the data in it upon use. The fix is to make a copy
// of the structure before using it and use that copy instead.
va_list backup_ap;
va_copy(backup_ap, ap);
int result = vsnprintf(space, sizeof(space), format, backup_ap);
va_end(backup_ap);
if ((result >= 0) && (static_cast<size_t>(result) < sizeof(space))) {
// It fit
dst->append(space, result);
return;
}
// Repeatedly increase buffer size until it fits
int length = sizeof(space);
while (true) {
if (result < 0) {
// Older behavior: just try doubling the buffer size
length *= 2;
} else {
// We need exactly "result+1" characters
length = result+1;
}
char* buf = new char[length];
// Restore the va_list before we use it again
va_copy(backup_ap, ap);
result = vsnprintf(buf, length, format, backup_ap);
va_end(backup_ap);
if ((result >= 0) && (result < length)) {
// It fit
dst->append(buf, result);
delete[] buf;
return;
}
delete[] buf;
}
}
std::string StringPrintf(const char* format, ...) {
va_list ap;
va_start(ap, format);
std::string result;
StringAppendV(&result, format, ap);
va_end(ap);
return result;
}
} // namespace re2

View File

@@ -0,0 +1,21 @@
// Copyright 2016 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_STRUTIL_H_
#define UTIL_STRUTIL_H_
#include <string>
#include "re2/stringpiece.h"
#include "util/util.h"
namespace duckdb_re2 {
std::string CEscape(const StringPiece& src);
void PrefixSuccessor(std::string* prefix);
std::string StringPrintf(const char* format, ...);
} // namespace re2
#endif // UTIL_STRUTIL_H_

View File

@@ -0,0 +1,44 @@
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*
* This file and rune.cc have been converted to compile as C++ code
* in name space re2.
*/
#ifndef UTIL_UTF_H_
#define UTIL_UTF_H_
#include <stdint.h>
namespace duckdb_re2 {
typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
enum
{
UTFmax = 4, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
Runeerror = 0xFFFD, /* decoding error in UTF */
Runemax = 0x10FFFF, /* maximum rune value */
};
int runetochar(char* s, const Rune* r);
int chartorune(Rune* r, const char* s);
int fullrune(const char* s, int n);
int utflen(const char* s);
char* utfrune(const char*, Rune);
} // namespace re2
#endif // UTIL_UTF_H_

View File

@@ -0,0 +1,42 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_UTIL_H_
#define UTIL_UTIL_H_
#define arraysize(array) (sizeof(array)/sizeof((array)[0]))
#ifndef ATTRIBUTE_NORETURN
#if defined(__GNUC__)
#define ATTRIBUTE_NORETURN __attribute__((noreturn))
#elif defined(_MSC_VER)
#define ATTRIBUTE_NORETURN __declspec(noreturn)
#else
#define ATTRIBUTE_NORETURN
#endif
#endif
#ifndef ATTRIBUTE_UNUSED
#if defined(__GNUC__)
#define ATTRIBUTE_UNUSED __attribute__((unused))
#else
#define ATTRIBUTE_UNUSED
#endif
#endif
#ifndef FALLTHROUGH_INTENDED
#if defined(__clang__)
#define FALLTHROUGH_INTENDED [[clang::fallthrough]]
#elif defined(__GNUC__) && __GNUC__ >= 7
#define FALLTHROUGH_INTENDED [[gnu::fallthrough]]
#else
#define FALLTHROUGH_INTENDED do {} while (0)
#endif
#endif
#ifndef NO_THREAD_SAFETY_ANALYSIS
#define NO_THREAD_SAFETY_ANALYSIS
#endif
#endif // UTIL_UTIL_H_