should be it
This commit is contained in:
20
external/duckdb/third_party/jaro_winkler/LICENSE
vendored
Normal file
20
external/duckdb/third_party/jaro_winkler/LICENSE
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
Copyright © 2022 Max Bachmann
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
270
external/duckdb/third_party/jaro_winkler/details/common.hpp
vendored
Normal file
270
external/duckdb/third_party/jaro_winkler/details/common.hpp
vendored
Normal file
@@ -0,0 +1,270 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <iterator>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
namespace duckdb_jaro_winkler {
|
||||
|
||||
namespace common {
|
||||
|
||||
/**
|
||||
* @defgroup Common Common
|
||||
* Common utilities shared among multiple functions
|
||||
* @{
|
||||
*/
|
||||
|
||||
/* taken from https://stackoverflow.com/a/30766365/11335032 */
|
||||
template <typename T>
|
||||
struct is_iterator {
|
||||
static char test(...);
|
||||
|
||||
template <typename U, typename = typename std::iterator_traits<U>::difference_type,
|
||||
typename = typename std::iterator_traits<U>::pointer,
|
||||
typename = typename std::iterator_traits<U>::reference,
|
||||
typename = typename std::iterator_traits<U>::value_type,
|
||||
typename = typename std::iterator_traits<U>::iterator_category>
|
||||
static long test(U&&);
|
||||
|
||||
constexpr static bool value = std::is_same<decltype(test(std::declval<T>())), long>::value;
|
||||
};
|
||||
|
||||
constexpr double result_cutoff(double result, double score_cutoff)
|
||||
{
|
||||
return (result >= score_cutoff) ? result : 0;
|
||||
}
|
||||
|
||||
template <typename T, typename U>
|
||||
T ceildiv(T a, U divisor)
|
||||
{
|
||||
return static_cast<T>(a / divisor) + static_cast<T>((a % divisor) != 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes common prefix of two string views // todo
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
int64_t remove_common_prefix(InputIt1& first1, InputIt1 last1, InputIt2& first2, InputIt2 last2)
|
||||
{
|
||||
// DuckDB passes a raw pointer, but this gives compile errors for std::
|
||||
int64_t len1 = std::distance(first1, last1);
|
||||
int64_t len2 = std::distance(first2, last2);
|
||||
const int64_t max_comparisons = std::min<int64_t>(len1, len2);
|
||||
int64_t prefix;
|
||||
for (prefix = 0; prefix < max_comparisons; prefix++) {
|
||||
if (first1[prefix] != first2[prefix]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// int64_t prefix = static_cast<int64_t>(
|
||||
// std::distance(first1, std::mismatch(first1, last1, first2, last2).first));
|
||||
first1 += prefix;
|
||||
first2 += prefix;
|
||||
return prefix;
|
||||
}
|
||||
|
||||
struct BitvectorHashmap {
|
||||
struct MapElem {
|
||||
uint64_t key = 0;
|
||||
uint64_t value = 0;
|
||||
};
|
||||
|
||||
BitvectorHashmap() : m_map()
|
||||
{}
|
||||
|
||||
template <typename CharT>
|
||||
void insert(CharT key, int64_t pos)
|
||||
{
|
||||
insert_mask(key, 1ull << pos);
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
void insert_mask(CharT key, uint64_t mask)
|
||||
{
|
||||
uint64_t i = lookup(static_cast<uint64_t>(key));
|
||||
m_map[i].key = static_cast<uint64_t>(key);
|
||||
m_map[i].value |= mask;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
uint64_t get(CharT key) const
|
||||
{
|
||||
return m_map[lookup(static_cast<uint64_t>(key))].value;
|
||||
}
|
||||
|
||||
private:
|
||||
/**
|
||||
* lookup key inside the hashmap using a similar collision resolution
|
||||
* strategy to CPython and Ruby
|
||||
*/
|
||||
uint64_t lookup(uint64_t key) const
|
||||
{
|
||||
uint64_t i = key % 128;
|
||||
|
||||
if (!m_map[i].value || m_map[i].key == key) {
|
||||
return i;
|
||||
}
|
||||
|
||||
uint64_t perturb = key;
|
||||
while (true) {
|
||||
i = ((i * 5) + perturb + 1) % 128;
|
||||
if (!m_map[i].value || m_map[i].key == key) {
|
||||
return i;
|
||||
}
|
||||
|
||||
perturb >>= 5;
|
||||
}
|
||||
}
|
||||
|
||||
std::array<MapElem, 128> m_map;
|
||||
};
|
||||
|
||||
struct PatternMatchVector {
|
||||
struct MapElem {
|
||||
uint64_t key = 0;
|
||||
uint64_t value = 0;
|
||||
};
|
||||
|
||||
PatternMatchVector() : m_map(), m_extendedAscii()
|
||||
{}
|
||||
|
||||
template <typename InputIt1>
|
||||
PatternMatchVector(InputIt1 first, InputIt1 last) : m_map(), m_extendedAscii()
|
||||
{
|
||||
insert(first, last);
|
||||
}
|
||||
|
||||
template <typename InputIt1>
|
||||
void insert(InputIt1 first, InputIt1 last)
|
||||
{
|
||||
uint64_t mask = 1;
|
||||
for (int64_t i = 0; i < std::distance(first, last); ++i) {
|
||||
auto key = first[i];
|
||||
if (key >= 0 && key <= 255) {
|
||||
m_extendedAscii[static_cast<size_t>(key)] |= mask;
|
||||
}
|
||||
else {
|
||||
m_map.insert_mask(key, mask);
|
||||
}
|
||||
mask <<= 1;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
void insert(CharT key, int64_t pos)
|
||||
{
|
||||
uint64_t mask = 1ull << pos;
|
||||
if (key >= 0 && key <= 255) {
|
||||
m_extendedAscii[key] |= mask;
|
||||
}
|
||||
else {
|
||||
m_map.insert_mask(key, mask);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
uint64_t get(CharT key) const
|
||||
{
|
||||
if (key >= 0 && key <= 255) {
|
||||
return m_extendedAscii[static_cast<size_t>(key)];
|
||||
}
|
||||
else {
|
||||
return m_map.get(key);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* combat func for BlockPatternMatchVector
|
||||
*/
|
||||
template <typename CharT>
|
||||
uint64_t get(int64_t block, CharT key) const
|
||||
{
|
||||
(void)block;
|
||||
assert(block == 0);
|
||||
return get(key);
|
||||
}
|
||||
|
||||
private:
|
||||
BitvectorHashmap m_map;
|
||||
std::array<uint64_t, 256> m_extendedAscii;
|
||||
};
|
||||
|
||||
struct BlockPatternMatchVector {
|
||||
BlockPatternMatchVector() : m_block_count(0)
|
||||
{}
|
||||
|
||||
template <typename InputIt1>
|
||||
BlockPatternMatchVector(InputIt1 first, InputIt1 last) : m_block_count(0)
|
||||
{
|
||||
insert(first, last);
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
void insert(int64_t block, CharT key, int pos)
|
||||
{
|
||||
uint64_t mask = 1ull << pos;
|
||||
|
||||
assert(block < m_block_count);
|
||||
if (key >= 0 && key <= 255) {
|
||||
m_extendedAscii[static_cast<size_t>(key * m_block_count + block)] |= mask;
|
||||
}
|
||||
else {
|
||||
m_map[static_cast<size_t>(block)].insert_mask(key, mask);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputIt1>
|
||||
void insert(InputIt1 first, InputIt1 last)
|
||||
{
|
||||
int64_t len = std::distance(first, last);
|
||||
m_block_count = ceildiv(len, 64);
|
||||
m_map.resize(static_cast<size_t>(m_block_count));
|
||||
m_extendedAscii.resize(static_cast<size_t>(m_block_count * 256));
|
||||
|
||||
for (int64_t i = 0; i < len; ++i) {
|
||||
int64_t block = i / 64;
|
||||
int64_t pos = i % 64;
|
||||
insert(block, first[i], static_cast<int>(pos));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* combat func for PatternMatchVector
|
||||
*/
|
||||
template <typename CharT>
|
||||
uint64_t get(CharT key) const
|
||||
{
|
||||
return get(0, key);
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
uint64_t get(int64_t block, CharT key) const
|
||||
{
|
||||
assert(block < m_block_count);
|
||||
if (key >= 0 && key <= 255) {
|
||||
return m_extendedAscii[static_cast<size_t>(key * m_block_count + block)];
|
||||
}
|
||||
else {
|
||||
return m_map[static_cast<size_t>(block)].get(key);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<BitvectorHashmap> m_map;
|
||||
std::vector<uint64_t> m_extendedAscii;
|
||||
int64_t m_block_count;
|
||||
};
|
||||
|
||||
/**@}*/
|
||||
|
||||
} // namespace common
|
||||
} // namespace duckdb_jaro_winkler
|
||||
110
external/duckdb/third_party/jaro_winkler/details/intrinsics.hpp
vendored
Normal file
110
external/duckdb/third_party/jaro_winkler/details/intrinsics.hpp
vendored
Normal file
@@ -0,0 +1,110 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
# include <intrin.h>
|
||||
#endif
|
||||
|
||||
namespace duckdb_jaro_winkler {
|
||||
namespace intrinsics {
|
||||
|
||||
template <typename T>
|
||||
T bit_mask_lsb(int n)
|
||||
{
|
||||
T mask = static_cast<T>(-1);
|
||||
if (n < static_cast<int>(sizeof(T) * 8)) {
|
||||
mask += static_cast<T>(1) << n;
|
||||
}
|
||||
return mask;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool bittest(T a, int bit)
|
||||
{
|
||||
return (a >> bit) & 1;
|
||||
}
|
||||
|
||||
static inline int64_t popcount(uint64_t x)
|
||||
{
|
||||
const uint64_t m1 = 0x5555555555555555;
|
||||
const uint64_t m2 = 0x3333333333333333;
|
||||
const uint64_t m4 = 0x0f0f0f0f0f0f0f0f;
|
||||
const uint64_t h01 = 0x0101010101010101;
|
||||
|
||||
x -= (x >> 1) & m1;
|
||||
x = (x & m2) + ((x >> 2) & m2);
|
||||
x = (x + (x >> 4)) & m4;
|
||||
return static_cast<int64_t>((x * h01) >> 56);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the lowest set bit from a. If no bits are set in a returns 0.
|
||||
*/
|
||||
template <typename T>
|
||||
T blsi(T a)
|
||||
{
|
||||
#if _MSC_VER && !defined(__clang__)
|
||||
# pragma warning(push)
|
||||
/* unary minus operator applied to unsigned type, result still unsigned */
|
||||
# pragma warning(disable: 4146)
|
||||
#endif
|
||||
return a & -a;
|
||||
#if _MSC_VER && !defined(__clang__)
|
||||
# pragma warning(pop)
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear the lowest set bit in a.
|
||||
*/
|
||||
template <typename T>
|
||||
T blsr(T x)
|
||||
{
|
||||
return x & (x - 1);
|
||||
}
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
static inline int tzcnt(uint32_t x)
|
||||
{
|
||||
unsigned long trailing_zero = 0;
|
||||
_BitScanForward(&trailing_zero, x);
|
||||
return trailing_zero;
|
||||
}
|
||||
|
||||
# if defined(_M_ARM) || defined(_M_X64)
|
||||
static inline int tzcnt(uint64_t x)
|
||||
{
|
||||
unsigned long trailing_zero = 0;
|
||||
_BitScanForward64(&trailing_zero, x);
|
||||
return trailing_zero;
|
||||
}
|
||||
# else
|
||||
static inline int tzcnt(uint64_t x)
|
||||
{
|
||||
uint32_t msh = (uint32_t)(x >> 32);
|
||||
uint32_t lsh = (uint32_t)(x & 0xFFFFFFFF);
|
||||
if (lsh != 0) {
|
||||
return tzcnt(lsh);
|
||||
}
|
||||
return 32 + tzcnt(msh);
|
||||
}
|
||||
# endif
|
||||
|
||||
#else /* gcc / clang */
|
||||
//static inline int tzcnt(uint32_t x)
|
||||
//{
|
||||
// return __builtin_ctz(x);
|
||||
//}
|
||||
|
||||
static inline int tzcnt(uint64_t x)
|
||||
{
|
||||
return __builtin_ctzll(x);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace intrinsics
|
||||
} // namespace duckdb_jaro_winkler
|
||||
509
external/duckdb/third_party/jaro_winkler/details/jaro_impl.hpp
vendored
Normal file
509
external/duckdb/third_party/jaro_winkler/details/jaro_impl.hpp
vendored
Normal file
@@ -0,0 +1,509 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common.hpp"
|
||||
#include "intrinsics.hpp"
|
||||
|
||||
namespace duckdb_jaro_winkler {
|
||||
namespace detail {
|
||||
|
||||
struct FlaggedCharsWord {
|
||||
uint64_t P_flag;
|
||||
uint64_t T_flag;
|
||||
};
|
||||
|
||||
struct FlaggedCharsMultiword {
|
||||
std::vector<uint64_t> P_flag;
|
||||
std::vector<uint64_t> T_flag;
|
||||
};
|
||||
|
||||
struct SearchBoundMask {
|
||||
int64_t words = 0;
|
||||
int64_t empty_words = 0;
|
||||
uint64_t last_mask = 0;
|
||||
uint64_t first_mask = 0;
|
||||
};
|
||||
|
||||
struct TextPosition {
|
||||
TextPosition(int64_t Word_, int64_t WordPos_) : Word(Word_), WordPos(WordPos_)
|
||||
{}
|
||||
int64_t Word;
|
||||
int64_t WordPos;
|
||||
};
|
||||
|
||||
static inline double jaro_calculate_similarity(int64_t P_len, int64_t T_len, int64_t CommonChars,
|
||||
int64_t Transpositions)
|
||||
{
|
||||
Transpositions /= 2;
|
||||
double Sim = 0;
|
||||
Sim += static_cast<double>(CommonChars) / static_cast<double>(P_len);
|
||||
Sim += static_cast<double>(CommonChars) / static_cast<double>(T_len);
|
||||
Sim += (static_cast<double>(CommonChars) - static_cast<double>(Transpositions)) / static_cast<double>(CommonChars);
|
||||
return Sim / 3.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief filter matches below score_cutoff based on string lengths
|
||||
*/
|
||||
static inline bool jaro_length_filter(int64_t P_len, int64_t T_len, double score_cutoff)
|
||||
{
|
||||
if (!T_len || !P_len) return false;
|
||||
|
||||
double min_len = static_cast<double>(std::min(P_len, T_len));
|
||||
double Sim = min_len / static_cast<double>(P_len) + min_len / static_cast<double>(T_len) + 1.0;
|
||||
Sim /= 3.0;
|
||||
return Sim >= score_cutoff;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief filter matches below score_cutoff based on string lengths and common characters
|
||||
*/
|
||||
static inline bool jaro_common_char_filter(int64_t P_len, int64_t T_len, int64_t CommonChars,
|
||||
double score_cutoff)
|
||||
{
|
||||
if (!CommonChars) return false;
|
||||
|
||||
double Sim = 0;
|
||||
Sim += static_cast<double>(CommonChars) / static_cast<double>(P_len);
|
||||
Sim += static_cast<double>(CommonChars) / static_cast<double>(T_len);
|
||||
Sim += 1.0;
|
||||
Sim /= 3.0;
|
||||
return Sim >= score_cutoff;
|
||||
}
|
||||
|
||||
static inline int64_t count_common_chars(const FlaggedCharsWord& flagged)
|
||||
{
|
||||
return intrinsics::popcount(flagged.P_flag);
|
||||
}
|
||||
|
||||
static inline int64_t count_common_chars(const FlaggedCharsMultiword& flagged)
|
||||
{
|
||||
int64_t CommonChars = 0;
|
||||
if (flagged.P_flag.size() < flagged.T_flag.size()) {
|
||||
for (uint64_t flag : flagged.P_flag) {
|
||||
CommonChars += intrinsics::popcount(flag);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (uint64_t flag : flagged.T_flag) {
|
||||
CommonChars += intrinsics::popcount(flag);
|
||||
}
|
||||
}
|
||||
return CommonChars;
|
||||
}
|
||||
|
||||
template <typename PM_Vec, typename InputIt1, typename InputIt2>
|
||||
static inline FlaggedCharsWord
|
||||
flag_similar_characters_word(const PM_Vec& PM, InputIt1 P_first,
|
||||
InputIt1 P_last, InputIt2 T_first, InputIt2 T_last, int Bound)
|
||||
{
|
||||
using namespace intrinsics;
|
||||
int64_t P_len = std::distance(P_first, P_last);
|
||||
(void)P_len;
|
||||
int64_t T_len = std::distance(T_first, T_last);
|
||||
assert(P_len <= 64);
|
||||
assert(T_len <= 64);
|
||||
assert(Bound > P_len || P_len - Bound <= T_len);
|
||||
|
||||
FlaggedCharsWord flagged = {0, 0};
|
||||
|
||||
uint64_t BoundMask = bit_mask_lsb<uint64_t>(Bound + 1);
|
||||
|
||||
int64_t j = 0;
|
||||
for (; j < std::min(static_cast<int64_t>(Bound), T_len); ++j) {
|
||||
uint64_t PM_j = PM.get(T_first[j]) & BoundMask & (~flagged.P_flag);
|
||||
|
||||
flagged.P_flag |= blsi(PM_j);
|
||||
flagged.T_flag |= static_cast<uint64_t>(PM_j != 0) << j;
|
||||
|
||||
BoundMask = (BoundMask << 1) | 1;
|
||||
}
|
||||
|
||||
for (; j < T_len; ++j) {
|
||||
uint64_t PM_j = PM.get(T_first[j]) & BoundMask & (~flagged.P_flag);
|
||||
|
||||
flagged.P_flag |= blsi(PM_j);
|
||||
flagged.T_flag |= static_cast<uint64_t>(PM_j != 0) << j;
|
||||
|
||||
BoundMask <<= 1;
|
||||
}
|
||||
|
||||
return flagged;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
static inline void flag_similar_characters_step(const common::BlockPatternMatchVector& PM,
|
||||
CharT T_j, FlaggedCharsMultiword& flagged,
|
||||
int64_t j, SearchBoundMask BoundMask)
|
||||
{
|
||||
using namespace intrinsics;
|
||||
|
||||
int64_t j_word = j / 64;
|
||||
int64_t j_pos = j % 64;
|
||||
int64_t word = BoundMask.empty_words;
|
||||
int64_t last_word = word + BoundMask.words;
|
||||
|
||||
if (BoundMask.words == 1) {
|
||||
uint64_t PM_j = PM.get(word, T_j) & BoundMask.last_mask & BoundMask.first_mask &
|
||||
(~flagged.P_flag[static_cast<size_t>(word)]);
|
||||
|
||||
flagged.P_flag[static_cast<size_t>(word)] |= blsi(PM_j);
|
||||
flagged.T_flag[static_cast<size_t>(j_word)] |= static_cast<uint64_t>(PM_j != 0) << j_pos;
|
||||
return;
|
||||
}
|
||||
|
||||
if (BoundMask.first_mask) {
|
||||
uint64_t PM_j = PM.get(word, T_j) & BoundMask.first_mask & (~flagged.P_flag[static_cast<size_t>(word)]);
|
||||
|
||||
if (PM_j) {
|
||||
flagged.P_flag[static_cast<size_t>(word)] |= blsi(PM_j);
|
||||
flagged.T_flag[static_cast<size_t>(j_word)] |= 1ull << j_pos;
|
||||
return;
|
||||
}
|
||||
word++;
|
||||
}
|
||||
|
||||
for (; word < last_word - 1; ++word) {
|
||||
uint64_t PM_j = PM.get(word, T_j) & (~flagged.P_flag[static_cast<size_t>(word)]);
|
||||
|
||||
if (PM_j) {
|
||||
flagged.P_flag[static_cast<size_t>(word)] |= blsi(PM_j);
|
||||
flagged.T_flag[static_cast<size_t>(j_word)] |= 1ull << j_pos;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (BoundMask.last_mask) {
|
||||
uint64_t PM_j = PM.get(word, T_j) & BoundMask.last_mask & (~flagged.P_flag[static_cast<size_t>(word)]);
|
||||
|
||||
flagged.P_flag[static_cast<size_t>(word)] |= blsi(PM_j);
|
||||
flagged.T_flag[static_cast<size_t>(j_word)] |= static_cast<uint64_t>(PM_j != 0) << j_pos;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static inline FlaggedCharsMultiword
|
||||
flag_similar_characters_block(const common::BlockPatternMatchVector& PM, InputIt1 P_first,
|
||||
InputIt1 P_last, InputIt2 T_first, InputIt2 T_last, int64_t Bound)
|
||||
{
|
||||
using namespace intrinsics;
|
||||
int64_t P_len = std::distance(P_first, P_last);
|
||||
int64_t T_len = std::distance(T_first, T_last);
|
||||
assert(P_len > 64 || T_len > 64);
|
||||
assert(Bound > P_len || P_len - Bound <= T_len);
|
||||
assert(Bound >= 31);
|
||||
|
||||
int64_t TextWords = common::ceildiv(T_len, 64);
|
||||
int64_t PatternWords = common::ceildiv(P_len, 64);
|
||||
|
||||
FlaggedCharsMultiword flagged;
|
||||
flagged.T_flag.resize(static_cast<size_t>(TextWords));
|
||||
flagged.P_flag.resize(static_cast<size_t>(PatternWords));
|
||||
|
||||
SearchBoundMask BoundMask;
|
||||
int64_t start_range = std::min(Bound + 1, P_len);
|
||||
BoundMask.words = 1 + start_range / 64;
|
||||
BoundMask.empty_words = 0;
|
||||
BoundMask.last_mask = (1ull << (start_range % 64)) - 1;
|
||||
BoundMask.first_mask = ~UINT64_C(0);
|
||||
|
||||
for (int64_t j = 0; j < T_len; ++j) {
|
||||
flag_similar_characters_step(PM, T_first[j], flagged, j, BoundMask);
|
||||
|
||||
if (j + Bound + 1 < P_len) {
|
||||
BoundMask.last_mask = (BoundMask.last_mask << 1) | 1;
|
||||
if (j + Bound + 2 < P_len && BoundMask.last_mask == ~UINT64_C(0)) {
|
||||
BoundMask.last_mask = 0;
|
||||
BoundMask.words++;
|
||||
}
|
||||
}
|
||||
|
||||
if (j >= Bound) {
|
||||
BoundMask.first_mask <<= 1;
|
||||
if (BoundMask.first_mask == 0) {
|
||||
BoundMask.first_mask = ~UINT64_C(0);
|
||||
BoundMask.words--;
|
||||
BoundMask.empty_words++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return flagged;
|
||||
}
|
||||
|
||||
template <typename PM_Vec, typename InputIt1>
|
||||
static inline int64_t count_transpositions_word(const PM_Vec& PM,
|
||||
InputIt1 T_first, InputIt1,
|
||||
const FlaggedCharsWord& flagged)
|
||||
{
|
||||
using namespace intrinsics;
|
||||
uint64_t P_flag = flagged.P_flag;
|
||||
uint64_t T_flag = flagged.T_flag;
|
||||
int64_t Transpositions = 0;
|
||||
while (T_flag) {
|
||||
uint64_t PatternFlagMask = blsi(P_flag);
|
||||
|
||||
Transpositions += !(PM.get(T_first[tzcnt(T_flag)]) & PatternFlagMask);
|
||||
|
||||
T_flag = blsr(T_flag);
|
||||
P_flag ^= PatternFlagMask;
|
||||
}
|
||||
|
||||
return Transpositions;
|
||||
}
|
||||
|
||||
template <typename InputIt1>
|
||||
static inline int64_t
|
||||
count_transpositions_block(const common::BlockPatternMatchVector& PM, InputIt1 T_first, InputIt1,
|
||||
const FlaggedCharsMultiword& flagged, int64_t FlaggedChars)
|
||||
{
|
||||
using namespace intrinsics;
|
||||
int64_t TextWord = 0;
|
||||
int64_t PatternWord = 0;
|
||||
uint64_t T_flag = flagged.T_flag[static_cast<size_t>(TextWord)];
|
||||
uint64_t P_flag = flagged.P_flag[static_cast<size_t>(PatternWord)];
|
||||
|
||||
int64_t Transpositions = 0;
|
||||
while (FlaggedChars) {
|
||||
while (!T_flag) {
|
||||
TextWord++;
|
||||
T_first += 64;
|
||||
T_flag = flagged.T_flag[static_cast<size_t>(TextWord)];
|
||||
}
|
||||
|
||||
while (T_flag) {
|
||||
while (!P_flag) {
|
||||
PatternWord++;
|
||||
P_flag = flagged.P_flag[static_cast<size_t>(PatternWord)];
|
||||
}
|
||||
|
||||
uint64_t PatternFlagMask = blsi(P_flag);
|
||||
|
||||
Transpositions += !(PM.get(PatternWord, T_first[tzcnt(T_flag)]) & PatternFlagMask);
|
||||
|
||||
T_flag = blsr(T_flag);
|
||||
P_flag ^= PatternFlagMask;
|
||||
|
||||
FlaggedChars--;
|
||||
}
|
||||
}
|
||||
|
||||
return Transpositions;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief find bounds and skip out of bound parts of the sequences
|
||||
*
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
int64_t jaro_bounds(InputIt1 P_first, InputIt1& P_last, InputIt2 T_first, InputIt2& T_last)
|
||||
{
|
||||
int64_t P_len = std::distance(P_first, P_last);
|
||||
int64_t T_len = std::distance(T_first, T_last);
|
||||
|
||||
/* since jaro uses a sliding window some parts of T/P might never be in
|
||||
* range an can be removed ahead of time
|
||||
*/
|
||||
int64_t Bound = 0;
|
||||
if (T_len > P_len) {
|
||||
Bound = T_len / 2 - 1;
|
||||
if (T_len > P_len + Bound) {
|
||||
T_last = T_first + P_len + Bound;
|
||||
}
|
||||
}
|
||||
else {
|
||||
Bound = P_len / 2 - 1;
|
||||
if (P_len > T_len + Bound) {
|
||||
P_last = P_first + T_len + Bound;
|
||||
}
|
||||
}
|
||||
return Bound;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double jaro_similarity(InputIt1 P_first, InputIt1 P_last, InputIt2 T_first, InputIt2 T_last,
|
||||
double score_cutoff)
|
||||
{
|
||||
int64_t P_len = std::distance(P_first, P_last);
|
||||
int64_t T_len = std::distance(T_first, T_last);
|
||||
|
||||
/* filter out based on the length difference between the two strings */
|
||||
if (!jaro_length_filter(P_len, T_len, score_cutoff)) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
if (P_len == 1 && T_len == 1) {
|
||||
return static_cast<double>(P_first[0] == T_first[0]);
|
||||
}
|
||||
|
||||
int64_t Bound = jaro_bounds(P_first, P_last, T_first, T_last);
|
||||
|
||||
/* common prefix never includes Transpositions */
|
||||
int64_t CommonChars = common::remove_common_prefix(P_first, P_last, T_first, T_last);
|
||||
int64_t Transpositions = 0;
|
||||
int64_t P_view_len = std::distance(P_first, P_last);
|
||||
int64_t T_view_len = std::distance(T_first, T_last);
|
||||
|
||||
if (!P_view_len || !T_view_len) {
|
||||
/* already has correct number of common chars and transpositions */
|
||||
}
|
||||
else if (P_view_len <= 64 && T_view_len <= 64) {
|
||||
common::PatternMatchVector PM(P_first, P_last);
|
||||
auto flagged = flag_similar_characters_word(PM, P_first, P_last, T_first, T_last, static_cast<int>(Bound));
|
||||
CommonChars += count_common_chars(flagged);
|
||||
|
||||
if (!jaro_common_char_filter(P_len, T_len, CommonChars, score_cutoff)) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
Transpositions = count_transpositions_word(PM, T_first, T_last, flagged);
|
||||
}
|
||||
else {
|
||||
common::BlockPatternMatchVector PM(P_first, P_last);
|
||||
auto flagged = flag_similar_characters_block(PM, P_first, P_last, T_first, T_last, Bound);
|
||||
int64_t FlaggedChars = count_common_chars(flagged);
|
||||
CommonChars += FlaggedChars;
|
||||
|
||||
if (!jaro_common_char_filter(P_len, T_len, CommonChars, score_cutoff)) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
Transpositions = count_transpositions_block(PM, T_first, T_last, flagged, FlaggedChars);
|
||||
}
|
||||
|
||||
double Sim = jaro_calculate_similarity(P_len, T_len, CommonChars, Transpositions);
|
||||
return common::result_cutoff(Sim, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double jaro_similarity(const common::BlockPatternMatchVector& PM, InputIt1 P_first, InputIt1 P_last,
|
||||
InputIt2 T_first, InputIt2 T_last, double score_cutoff)
|
||||
{
|
||||
int64_t P_len = std::distance(P_first, P_last);
|
||||
int64_t T_len = std::distance(T_first, T_last);
|
||||
|
||||
/* filter out based on the length difference between the two strings */
|
||||
if (!jaro_length_filter(P_len, T_len, score_cutoff)) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
if (P_len == 1 && T_len == 1) {
|
||||
return static_cast<double>(P_first[0] == T_first[0]);
|
||||
}
|
||||
|
||||
int64_t Bound = jaro_bounds(P_first, P_last, T_first, T_last);
|
||||
|
||||
/* common prefix never includes Transpositions */
|
||||
int64_t CommonChars = 0;
|
||||
int64_t Transpositions = 0;
|
||||
int64_t P_view_len = std::distance(P_first, P_last);
|
||||
int64_t T_view_len = std::distance(T_first, T_last);
|
||||
|
||||
if (!P_view_len || !T_view_len) {
|
||||
/* already has correct number of common chars and transpositions */
|
||||
}
|
||||
else if (P_view_len <= 64 && T_view_len <= 64) {
|
||||
auto flagged = flag_similar_characters_word(PM, P_first, P_last, T_first, T_last, static_cast<int>(Bound));
|
||||
CommonChars += count_common_chars(flagged);
|
||||
|
||||
if (!jaro_common_char_filter(P_len, T_len, CommonChars, score_cutoff)) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
Transpositions = count_transpositions_word(PM, T_first, T_last, flagged);
|
||||
}
|
||||
else {
|
||||
auto flagged = flag_similar_characters_block(PM, P_first, P_last, T_first, T_last, Bound);
|
||||
int64_t FlaggedChars = count_common_chars(flagged);
|
||||
CommonChars += FlaggedChars;
|
||||
|
||||
if (!jaro_common_char_filter(P_len, T_len, CommonChars, score_cutoff)) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
Transpositions = count_transpositions_block(PM, T_first, T_last, flagged, FlaggedChars);
|
||||
}
|
||||
|
||||
double Sim = jaro_calculate_similarity(P_len, T_len, CommonChars, Transpositions);
|
||||
return common::result_cutoff(Sim, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double jaro_winkler_similarity(InputIt1 P_first, InputIt1 P_last, InputIt2 T_first, InputIt2 T_last,
|
||||
double prefix_weight, double score_cutoff)
|
||||
{
|
||||
int64_t P_len = std::distance(P_first, P_last);
|
||||
int64_t T_len = std::distance(T_first, T_last);
|
||||
int64_t min_len = std::min(P_len, T_len);
|
||||
int64_t prefix = 0;
|
||||
int64_t max_prefix = std::min<int64_t>(min_len, 4);
|
||||
|
||||
for (; prefix < max_prefix; ++prefix) {
|
||||
if (T_first[prefix] != P_first[prefix]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
double jaro_score_cutoff = score_cutoff;
|
||||
if (jaro_score_cutoff > 0.7) {
|
||||
double prefix_sim = static_cast<double>(prefix) * prefix_weight;
|
||||
|
||||
if (prefix_sim >= 1.0) {
|
||||
jaro_score_cutoff = 0.7;
|
||||
}
|
||||
else {
|
||||
jaro_score_cutoff =
|
||||
std::max(0.7, (prefix_sim - jaro_score_cutoff) / (prefix_sim - 1.0));
|
||||
}
|
||||
}
|
||||
|
||||
double Sim = jaro_similarity(P_first, P_last, T_first, T_last, jaro_score_cutoff);
|
||||
if (Sim > 0.7) {
|
||||
Sim += static_cast<double>(prefix) * prefix_weight * (1.0 - Sim);
|
||||
}
|
||||
|
||||
return common::result_cutoff(Sim, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double jaro_winkler_similarity(const common::BlockPatternMatchVector& PM, InputIt1 P_first,
|
||||
InputIt1 P_last, InputIt2 T_first, InputIt2 T_last,
|
||||
double prefix_weight, double score_cutoff)
|
||||
{
|
||||
int64_t P_len = std::distance(P_first, P_last);
|
||||
int64_t T_len = std::distance(T_first, T_last);
|
||||
int64_t min_len = std::min(P_len, T_len);
|
||||
int64_t prefix = 0;
|
||||
int64_t max_prefix = std::min<int64_t>(min_len, 4);
|
||||
|
||||
for (; prefix < max_prefix; ++prefix) {
|
||||
if (T_first[prefix] != P_first[prefix]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
double jaro_score_cutoff = score_cutoff;
|
||||
if (jaro_score_cutoff > 0.7) {
|
||||
double prefix_sim = static_cast<double>(prefix) * prefix_weight;
|
||||
|
||||
if (prefix_sim >= 1.0) {
|
||||
jaro_score_cutoff = 0.7;
|
||||
}
|
||||
else {
|
||||
jaro_score_cutoff =
|
||||
std::max(0.7, (prefix_sim - jaro_score_cutoff) / (prefix_sim - 1.0));
|
||||
}
|
||||
}
|
||||
|
||||
double Sim = jaro_similarity(PM, P_first, P_last, T_first, T_last, jaro_score_cutoff);
|
||||
if (Sim > 0.7) {
|
||||
Sim += static_cast<double>(prefix) * prefix_weight * (1.0 - Sim);
|
||||
}
|
||||
|
||||
return common::result_cutoff(Sim, score_cutoff);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
} // namespace duckdb_jaro_winkler
|
||||
186
external/duckdb/third_party/jaro_winkler/jaro_winkler.hpp
vendored
Normal file
186
external/duckdb/third_party/jaro_winkler/jaro_winkler.hpp
vendored
Normal file
@@ -0,0 +1,186 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include "details/common.hpp"
|
||||
#include "details/jaro_impl.hpp"
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
namespace duckdb_jaro_winkler {
|
||||
|
||||
/**
|
||||
* @defgroup jaro_winkler jaro_winkler
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Calculates the jaro winkler similarity
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1
|
||||
* string to compare with s2 (for type info check Template parameters above)
|
||||
* @param s2
|
||||
* string to compare with s1 (for type info check Template parameters above)
|
||||
* @param prefix_weight
|
||||
* Weight used for the common prefix of the two strings.
|
||||
* Has to be between 0 and 0.25. Default is 0.1.
|
||||
* @param score_cutoff
|
||||
* Optional argument for a score threshold as a float between 0 and 100.
|
||||
* For similarity < score_cutoff 0 is returned instead. Default is 0,
|
||||
* which deactivates this behaviour.
|
||||
*
|
||||
* @return jaro winkler similarity between s1 and s2
|
||||
* as a float between 0 and 100
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
typename std::enable_if<
|
||||
common::is_iterator<InputIt1>::value && common::is_iterator<InputIt2>::value, double>::type
|
||||
jaro_winkler_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double prefix_weight = 0.1, double score_cutoff = 0.0)
|
||||
{
|
||||
if (prefix_weight < 0.0 || prefix_weight > 0.25) {
|
||||
throw std::invalid_argument("prefix_weight has to be between 0.0 and 0.25");
|
||||
}
|
||||
|
||||
return detail::jaro_winkler_similarity(first1, last1, first2, last2, prefix_weight,
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template <typename S1, typename S2>
|
||||
double jaro_winkler_similarity(const S1& s1, const S2& s2, double prefix_weight = 0.1,
|
||||
double score_cutoff = 0.0)
|
||||
{
|
||||
return jaro_winkler_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2),
|
||||
prefix_weight, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
struct CachedJaroWinklerSimilarity {
|
||||
template <typename InputIt1>
|
||||
CachedJaroWinklerSimilarity(InputIt1 first1, InputIt1 last1, double prefix_weight_ = 0.1)
|
||||
: s1(first1, last1), PM(first1, last1), prefix_weight(prefix_weight_)
|
||||
{
|
||||
if (prefix_weight < 0.0 || prefix_weight > 0.25) {
|
||||
throw std::invalid_argument("prefix_weight has to be between 0.0 and 0.25");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename S1>
|
||||
CachedJaroWinklerSimilarity(const S1& s1_, double prefix_weight_ = 0.1)
|
||||
: CachedJaroWinklerSimilarity(std::begin(s1_), std::end(s1_), prefix_weight_)
|
||||
{}
|
||||
|
||||
template <typename InputIt2>
|
||||
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
|
||||
{
|
||||
return detail::jaro_winkler_similarity(PM, std::begin(s1), std::end(s1), first2, last2,
|
||||
prefix_weight, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename S2>
|
||||
double similarity(const S2& s2, double score_cutoff = 0) const
|
||||
{
|
||||
return similarity(std::begin(s2), std::end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
|
||||
{
|
||||
return similarity(first2, last2, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename S2>
|
||||
double normalized_similarity(const S2& s2, double score_cutoff = 0) const
|
||||
{
|
||||
return similarity(s2, score_cutoff);
|
||||
}
|
||||
|
||||
private:
|
||||
std::basic_string<CharT1> s1;
|
||||
common::BlockPatternMatchVector PM;
|
||||
|
||||
double prefix_weight;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Calculates the jaro similarity
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1
|
||||
* string to compare with s2 (for type info check Template parameters above)
|
||||
* @param s2
|
||||
* string to compare with s1 (for type info check Template parameters above)
|
||||
* @param score_cutoff
|
||||
* Optional argument for a score threshold as a float between 0 and 100.
|
||||
* For similarity < score_cutoff 0 is returned instead. Default is 0,
|
||||
* which deactivates this behaviour.
|
||||
*
|
||||
* @return jaro similarity between s1 and s2
|
||||
* as a float between 0 and 100
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double jaro_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::jaro_similarity(first1, last1, first2, last2, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename S1, typename S2>
|
||||
double jaro_similarity(const S1& s1, const S2& s2, double score_cutoff = 0.0)
|
||||
{
|
||||
return jaro_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
struct CachedJaroSimilarity {
|
||||
template <typename InputIt1>
|
||||
CachedJaroSimilarity(InputIt1 first1, InputIt1 last1) : s1(first1, last1), PM(first1, last1)
|
||||
{}
|
||||
|
||||
template <typename S1>
|
||||
CachedJaroSimilarity(const S1& s1_) : CachedJaroSimilarity(std::begin(s1_), std::end(s1_))
|
||||
{}
|
||||
|
||||
template <typename InputIt2>
|
||||
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
|
||||
{
|
||||
return detail::jaro_similarity(PM, std::begin(s1), std::end(s1), first2, last2,
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template <typename S2>
|
||||
double similarity(const S2& s2, double score_cutoff = 0) const
|
||||
{
|
||||
return similarity(std::begin(s2), std::end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
|
||||
{
|
||||
return similarity(first2, last2, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename S2>
|
||||
double normalized_similarity(const S2& s2, double score_cutoff = 0) const
|
||||
{
|
||||
return similarity(s2, score_cutoff);
|
||||
}
|
||||
|
||||
private:
|
||||
std::basic_string<CharT1> s1;
|
||||
common::BlockPatternMatchVector PM;
|
||||
};
|
||||
|
||||
/**@}*/
|
||||
|
||||
} // namespace duckdb_jaro_winkler
|
||||
Reference in New Issue
Block a user