should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,20 @@
Copyright © 2022 Max Bachmann
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@@ -0,0 +1,270 @@
/* SPDX-License-Identifier: MIT */
/* Copyright © 2022 Max Bachmann */
#pragma once
#include <algorithm>
#include <array>
#include <cassert>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <iterator>
#include <type_traits>
#include <vector>
namespace duckdb_jaro_winkler {
namespace common {
/**
* @defgroup Common Common
* Common utilities shared among multiple functions
* @{
*/
/* taken from https://stackoverflow.com/a/30766365/11335032 */
template <typename T>
struct is_iterator {
static char test(...);
template <typename U, typename = typename std::iterator_traits<U>::difference_type,
typename = typename std::iterator_traits<U>::pointer,
typename = typename std::iterator_traits<U>::reference,
typename = typename std::iterator_traits<U>::value_type,
typename = typename std::iterator_traits<U>::iterator_category>
static long test(U&&);
constexpr static bool value = std::is_same<decltype(test(std::declval<T>())), long>::value;
};
constexpr double result_cutoff(double result, double score_cutoff)
{
return (result >= score_cutoff) ? result : 0;
}
template <typename T, typename U>
T ceildiv(T a, U divisor)
{
return static_cast<T>(a / divisor) + static_cast<T>((a % divisor) != 0);
}
/**
* Removes common prefix of two string views // todo
*/
template <typename InputIt1, typename InputIt2>
int64_t remove_common_prefix(InputIt1& first1, InputIt1 last1, InputIt2& first2, InputIt2 last2)
{
// DuckDB passes a raw pointer, but this gives compile errors for std::
int64_t len1 = std::distance(first1, last1);
int64_t len2 = std::distance(first2, last2);
const int64_t max_comparisons = std::min<int64_t>(len1, len2);
int64_t prefix;
for (prefix = 0; prefix < max_comparisons; prefix++) {
if (first1[prefix] != first2[prefix]) {
break;
}
}
// int64_t prefix = static_cast<int64_t>(
// std::distance(first1, std::mismatch(first1, last1, first2, last2).first));
first1 += prefix;
first2 += prefix;
return prefix;
}
struct BitvectorHashmap {
struct MapElem {
uint64_t key = 0;
uint64_t value = 0;
};
BitvectorHashmap() : m_map()
{}
template <typename CharT>
void insert(CharT key, int64_t pos)
{
insert_mask(key, 1ull << pos);
}
template <typename CharT>
void insert_mask(CharT key, uint64_t mask)
{
uint64_t i = lookup(static_cast<uint64_t>(key));
m_map[i].key = static_cast<uint64_t>(key);
m_map[i].value |= mask;
}
template <typename CharT>
uint64_t get(CharT key) const
{
return m_map[lookup(static_cast<uint64_t>(key))].value;
}
private:
/**
* lookup key inside the hashmap using a similar collision resolution
* strategy to CPython and Ruby
*/
uint64_t lookup(uint64_t key) const
{
uint64_t i = key % 128;
if (!m_map[i].value || m_map[i].key == key) {
return i;
}
uint64_t perturb = key;
while (true) {
i = ((i * 5) + perturb + 1) % 128;
if (!m_map[i].value || m_map[i].key == key) {
return i;
}
perturb >>= 5;
}
}
std::array<MapElem, 128> m_map;
};
struct PatternMatchVector {
struct MapElem {
uint64_t key = 0;
uint64_t value = 0;
};
PatternMatchVector() : m_map(), m_extendedAscii()
{}
template <typename InputIt1>
PatternMatchVector(InputIt1 first, InputIt1 last) : m_map(), m_extendedAscii()
{
insert(first, last);
}
template <typename InputIt1>
void insert(InputIt1 first, InputIt1 last)
{
uint64_t mask = 1;
for (int64_t i = 0; i < std::distance(first, last); ++i) {
auto key = first[i];
if (key >= 0 && key <= 255) {
m_extendedAscii[static_cast<size_t>(key)] |= mask;
}
else {
m_map.insert_mask(key, mask);
}
mask <<= 1;
}
}
template <typename CharT>
void insert(CharT key, int64_t pos)
{
uint64_t mask = 1ull << pos;
if (key >= 0 && key <= 255) {
m_extendedAscii[key] |= mask;
}
else {
m_map.insert_mask(key, mask);
}
}
template <typename CharT>
uint64_t get(CharT key) const
{
if (key >= 0 && key <= 255) {
return m_extendedAscii[static_cast<size_t>(key)];
}
else {
return m_map.get(key);
}
}
/**
* combat func for BlockPatternMatchVector
*/
template <typename CharT>
uint64_t get(int64_t block, CharT key) const
{
(void)block;
assert(block == 0);
return get(key);
}
private:
BitvectorHashmap m_map;
std::array<uint64_t, 256> m_extendedAscii;
};
struct BlockPatternMatchVector {
BlockPatternMatchVector() : m_block_count(0)
{}
template <typename InputIt1>
BlockPatternMatchVector(InputIt1 first, InputIt1 last) : m_block_count(0)
{
insert(first, last);
}
template <typename CharT>
void insert(int64_t block, CharT key, int pos)
{
uint64_t mask = 1ull << pos;
assert(block < m_block_count);
if (key >= 0 && key <= 255) {
m_extendedAscii[static_cast<size_t>(key * m_block_count + block)] |= mask;
}
else {
m_map[static_cast<size_t>(block)].insert_mask(key, mask);
}
}
template <typename InputIt1>
void insert(InputIt1 first, InputIt1 last)
{
int64_t len = std::distance(first, last);
m_block_count = ceildiv(len, 64);
m_map.resize(static_cast<size_t>(m_block_count));
m_extendedAscii.resize(static_cast<size_t>(m_block_count * 256));
for (int64_t i = 0; i < len; ++i) {
int64_t block = i / 64;
int64_t pos = i % 64;
insert(block, first[i], static_cast<int>(pos));
}
}
/**
* combat func for PatternMatchVector
*/
template <typename CharT>
uint64_t get(CharT key) const
{
return get(0, key);
}
template <typename CharT>
uint64_t get(int64_t block, CharT key) const
{
assert(block < m_block_count);
if (key >= 0 && key <= 255) {
return m_extendedAscii[static_cast<size_t>(key * m_block_count + block)];
}
else {
return m_map[static_cast<size_t>(block)].get(key);
}
}
private:
std::vector<BitvectorHashmap> m_map;
std::vector<uint64_t> m_extendedAscii;
int64_t m_block_count;
};
/**@}*/
} // namespace common
} // namespace duckdb_jaro_winkler

View File

@@ -0,0 +1,110 @@
/* SPDX-License-Identifier: MIT */
/* Copyright © 2022 Max Bachmann */
#pragma once
#include <cstdint>
#if defined(_MSC_VER) && !defined(__clang__)
# include <intrin.h>
#endif
namespace duckdb_jaro_winkler {
namespace intrinsics {
template <typename T>
T bit_mask_lsb(int n)
{
T mask = static_cast<T>(-1);
if (n < static_cast<int>(sizeof(T) * 8)) {
mask += static_cast<T>(1) << n;
}
return mask;
}
template <typename T>
bool bittest(T a, int bit)
{
return (a >> bit) & 1;
}
static inline int64_t popcount(uint64_t x)
{
const uint64_t m1 = 0x5555555555555555;
const uint64_t m2 = 0x3333333333333333;
const uint64_t m4 = 0x0f0f0f0f0f0f0f0f;
const uint64_t h01 = 0x0101010101010101;
x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;
return static_cast<int64_t>((x * h01) >> 56);
}
/**
* Extract the lowest set bit from a. If no bits are set in a returns 0.
*/
template <typename T>
T blsi(T a)
{
#if _MSC_VER && !defined(__clang__)
# pragma warning(push)
/* unary minus operator applied to unsigned type, result still unsigned */
# pragma warning(disable: 4146)
#endif
return a & -a;
#if _MSC_VER && !defined(__clang__)
# pragma warning(pop)
#endif
}
/**
* Clear the lowest set bit in a.
*/
template <typename T>
T blsr(T x)
{
return x & (x - 1);
}
#if defined(_MSC_VER) && !defined(__clang__)
static inline int tzcnt(uint32_t x)
{
unsigned long trailing_zero = 0;
_BitScanForward(&trailing_zero, x);
return trailing_zero;
}
# if defined(_M_ARM) || defined(_M_X64)
static inline int tzcnt(uint64_t x)
{
unsigned long trailing_zero = 0;
_BitScanForward64(&trailing_zero, x);
return trailing_zero;
}
# else
static inline int tzcnt(uint64_t x)
{
uint32_t msh = (uint32_t)(x >> 32);
uint32_t lsh = (uint32_t)(x & 0xFFFFFFFF);
if (lsh != 0) {
return tzcnt(lsh);
}
return 32 + tzcnt(msh);
}
# endif
#else /* gcc / clang */
//static inline int tzcnt(uint32_t x)
//{
// return __builtin_ctz(x);
//}
static inline int tzcnt(uint64_t x)
{
return __builtin_ctzll(x);
}
#endif
} // namespace intrinsics
} // namespace duckdb_jaro_winkler

View File

@@ -0,0 +1,509 @@
/* SPDX-License-Identifier: MIT */
/* Copyright © 2022 Max Bachmann */
#pragma once
#include "common.hpp"
#include "intrinsics.hpp"
namespace duckdb_jaro_winkler {
namespace detail {
struct FlaggedCharsWord {
uint64_t P_flag;
uint64_t T_flag;
};
struct FlaggedCharsMultiword {
std::vector<uint64_t> P_flag;
std::vector<uint64_t> T_flag;
};
struct SearchBoundMask {
int64_t words = 0;
int64_t empty_words = 0;
uint64_t last_mask = 0;
uint64_t first_mask = 0;
};
struct TextPosition {
TextPosition(int64_t Word_, int64_t WordPos_) : Word(Word_), WordPos(WordPos_)
{}
int64_t Word;
int64_t WordPos;
};
static inline double jaro_calculate_similarity(int64_t P_len, int64_t T_len, int64_t CommonChars,
int64_t Transpositions)
{
Transpositions /= 2;
double Sim = 0;
Sim += static_cast<double>(CommonChars) / static_cast<double>(P_len);
Sim += static_cast<double>(CommonChars) / static_cast<double>(T_len);
Sim += (static_cast<double>(CommonChars) - static_cast<double>(Transpositions)) / static_cast<double>(CommonChars);
return Sim / 3.0;
}
/**
* @brief filter matches below score_cutoff based on string lengths
*/
static inline bool jaro_length_filter(int64_t P_len, int64_t T_len, double score_cutoff)
{
if (!T_len || !P_len) return false;
double min_len = static_cast<double>(std::min(P_len, T_len));
double Sim = min_len / static_cast<double>(P_len) + min_len / static_cast<double>(T_len) + 1.0;
Sim /= 3.0;
return Sim >= score_cutoff;
}
/**
* @brief filter matches below score_cutoff based on string lengths and common characters
*/
static inline bool jaro_common_char_filter(int64_t P_len, int64_t T_len, int64_t CommonChars,
double score_cutoff)
{
if (!CommonChars) return false;
double Sim = 0;
Sim += static_cast<double>(CommonChars) / static_cast<double>(P_len);
Sim += static_cast<double>(CommonChars) / static_cast<double>(T_len);
Sim += 1.0;
Sim /= 3.0;
return Sim >= score_cutoff;
}
static inline int64_t count_common_chars(const FlaggedCharsWord& flagged)
{
return intrinsics::popcount(flagged.P_flag);
}
static inline int64_t count_common_chars(const FlaggedCharsMultiword& flagged)
{
int64_t CommonChars = 0;
if (flagged.P_flag.size() < flagged.T_flag.size()) {
for (uint64_t flag : flagged.P_flag) {
CommonChars += intrinsics::popcount(flag);
}
}
else {
for (uint64_t flag : flagged.T_flag) {
CommonChars += intrinsics::popcount(flag);
}
}
return CommonChars;
}
template <typename PM_Vec, typename InputIt1, typename InputIt2>
static inline FlaggedCharsWord
flag_similar_characters_word(const PM_Vec& PM, InputIt1 P_first,
InputIt1 P_last, InputIt2 T_first, InputIt2 T_last, int Bound)
{
using namespace intrinsics;
int64_t P_len = std::distance(P_first, P_last);
(void)P_len;
int64_t T_len = std::distance(T_first, T_last);
assert(P_len <= 64);
assert(T_len <= 64);
assert(Bound > P_len || P_len - Bound <= T_len);
FlaggedCharsWord flagged = {0, 0};
uint64_t BoundMask = bit_mask_lsb<uint64_t>(Bound + 1);
int64_t j = 0;
for (; j < std::min(static_cast<int64_t>(Bound), T_len); ++j) {
uint64_t PM_j = PM.get(T_first[j]) & BoundMask & (~flagged.P_flag);
flagged.P_flag |= blsi(PM_j);
flagged.T_flag |= static_cast<uint64_t>(PM_j != 0) << j;
BoundMask = (BoundMask << 1) | 1;
}
for (; j < T_len; ++j) {
uint64_t PM_j = PM.get(T_first[j]) & BoundMask & (~flagged.P_flag);
flagged.P_flag |= blsi(PM_j);
flagged.T_flag |= static_cast<uint64_t>(PM_j != 0) << j;
BoundMask <<= 1;
}
return flagged;
}
template <typename CharT>
static inline void flag_similar_characters_step(const common::BlockPatternMatchVector& PM,
CharT T_j, FlaggedCharsMultiword& flagged,
int64_t j, SearchBoundMask BoundMask)
{
using namespace intrinsics;
int64_t j_word = j / 64;
int64_t j_pos = j % 64;
int64_t word = BoundMask.empty_words;
int64_t last_word = word + BoundMask.words;
if (BoundMask.words == 1) {
uint64_t PM_j = PM.get(word, T_j) & BoundMask.last_mask & BoundMask.first_mask &
(~flagged.P_flag[static_cast<size_t>(word)]);
flagged.P_flag[static_cast<size_t>(word)] |= blsi(PM_j);
flagged.T_flag[static_cast<size_t>(j_word)] |= static_cast<uint64_t>(PM_j != 0) << j_pos;
return;
}
if (BoundMask.first_mask) {
uint64_t PM_j = PM.get(word, T_j) & BoundMask.first_mask & (~flagged.P_flag[static_cast<size_t>(word)]);
if (PM_j) {
flagged.P_flag[static_cast<size_t>(word)] |= blsi(PM_j);
flagged.T_flag[static_cast<size_t>(j_word)] |= 1ull << j_pos;
return;
}
word++;
}
for (; word < last_word - 1; ++word) {
uint64_t PM_j = PM.get(word, T_j) & (~flagged.P_flag[static_cast<size_t>(word)]);
if (PM_j) {
flagged.P_flag[static_cast<size_t>(word)] |= blsi(PM_j);
flagged.T_flag[static_cast<size_t>(j_word)] |= 1ull << j_pos;
return;
}
}
if (BoundMask.last_mask) {
uint64_t PM_j = PM.get(word, T_j) & BoundMask.last_mask & (~flagged.P_flag[static_cast<size_t>(word)]);
flagged.P_flag[static_cast<size_t>(word)] |= blsi(PM_j);
flagged.T_flag[static_cast<size_t>(j_word)] |= static_cast<uint64_t>(PM_j != 0) << j_pos;
}
}
template <typename InputIt1, typename InputIt2>
static inline FlaggedCharsMultiword
flag_similar_characters_block(const common::BlockPatternMatchVector& PM, InputIt1 P_first,
InputIt1 P_last, InputIt2 T_first, InputIt2 T_last, int64_t Bound)
{
using namespace intrinsics;
int64_t P_len = std::distance(P_first, P_last);
int64_t T_len = std::distance(T_first, T_last);
assert(P_len > 64 || T_len > 64);
assert(Bound > P_len || P_len - Bound <= T_len);
assert(Bound >= 31);
int64_t TextWords = common::ceildiv(T_len, 64);
int64_t PatternWords = common::ceildiv(P_len, 64);
FlaggedCharsMultiword flagged;
flagged.T_flag.resize(static_cast<size_t>(TextWords));
flagged.P_flag.resize(static_cast<size_t>(PatternWords));
SearchBoundMask BoundMask;
int64_t start_range = std::min(Bound + 1, P_len);
BoundMask.words = 1 + start_range / 64;
BoundMask.empty_words = 0;
BoundMask.last_mask = (1ull << (start_range % 64)) - 1;
BoundMask.first_mask = ~UINT64_C(0);
for (int64_t j = 0; j < T_len; ++j) {
flag_similar_characters_step(PM, T_first[j], flagged, j, BoundMask);
if (j + Bound + 1 < P_len) {
BoundMask.last_mask = (BoundMask.last_mask << 1) | 1;
if (j + Bound + 2 < P_len && BoundMask.last_mask == ~UINT64_C(0)) {
BoundMask.last_mask = 0;
BoundMask.words++;
}
}
if (j >= Bound) {
BoundMask.first_mask <<= 1;
if (BoundMask.first_mask == 0) {
BoundMask.first_mask = ~UINT64_C(0);
BoundMask.words--;
BoundMask.empty_words++;
}
}
}
return flagged;
}
template <typename PM_Vec, typename InputIt1>
static inline int64_t count_transpositions_word(const PM_Vec& PM,
InputIt1 T_first, InputIt1,
const FlaggedCharsWord& flagged)
{
using namespace intrinsics;
uint64_t P_flag = flagged.P_flag;
uint64_t T_flag = flagged.T_flag;
int64_t Transpositions = 0;
while (T_flag) {
uint64_t PatternFlagMask = blsi(P_flag);
Transpositions += !(PM.get(T_first[tzcnt(T_flag)]) & PatternFlagMask);
T_flag = blsr(T_flag);
P_flag ^= PatternFlagMask;
}
return Transpositions;
}
template <typename InputIt1>
static inline int64_t
count_transpositions_block(const common::BlockPatternMatchVector& PM, InputIt1 T_first, InputIt1,
const FlaggedCharsMultiword& flagged, int64_t FlaggedChars)
{
using namespace intrinsics;
int64_t TextWord = 0;
int64_t PatternWord = 0;
uint64_t T_flag = flagged.T_flag[static_cast<size_t>(TextWord)];
uint64_t P_flag = flagged.P_flag[static_cast<size_t>(PatternWord)];
int64_t Transpositions = 0;
while (FlaggedChars) {
while (!T_flag) {
TextWord++;
T_first += 64;
T_flag = flagged.T_flag[static_cast<size_t>(TextWord)];
}
while (T_flag) {
while (!P_flag) {
PatternWord++;
P_flag = flagged.P_flag[static_cast<size_t>(PatternWord)];
}
uint64_t PatternFlagMask = blsi(P_flag);
Transpositions += !(PM.get(PatternWord, T_first[tzcnt(T_flag)]) & PatternFlagMask);
T_flag = blsr(T_flag);
P_flag ^= PatternFlagMask;
FlaggedChars--;
}
}
return Transpositions;
}
/**
* @brief find bounds and skip out of bound parts of the sequences
*
*/
template <typename InputIt1, typename InputIt2>
int64_t jaro_bounds(InputIt1 P_first, InputIt1& P_last, InputIt2 T_first, InputIt2& T_last)
{
int64_t P_len = std::distance(P_first, P_last);
int64_t T_len = std::distance(T_first, T_last);
/* since jaro uses a sliding window some parts of T/P might never be in
* range an can be removed ahead of time
*/
int64_t Bound = 0;
if (T_len > P_len) {
Bound = T_len / 2 - 1;
if (T_len > P_len + Bound) {
T_last = T_first + P_len + Bound;
}
}
else {
Bound = P_len / 2 - 1;
if (P_len > T_len + Bound) {
P_last = P_first + T_len + Bound;
}
}
return Bound;
}
template <typename InputIt1, typename InputIt2>
double jaro_similarity(InputIt1 P_first, InputIt1 P_last, InputIt2 T_first, InputIt2 T_last,
double score_cutoff)
{
int64_t P_len = std::distance(P_first, P_last);
int64_t T_len = std::distance(T_first, T_last);
/* filter out based on the length difference between the two strings */
if (!jaro_length_filter(P_len, T_len, score_cutoff)) {
return 0.0;
}
if (P_len == 1 && T_len == 1) {
return static_cast<double>(P_first[0] == T_first[0]);
}
int64_t Bound = jaro_bounds(P_first, P_last, T_first, T_last);
/* common prefix never includes Transpositions */
int64_t CommonChars = common::remove_common_prefix(P_first, P_last, T_first, T_last);
int64_t Transpositions = 0;
int64_t P_view_len = std::distance(P_first, P_last);
int64_t T_view_len = std::distance(T_first, T_last);
if (!P_view_len || !T_view_len) {
/* already has correct number of common chars and transpositions */
}
else if (P_view_len <= 64 && T_view_len <= 64) {
common::PatternMatchVector PM(P_first, P_last);
auto flagged = flag_similar_characters_word(PM, P_first, P_last, T_first, T_last, static_cast<int>(Bound));
CommonChars += count_common_chars(flagged);
if (!jaro_common_char_filter(P_len, T_len, CommonChars, score_cutoff)) {
return 0.0;
}
Transpositions = count_transpositions_word(PM, T_first, T_last, flagged);
}
else {
common::BlockPatternMatchVector PM(P_first, P_last);
auto flagged = flag_similar_characters_block(PM, P_first, P_last, T_first, T_last, Bound);
int64_t FlaggedChars = count_common_chars(flagged);
CommonChars += FlaggedChars;
if (!jaro_common_char_filter(P_len, T_len, CommonChars, score_cutoff)) {
return 0.0;
}
Transpositions = count_transpositions_block(PM, T_first, T_last, flagged, FlaggedChars);
}
double Sim = jaro_calculate_similarity(P_len, T_len, CommonChars, Transpositions);
return common::result_cutoff(Sim, score_cutoff);
}
template <typename InputIt1, typename InputIt2>
double jaro_similarity(const common::BlockPatternMatchVector& PM, InputIt1 P_first, InputIt1 P_last,
InputIt2 T_first, InputIt2 T_last, double score_cutoff)
{
int64_t P_len = std::distance(P_first, P_last);
int64_t T_len = std::distance(T_first, T_last);
/* filter out based on the length difference between the two strings */
if (!jaro_length_filter(P_len, T_len, score_cutoff)) {
return 0.0;
}
if (P_len == 1 && T_len == 1) {
return static_cast<double>(P_first[0] == T_first[0]);
}
int64_t Bound = jaro_bounds(P_first, P_last, T_first, T_last);
/* common prefix never includes Transpositions */
int64_t CommonChars = 0;
int64_t Transpositions = 0;
int64_t P_view_len = std::distance(P_first, P_last);
int64_t T_view_len = std::distance(T_first, T_last);
if (!P_view_len || !T_view_len) {
/* already has correct number of common chars and transpositions */
}
else if (P_view_len <= 64 && T_view_len <= 64) {
auto flagged = flag_similar_characters_word(PM, P_first, P_last, T_first, T_last, static_cast<int>(Bound));
CommonChars += count_common_chars(flagged);
if (!jaro_common_char_filter(P_len, T_len, CommonChars, score_cutoff)) {
return 0.0;
}
Transpositions = count_transpositions_word(PM, T_first, T_last, flagged);
}
else {
auto flagged = flag_similar_characters_block(PM, P_first, P_last, T_first, T_last, Bound);
int64_t FlaggedChars = count_common_chars(flagged);
CommonChars += FlaggedChars;
if (!jaro_common_char_filter(P_len, T_len, CommonChars, score_cutoff)) {
return 0.0;
}
Transpositions = count_transpositions_block(PM, T_first, T_last, flagged, FlaggedChars);
}
double Sim = jaro_calculate_similarity(P_len, T_len, CommonChars, Transpositions);
return common::result_cutoff(Sim, score_cutoff);
}
template <typename InputIt1, typename InputIt2>
double jaro_winkler_similarity(InputIt1 P_first, InputIt1 P_last, InputIt2 T_first, InputIt2 T_last,
double prefix_weight, double score_cutoff)
{
int64_t P_len = std::distance(P_first, P_last);
int64_t T_len = std::distance(T_first, T_last);
int64_t min_len = std::min(P_len, T_len);
int64_t prefix = 0;
int64_t max_prefix = std::min<int64_t>(min_len, 4);
for (; prefix < max_prefix; ++prefix) {
if (T_first[prefix] != P_first[prefix]) {
break;
}
}
double jaro_score_cutoff = score_cutoff;
if (jaro_score_cutoff > 0.7) {
double prefix_sim = static_cast<double>(prefix) * prefix_weight;
if (prefix_sim >= 1.0) {
jaro_score_cutoff = 0.7;
}
else {
jaro_score_cutoff =
std::max(0.7, (prefix_sim - jaro_score_cutoff) / (prefix_sim - 1.0));
}
}
double Sim = jaro_similarity(P_first, P_last, T_first, T_last, jaro_score_cutoff);
if (Sim > 0.7) {
Sim += static_cast<double>(prefix) * prefix_weight * (1.0 - Sim);
}
return common::result_cutoff(Sim, score_cutoff);
}
template <typename InputIt1, typename InputIt2>
double jaro_winkler_similarity(const common::BlockPatternMatchVector& PM, InputIt1 P_first,
InputIt1 P_last, InputIt2 T_first, InputIt2 T_last,
double prefix_weight, double score_cutoff)
{
int64_t P_len = std::distance(P_first, P_last);
int64_t T_len = std::distance(T_first, T_last);
int64_t min_len = std::min(P_len, T_len);
int64_t prefix = 0;
int64_t max_prefix = std::min<int64_t>(min_len, 4);
for (; prefix < max_prefix; ++prefix) {
if (T_first[prefix] != P_first[prefix]) {
break;
}
}
double jaro_score_cutoff = score_cutoff;
if (jaro_score_cutoff > 0.7) {
double prefix_sim = static_cast<double>(prefix) * prefix_weight;
if (prefix_sim >= 1.0) {
jaro_score_cutoff = 0.7;
}
else {
jaro_score_cutoff =
std::max(0.7, (prefix_sim - jaro_score_cutoff) / (prefix_sim - 1.0));
}
}
double Sim = jaro_similarity(PM, P_first, P_last, T_first, T_last, jaro_score_cutoff);
if (Sim > 0.7) {
Sim += static_cast<double>(prefix) * prefix_weight * (1.0 - Sim);
}
return common::result_cutoff(Sim, score_cutoff);
}
} // namespace detail
} // namespace duckdb_jaro_winkler

View File

@@ -0,0 +1,186 @@
/* SPDX-License-Identifier: MIT */
/* Copyright © 2022 Max Bachmann */
#pragma once
#include "details/common.hpp"
#include "details/jaro_impl.hpp"
#include <stdexcept>
namespace duckdb_jaro_winkler {
/**
* @defgroup jaro_winkler jaro_winkler
* @{
*/
/**
* @brief Calculates the jaro winkler similarity
*
* @tparam Sentence1 This is a string that can be converted to
* basic_string_view<char_type>
* @tparam Sentence2 This is a string that can be converted to
* basic_string_view<char_type>
*
* @param s1
* string to compare with s2 (for type info check Template parameters above)
* @param s2
* string to compare with s1 (for type info check Template parameters above)
* @param prefix_weight
* Weight used for the common prefix of the two strings.
* Has to be between 0 and 0.25. Default is 0.1.
* @param score_cutoff
* Optional argument for a score threshold as a float between 0 and 100.
* For similarity < score_cutoff 0 is returned instead. Default is 0,
* which deactivates this behaviour.
*
* @return jaro winkler similarity between s1 and s2
* as a float between 0 and 100
*/
template <typename InputIt1, typename InputIt2>
typename std::enable_if<
common::is_iterator<InputIt1>::value && common::is_iterator<InputIt2>::value, double>::type
jaro_winkler_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double prefix_weight = 0.1, double score_cutoff = 0.0)
{
if (prefix_weight < 0.0 || prefix_weight > 0.25) {
throw std::invalid_argument("prefix_weight has to be between 0.0 and 0.25");
}
return detail::jaro_winkler_similarity(first1, last1, first2, last2, prefix_weight,
score_cutoff);
}
template <typename S1, typename S2>
double jaro_winkler_similarity(const S1& s1, const S2& s2, double prefix_weight = 0.1,
double score_cutoff = 0.0)
{
return jaro_winkler_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2),
prefix_weight, score_cutoff);
}
template <typename CharT1>
struct CachedJaroWinklerSimilarity {
template <typename InputIt1>
CachedJaroWinklerSimilarity(InputIt1 first1, InputIt1 last1, double prefix_weight_ = 0.1)
: s1(first1, last1), PM(first1, last1), prefix_weight(prefix_weight_)
{
if (prefix_weight < 0.0 || prefix_weight > 0.25) {
throw std::invalid_argument("prefix_weight has to be between 0.0 and 0.25");
}
}
template <typename S1>
CachedJaroWinklerSimilarity(const S1& s1_, double prefix_weight_ = 0.1)
: CachedJaroWinklerSimilarity(std::begin(s1_), std::end(s1_), prefix_weight_)
{}
template <typename InputIt2>
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
{
return detail::jaro_winkler_similarity(PM, std::begin(s1), std::end(s1), first2, last2,
prefix_weight, score_cutoff);
}
template <typename S2>
double similarity(const S2& s2, double score_cutoff = 0) const
{
return similarity(std::begin(s2), std::end(s2), score_cutoff);
}
template <typename InputIt2>
double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
{
return similarity(first2, last2, score_cutoff);
}
template <typename S2>
double normalized_similarity(const S2& s2, double score_cutoff = 0) const
{
return similarity(s2, score_cutoff);
}
private:
std::basic_string<CharT1> s1;
common::BlockPatternMatchVector PM;
double prefix_weight;
};
/**
* @brief Calculates the jaro similarity
*
* @tparam Sentence1 This is a string that can be converted to
* basic_string_view<char_type>
* @tparam Sentence2 This is a string that can be converted to
* basic_string_view<char_type>
*
* @param s1
* string to compare with s2 (for type info check Template parameters above)
* @param s2
* string to compare with s1 (for type info check Template parameters above)
* @param score_cutoff
* Optional argument for a score threshold as a float between 0 and 100.
* For similarity < score_cutoff 0 is returned instead. Default is 0,
* which deactivates this behaviour.
*
* @return jaro similarity between s1 and s2
* as a float between 0 and 100
*/
template <typename InputIt1, typename InputIt2>
double jaro_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 0.0)
{
return detail::jaro_similarity(first1, last1, first2, last2, score_cutoff);
}
template <typename S1, typename S2>
double jaro_similarity(const S1& s1, const S2& s2, double score_cutoff = 0.0)
{
return jaro_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2),
score_cutoff);
}
template <typename CharT1>
struct CachedJaroSimilarity {
template <typename InputIt1>
CachedJaroSimilarity(InputIt1 first1, InputIt1 last1) : s1(first1, last1), PM(first1, last1)
{}
template <typename S1>
CachedJaroSimilarity(const S1& s1_) : CachedJaroSimilarity(std::begin(s1_), std::end(s1_))
{}
template <typename InputIt2>
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
{
return detail::jaro_similarity(PM, std::begin(s1), std::end(s1), first2, last2,
score_cutoff);
}
template <typename S2>
double similarity(const S2& s2, double score_cutoff = 0) const
{
return similarity(std::begin(s2), std::end(s2), score_cutoff);
}
template <typename InputIt2>
double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
{
return similarity(first2, last2, score_cutoff);
}
template <typename S2>
double normalized_similarity(const S2& s2, double score_cutoff = 0) const
{
return similarity(s2, score_cutoff);
}
private:
std::basic_string<CharT1> s1;
common::BlockPatternMatchVector PM;
};
/**@}*/
} // namespace duckdb_jaro_winkler