should be it

This commit is contained in:
2025-10-24 19:21:19 -05:00
parent a4b23fc57c
commit f09560c7b1
14047 changed files with 3161551 additions and 1 deletions

View File

@@ -0,0 +1,186 @@
/* SPDX-License-Identifier: MIT */
/* Copyright © 2022 Max Bachmann */
#pragma once
#include "details/common.hpp"
#include "details/jaro_impl.hpp"
#include <stdexcept>
namespace duckdb_jaro_winkler {
/**
* @defgroup jaro_winkler jaro_winkler
* @{
*/
/**
* @brief Calculates the jaro winkler similarity
*
* @tparam Sentence1 This is a string that can be converted to
* basic_string_view<char_type>
* @tparam Sentence2 This is a string that can be converted to
* basic_string_view<char_type>
*
* @param s1
* string to compare with s2 (for type info check Template parameters above)
* @param s2
* string to compare with s1 (for type info check Template parameters above)
* @param prefix_weight
* Weight used for the common prefix of the two strings.
* Has to be between 0 and 0.25. Default is 0.1.
* @param score_cutoff
* Optional argument for a score threshold as a float between 0 and 100.
* For similarity < score_cutoff 0 is returned instead. Default is 0,
* which deactivates this behaviour.
*
* @return jaro winkler similarity between s1 and s2
* as a float between 0 and 100
*/
template <typename InputIt1, typename InputIt2>
typename std::enable_if<
common::is_iterator<InputIt1>::value && common::is_iterator<InputIt2>::value, double>::type
jaro_winkler_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double prefix_weight = 0.1, double score_cutoff = 0.0)
{
if (prefix_weight < 0.0 || prefix_weight > 0.25) {
throw std::invalid_argument("prefix_weight has to be between 0.0 and 0.25");
}
return detail::jaro_winkler_similarity(first1, last1, first2, last2, prefix_weight,
score_cutoff);
}
template <typename S1, typename S2>
double jaro_winkler_similarity(const S1& s1, const S2& s2, double prefix_weight = 0.1,
double score_cutoff = 0.0)
{
return jaro_winkler_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2),
prefix_weight, score_cutoff);
}
template <typename CharT1>
struct CachedJaroWinklerSimilarity {
template <typename InputIt1>
CachedJaroWinklerSimilarity(InputIt1 first1, InputIt1 last1, double prefix_weight_ = 0.1)
: s1(first1, last1), PM(first1, last1), prefix_weight(prefix_weight_)
{
if (prefix_weight < 0.0 || prefix_weight > 0.25) {
throw std::invalid_argument("prefix_weight has to be between 0.0 and 0.25");
}
}
template <typename S1>
CachedJaroWinklerSimilarity(const S1& s1_, double prefix_weight_ = 0.1)
: CachedJaroWinklerSimilarity(std::begin(s1_), std::end(s1_), prefix_weight_)
{}
template <typename InputIt2>
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
{
return detail::jaro_winkler_similarity(PM, std::begin(s1), std::end(s1), first2, last2,
prefix_weight, score_cutoff);
}
template <typename S2>
double similarity(const S2& s2, double score_cutoff = 0) const
{
return similarity(std::begin(s2), std::end(s2), score_cutoff);
}
template <typename InputIt2>
double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
{
return similarity(first2, last2, score_cutoff);
}
template <typename S2>
double normalized_similarity(const S2& s2, double score_cutoff = 0) const
{
return similarity(s2, score_cutoff);
}
private:
std::basic_string<CharT1> s1;
common::BlockPatternMatchVector PM;
double prefix_weight;
};
/**
* @brief Calculates the jaro similarity
*
* @tparam Sentence1 This is a string that can be converted to
* basic_string_view<char_type>
* @tparam Sentence2 This is a string that can be converted to
* basic_string_view<char_type>
*
* @param s1
* string to compare with s2 (for type info check Template parameters above)
* @param s2
* string to compare with s1 (for type info check Template parameters above)
* @param score_cutoff
* Optional argument for a score threshold as a float between 0 and 100.
* For similarity < score_cutoff 0 is returned instead. Default is 0,
* which deactivates this behaviour.
*
* @return jaro similarity between s1 and s2
* as a float between 0 and 100
*/
template <typename InputIt1, typename InputIt2>
double jaro_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff = 0.0)
{
return detail::jaro_similarity(first1, last1, first2, last2, score_cutoff);
}
template <typename S1, typename S2>
double jaro_similarity(const S1& s1, const S2& s2, double score_cutoff = 0.0)
{
return jaro_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2),
score_cutoff);
}
template <typename CharT1>
struct CachedJaroSimilarity {
template <typename InputIt1>
CachedJaroSimilarity(InputIt1 first1, InputIt1 last1) : s1(first1, last1), PM(first1, last1)
{}
template <typename S1>
CachedJaroSimilarity(const S1& s1_) : CachedJaroSimilarity(std::begin(s1_), std::end(s1_))
{}
template <typename InputIt2>
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
{
return detail::jaro_similarity(PM, std::begin(s1), std::end(s1), first2, last2,
score_cutoff);
}
template <typename S2>
double similarity(const S2& s2, double score_cutoff = 0) const
{
return similarity(std::begin(s2), std::end(s2), score_cutoff);
}
template <typename InputIt2>
double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
{
return similarity(first2, last2, score_cutoff);
}
template <typename S2>
double normalized_similarity(const S2& s2, double score_cutoff = 0) const
{
return similarity(s2, score_cutoff);
}
private:
std::basic_string<CharT1> s1;
common::BlockPatternMatchVector PM;
};
/**@}*/
} // namespace duckdb_jaro_winkler