790 lines
27 KiB
C++
790 lines
27 KiB
C++
/* SPDX-License-Identifier: MIT */
|
|
/* Copyright © 2021 Max Bachmann */
|
|
/* Copyright © 2011 Adam Cohen */
|
|
|
|
#pragma once
|
|
#include <rapidfuzz/details/CharSet.hpp>
|
|
#include <rapidfuzz/details/PatternMatchVector.hpp>
|
|
#include <rapidfuzz/details/common.hpp>
|
|
#include <rapidfuzz/distance/Indel.hpp>
|
|
|
|
namespace rapidfuzz::fuzz {
|
|
|
|
/**
|
|
* @defgroup Fuzz Fuzz
|
|
* A collection of string matching algorithms from FuzzyWuzzy
|
|
* @{
|
|
*/
|
|
|
|
/**
|
|
* @brief calculates a simple ratio between two strings
|
|
*
|
|
* @details
|
|
* @code{.cpp}
|
|
* // score is 96.55
|
|
* double score = ratio("this is a test", "this is a test!")
|
|
* @endcode
|
|
*
|
|
* @tparam Sentence1 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
* @tparam Sentence2 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
*
|
|
* @param s1 string to compare with s2 (for type info check Template parameters
|
|
* above)
|
|
* @param s2 string to compare with s1 (for type info check Template parameters
|
|
* above)
|
|
* @param score_cutoff Optional argument for a score threshold between 0% and
|
|
* 100%. Matches with a lower score than this number will not be returned.
|
|
* Defaults to 0.
|
|
*
|
|
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
|
*/
|
|
template <typename Sentence1, typename Sentence2>
|
|
double ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
|
|
|
template <typename InputIt1, typename InputIt2>
|
|
double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
|
|
|
|
#ifdef RAPIDFUZZ_SIMD
|
|
namespace experimental {
|
|
template <int MaxLen>
|
|
struct MultiRatio {
|
|
public:
|
|
MultiRatio(size_t count) : input_count(count), scorer(count)
|
|
{}
|
|
|
|
size_t result_count() const
|
|
{
|
|
return scorer.result_count();
|
|
}
|
|
|
|
template <typename Sentence1>
|
|
void insert(const Sentence1& s1_)
|
|
{
|
|
insert(detail::to_begin(s1_), detail::to_end(s1_));
|
|
}
|
|
|
|
template <typename InputIt1>
|
|
void insert(InputIt1 first1, InputIt1 last1)
|
|
{
|
|
scorer.insert(first1, last1);
|
|
}
|
|
|
|
template <typename InputIt2>
|
|
void similarity(double* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
|
|
double score_cutoff = 0.0) const
|
|
{
|
|
similarity(scores, score_count, detail::Range(first2, last2), score_cutoff);
|
|
}
|
|
|
|
template <typename Sentence2>
|
|
void similarity(double* scores, size_t score_count, const Sentence2& s2, double score_cutoff = 0) const
|
|
{
|
|
scorer.normalized_similarity(scores, score_count, s2, score_cutoff / 100.0);
|
|
|
|
for (size_t i = 0; i < input_count; ++i)
|
|
scores[i] *= 100.0;
|
|
}
|
|
|
|
private:
|
|
size_t input_count;
|
|
rapidfuzz::experimental::MultiIndel<MaxLen> scorer;
|
|
};
|
|
} /* namespace experimental */
|
|
#endif
|
|
|
|
// TODO documentation
|
|
template <typename CharT1>
|
|
struct CachedRatio {
|
|
template <typename InputIt1>
|
|
CachedRatio(InputIt1 first1, InputIt1 last1) : cached_indel(first1, last1)
|
|
{}
|
|
|
|
template <typename Sentence1>
|
|
CachedRatio(const Sentence1& s1) : cached_indel(s1)
|
|
{}
|
|
|
|
template <typename InputIt2>
|
|
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
|
double score_hint = 0.0) const;
|
|
|
|
template <typename Sentence2>
|
|
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
|
|
|
// private:
|
|
CachedIndel<CharT1> cached_indel;
|
|
};
|
|
|
|
template <typename Sentence1>
|
|
CachedRatio(const Sentence1& s1) -> CachedRatio<char_type<Sentence1>>;
|
|
|
|
template <typename InputIt1>
|
|
CachedRatio(InputIt1 first1, InputIt1 last1) -> CachedRatio<iter_value_t<InputIt1>>;
|
|
|
|
template <typename InputIt1, typename InputIt2>
|
|
ScoreAlignment<double> partial_ratio_alignment(InputIt1 first1, InputIt1 last1, InputIt2 first2,
|
|
InputIt2 last2, double score_cutoff = 0);
|
|
|
|
template <typename Sentence1, typename Sentence2>
|
|
ScoreAlignment<double> partial_ratio_alignment(const Sentence1& s1, const Sentence2& s2,
|
|
double score_cutoff = 0);
|
|
|
|
/**
|
|
* @brief calculates the fuzz::ratio of the optimal string alignment
|
|
*
|
|
* @details
|
|
* test @cite hyrro_2004 @cite wagner_fischer_1974
|
|
* @code{.cpp}
|
|
* // score is 100
|
|
* double score = partial_ratio("this is a test", "this is a test!")
|
|
* @endcode
|
|
*
|
|
* @tparam Sentence1 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
* @tparam Sentence2 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
*
|
|
* @param s1 string to compare with s2 (for type info check Template parameters
|
|
* above)
|
|
* @param s2 string to compare with s1 (for type info check Template parameters
|
|
* above)
|
|
* @param score_cutoff Optional argument for a score threshold between 0% and
|
|
* 100%. Matches with a lower score than this number will not be returned.
|
|
* Defaults to 0.
|
|
*
|
|
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
|
*/
|
|
template <typename Sentence1, typename Sentence2>
|
|
double partial_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
|
|
|
template <typename InputIt1, typename InputIt2>
|
|
double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
|
double score_cutoff = 0);
|
|
|
|
// todo add real implementation
|
|
template <typename CharT1>
|
|
struct CachedPartialRatio {
|
|
template <typename>
|
|
friend struct CachedWRatio;
|
|
|
|
template <typename InputIt1>
|
|
CachedPartialRatio(InputIt1 first1, InputIt1 last1);
|
|
|
|
template <typename Sentence1>
|
|
explicit CachedPartialRatio(const Sentence1& s1_)
|
|
: CachedPartialRatio(detail::to_begin(s1_), detail::to_end(s1_))
|
|
{}
|
|
|
|
template <typename InputIt2>
|
|
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
|
double score_hint = 0.0) const;
|
|
|
|
template <typename Sentence2>
|
|
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
|
|
|
private:
|
|
std::vector<CharT1> s1;
|
|
rapidfuzz::detail::CharSet<CharT1> s1_char_set;
|
|
CachedRatio<CharT1> cached_ratio;
|
|
};
|
|
|
|
template <typename Sentence1>
|
|
explicit CachedPartialRatio(const Sentence1& s1) -> CachedPartialRatio<char_type<Sentence1>>;
|
|
|
|
template <typename InputIt1>
|
|
CachedPartialRatio(InputIt1 first1, InputIt1 last1) -> CachedPartialRatio<iter_value_t<InputIt1>>;
|
|
|
|
/**
|
|
* @brief Sorts the words in the strings and calculates the fuzz::ratio between
|
|
* them
|
|
*
|
|
* @details
|
|
* @code{.cpp}
|
|
* // score is 100
|
|
* double score = token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a
|
|
* bear")
|
|
* @endcode
|
|
*
|
|
* @tparam Sentence1 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
* @tparam Sentence2 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
*
|
|
* @param s1 string to compare with s2 (for type info check Template parameters
|
|
* above)
|
|
* @param s2 string to compare with s1 (for type info check Template parameters
|
|
* above)
|
|
* @param score_cutoff Optional argument for a score threshold between 0% and
|
|
* 100%. Matches with a lower score than this number will not be returned.
|
|
* Defaults to 0.
|
|
*
|
|
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
|
*/
|
|
template <typename Sentence1, typename Sentence2>
|
|
double token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
|
|
|
template <typename InputIt1, typename InputIt2>
|
|
double token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
|
double score_cutoff = 0);
|
|
|
|
#ifdef RAPIDFUZZ_SIMD
|
|
namespace experimental {
|
|
template <int MaxLen>
|
|
struct MultiTokenSortRatio {
|
|
public:
|
|
MultiTokenSortRatio(size_t count) : scorer(count)
|
|
{}
|
|
|
|
size_t result_count() const
|
|
{
|
|
return scorer.result_count();
|
|
}
|
|
|
|
template <typename Sentence1>
|
|
void insert(const Sentence1& s1_)
|
|
{
|
|
insert(detail::to_begin(s1_), detail::to_end(s1_));
|
|
}
|
|
|
|
template <typename InputIt1>
|
|
void insert(InputIt1 first1, InputIt1 last1)
|
|
{
|
|
scorer.insert(detail::sorted_split(first1, last1).join());
|
|
}
|
|
|
|
template <typename InputIt2>
|
|
void similarity(double* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
|
|
double score_cutoff = 0.0) const
|
|
{
|
|
scorer.similarity(scores, score_count, detail::sorted_split(first2, last2).join(), score_cutoff);
|
|
}
|
|
|
|
template <typename Sentence2>
|
|
void similarity(double* scores, size_t score_count, const Sentence2& s2, double score_cutoff = 0) const
|
|
{
|
|
similarity(scores, score_count, detail::to_begin(s2), detail::to_end(s2), score_cutoff);
|
|
}
|
|
|
|
private:
|
|
MultiRatio<MaxLen> scorer;
|
|
};
|
|
} /* namespace experimental */
|
|
#endif
|
|
|
|
// todo CachedRatio speed for equal strings vs original implementation
|
|
// TODO documentation
|
|
template <typename CharT1>
|
|
struct CachedTokenSortRatio {
|
|
template <typename InputIt1>
|
|
CachedTokenSortRatio(InputIt1 first1, InputIt1 last1)
|
|
: s1_sorted(detail::sorted_split(first1, last1).join()), cached_ratio(s1_sorted)
|
|
{}
|
|
|
|
template <typename Sentence1>
|
|
explicit CachedTokenSortRatio(const Sentence1& s1)
|
|
: CachedTokenSortRatio(detail::to_begin(s1), detail::to_end(s1))
|
|
{}
|
|
|
|
template <typename InputIt2>
|
|
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
|
double score_hint = 0.0) const;
|
|
|
|
template <typename Sentence2>
|
|
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
|
|
|
private:
|
|
std::vector<CharT1> s1_sorted;
|
|
CachedRatio<CharT1> cached_ratio;
|
|
};
|
|
|
|
template <typename Sentence1>
|
|
explicit CachedTokenSortRatio(const Sentence1& s1) -> CachedTokenSortRatio<char_type<Sentence1>>;
|
|
|
|
template <typename InputIt1>
|
|
CachedTokenSortRatio(InputIt1 first1, InputIt1 last1) -> CachedTokenSortRatio<iter_value_t<InputIt1>>;
|
|
|
|
/**
|
|
* @brief Sorts the words in the strings and calculates the fuzz::partial_ratio
|
|
* between them
|
|
*
|
|
*
|
|
* @tparam Sentence1 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
* @tparam Sentence2 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
*
|
|
* @param s1 string to compare with s2 (for type info check Template parameters
|
|
* above)
|
|
* @param s2 string to compare with s1 (for type info check Template parameters
|
|
* above)
|
|
* @param score_cutoff Optional argument for a score threshold between 0% and
|
|
* 100%. Matches with a lower score than this number will not be returned.
|
|
* Defaults to 0.
|
|
*
|
|
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
|
*/
|
|
template <typename Sentence1, typename Sentence2>
|
|
double partial_token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
|
|
|
template <typename InputIt1, typename InputIt2>
|
|
double partial_token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
|
double score_cutoff = 0);
|
|
|
|
// TODO documentation
|
|
template <typename CharT1>
|
|
struct CachedPartialTokenSortRatio {
|
|
template <typename InputIt1>
|
|
CachedPartialTokenSortRatio(InputIt1 first1, InputIt1 last1)
|
|
: s1_sorted(detail::sorted_split(first1, last1).join()), cached_partial_ratio(s1_sorted)
|
|
{}
|
|
|
|
template <typename Sentence1>
|
|
explicit CachedPartialTokenSortRatio(const Sentence1& s1)
|
|
: CachedPartialTokenSortRatio(detail::to_begin(s1), detail::to_end(s1))
|
|
{}
|
|
|
|
template <typename InputIt2>
|
|
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
|
double score_hint = 0.0) const;
|
|
|
|
template <typename Sentence2>
|
|
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
|
|
|
private:
|
|
std::vector<CharT1> s1_sorted;
|
|
CachedPartialRatio<CharT1> cached_partial_ratio;
|
|
};
|
|
|
|
template <typename Sentence1>
|
|
explicit CachedPartialTokenSortRatio(const Sentence1& s1)
|
|
-> CachedPartialTokenSortRatio<char_type<Sentence1>>;
|
|
|
|
template <typename InputIt1>
|
|
CachedPartialTokenSortRatio(InputIt1 first1,
|
|
InputIt1 last1) -> CachedPartialTokenSortRatio<iter_value_t<InputIt1>>;
|
|
|
|
/**
|
|
* @brief Compares the words in the strings based on unique and common words
|
|
* between them using fuzz::ratio
|
|
*
|
|
* @details
|
|
* @code{.cpp}
|
|
* // score1 is 83.87
|
|
* double score1 = token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a
|
|
* bear")
|
|
* // score2 is 100
|
|
* double score2 = token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
|
|
* @endcode
|
|
*
|
|
* @tparam Sentence1 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
* @tparam Sentence2 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
*
|
|
* @param s1 string to compare with s2 (for type info check Template parameters
|
|
* above)
|
|
* @param s2 string to compare with s1 (for type info check Template parameters
|
|
* above)
|
|
* @param score_cutoff Optional argument for a score threshold between 0% and
|
|
* 100%. Matches with a lower score than this number will not be returned.
|
|
* Defaults to 0.
|
|
*
|
|
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
|
*/
|
|
template <typename Sentence1, typename Sentence2>
|
|
double token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
|
|
|
template <typename InputIt1, typename InputIt2>
|
|
double token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
|
double score_cutoff = 0);
|
|
|
|
// TODO documentation
|
|
template <typename CharT1>
|
|
struct CachedTokenSetRatio {
|
|
template <typename InputIt1>
|
|
CachedTokenSetRatio(InputIt1 first1, InputIt1 last1)
|
|
: s1(first1, last1), tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1)))
|
|
{}
|
|
|
|
template <typename Sentence1>
|
|
explicit CachedTokenSetRatio(const Sentence1& s1_)
|
|
: CachedTokenSetRatio(detail::to_begin(s1_), detail::to_end(s1_))
|
|
{}
|
|
|
|
template <typename InputIt2>
|
|
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
|
double score_hint = 0.0) const;
|
|
|
|
template <typename Sentence2>
|
|
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
|
|
|
private:
|
|
std::vector<CharT1> s1;
|
|
detail::SplittedSentenceView<typename std::vector<CharT1>::iterator> tokens_s1;
|
|
};
|
|
|
|
template <typename Sentence1>
|
|
explicit CachedTokenSetRatio(const Sentence1& s1) -> CachedTokenSetRatio<char_type<Sentence1>>;
|
|
|
|
template <typename InputIt1>
|
|
CachedTokenSetRatio(InputIt1 first1, InputIt1 last1) -> CachedTokenSetRatio<iter_value_t<InputIt1>>;
|
|
|
|
/**
|
|
* @brief Compares the words in the strings based on unique and common words
|
|
* between them using fuzz::partial_ratio
|
|
*
|
|
* @tparam Sentence1 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
* @tparam Sentence2 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
*
|
|
* @param s1 string to compare with s2 (for type info check Template parameters
|
|
* above)
|
|
* @param s2 string to compare with s1 (for type info check Template parameters
|
|
* above)
|
|
* @param score_cutoff Optional argument for a score threshold between 0% and
|
|
* 100%. Matches with a lower score than this number will not be returned.
|
|
* Defaults to 0.
|
|
*
|
|
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
|
*/
|
|
template <typename Sentence1, typename Sentence2>
|
|
double partial_token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
|
|
|
template <typename InputIt1, typename InputIt2>
|
|
double partial_token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
|
double score_cutoff = 0);
|
|
|
|
// TODO documentation
|
|
template <typename CharT1>
|
|
struct CachedPartialTokenSetRatio {
|
|
template <typename InputIt1>
|
|
CachedPartialTokenSetRatio(InputIt1 first1, InputIt1 last1)
|
|
: s1(first1, last1), tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1)))
|
|
{}
|
|
|
|
template <typename Sentence1>
|
|
explicit CachedPartialTokenSetRatio(const Sentence1& s1_)
|
|
: CachedPartialTokenSetRatio(detail::to_begin(s1_), detail::to_end(s1_))
|
|
{}
|
|
|
|
template <typename InputIt2>
|
|
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
|
double score_hint = 0.0) const;
|
|
|
|
template <typename Sentence2>
|
|
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
|
|
|
private:
|
|
std::vector<CharT1> s1;
|
|
detail::SplittedSentenceView<typename std::vector<CharT1>::iterator> tokens_s1;
|
|
};
|
|
|
|
template <typename Sentence1>
|
|
explicit CachedPartialTokenSetRatio(const Sentence1& s1) -> CachedPartialTokenSetRatio<char_type<Sentence1>>;
|
|
|
|
template <typename InputIt1>
|
|
CachedPartialTokenSetRatio(InputIt1 first1,
|
|
InputIt1 last1) -> CachedPartialTokenSetRatio<iter_value_t<InputIt1>>;
|
|
|
|
/**
|
|
* @brief Helper method that returns the maximum of fuzz::token_set_ratio and
|
|
* fuzz::token_sort_ratio (faster than manually executing the two functions)
|
|
*
|
|
* @tparam Sentence1 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
* @tparam Sentence2 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
*
|
|
* @param s1 string to compare with s2 (for type info check Template parameters
|
|
* above)
|
|
* @param s2 string to compare with s1 (for type info check Template parameters
|
|
* above)
|
|
* @param score_cutoff Optional argument for a score threshold between 0% and
|
|
* 100%. Matches with a lower score than this number will not be returned.
|
|
* Defaults to 0.
|
|
*
|
|
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
|
*/
|
|
template <typename Sentence1, typename Sentence2>
|
|
double token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
|
|
|
template <typename InputIt1, typename InputIt2>
|
|
double token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
|
|
|
|
// todo add real implementation
|
|
template <typename CharT1>
|
|
struct CachedTokenRatio {
|
|
template <typename InputIt1>
|
|
CachedTokenRatio(InputIt1 first1, InputIt1 last1)
|
|
: s1(first1, last1),
|
|
s1_tokens(detail::sorted_split(std::begin(s1), std::end(s1))),
|
|
s1_sorted(s1_tokens.join()),
|
|
cached_ratio_s1_sorted(s1_sorted)
|
|
{}
|
|
|
|
template <typename Sentence1>
|
|
explicit CachedTokenRatio(const Sentence1& s1_)
|
|
: CachedTokenRatio(detail::to_begin(s1_), detail::to_end(s1_))
|
|
{}
|
|
|
|
template <typename InputIt2>
|
|
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
|
double score_hint = 0.0) const;
|
|
|
|
template <typename Sentence2>
|
|
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
|
|
|
private:
|
|
std::vector<CharT1> s1;
|
|
detail::SplittedSentenceView<typename std::vector<CharT1>::iterator> s1_tokens;
|
|
std::vector<CharT1> s1_sorted;
|
|
CachedRatio<CharT1> cached_ratio_s1_sorted;
|
|
};
|
|
|
|
template <typename Sentence1>
|
|
explicit CachedTokenRatio(const Sentence1& s1) -> CachedTokenRatio<char_type<Sentence1>>;
|
|
|
|
template <typename InputIt1>
|
|
CachedTokenRatio(InputIt1 first1, InputIt1 last1) -> CachedTokenRatio<iter_value_t<InputIt1>>;
|
|
|
|
/**
|
|
* @brief Helper method that returns the maximum of
|
|
* fuzz::partial_token_set_ratio and fuzz::partial_token_sort_ratio (faster than
|
|
* manually executing the two functions)
|
|
*
|
|
* @tparam Sentence1 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
* @tparam Sentence2 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
*
|
|
* @param s1 string to compare with s2 (for type info check Template parameters
|
|
* above)
|
|
* @param s2 string to compare with s1 (for type info check Template parameters
|
|
* above)
|
|
* @param score_cutoff Optional argument for a score threshold between 0% and
|
|
* 100%. Matches with a lower score than this number will not be returned.
|
|
* Defaults to 0.
|
|
*
|
|
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
|
*/
|
|
template <typename Sentence1, typename Sentence2>
|
|
double partial_token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
|
|
|
template <typename InputIt1, typename InputIt2>
|
|
double partial_token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
|
double score_cutoff = 0);
|
|
|
|
// todo add real implementation
|
|
template <typename CharT1>
|
|
struct CachedPartialTokenRatio {
|
|
template <typename InputIt1>
|
|
CachedPartialTokenRatio(InputIt1 first1, InputIt1 last1)
|
|
: s1(first1, last1),
|
|
tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1))),
|
|
s1_sorted(tokens_s1.join())
|
|
{}
|
|
|
|
template <typename Sentence1>
|
|
explicit CachedPartialTokenRatio(const Sentence1& s1_)
|
|
: CachedPartialTokenRatio(detail::to_begin(s1_), detail::to_end(s1_))
|
|
{}
|
|
|
|
template <typename InputIt2>
|
|
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
|
double score_hint = 0.0) const;
|
|
|
|
template <typename Sentence2>
|
|
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
|
|
|
private:
|
|
std::vector<CharT1> s1;
|
|
detail::SplittedSentenceView<typename std::vector<CharT1>::iterator> tokens_s1;
|
|
std::vector<CharT1> s1_sorted;
|
|
};
|
|
|
|
template <typename Sentence1>
|
|
explicit CachedPartialTokenRatio(const Sentence1& s1) -> CachedPartialTokenRatio<char_type<Sentence1>>;
|
|
|
|
template <typename InputIt1>
|
|
CachedPartialTokenRatio(InputIt1 first1, InputIt1 last1) -> CachedPartialTokenRatio<iter_value_t<InputIt1>>;
|
|
|
|
/**
|
|
* @brief Calculates a weighted ratio based on the other ratio algorithms
|
|
*
|
|
* @details
|
|
* @todo add a detailed description
|
|
*
|
|
* @tparam Sentence1 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
* @tparam Sentence2 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
*
|
|
* @param s1 string to compare with s2 (for type info check Template parameters
|
|
* above)
|
|
* @param s2 string to compare with s1 (for type info check Template parameters
|
|
* above)
|
|
* @param score_cutoff Optional argument for a score threshold between 0% and
|
|
* 100%. Matches with a lower score than this number will not be returned.
|
|
* Defaults to 0.
|
|
*
|
|
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
|
*/
|
|
template <typename Sentence1, typename Sentence2>
|
|
double WRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
|
|
|
template <typename InputIt1, typename InputIt2>
|
|
double WRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
|
|
|
|
// todo add real implementation
|
|
template <typename CharT1>
|
|
struct CachedWRatio {
|
|
template <typename InputIt1>
|
|
explicit CachedWRatio(InputIt1 first1, InputIt1 last1);
|
|
|
|
template <typename Sentence1>
|
|
CachedWRatio(const Sentence1& s1_) : CachedWRatio(detail::to_begin(s1_), detail::to_end(s1_))
|
|
{}
|
|
|
|
template <typename InputIt2>
|
|
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
|
double score_hint = 0.0) const;
|
|
|
|
template <typename Sentence2>
|
|
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
|
|
|
private:
|
|
// todo somehow implement this using other ratios with creating PatternMatchVector
|
|
// multiple times
|
|
std::vector<CharT1> s1;
|
|
CachedPartialRatio<CharT1> cached_partial_ratio;
|
|
detail::SplittedSentenceView<typename std::vector<CharT1>::iterator> tokens_s1;
|
|
std::vector<CharT1> s1_sorted;
|
|
rapidfuzz::detail::BlockPatternMatchVector blockmap_s1_sorted;
|
|
};
|
|
|
|
template <typename Sentence1>
|
|
explicit CachedWRatio(const Sentence1& s1) -> CachedWRatio<char_type<Sentence1>>;
|
|
|
|
template <typename InputIt1>
|
|
CachedWRatio(InputIt1 first1, InputIt1 last1) -> CachedWRatio<iter_value_t<InputIt1>>;
|
|
|
|
/**
|
|
* @brief Calculates a quick ratio between two strings using fuzz.ratio
|
|
*
|
|
* @details
|
|
* @todo add a detailed description
|
|
*
|
|
* @tparam Sentence1 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
* @tparam Sentence2 This is a string that can be converted to
|
|
* basic_string_view<char_type>
|
|
*
|
|
* @param s1 string to compare with s2 (for type info check Template parameters
|
|
* above)
|
|
* @param s2 string to compare with s1 (for type info check Template parameters
|
|
* above)
|
|
* @param score_cutoff Optional argument for a score threshold between 0% and
|
|
* 100%. Matches with a lower score than this number will not be returned.
|
|
* Defaults to 0.
|
|
*
|
|
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
|
*/
|
|
template <typename Sentence1, typename Sentence2>
|
|
double QRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
|
|
|
template <typename InputIt1, typename InputIt2>
|
|
double QRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
|
|
|
|
#ifdef RAPIDFUZZ_SIMD
|
|
namespace experimental {
|
|
template <int MaxLen>
|
|
struct MultiQRatio {
|
|
public:
|
|
MultiQRatio(size_t count) : scorer(count)
|
|
{}
|
|
|
|
size_t result_count() const
|
|
{
|
|
return scorer.result_count();
|
|
}
|
|
|
|
template <typename Sentence1>
|
|
void insert(const Sentence1& s1_)
|
|
{
|
|
insert(detail::to_begin(s1_), detail::to_end(s1_));
|
|
}
|
|
|
|
template <typename InputIt1>
|
|
void insert(InputIt1 first1, InputIt1 last1)
|
|
{
|
|
scorer.insert(first1, last1);
|
|
str_lens.push_back(static_cast<size_t>(std::distance(first1, last1)));
|
|
}
|
|
|
|
template <typename InputIt2>
|
|
void similarity(double* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
|
|
double score_cutoff = 0.0) const
|
|
{
|
|
similarity(scores, score_count, detail::Range(first2, last2), score_cutoff);
|
|
}
|
|
|
|
template <typename Sentence2>
|
|
void similarity(double* scores, size_t score_count, const Sentence2& s2, double score_cutoff = 0) const
|
|
{
|
|
rapidfuzz::detail::Range s2_(s2);
|
|
if (s2_.empty()) {
|
|
for (size_t i = 0; i < str_lens.size(); ++i)
|
|
scores[i] = 0;
|
|
|
|
return;
|
|
}
|
|
|
|
scorer.similarity(scores, score_count, s2, score_cutoff);
|
|
|
|
for (size_t i = 0; i < str_lens.size(); ++i)
|
|
if (str_lens[i] == 0) scores[i] = 0;
|
|
}
|
|
|
|
private:
|
|
std::vector<size_t> str_lens;
|
|
MultiRatio<MaxLen> scorer;
|
|
};
|
|
} /* namespace experimental */
|
|
#endif
|
|
|
|
template <typename CharT1>
|
|
struct CachedQRatio {
|
|
template <typename InputIt1>
|
|
CachedQRatio(InputIt1 first1, InputIt1 last1) : s1(first1, last1), cached_ratio(first1, last1)
|
|
{}
|
|
|
|
template <typename Sentence1>
|
|
explicit CachedQRatio(const Sentence1& s1_) : CachedQRatio(detail::to_begin(s1_), detail::to_end(s1_))
|
|
{}
|
|
|
|
template <typename InputIt2>
|
|
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
|
double score_hint = 0.0) const;
|
|
|
|
template <typename Sentence2>
|
|
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
|
|
|
private:
|
|
std::vector<CharT1> s1;
|
|
CachedRatio<CharT1> cached_ratio;
|
|
};
|
|
|
|
template <typename Sentence1>
|
|
explicit CachedQRatio(const Sentence1& s1) -> CachedQRatio<char_type<Sentence1>>;
|
|
|
|
template <typename InputIt1>
|
|
CachedQRatio(InputIt1 first1, InputIt1 last1) -> CachedQRatio<iter_value_t<InputIt1>>;
|
|
|
|
/**@}*/
|
|
|
|
} // namespace rapidfuzz::fuzz
|
|
|
|
#include <rapidfuzz/fuzz_impl.hpp>
|