/* SPDX-License-Identifier: MIT */ /* Copyright © 2021 Max Bachmann */ /* Copyright © 2011 Adam Cohen */ #pragma once #include #include #include #include namespace rapidfuzz::fuzz { /** * @defgroup Fuzz Fuzz * A collection of string matching algorithms from FuzzyWuzzy * @{ */ /** * @brief calculates a simple ratio between two strings * * @details * @code{.cpp} * // score is 96.55 * double score = ratio("this is a test", "this is a test!") * @endcode * * @tparam Sentence1 This is a string that can be converted to * basic_string_view * @tparam Sentence2 This is a string that can be converted to * basic_string_view * * @param s1 string to compare with s2 (for type info check Template parameters * above) * @param s2 string to compare with s1 (for type info check Template parameters * above) * @param score_cutoff Optional argument for a score threshold between 0% and * 100%. Matches with a lower score than this number will not be returned. * Defaults to 0. * * @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff */ template double ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0); template double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0); #ifdef RAPIDFUZZ_SIMD namespace experimental { template struct MultiRatio { public: MultiRatio(size_t count) : input_count(count), scorer(count) {} size_t result_count() const { return scorer.result_count(); } template void insert(const Sentence1& s1_) { insert(detail::to_begin(s1_), detail::to_end(s1_)); } template void insert(InputIt1 first1, InputIt1 last1) { scorer.insert(first1, last1); } template void similarity(double* scores, size_t score_count, InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0) const { similarity(scores, score_count, detail::Range(first2, last2), score_cutoff); } template void similarity(double* scores, size_t score_count, const Sentence2& s2, double score_cutoff = 0) const { scorer.normalized_similarity(scores, score_count, s2, score_cutoff / 100.0); for (size_t i = 0; i < input_count; ++i) scores[i] *= 100.0; } private: size_t input_count; rapidfuzz::experimental::MultiIndel scorer; }; } /* namespace experimental */ #endif // TODO documentation template struct CachedRatio { template CachedRatio(InputIt1 first1, InputIt1 last1) : cached_indel(first1, last1) {} template CachedRatio(const Sentence1& s1) : cached_indel(s1) {} template double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0, double score_hint = 0.0) const; template double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const; // private: CachedIndel cached_indel; }; template CachedRatio(const Sentence1& s1) -> CachedRatio>; template CachedRatio(InputIt1 first1, InputIt1 last1) -> CachedRatio>; template ScoreAlignment partial_ratio_alignment(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0); template ScoreAlignment partial_ratio_alignment(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0); /** * @brief calculates the fuzz::ratio of the optimal string alignment * * @details * test @cite hyrro_2004 @cite wagner_fischer_1974 * @code{.cpp} * // score is 100 * double score = partial_ratio("this is a test", "this is a test!") * @endcode * * @tparam Sentence1 This is a string that can be converted to * basic_string_view * @tparam Sentence2 This is a string that can be converted to * basic_string_view * * @param s1 string to compare with s2 (for type info check Template parameters * above) * @param s2 string to compare with s1 (for type info check Template parameters * above) * @param score_cutoff Optional argument for a score threshold between 0% and * 100%. Matches with a lower score than this number will not be returned. * Defaults to 0. * * @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff */ template double partial_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0); template double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0); // todo add real implementation template struct CachedPartialRatio { template friend struct CachedWRatio; template CachedPartialRatio(InputIt1 first1, InputIt1 last1); template explicit CachedPartialRatio(const Sentence1& s1_) : CachedPartialRatio(detail::to_begin(s1_), detail::to_end(s1_)) {} template double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0, double score_hint = 0.0) const; template double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const; private: std::vector s1; rapidfuzz::detail::CharSet s1_char_set; CachedRatio cached_ratio; }; template explicit CachedPartialRatio(const Sentence1& s1) -> CachedPartialRatio>; template CachedPartialRatio(InputIt1 first1, InputIt1 last1) -> CachedPartialRatio>; /** * @brief Sorts the words in the strings and calculates the fuzz::ratio between * them * * @details * @code{.cpp} * // score is 100 * double score = token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a * bear") * @endcode * * @tparam Sentence1 This is a string that can be converted to * basic_string_view * @tparam Sentence2 This is a string that can be converted to * basic_string_view * * @param s1 string to compare with s2 (for type info check Template parameters * above) * @param s2 string to compare with s1 (for type info check Template parameters * above) * @param score_cutoff Optional argument for a score threshold between 0% and * 100%. Matches with a lower score than this number will not be returned. * Defaults to 0. * * @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff */ template double token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0); template double token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0); #ifdef RAPIDFUZZ_SIMD namespace experimental { template struct MultiTokenSortRatio { public: MultiTokenSortRatio(size_t count) : scorer(count) {} size_t result_count() const { return scorer.result_count(); } template void insert(const Sentence1& s1_) { insert(detail::to_begin(s1_), detail::to_end(s1_)); } template void insert(InputIt1 first1, InputIt1 last1) { scorer.insert(detail::sorted_split(first1, last1).join()); } template void similarity(double* scores, size_t score_count, InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0) const { scorer.similarity(scores, score_count, detail::sorted_split(first2, last2).join(), score_cutoff); } template void similarity(double* scores, size_t score_count, const Sentence2& s2, double score_cutoff = 0) const { similarity(scores, score_count, detail::to_begin(s2), detail::to_end(s2), score_cutoff); } private: MultiRatio scorer; }; } /* namespace experimental */ #endif // todo CachedRatio speed for equal strings vs original implementation // TODO documentation template struct CachedTokenSortRatio { template CachedTokenSortRatio(InputIt1 first1, InputIt1 last1) : s1_sorted(detail::sorted_split(first1, last1).join()), cached_ratio(s1_sorted) {} template explicit CachedTokenSortRatio(const Sentence1& s1) : CachedTokenSortRatio(detail::to_begin(s1), detail::to_end(s1)) {} template double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0, double score_hint = 0.0) const; template double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const; private: std::vector s1_sorted; CachedRatio cached_ratio; }; template explicit CachedTokenSortRatio(const Sentence1& s1) -> CachedTokenSortRatio>; template CachedTokenSortRatio(InputIt1 first1, InputIt1 last1) -> CachedTokenSortRatio>; /** * @brief Sorts the words in the strings and calculates the fuzz::partial_ratio * between them * * * @tparam Sentence1 This is a string that can be converted to * basic_string_view * @tparam Sentence2 This is a string that can be converted to * basic_string_view * * @param s1 string to compare with s2 (for type info check Template parameters * above) * @param s2 string to compare with s1 (for type info check Template parameters * above) * @param score_cutoff Optional argument for a score threshold between 0% and * 100%. Matches with a lower score than this number will not be returned. * Defaults to 0. * * @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff */ template double partial_token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0); template double partial_token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0); // TODO documentation template struct CachedPartialTokenSortRatio { template CachedPartialTokenSortRatio(InputIt1 first1, InputIt1 last1) : s1_sorted(detail::sorted_split(first1, last1).join()), cached_partial_ratio(s1_sorted) {} template explicit CachedPartialTokenSortRatio(const Sentence1& s1) : CachedPartialTokenSortRatio(detail::to_begin(s1), detail::to_end(s1)) {} template double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0, double score_hint = 0.0) const; template double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const; private: std::vector s1_sorted; CachedPartialRatio cached_partial_ratio; }; template explicit CachedPartialTokenSortRatio(const Sentence1& s1) -> CachedPartialTokenSortRatio>; template CachedPartialTokenSortRatio(InputIt1 first1, InputIt1 last1) -> CachedPartialTokenSortRatio>; /** * @brief Compares the words in the strings based on unique and common words * between them using fuzz::ratio * * @details * @code{.cpp} * // score1 is 83.87 * double score1 = token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a * bear") * // score2 is 100 * double score2 = token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear") * @endcode * * @tparam Sentence1 This is a string that can be converted to * basic_string_view * @tparam Sentence2 This is a string that can be converted to * basic_string_view * * @param s1 string to compare with s2 (for type info check Template parameters * above) * @param s2 string to compare with s1 (for type info check Template parameters * above) * @param score_cutoff Optional argument for a score threshold between 0% and * 100%. Matches with a lower score than this number will not be returned. * Defaults to 0. * * @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff */ template double token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0); template double token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0); // TODO documentation template struct CachedTokenSetRatio { template CachedTokenSetRatio(InputIt1 first1, InputIt1 last1) : s1(first1, last1), tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1))) {} template explicit CachedTokenSetRatio(const Sentence1& s1_) : CachedTokenSetRatio(detail::to_begin(s1_), detail::to_end(s1_)) {} template double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0, double score_hint = 0.0) const; template double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const; private: std::vector s1; detail::SplittedSentenceView::iterator> tokens_s1; }; template explicit CachedTokenSetRatio(const Sentence1& s1) -> CachedTokenSetRatio>; template CachedTokenSetRatio(InputIt1 first1, InputIt1 last1) -> CachedTokenSetRatio>; /** * @brief Compares the words in the strings based on unique and common words * between them using fuzz::partial_ratio * * @tparam Sentence1 This is a string that can be converted to * basic_string_view * @tparam Sentence2 This is a string that can be converted to * basic_string_view * * @param s1 string to compare with s2 (for type info check Template parameters * above) * @param s2 string to compare with s1 (for type info check Template parameters * above) * @param score_cutoff Optional argument for a score threshold between 0% and * 100%. Matches with a lower score than this number will not be returned. * Defaults to 0. * * @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff */ template double partial_token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0); template double partial_token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0); // TODO documentation template struct CachedPartialTokenSetRatio { template CachedPartialTokenSetRatio(InputIt1 first1, InputIt1 last1) : s1(first1, last1), tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1))) {} template explicit CachedPartialTokenSetRatio(const Sentence1& s1_) : CachedPartialTokenSetRatio(detail::to_begin(s1_), detail::to_end(s1_)) {} template double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0, double score_hint = 0.0) const; template double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const; private: std::vector s1; detail::SplittedSentenceView::iterator> tokens_s1; }; template explicit CachedPartialTokenSetRatio(const Sentence1& s1) -> CachedPartialTokenSetRatio>; template CachedPartialTokenSetRatio(InputIt1 first1, InputIt1 last1) -> CachedPartialTokenSetRatio>; /** * @brief Helper method that returns the maximum of fuzz::token_set_ratio and * fuzz::token_sort_ratio (faster than manually executing the two functions) * * @tparam Sentence1 This is a string that can be converted to * basic_string_view * @tparam Sentence2 This is a string that can be converted to * basic_string_view * * @param s1 string to compare with s2 (for type info check Template parameters * above) * @param s2 string to compare with s1 (for type info check Template parameters * above) * @param score_cutoff Optional argument for a score threshold between 0% and * 100%. Matches with a lower score than this number will not be returned. * Defaults to 0. * * @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff */ template double token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0); template double token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0); // todo add real implementation template struct CachedTokenRatio { template CachedTokenRatio(InputIt1 first1, InputIt1 last1) : s1(first1, last1), s1_tokens(detail::sorted_split(std::begin(s1), std::end(s1))), s1_sorted(s1_tokens.join()), cached_ratio_s1_sorted(s1_sorted) {} template explicit CachedTokenRatio(const Sentence1& s1_) : CachedTokenRatio(detail::to_begin(s1_), detail::to_end(s1_)) {} template double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0, double score_hint = 0.0) const; template double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const; private: std::vector s1; detail::SplittedSentenceView::iterator> s1_tokens; std::vector s1_sorted; CachedRatio cached_ratio_s1_sorted; }; template explicit CachedTokenRatio(const Sentence1& s1) -> CachedTokenRatio>; template CachedTokenRatio(InputIt1 first1, InputIt1 last1) -> CachedTokenRatio>; /** * @brief Helper method that returns the maximum of * fuzz::partial_token_set_ratio and fuzz::partial_token_sort_ratio (faster than * manually executing the two functions) * * @tparam Sentence1 This is a string that can be converted to * basic_string_view * @tparam Sentence2 This is a string that can be converted to * basic_string_view * * @param s1 string to compare with s2 (for type info check Template parameters * above) * @param s2 string to compare with s1 (for type info check Template parameters * above) * @param score_cutoff Optional argument for a score threshold between 0% and * 100%. Matches with a lower score than this number will not be returned. * Defaults to 0. * * @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff */ template double partial_token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0); template double partial_token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0); // todo add real implementation template struct CachedPartialTokenRatio { template CachedPartialTokenRatio(InputIt1 first1, InputIt1 last1) : s1(first1, last1), tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1))), s1_sorted(tokens_s1.join()) {} template explicit CachedPartialTokenRatio(const Sentence1& s1_) : CachedPartialTokenRatio(detail::to_begin(s1_), detail::to_end(s1_)) {} template double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0, double score_hint = 0.0) const; template double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const; private: std::vector s1; detail::SplittedSentenceView::iterator> tokens_s1; std::vector s1_sorted; }; template explicit CachedPartialTokenRatio(const Sentence1& s1) -> CachedPartialTokenRatio>; template CachedPartialTokenRatio(InputIt1 first1, InputIt1 last1) -> CachedPartialTokenRatio>; /** * @brief Calculates a weighted ratio based on the other ratio algorithms * * @details * @todo add a detailed description * * @tparam Sentence1 This is a string that can be converted to * basic_string_view * @tparam Sentence2 This is a string that can be converted to * basic_string_view * * @param s1 string to compare with s2 (for type info check Template parameters * above) * @param s2 string to compare with s1 (for type info check Template parameters * above) * @param score_cutoff Optional argument for a score threshold between 0% and * 100%. Matches with a lower score than this number will not be returned. * Defaults to 0. * * @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff */ template double WRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0); template double WRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0); // todo add real implementation template struct CachedWRatio { template explicit CachedWRatio(InputIt1 first1, InputIt1 last1); template CachedWRatio(const Sentence1& s1_) : CachedWRatio(detail::to_begin(s1_), detail::to_end(s1_)) {} template double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0, double score_hint = 0.0) const; template double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const; private: // todo somehow implement this using other ratios with creating PatternMatchVector // multiple times std::vector s1; CachedPartialRatio cached_partial_ratio; detail::SplittedSentenceView::iterator> tokens_s1; std::vector s1_sorted; rapidfuzz::detail::BlockPatternMatchVector blockmap_s1_sorted; }; template explicit CachedWRatio(const Sentence1& s1) -> CachedWRatio>; template CachedWRatio(InputIt1 first1, InputIt1 last1) -> CachedWRatio>; /** * @brief Calculates a quick ratio between two strings using fuzz.ratio * * @details * @todo add a detailed description * * @tparam Sentence1 This is a string that can be converted to * basic_string_view * @tparam Sentence2 This is a string that can be converted to * basic_string_view * * @param s1 string to compare with s2 (for type info check Template parameters * above) * @param s2 string to compare with s1 (for type info check Template parameters * above) * @param score_cutoff Optional argument for a score threshold between 0% and * 100%. Matches with a lower score than this number will not be returned. * Defaults to 0. * * @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff */ template double QRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0); template double QRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0); #ifdef RAPIDFUZZ_SIMD namespace experimental { template struct MultiQRatio { public: MultiQRatio(size_t count) : scorer(count) {} size_t result_count() const { return scorer.result_count(); } template void insert(const Sentence1& s1_) { insert(detail::to_begin(s1_), detail::to_end(s1_)); } template void insert(InputIt1 first1, InputIt1 last1) { scorer.insert(first1, last1); str_lens.push_back(static_cast(std::distance(first1, last1))); } template void similarity(double* scores, size_t score_count, InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0) const { similarity(scores, score_count, detail::Range(first2, last2), score_cutoff); } template void similarity(double* scores, size_t score_count, const Sentence2& s2, double score_cutoff = 0) const { rapidfuzz::detail::Range s2_(s2); if (s2_.empty()) { for (size_t i = 0; i < str_lens.size(); ++i) scores[i] = 0; return; } scorer.similarity(scores, score_count, s2, score_cutoff); for (size_t i = 0; i < str_lens.size(); ++i) if (str_lens[i] == 0) scores[i] = 0; } private: std::vector str_lens; MultiRatio scorer; }; } /* namespace experimental */ #endif template struct CachedQRatio { template CachedQRatio(InputIt1 first1, InputIt1 last1) : s1(first1, last1), cached_ratio(first1, last1) {} template explicit CachedQRatio(const Sentence1& s1_) : CachedQRatio(detail::to_begin(s1_), detail::to_end(s1_)) {} template double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0, double score_hint = 0.0) const; template double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const; private: std::vector s1; CachedRatio cached_ratio; }; template explicit CachedQRatio(const Sentence1& s1) -> CachedQRatio>; template CachedQRatio(InputIt1 first1, InputIt1 last1) -> CachedQRatio>; /**@}*/ } // namespace rapidfuzz::fuzz #include