Files
noggit-red/src/external/rapidfuzz-cpp/bench/bench-levenshtein.cpp
2025-01-02 17:42:59 +00:00

224 lines
7.2 KiB
C++

#include <benchmark/benchmark.h>
#include <random>
#include <rapidfuzz/distance/Levenshtein.hpp>
#include <string>
#include <vector>
std::string generate(int max_length)
{
std::string possible_characters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
std::random_device rd;
std::mt19937 engine(rd());
std::uniform_int_distribution<> dist(0, static_cast<int>(possible_characters.size() - 1));
std::string ret = "";
for (int i = 0; i < max_length; i++) {
int random_index = dist(engine);
ret += possible_characters[static_cast<size_t>(random_index)];
}
return ret;
}
template <typename T>
std::basic_string<T> str_multiply(std::basic_string<T> a, unsigned int b)
{
std::basic_string<T> output;
while (b--)
output += a;
return output;
}
// Define another benchmark
static void BM_LevWeightedDist1(benchmark::State& state)
{
std::string a = "aaaaa aaaaa";
for (auto _ : state) {
benchmark::DoNotOptimize(rapidfuzz::levenshtein_distance(a, a));
}
state.SetLabel("Similar Strings");
}
static void BM_LevWeightedDist2(benchmark::State& state)
{
std::string a = "aaaaa aaaaa";
std::string b = "bbbbb bbbbb";
for (auto _ : state) {
benchmark::DoNotOptimize(rapidfuzz::levenshtein_distance(a, b));
}
state.SetLabel("Different Strings");
}
static void BM_LevNormWeightedDist1(benchmark::State& state)
{
std::string a = "aaaaa aaaaa";
for (auto _ : state) {
benchmark::DoNotOptimize(rapidfuzz::levenshtein_normalized_distance(a, a));
}
state.SetLabel("Similar Strings");
}
static void BM_LevNormWeightedDist2(benchmark::State& state)
{
std::string a = "aaaaa aaaaa";
std::string b = "bbbbb bbbbb";
for (auto _ : state) {
benchmark::DoNotOptimize(rapidfuzz::levenshtein_normalized_distance(a, b));
}
state.SetLabel("Different Strings");
}
static void BM_LevLongSimilarSequence(benchmark::State& state)
{
size_t len = state.range(0);
size_t score_cutoff = state.range(1);
std::string s1 = std::string("a") + str_multiply(std::string("b"), (len - 2)) + std::string("a");
std::string s2 = str_multiply(std::string("b"), len);
size_t num = 0;
for (auto _ : state) {
benchmark::DoNotOptimize(rapidfuzz::levenshtein_distance(s1, s2, {1, 1, 1}, score_cutoff));
++num;
}
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num * len), benchmark::Counter::kIsRate);
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num * len),
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
}
static void BM_LevLongNonSimilarSequence(benchmark::State& state)
{
size_t len = state.range(0);
size_t score_cutoff = state.range(1);
std::string s1 = str_multiply(std::string("a"), len);
std::string s2 = str_multiply(std::string("b"), len);
size_t num = 0;
for (auto _ : state) {
benchmark::DoNotOptimize(rapidfuzz::levenshtein_distance(s1, s2, {1, 1, 1}, score_cutoff));
++num;
}
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num * len), benchmark::Counter::kIsRate);
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num * len),
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
}
template <size_t MaxLen>
static void BM_Levenshtein(benchmark::State& state)
{
std::vector<std::string> seq1;
std::vector<std::string> seq2;
for (int i = 0; i < 256; i++)
seq1.push_back(generate(MaxLen));
for (int i = 0; i < 10000; i++)
seq2.push_back(generate(MaxLen));
size_t num = 0;
for (auto _ : state) {
for (size_t j = 0; j < seq2.size(); ++j)
for (size_t i = 0; i < seq1.size(); ++i)
benchmark::DoNotOptimize(rapidfuzz::levenshtein_distance(seq1[i], seq2[j]));
num += seq1.size() * seq2.size();
}
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num), benchmark::Counter::kIsRate);
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num),
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
}
template <size_t MaxLen>
static void BM_Levenshtein_Cached(benchmark::State& state)
{
std::vector<std::string> seq1;
std::vector<std::string> seq2;
for (int i = 0; i < 256; i++)
seq1.push_back(generate(MaxLen));
for (int i = 0; i < 10000; i++)
seq2.push_back(generate(MaxLen));
size_t num = 0;
for (auto _ : state) {
for (const auto& str1 : seq1) {
rapidfuzz::CachedLevenshtein<char> scorer(str1);
for (size_t j = 0; j < seq2.size(); ++j)
benchmark::DoNotOptimize(scorer.similarity(seq2[j]));
}
num += seq1.size() * seq2.size();
}
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num), benchmark::Counter::kIsRate);
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num),
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
}
#ifdef RAPIDFUZZ_SIMD
template <size_t MaxLen>
static void BM_Levenshtein_SIMD(benchmark::State& state)
{
std::vector<std::string> seq1;
std::vector<std::string> seq2;
std::vector<size_t> results(64);
for (int i = 0; i < 64; i++)
seq1.push_back(generate(MaxLen));
for (int i = 0; i < 10000; i++)
seq2.push_back(generate(MaxLen));
size_t num = 0;
for (auto _ : state) {
rapidfuzz::experimental::MultiLevenshtein<MaxLen> scorer(seq1.size());
for (const auto& str1 : seq1)
scorer.insert(str1);
for (const auto& str2 : seq2)
scorer.similarity(&results[0], results.size(), str2);
num += seq1.size() * seq2.size();
}
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num), benchmark::Counter::kIsRate);
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num),
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
}
#endif
BENCHMARK(BM_LevLongSimilarSequence)
->Args({100, 30})
->Args({500, 30})
->Args({5000, 30})
->Args({10000, 30})
->Args({20000, 30})
->Args({50000, 30});
BENCHMARK(BM_LevLongNonSimilarSequence)
->Args({100, 30})
->Args({500, 30})
->Args({5000, 30})
->Args({10000, 30})
->Args({20000, 30})
->Args({50000, 30});
BENCHMARK(BM_LevWeightedDist1);
BENCHMARK(BM_LevWeightedDist2);
BENCHMARK(BM_LevNormWeightedDist1);
BENCHMARK(BM_LevNormWeightedDist2);
BENCHMARK_TEMPLATE(BM_Levenshtein, 8);
BENCHMARK_TEMPLATE(BM_Levenshtein, 16);
BENCHMARK_TEMPLATE(BM_Levenshtein, 32);
BENCHMARK_TEMPLATE(BM_Levenshtein, 64);
BENCHMARK_TEMPLATE(BM_Levenshtein_Cached, 8);
BENCHMARK_TEMPLATE(BM_Levenshtein_Cached, 16);
BENCHMARK_TEMPLATE(BM_Levenshtein_Cached, 32);
BENCHMARK_TEMPLATE(BM_Levenshtein_Cached, 64);
#ifdef RAPIDFUZZ_SIMD
BENCHMARK_TEMPLATE(BM_Levenshtein_SIMD, 8);
BENCHMARK_TEMPLATE(BM_Levenshtein_SIMD, 16);
BENCHMARK_TEMPLATE(BM_Levenshtein_SIMD, 32);
BENCHMARK_TEMPLATE(BM_Levenshtein_SIMD, 64);
#endif
BENCHMARK_MAIN();