AreaTrigger Tool
This commit is contained in:
@@ -142,6 +142,7 @@ ADD_SUBDIRECTORY("${EXTERNAL_SOURCE_DIR}/NodeEditor")
|
||||
option(BUILD_LIBNOISE_EXAMPLES "Build libnoise examples" OFF)
|
||||
ADD_SUBDIRECTORY("${EXTERNAL_SOURCE_DIR}/libnoise")
|
||||
ADD_SUBDIRECTORY("${EXTERNAL_SOURCE_DIR}/glm")
|
||||
ADD_SUBDIRECTORY("${EXTERNAL_SOURCE_DIR}/rapidfuzz-cpp")
|
||||
|
||||
# Add the found include directories to our include list.
|
||||
INCLUDE_DIRECTORIES(SYSTEM "${CMAKE_SOURCE_DIR}/include/")
|
||||
@@ -360,6 +361,7 @@ TARGET_LINK_LIBRARIES (noggit
|
||||
sol2::sane
|
||||
blizzard-archive-library
|
||||
blizzard-database-library
|
||||
rapidfuzz::rapidfuzz
|
||||
)
|
||||
|
||||
#add distribution themes
|
||||
@@ -519,4 +521,4 @@ if(${FAST_BUILD_NOGGIT_JUMBO})
|
||||
set_source_files_properties("${CMAKE_CURRENT_SOURCE_DIR}/src/external/imguizmo/ImGuizmo.cpp" PROPERTIES SKIP_UNITY_BUILD_INCLUSION TRUE)
|
||||
set_source_files_properties("${CMAKE_CURRENT_SOURCE_DIR}/src/external/imguizmo/ImSequencer.cpp" PROPERTIES SKIP_UNITY_BUILD_INCLUSION TRUE)
|
||||
set_source_files_properties("${CMAKE_CURRENT_SOURCE_DIR}/src/noggit/rendering/WorldRender.cpp" PROPERTIES SKIP_UNITY_BUILD_INCLUSION TRUE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
1220
dist/noggit-definitions/AreatriggerDescriptions.csv
vendored
Normal file
1220
dist/noggit-definitions/AreatriggerDescriptions.csv
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
28
src/external/rapidfuzz-cpp/.clang-format
vendored
Normal file
28
src/external/rapidfuzz-cpp/.clang-format
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
ColumnLimit: 110
|
||||
IndentWidth: 4
|
||||
AccessModifierOffset: -4
|
||||
|
||||
AllowShortIfStatementsOnASingleLine: true
|
||||
PointerAlignment: Left
|
||||
AllowShortBlocksOnASingleLine: Always
|
||||
AllowShortFunctionsOnASingleLine: None
|
||||
BreakBeforeBraces: Custom
|
||||
AlwaysBreakTemplateDeclarations: true
|
||||
BraceWrapping:
|
||||
SplitEmptyFunction: false
|
||||
AfterCaseLabel: true
|
||||
AfterClass: false
|
||||
AfterControlStatement: MultiLine
|
||||
AfterEnum: false
|
||||
AfterFunction: true
|
||||
AfterNamespace: false
|
||||
AfterStruct: false
|
||||
AfterUnion: false
|
||||
BeforeCatch: true
|
||||
BeforeElse: true
|
||||
SplitEmptyRecord: false
|
||||
SplitEmptyNamespace: false
|
||||
AllowAllConstructorInitializersOnNextLine: true
|
||||
ConstructorInitializerAllOnOneLineOrOnePerLine: true
|
||||
AllowShortCaseLabelsOnASingleLine: true
|
||||
IndentPPDirectives: AfterHash
|
||||
1
src/external/rapidfuzz-cpp/.gitattributes
vendored
Normal file
1
src/external/rapidfuzz-cpp/.gitattributes
vendored
Normal file
@@ -0,0 +1 @@
|
||||
*.impl linguist-language=C++
|
||||
2
src/external/rapidfuzz-cpp/.github/FUNDING.yml
vendored
Normal file
2
src/external/rapidfuzz-cpp/.github/FUNDING.yml
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
github: maxbachmann
|
||||
custom: ["https://www.paypal.com/donate/?hosted_button_id=VGWQBBD5CTWJU"]
|
||||
195
src/external/rapidfuzz-cpp/.github/RapidFuzz.svg
vendored
Normal file
195
src/external/rapidfuzz-cpp/.github/RapidFuzz.svg
vendored
Normal file
@@ -0,0 +1,195 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Generated by Microsoft Visio, SVG Export RapidFuzz.svg Page-1 -->
|
||||
|
||||
<svg
|
||||
xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
width="5.2963533in"
|
||||
height="1.7699279in"
|
||||
viewBox="0 0 381.33799 127.43484"
|
||||
xml:space="preserve"
|
||||
class="st6"
|
||||
version="1.1"
|
||||
id="svg933"
|
||||
sodipodi:docname="RapidFuzz.svg"
|
||||
style="font-size:12px;overflow:visible;color-interpolation-filters:sRGB;fill:none;fill-rule:evenodd;stroke-linecap:square;stroke-miterlimit:3"
|
||||
inkscape:version="0.92.4 (5da689c313, 2019-01-14)"><metadata
|
||||
id="metadata939"><rdf:RDF><cc:Work
|
||||
rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" /></cc:Work></rdf:RDF></metadata><defs
|
||||
id="defs937" /><sodipodi:namedview
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1"
|
||||
objecttolerance="10"
|
||||
gridtolerance="10"
|
||||
guidetolerance="10"
|
||||
inkscape:pageopacity="0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:window-width="1920"
|
||||
inkscape:window-height="1137"
|
||||
id="namedview935"
|
||||
showgrid="false"
|
||||
inkscape:zoom="0.60980132"
|
||||
inkscape:cx="165.11513"
|
||||
inkscape:cy="-104.98284"
|
||||
inkscape:window-x="1592"
|
||||
inkscape:window-y="-8"
|
||||
inkscape:window-maximized="1"
|
||||
inkscape:current-layer="svg933" />
|
||||
<v:documentProperties
|
||||
v:langID="1033"
|
||||
v:metric="true"
|
||||
v:viewMarkup="false">
|
||||
<v:userDefs>
|
||||
<v:ud
|
||||
v:nameU="msvNoAutoConnect"
|
||||
v:val="VT0(1):26" />
|
||||
</v:userDefs>
|
||||
</v:documentProperties>
|
||||
|
||||
<style
|
||||
type="text/css"
|
||||
id="style899">
|
||||
<![CDATA[
|
||||
.st1 {fill:none;stroke:#0070c0;stroke-linecap:round;stroke-linejoin:round;stroke-width:10}
|
||||
.st2 {fill:#0070c0;stroke:#0070c0;stroke-linecap:round;stroke-linejoin:round;stroke-width:10}
|
||||
.st3 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
|
||||
.st4 {fill:#0070c0;font-family:Calibri;font-size:5.33334em}
|
||||
.st5 {fill:#ffffff;font-family:Calibri;font-size:5.33334em;font-style:italic}
|
||||
.st6 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
|
||||
]]>
|
||||
</style>
|
||||
|
||||
<g
|
||||
v:mID="0"
|
||||
v:index="1"
|
||||
v:groupContext="foregroundPage"
|
||||
id="g931"
|
||||
transform="translate(-142.402,-264.29138)">
|
||||
<title
|
||||
id="title901">Page-1</title>
|
||||
<v:pageProperties
|
||||
v:drawingScale="0.0393701"
|
||||
v:pageScale="0.0393701"
|
||||
v:drawingUnits="24"
|
||||
v:shadowOffsetX="8.50394"
|
||||
v:shadowOffsetY="-8.50394" />
|
||||
<g
|
||||
id="shape1-1"
|
||||
v:mID="1"
|
||||
v:groupContext="shape"
|
||||
transform="translate(147.402,-208.549)">
|
||||
<title
|
||||
id="title903">Rectangle</title>
|
||||
<v:userDefs>
|
||||
<v:ud
|
||||
v:nameU="visVersion"
|
||||
v:val="VT0(15):26" />
|
||||
</v:userDefs>
|
||||
<rect
|
||||
x="0"
|
||||
y="477.84"
|
||||
width="185.994"
|
||||
height="117.435"
|
||||
class="st1"
|
||||
id="rect905"
|
||||
style="fill:none;stroke:#0070c0;stroke-width:10;stroke-linecap:round;stroke-linejoin:round" />
|
||||
</g>
|
||||
<g
|
||||
id="shape2-3"
|
||||
v:mID="2"
|
||||
v:groupContext="shape"
|
||||
transform="translate(332.746,-208.549)">
|
||||
<title
|
||||
id="title908">Rectangle.2</title>
|
||||
<v:userDefs>
|
||||
<v:ud
|
||||
v:nameU="visVersion"
|
||||
v:val="VT0(15):26" />
|
||||
</v:userDefs>
|
||||
<rect
|
||||
x="0"
|
||||
y="477.84"
|
||||
width="185.994"
|
||||
height="117.435"
|
||||
class="st2"
|
||||
id="rect910"
|
||||
style="fill:#0070c0;stroke:#0070c0;stroke-width:10;stroke-linecap:round;stroke-linejoin:round" />
|
||||
</g>
|
||||
<g
|
||||
id="shape3-5"
|
||||
v:mID="3"
|
||||
v:groupContext="shape"
|
||||
transform="translate(162.076,-256.961)">
|
||||
<title
|
||||
id="title913">Sheet.3</title>
|
||||
<desc
|
||||
id="desc915">Rapid</desc>
|
||||
<v:textBlock
|
||||
v:margins="rect(4,4,4,4)"
|
||||
v:tabSpace="42.5197" />
|
||||
<v:textRect
|
||||
cx="78.322"
|
||||
cy="584.97"
|
||||
width="156.65"
|
||||
height="20.611" />
|
||||
<rect
|
||||
x="0"
|
||||
y="574.66498"
|
||||
width="156.644"
|
||||
height="20.611"
|
||||
class="st3"
|
||||
id="rect917"
|
||||
style="fill:none;stroke:none;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round" />
|
||||
<text
|
||||
x="4.6500001"
|
||||
y="604.16998"
|
||||
class="st4"
|
||||
v:langID="1031"
|
||||
id="text919"
|
||||
style="font-size:64.00008392px;font-family:Calibri;fill:#0070c0"><v:paragraph
|
||||
v:horizAlign="1" /><v:tabList />Rapid</text>
|
||||
</g>
|
||||
<g
|
||||
id="shape4-8"
|
||||
v:mID="4"
|
||||
v:groupContext="shape"
|
||||
transform="translate(347.421,-256.961)">
|
||||
<title
|
||||
id="title922">Sheet.4</title>
|
||||
<desc
|
||||
id="desc924">Fuzz</desc>
|
||||
<v:textBlock
|
||||
v:margins="rect(4,4,4,4)"
|
||||
v:tabSpace="42.5197" />
|
||||
<v:textRect
|
||||
cx="78.322"
|
||||
cy="584.97"
|
||||
width="156.65"
|
||||
height="20.611" />
|
||||
<rect
|
||||
x="0"
|
||||
y="574.66498"
|
||||
width="156.644"
|
||||
height="20.611"
|
||||
class="st3"
|
||||
id="rect926"
|
||||
style="fill:none;stroke:none;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round" />
|
||||
<text
|
||||
x="21.879999"
|
||||
y="604.16998"
|
||||
class="st5"
|
||||
v:langID="1031"
|
||||
id="text928"
|
||||
style="font-style:italic;font-size:64.00008392px;font-family:Calibri;fill:#ffffff"><v:paragraph
|
||||
v:horizAlign="1" /><v:tabList />Fuzz</text>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 5.2 KiB |
202
src/external/rapidfuzz-cpp/.github/workflows/cmake.yml
vendored
Normal file
202
src/external/rapidfuzz-cpp/.github/workflows/cmake.yml
vendored
Normal file
@@ -0,0 +1,202 @@
|
||||
name: CMake
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
env:
|
||||
BUILD_TYPE: Release
|
||||
|
||||
jobs:
|
||||
build_linux_clang:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
BUILD_TYPE: [Release, Debug]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Configure CMake
|
||||
run: cmake -B build -DCMAKE_BUILD_TYPE=${{matrix.BUILD_TYPE}} -DRAPIDFUZZ_BUILD_TESTING=1 -DRAPIDFUZZ_ENABLE_LINTERS=1 -DRAPIDFUZZ_BUILD_FUZZERS=1 -DCMAKE_CXX_COMPILER=clang++
|
||||
|
||||
- name: Build
|
||||
run: cmake --build build --config ${{matrix.BUILD_TYPE}}
|
||||
|
||||
- name: Test
|
||||
working-directory: build
|
||||
run: ctest -C ${{matrix.BUILD_TYPE}} --rerun-failed --output-on-failure
|
||||
|
||||
- name: Fuzz Test
|
||||
working-directory: build
|
||||
run: |
|
||||
fuzzing/fuzz_lcs_similarity -max_total_time=30
|
||||
fuzzing/fuzz_levenshtein_distance -max_total_time=30
|
||||
fuzzing/fuzz_levenshtein_editops -max_total_time=30
|
||||
fuzzing/fuzz_indel_distance -max_total_time=30
|
||||
fuzzing/fuzz_indel_editops -max_total_time=30
|
||||
fuzzing/fuzz_osa_distance -max_total_time=30
|
||||
fuzzing/fuzz_damerau_levenshtein_distance -max_total_time=30
|
||||
|
||||
build_linux_clang_32:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
BUILD_TYPE: [Release, Debug]
|
||||
env:
|
||||
CXXFLAGS: -m32
|
||||
CFLAGS: -m32
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y libc6-dev-i386 g++-multilib
|
||||
|
||||
- name: Configure CMake
|
||||
run: cmake -B build -DCMAKE_BUILD_TYPE=${{matrix.BUILD_TYPE}} -DRAPIDFUZZ_BUILD_TESTING=1 -DRAPIDFUZZ_ENABLE_LINTERS=1 -DRAPIDFUZZ_BUILD_FUZZERS=1 -DCMAKE_CXX_COMPILER=clang++
|
||||
|
||||
- name: Build
|
||||
run: cmake --build build --config ${{matrix.BUILD_TYPE}}
|
||||
|
||||
- name: Test
|
||||
working-directory: build
|
||||
run: ctest -C ${{matrix.BUILD_TYPE}} --rerun-failed --output-on-failure
|
||||
|
||||
- name: Fuzz Test
|
||||
working-directory: build
|
||||
run: |
|
||||
fuzzing/fuzz_lcs_similarity -max_total_time=30
|
||||
fuzzing/fuzz_levenshtein_distance -max_total_time=30
|
||||
fuzzing/fuzz_levenshtein_editops -max_total_time=30
|
||||
fuzzing/fuzz_indel_distance -max_total_time=30
|
||||
fuzzing/fuzz_indel_editops -max_total_time=30
|
||||
fuzzing/fuzz_osa_distance -max_total_time=30
|
||||
fuzzing/fuzz_damerau_levenshtein_distance -max_total_time=30
|
||||
|
||||
build_linux_gcc:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
BUILD_TYPE: [Release, Debug]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Configure CMake
|
||||
run: cmake -B build -DCMAKE_BUILD_TYPE=${{matrix.BUILD_TYPE}} -DRAPIDFUZZ_BUILD_TESTING=1 -DRAPIDFUZZ_ENABLE_LINTERS=1 -DCMAKE_CXX_COMPILER=g++
|
||||
|
||||
- name: Build
|
||||
run: cmake --build build --config ${{matrix.BUILD_TYPE}}
|
||||
|
||||
- name: Test
|
||||
working-directory: build
|
||||
run: ctest -C ${{matrix.BUILD_TYPE}} --rerun-failed --output-on-failure
|
||||
|
||||
build_windows:
|
||||
runs-on: windows-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
BUILD_TYPE: [Release, Debug]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Configure CMake
|
||||
run: cmake -B build -DCMAKE_BUILD_TYPE=${{matrix.BUILD_TYPE}} -DRAPIDFUZZ_BUILD_TESTING=1 -DRAPIDFUZZ_ENABLE_LINTERS=1
|
||||
|
||||
- name: Build
|
||||
run: cmake --build build --config ${{matrix.BUILD_TYPE}}
|
||||
|
||||
- name: Test
|
||||
working-directory: build
|
||||
run: ctest -C ${{matrix.BUILD_TYPE}} --rerun-failed --output-on-failure
|
||||
|
||||
build_cmake_installed:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Configure CMake
|
||||
run: cmake -B build -DCMAKE_BUILD_TYPE=Release
|
||||
|
||||
- name: Install RapidFuzz
|
||||
run: sudo cmake --build build --target install
|
||||
|
||||
- name: Configure example project
|
||||
working-directory: examples/cmake_installed
|
||||
run: cmake -B build -DCMAKE_BUILD_TYPE=Release
|
||||
|
||||
- name: Build example project
|
||||
working-directory: examples/cmake_installed
|
||||
run: cmake --build build --config ${{env.BUILD_TYPE}}
|
||||
|
||||
- name: Run example project
|
||||
working-directory: examples/cmake_installed/build
|
||||
run: ./foo
|
||||
|
||||
build_cmake_subdir:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
BUILD_TYPE: [Release, Debug]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Configure the library dependent on RapidFuzz
|
||||
working-directory: examples/cmake_export
|
||||
run: cmake -B build -DCMAKE_BUILD_TYPE=${{matrix.BUILD_TYPE}}
|
||||
|
||||
- name: Build the library dependent on RapidFuzz
|
||||
working-directory: examples/cmake_export
|
||||
run: cmake --build build --config ${{matrix.BUILD_TYPE}}
|
||||
|
||||
- name: Install the library dependent on RapidFuzz
|
||||
working-directory: examples/cmake_export
|
||||
run: sudo cmake --build build --target install
|
||||
|
||||
- name: Configure the app indirectly dependent on RapidFuzz
|
||||
working-directory: examples/cmake_export/indirect_app
|
||||
run: cmake -B build -DCMAKE_BUILD_TYPE=${{matrix.BUILD_TYPE}}
|
||||
|
||||
- name: Build the app indirectly dependent on RapidFuzz
|
||||
working-directory: examples/cmake_export/indirect_app
|
||||
run: cmake --build build --config ${{matrix.BUILD_TYPE}}
|
||||
|
||||
- name: Run the app indirectly dependent on RapidFuzz
|
||||
working-directory: examples/cmake_export/indirect_app/build
|
||||
run: ./fooapp
|
||||
|
||||
build_cpack_installed:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Configure CMake
|
||||
run: cmake -B build -DCMAKE_BUILD_TYPE=Release
|
||||
|
||||
- name: Install RapidFuzz
|
||||
working-directory: build
|
||||
run: |
|
||||
cpack -G DEB
|
||||
sudo dpkg -i *.deb
|
||||
|
||||
- name: Configure example project
|
||||
working-directory: examples/cmake_installed
|
||||
run: cmake -B build -DCMAKE_BUILD_TYPE=Release
|
||||
|
||||
- name: Build example project
|
||||
working-directory: examples/cmake_installed
|
||||
run: cmake --build build --config ${{env.BUILD_TYPE}}
|
||||
|
||||
- name: Run example project
|
||||
working-directory: examples/cmake_installed/build
|
||||
run: ./foo
|
||||
18
src/external/rapidfuzz-cpp/.github/workflows/documentation.yml
vendored
Normal file
18
src/external/rapidfuzz-cpp/.github/workflows/documentation.yml
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
name: documentation
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
build_docs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- run: sudo apt-get install -y doxygen
|
||||
- run: doxygen ./Doxyfile
|
||||
- uses: peaceiris/actions-gh-pages@v3
|
||||
with:
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
publish_dir: ./doxygen/html
|
||||
16
src/external/rapidfuzz-cpp/.gitignore
vendored
Normal file
16
src/external/rapidfuzz-cpp/.gitignore
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
.vscode/
|
||||
.cache/
|
||||
.idea/
|
||||
build/
|
||||
.cache/
|
||||
*.data
|
||||
*.so
|
||||
*.o
|
||||
*.out
|
||||
.vs/
|
||||
|
||||
CMakeCache.txt
|
||||
CMakeFiles
|
||||
CMakeScripts
|
||||
Makefile
|
||||
cmake_install.cmake
|
||||
224
src/external/rapidfuzz-cpp/CHANGELOG.md
vendored
Normal file
224
src/external/rapidfuzz-cpp/CHANGELOG.md
vendored
Normal file
@@ -0,0 +1,224 @@
|
||||
## Changelog
|
||||
|
||||
## [3.2.0] - 2024-12-17
|
||||
### Performance
|
||||
- improve calculation of min score inside partial_ratio so it can skip more alignments
|
||||
|
||||
## [3.1.1] - 2024-10-24
|
||||
### Fixed
|
||||
- Fixed incorrect score calculation for SIMD implementations of Levenshtein and OSA on 32 bit systems
|
||||
|
||||
## [3.1.0] - 024-10-24
|
||||
### Changed
|
||||
- split `editops_apply`/`opcodes_apply` into `*_apply_str` and `*_apply_vec`. This avoids the instantiation of
|
||||
std::basic_string for unsupported types.
|
||||
|
||||
## [3.0.5] - 2024-07-02
|
||||
### Fixed
|
||||
- the editops implementation didn't properly account for some cells in the Levenshtein matrix.
|
||||
This could lead both to incorrect results and crashes.
|
||||
|
||||
## [3.0.4] - 2023-04-07
|
||||
### Fixed
|
||||
- fix tagged version
|
||||
|
||||
## [3.0.3] - 2023-04-06
|
||||
### Fixed
|
||||
- fix potentially incorrect results of JaroWinkler when using high prefix weights
|
||||
|
||||
## [3.0.2] - 2023-03-04
|
||||
### Fixed
|
||||
- fix assert leading to compilation failures
|
||||
|
||||
## [3.0.1] - 2023-03-03
|
||||
### Fixed
|
||||
- fix doxygen warnings
|
||||
|
||||
## [3.0.0] - 2023-12-26
|
||||
### Performance
|
||||
- add banded implementation of LCS / Indel. This improves the runtime from `O((|s1|/64) * |s2|)` to `O((score_cutoff/64) * |s2|)`
|
||||
|
||||
### Changed
|
||||
- changed many types in the interface from int64_t to size_t, since they can't be negative.
|
||||
|
||||
### Fixed
|
||||
- fix incorrect transposition calculation in simd implementation of Jaro similarity
|
||||
- use posix_memalign on android
|
||||
|
||||
## [2.2.3] - 2023-11-02
|
||||
### Fixed
|
||||
- use _mm_malloc/_mm_free on macOS if aligned_alloc is unsupported
|
||||
|
||||
## [2.2.2] - 2023-10-31
|
||||
### Fixed
|
||||
- fix compilation failure on macOS
|
||||
|
||||
## [2.2.1] - 2023-10-31
|
||||
### Fixed
|
||||
- fix wraparound issue in simd implementation of Jaro and Jaro Winkler
|
||||
|
||||
## [2.2.0] - 2023-10-30
|
||||
#### Performance
|
||||
- improve performance of simd implementation for LCS and Indel by up to 50%
|
||||
- improve performance of simd implementation for Jaro and Jaro Winkler
|
||||
- improve performance of Jaro and Jaro Winkler for long sequences
|
||||
|
||||
## [2.1.1] - 2023-10-08
|
||||
### Fixed
|
||||
- fix edge case in new simd implementation of Jaro and Jaro Winkler
|
||||
|
||||
## [2.1.0] - 2023-10-08
|
||||
### Changed
|
||||
- add support for bidirectional iterators
|
||||
- add experimental simd implementation for Jaro and Jaro Winkler
|
||||
|
||||
### [2.0.0] - 2023-06-02
|
||||
#### Changed
|
||||
- added argument ``pad`` to Hamming distance. This controls whether sequences of different
|
||||
length should be padded or lead to a `std::invalid_argument` exception.
|
||||
- improve behaviour when including the project as cmake sub project
|
||||
|
||||
### [1.11.3] - 2023-04-18
|
||||
#### Fixed
|
||||
- add missing include leading to build failures on gcc 13
|
||||
|
||||
### [1.11.2] - 2023-04-17
|
||||
#### Fixed
|
||||
- fix handling of `score_cutoff > 1.0` in `Jaro` and `JaroWinkler`
|
||||
|
||||
### [1.11.1] - 2023-04-16
|
||||
#### Fixed
|
||||
- fix division by zero in simd implementation of normalized string metrics, when comparing empty strings
|
||||
|
||||
### [1.11.0] - 2023-04-16
|
||||
#### Changed
|
||||
- allow the usage of hamming for different string lengths. Length differences are handled as
|
||||
insertions / deletions
|
||||
|
||||
#### Fixed
|
||||
- fix some floating point comparisions in the test suite
|
||||
|
||||
### [1.10.4] - 2022-12-14
|
||||
#### Changed
|
||||
- Linters are now disabled in test builds by default and can be enabled using `RAPIDFUZZ_ENABLE_LINTERS`
|
||||
|
||||
### [1.10.3] - 2022-12-13
|
||||
#### Fixed
|
||||
- fix warning about `project_options` when building the test suite with `cmake>=3.24`
|
||||
|
||||
### [1.10.2] - 2022-12-01
|
||||
#### Fixed
|
||||
- `fuzz::partial_ratio` was not always symmetric when `len(s1) == len(s2)`
|
||||
- fix undefined behavior in experimental SIMD implementaton
|
||||
|
||||
### [1.10.1] - 2022-11-02
|
||||
#### Fixed
|
||||
- fix broken sse2 support
|
||||
|
||||
### [1.10.0] - 2022-10-29
|
||||
#### Fixed
|
||||
- fix bug in `Levenshtein.editops` leading to crashes when used with `score_hint`
|
||||
|
||||
#### Changed
|
||||
- add `score_hint` argument to cached implementations
|
||||
- add `score_hint` argument to Levenshtein functions
|
||||
|
||||
### [1.9.0] - 2022-10-22
|
||||
#### Added
|
||||
- added `Prefix`/`Postfix` similarity
|
||||
|
||||
### [1.8.0] - 2022-10-02
|
||||
#### Fixed
|
||||
- fixed incorrect score_cutoff handling in `lcs_seq_distance`
|
||||
|
||||
#### Added
|
||||
- added experimental simd support for `ratio`/`Levenshtein`/`LCSseq`/`Indel`
|
||||
- add Jaro and JaroWinkler
|
||||
|
||||
### [1.7.0] - 2022-09-18
|
||||
#### Added
|
||||
- add editops to hamming distance
|
||||
|
||||
#### Performance
|
||||
- strip common affix in osa distance
|
||||
|
||||
### [1.6.0] - 2022-09-16
|
||||
#### Added
|
||||
- add optimal string alignment (OSA) alignment
|
||||
|
||||
### [1.5.0] - 2022-09-11
|
||||
#### Fix
|
||||
- `fuzz::partial_ratio` did not find the optimal alignment in some edge cases
|
||||
|
||||
#### Performance
|
||||
- improve performance of `fuzz::partial_ratio`
|
||||
|
||||
### [1.4.1] - 2022-09-11
|
||||
#### Fixed
|
||||
- fix type mismatch error
|
||||
|
||||
### [1.4.0] - 2022-09-10
|
||||
#### Performance
|
||||
- improve performance of Levenshtein distance/editops calculation for long
|
||||
sequences when providing a `score_cutoff`/`score_hint`
|
||||
|
||||
### [1.3.0] - 2022-09-03
|
||||
#### Performance
|
||||
- improve performance of Levenshtein distance
|
||||
- improve performance when `score_cutoff = 1`
|
||||
- improve performance for long sequences when `3 < score_cutoff < 32`
|
||||
- improve performance of Levenshtein editops
|
||||
|
||||
#### Fixed
|
||||
- fix incorrect results of partial_ratio for long needles
|
||||
|
||||
### [1.2.0] - 2022-08-20
|
||||
#### Added
|
||||
- added damerau levenshtein implementation
|
||||
- Not API stable yet, since it will be extended with weights in a future version
|
||||
|
||||
### [1.1.1] - 2022-07-29
|
||||
#### Performance
|
||||
- improve performance for banded Levenshtein implementation
|
||||
|
||||
### [1.1.0] - 2022-07-29
|
||||
#### Fixed
|
||||
- fix banded Levenshtein implementation
|
||||
|
||||
#### Changed
|
||||
- implement Hirschbergs algorithms to reduce memory usage of
|
||||
levenshtein_editops
|
||||
|
||||
### [1.0.5] - 2022-07-23
|
||||
#### Fixed
|
||||
- fix opcode conversion for empty source sequence
|
||||
|
||||
### [1.0.4] - 2022-06-29
|
||||
#### Fixed
|
||||
- fix implementation of hamming_normalized_similarity
|
||||
- fix implementation of CachedLCSseq::distance
|
||||
|
||||
### [1.0.3] - 2022-06-24
|
||||
#### Fixed
|
||||
- fix integer wraparound in partial_ratio/partial_ratio_alignment
|
||||
|
||||
### [1.0.2] - 2022-06-11
|
||||
#### Fixed
|
||||
- fix unlimited recursion in CachedLCSseq::similarity
|
||||
- reduce compiler warnings
|
||||
|
||||
### [1.0.1] - 2022-04-16
|
||||
#### Fixed
|
||||
- fix undefined behavior in sorted_split incrementing iterator past the end
|
||||
- fix use after free in editops calculation
|
||||
- reduce compiler warnings
|
||||
|
||||
### [1.0.1] - 2022-04-16
|
||||
#### Added
|
||||
- added LCSseq (longest common subsequence) implementation
|
||||
|
||||
#### Fixed
|
||||
- reduced compiler warnings
|
||||
- consider float imprecision in score_cutoff
|
||||
- fix incorrect score_cutoff handling in token_set_ratio and token_ratio
|
||||
- fix template deduction guides on MSVC
|
||||
143
src/external/rapidfuzz-cpp/CMakeLists.txt
vendored
Normal file
143
src/external/rapidfuzz-cpp/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,143 @@
|
||||
# Cmake config largely taken from catch2
|
||||
cmake_minimum_required(VERSION 3.5)
|
||||
|
||||
if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24)
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
endif()
|
||||
|
||||
# detect if Catch is being bundled,
|
||||
# disable testsuite in that case
|
||||
if(NOT DEFINED PROJECT_NAME)
|
||||
set(NOT_SUBPROJECT ON)
|
||||
# If RapidFuzz is not being used as a subproject via `add_subdirectory`,
|
||||
# usually installation is required
|
||||
option(RAPIDFUZZ_INSTALL "Install rapidfuzz" ON)
|
||||
else()
|
||||
set(NOT_SUBPROJECT OFF)
|
||||
# If RapidFuzz is being used as a subproject via `add_subdirectory`,
|
||||
# chances are that the "main project" does not include RapidFuzz headers
|
||||
# in any of its headers, in which case installation is not needed.
|
||||
option(RAPIDFUZZ_INSTALL "Install rapidfuzz (Projects embedding rapidfuzz may want to turn this OFF.)" OFF)
|
||||
endif()
|
||||
|
||||
option(RAPIDFUZZ_BUILD_TESTING "Build tests" OFF)
|
||||
option(RAPIDFUZZ_ENABLE_LINTERS "Enable Linters for the test builds" OFF)
|
||||
option(RAPIDFUZZ_BUILD_BENCHMARKS "Build benchmarks" OFF)
|
||||
option(RAPIDFUZZ_BUILD_FUZZERS "Build fuzzers" OFF)
|
||||
|
||||
# RapidFuzz's build breaks if done in-tree. You probably should not build
|
||||
# things in tree anyway, but we can allow projects that include RapidFuzz
|
||||
# as a subproject to build in-tree as long as it is not in our tree.
|
||||
if (CMAKE_BINARY_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||
message(FATAL_ERROR "Building in-source is not supported! Create a build dir and remove ${CMAKE_SOURCE_DIR}/CMakeCache.txt")
|
||||
endif()
|
||||
|
||||
project(rapidfuzz LANGUAGES CXX VERSION 3.2.0)
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
|
||||
include(GNUInstallDirs)
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
# Basic paths
|
||||
set(BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
set(SOURCES_DIR ${BASE_DIR}/rapidfuzz)
|
||||
set(TEST_DIR ${BASE_DIR}/test)
|
||||
set(BENCHMARK_DIR ${BASE_DIR}/tests/bench)
|
||||
set(EXAMPLES_DIR ${BASE_DIR}/examples)
|
||||
|
||||
add_library(rapidfuzz INTERFACE)
|
||||
|
||||
# provide a namespaced alias for clients to 'link' against if RapidFuzz is included as a sub-project
|
||||
add_library(rapidfuzz::rapidfuzz ALIAS rapidfuzz)
|
||||
|
||||
target_compile_features(rapidfuzz INTERFACE cxx_std_17)
|
||||
|
||||
target_include_directories(rapidfuzz
|
||||
INTERFACE
|
||||
$<BUILD_INTERFACE:${SOURCES_DIR}/..>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
|
||||
)
|
||||
|
||||
# Build tests only if requested
|
||||
if(RAPIDFUZZ_BUILD_TESTING AND NOT_SUBPROJECT)
|
||||
include(CTest)
|
||||
enable_testing()
|
||||
add_subdirectory(test)
|
||||
endif()
|
||||
|
||||
# Build examples only if requested
|
||||
if(RAPIDFUZZ_BUILD_EXAMPLES)
|
||||
#add_subdirectory(examples)
|
||||
endif()
|
||||
|
||||
# Build benchmarks only if requested
|
||||
if(RAPIDFUZZ_BUILD_BENCHMARKS)
|
||||
add_subdirectory(bench)
|
||||
endif()
|
||||
|
||||
# Build fuzz tests only if requested
|
||||
if(RAPIDFUZZ_BUILD_FUZZERS)
|
||||
add_subdirectory(fuzzing)
|
||||
endif()
|
||||
|
||||
if (RAPIDFUZZ_INSTALL)
|
||||
set(RAPIDFUZZ_CMAKE_CONFIG_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/rapidfuzz")
|
||||
|
||||
install(
|
||||
TARGETS
|
||||
rapidfuzz
|
||||
EXPORT
|
||||
rapidfuzzTargets
|
||||
DESTINATION
|
||||
${CMAKE_INSTALL_LIBDIR}
|
||||
)
|
||||
|
||||
install(
|
||||
EXPORT
|
||||
rapidfuzzTargets
|
||||
NAMESPACE
|
||||
rapidfuzz::
|
||||
DESTINATION
|
||||
${RAPIDFUZZ_CMAKE_CONFIG_DESTINATION}
|
||||
)
|
||||
|
||||
install(
|
||||
DIRECTORY
|
||||
rapidfuzz
|
||||
DESTINATION
|
||||
${CMAKE_INSTALL_INCLUDEDIR}
|
||||
FILES_MATCHING
|
||||
PATTERN "*.hpp"
|
||||
PATTERN "*.impl"
|
||||
)
|
||||
|
||||
configure_package_config_file(
|
||||
${CMAKE_CURRENT_LIST_DIR}/cmake/${PROJECT_NAME}Config.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake
|
||||
INSTALL_DESTINATION ${RAPIDFUZZ_CMAKE_CONFIG_DESTINATION}
|
||||
)
|
||||
|
||||
write_basic_package_version_file(
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
|
||||
COMPATIBILITY SameMajorVersion
|
||||
)
|
||||
|
||||
install(
|
||||
FILES
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
|
||||
DESTINATION
|
||||
${RAPIDFUZZ_CMAKE_CONFIG_DESTINATION}
|
||||
)
|
||||
|
||||
# CPack/CMake started taking the package version from project version 3.12
|
||||
# So we need to set the version manually for older CMake versions
|
||||
if(${CMAKE_VERSION} VERSION_LESS "3.12.0")
|
||||
set(CPACK_PACKAGE_VERSION ${PROJECT_VERSION})
|
||||
endif()
|
||||
|
||||
set(CPACK_PACKAGE_VENDOR "Max Bachmann")
|
||||
set(CPACK_PACKAGE_CONTACT "https://github.com/rapidfuzz/rapidfuzz-cpp")
|
||||
include(CPack)
|
||||
|
||||
endif(RAPIDFUZZ_INSTALL)
|
||||
105
src/external/rapidfuzz-cpp/Doxyfile
vendored
Normal file
105
src/external/rapidfuzz-cpp/Doxyfile
vendored
Normal file
@@ -0,0 +1,105 @@
|
||||
# Doxyfile 1.8.20
|
||||
|
||||
PROJECT_NAME = RapidFuzz
|
||||
|
||||
OUTPUT_DIRECTORY = doxygen
|
||||
|
||||
|
||||
|
||||
# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
|
||||
# to include (a tag file for) the STL sources as input, then you should set this
|
||||
# tag to YES in order to let doxygen match functions declarations and
|
||||
# definitions whose arguments contain STL classes (e.g. func(std::string);
|
||||
# versus func(std::string) {}). This also make the inheritance and collaboration
|
||||
# diagrams that involve STL classes more complete and accurate.
|
||||
# The default value is: NO.
|
||||
|
||||
BUILTIN_STL_SUPPORT = YES
|
||||
|
||||
|
||||
EXTRACT_PRIVATE = YES
|
||||
|
||||
|
||||
|
||||
EXTRACT_STATIC = YES
|
||||
|
||||
|
||||
HIDE_UNDOC_MEMBERS = YES
|
||||
|
||||
HIDE_UNDOC_CLASSES = YES
|
||||
|
||||
|
||||
# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
|
||||
# their full class and namespace scopes in the documentation. If set to YES, the
|
||||
# scope will be hidden.
|
||||
# The default value is: NO.
|
||||
|
||||
HIDE_SCOPE_NAMES = NO
|
||||
|
||||
# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
|
||||
# append additional text to a page's title, such as Class Reference. If set to
|
||||
# YES the compound reference will be hidden.
|
||||
# The default value is: NO.
|
||||
|
||||
HIDE_COMPOUND_REFERENCE= NO
|
||||
|
||||
# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
|
||||
# the files that are included by a file in the documentation of that file.
|
||||
# The default value is: YES.
|
||||
|
||||
SHOW_INCLUDE_FILES = YES
|
||||
|
||||
# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
|
||||
# grouped member an include statement to the documentation, telling the reader
|
||||
# which file to include in order to use the member.
|
||||
# The default value is: NO.
|
||||
|
||||
SHOW_GROUPED_MEMB_INC = YES
|
||||
|
||||
|
||||
# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
|
||||
# list. This list is created by putting \todo commands in the documentation.
|
||||
# The default value is: YES.
|
||||
|
||||
GENERATE_TODOLIST = NO
|
||||
|
||||
SHOW_FILES = NO
|
||||
|
||||
|
||||
# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
|
||||
# the reference definitions. This must be a list of .bib files. The .bib
|
||||
# extension is automatically appended if omitted. This requires the bibtex tool
|
||||
# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
|
||||
# For LaTeX the style of the bibliography can be controlled using
|
||||
# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
|
||||
# search path. See also \cite for info how to create references.
|
||||
|
||||
CITE_BIB_FILES = docs/literature/hyrro_lcs_2004 \
|
||||
docs/literature/hyrro_2002 \
|
||||
docs/literature/hyrro_2004 \
|
||||
docs/literature/myers_1999 \
|
||||
docs/literature/wagner_fischer_1974
|
||||
|
||||
|
||||
EXTRA_PACKAGES = amsmath xr amsfonts
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
# Configuration options related to the input files
|
||||
#---------------------------------------------------------------------------
|
||||
|
||||
INPUT = rapidfuzz
|
||||
|
||||
FILE_PATTERNS = *.c \
|
||||
*.cxx \
|
||||
*.cpp \
|
||||
*.h \
|
||||
*.hpp \
|
||||
*.md
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
# Configuration options related to the LaTeX output
|
||||
#---------------------------------------------------------------------------
|
||||
|
||||
GENERATE_LATEX = NO
|
||||
|
||||
HAVE_DOT = YES
|
||||
21
src/external/rapidfuzz-cpp/LICENSE
vendored
Normal file
21
src/external/rapidfuzz-cpp/LICENSE
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
Copyright © 2020 Max Bachmann
|
||||
Copyright © 2011 Adam Cohen
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
242
src/external/rapidfuzz-cpp/README.md
vendored
Normal file
242
src/external/rapidfuzz-cpp/README.md
vendored
Normal file
@@ -0,0 +1,242 @@
|
||||
<h1 align="center">
|
||||
<img src="https://raw.githubusercontent.com/rapidfuzz/rapidfuzz/master/docs/img/RapidFuzz.svg?sanitize=true" alt="RapidFuzz" width="400">
|
||||
</h1>
|
||||
<h4 align="center">Rapid fuzzy string matching in C++ using the Levenshtein Distance</h4>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://github.com/rapidfuzz/rapidfuzz-cpp/actions">
|
||||
<img src="https://github.com/rapidfuzz/rapidfuzz-cpp/workflows/CMake/badge.svg"
|
||||
alt="Continuous Integration">
|
||||
</a>
|
||||
<a href="https://rapidfuzz.github.io/rapidfuzz-cpp">
|
||||
<img src="https://img.shields.io/badge/-documentation-blue"
|
||||
alt="Documentation">
|
||||
</a>
|
||||
<a href="https://github.com/rapidfuzz/rapidfuzz-cpp/blob/dev/LICENSE">
|
||||
<img src="https://img.shields.io/github/license/rapidfuzz/rapidfuzz-cpp"
|
||||
alt="GitHub license">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="#description">Description</a> •
|
||||
<a href="#installation">Installation</a> •
|
||||
<a href="#usage">Usage</a> •
|
||||
<a href="#license">License</a>
|
||||
</p>
|
||||
|
||||
---
|
||||
## Description
|
||||
RapidFuzz is a fast string matching library for Python and C++, which is using the string similarity calculations from [FuzzyWuzzy](https://github.com/seatgeek/fuzzywuzzy). However, there are two aspects that set RapidFuzz apart from FuzzyWuzzy:
|
||||
1) It is MIT licensed so it can be used whichever License you might want to choose for your project, while you're forced to adopt the GPL license when using FuzzyWuzzy
|
||||
2) It is mostly written in C++ and on top of this comes with a lot of Algorithmic improvements to make string matching even faster, while still providing the same results. More details on these performance improvements in the form of benchmarks can be found [here](https://github.com/rapidfuzz/rapidfuzz/blob/master/Benchmarks.md)
|
||||
|
||||
The Library is split across multiple repositories for the different supported programming languages:
|
||||
- The C++ version is versioned in this repository
|
||||
- The Python version can be found at [rapidfuzz/rapidfuzz](https://github.com/rapidfuzz/rapidfuzz)
|
||||
|
||||
|
||||
## CMake Integration
|
||||
|
||||
There are severals ways to integrate `rapidfuzz` in your CMake project.
|
||||
|
||||
### By Installing it
|
||||
```bash
|
||||
git clone https://github.com/rapidfuzz/rapidfuzz-cpp.git rapidfuzz-cpp
|
||||
cd rapidfuzz-cpp
|
||||
mkdir build && cd build
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release
|
||||
cmake --build .
|
||||
cmake --build . --target install
|
||||
```
|
||||
|
||||
Then in your CMakeLists.txt:
|
||||
```cmake
|
||||
find_package(rapidfuzz REQUIRED)
|
||||
add_executable(foo main.cpp)
|
||||
target_link_libraries(foo rapidfuzz::rapidfuzz)
|
||||
```
|
||||
|
||||
### Add this repository as a submodule
|
||||
```bash
|
||||
git submodule add https://github.com/rapidfuzz/rapidfuzz-cpp.git 3rdparty/RapidFuzz
|
||||
```
|
||||
Then you can either:
|
||||
|
||||
1. include it as a subdirectory
|
||||
```cmake
|
||||
add_subdirectory(3rdparty/RapidFuzz)
|
||||
add_executable(foo main.cpp)
|
||||
target_link_libraries(foo rapidfuzz::rapidfuzz)
|
||||
```
|
||||
2. build it at configure time with `FetchContent`
|
||||
```cmake
|
||||
FetchContent_Declare(
|
||||
rapidfuzz
|
||||
SOURCE_DIR ${CMAKE_SOURCE_DIR}/3rdparty/RapidFuzz
|
||||
PREFIX ${CMAKE_CURRENT_BINARY_DIR}/rapidfuzz
|
||||
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR> "${CMAKE_OPT_ARGS}"
|
||||
)
|
||||
FetchContent_MakeAvailable(rapidfuzz)
|
||||
add_executable(foo main.cpp)
|
||||
target_link_libraries(foo PRIVATE rapidfuzz::rapidfuzz)
|
||||
```
|
||||
### Download it at configure time
|
||||
|
||||
If you don't want to add `rapidfuzz-cpp` as a submodule, you can also download it with `FetchContent`:
|
||||
```cmake
|
||||
FetchContent_Declare(rapidfuzz
|
||||
GIT_REPOSITORY https://github.com/rapidfuzz/rapidfuzz-cpp.git
|
||||
GIT_TAG main)
|
||||
FetchContent_MakeAvailable(rapidfuzz)
|
||||
add_executable(foo main.cpp)
|
||||
target_link_libraries(foo PRIVATE rapidfuzz::rapidfuzz)
|
||||
```
|
||||
It will be downloaded each time you run CMake in a blank folder.
|
||||
|
||||
## CMake option
|
||||
|
||||
There are CMake options available:
|
||||
|
||||
1. `RAPIDFUZZ_BUILD_TESTING` : to build test (default OFF and requires [Catch2](https://github.com/catchorg/Catch2))
|
||||
2. `RAPIDFUZZ_BUILD_BENCHMARKS` : to build benchmarks (default OFF and requires [Google Benchmark](https://github.com/google/benchmark))
|
||||
3. `RAPIDFUZZ_INSTALL` : to install the library to local computer
|
||||
- When configured independently, installation is on.
|
||||
- When used as a subproject, the installation is turned off by default.
|
||||
- For library developers, you might want to toggle the behavior depending on your project.
|
||||
- If your project is exported via `CMake`, turn installation on or export error will result.
|
||||
- If your project publicly depends on `RapidFuzz` (includes `rapidfuzz.hpp` in header),
|
||||
turn installation on or apps depending on your project would face include errors.
|
||||
|
||||
## Usage
|
||||
```cpp
|
||||
#include <rapidfuzz/fuzz.hpp>
|
||||
```
|
||||
|
||||
### Simple Ratio
|
||||
```cpp
|
||||
using rapidfuzz::fuzz::ratio;
|
||||
|
||||
// score is 96.55171966552734
|
||||
double score = rapidfuzz::fuzz::ratio("this is a test", "this is a test!");
|
||||
```
|
||||
|
||||
### Partial Ratio
|
||||
```cpp
|
||||
// score is 100
|
||||
double score = rapidfuzz::fuzz::partial_ratio("this is a test", "this is a test!");
|
||||
```
|
||||
|
||||
### Token Sort Ratio
|
||||
```cpp
|
||||
// score is 90.90908813476562
|
||||
double score = rapidfuzz::fuzz::ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
|
||||
|
||||
// score is 100
|
||||
double score = rapidfuzz::fuzz::token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
|
||||
```
|
||||
|
||||
### Token Set Ratio
|
||||
```cpp
|
||||
// score is 83.8709716796875
|
||||
double score = rapidfuzz::fuzz::token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
|
||||
|
||||
// score is 100
|
||||
double score = rapidfuzz::fuzz::token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
|
||||
```
|
||||
|
||||
### Process
|
||||
In the Python implementation, there is a module process, which is used to compare e.g. a string to a list of strings.
|
||||
In Python, this both saves the time to implement those features yourself and can be a lot more efficient than repeated type
|
||||
conversions between Python and C++. Implementing a similar function in C++ using templates is not easily possible and probably slower than implementing them on your own. That's why this section describes how users can implement those features with a couple of lines of code using the C++ library.
|
||||
|
||||
### extract
|
||||
|
||||
The following function compares a query string to all strings in a list of choices. It returns all
|
||||
elements with a similarity over score_cutoff. Generally make use of the cached implementations when comparing
|
||||
a string to multiple strings.
|
||||
|
||||
|
||||
```cpp
|
||||
template <typename Sentence1,
|
||||
typename Iterable, typename Sentence2 = typename Iterable::value_type>
|
||||
std::vector<std::pair<Sentence2, double>>
|
||||
extract(const Sentence1& query, const Iterable& choices, const double score_cutoff = 0.0)
|
||||
{
|
||||
std::vector<std::pair<Sentence2, double>> results;
|
||||
|
||||
rapidfuzz::fuzz::CachedRatio<typename Sentence1::value_type> scorer(query);
|
||||
|
||||
for (const auto& choice : choices) {
|
||||
double score = scorer.similarity(choice, score_cutoff);
|
||||
|
||||
if (score >= score_cutoff) {
|
||||
results.emplace_back(choice, score);
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
```
|
||||
|
||||
### extractOne
|
||||
|
||||
The following function compares a query string to all strings in a list of choices.
|
||||
|
||||
```cpp
|
||||
template <typename Sentence1,
|
||||
typename Iterable, typename Sentence2 = typename Iterable::value_type>
|
||||
std::optional<std::pair<Sentence2, double>>
|
||||
extractOne(const Sentence1& query, const Iterable& choices, const double score_cutoff = 0.0)
|
||||
{
|
||||
bool match_found = false;
|
||||
double best_score = score_cutoff;
|
||||
Sentence2 best_match;
|
||||
|
||||
rapidfuzz::fuzz::CachedRatio<typename Sentence1::value_type> scorer(query);
|
||||
|
||||
for (const auto& choice : choices) {
|
||||
double score = scorer.similarity(choice, best_score);
|
||||
|
||||
if (score >= best_score) {
|
||||
match_found = true;
|
||||
best_score = score;
|
||||
best_match = choice;
|
||||
}
|
||||
}
|
||||
|
||||
if (!match_found) {
|
||||
return nullopt;
|
||||
}
|
||||
|
||||
return std::make_pair(best_match, best_score);
|
||||
}
|
||||
```
|
||||
|
||||
### multithreading
|
||||
|
||||
It is very simple to use those scorers e.g. with open OpenMP to achieve better performance.
|
||||
|
||||
```cpp
|
||||
template <typename Sentence1,
|
||||
typename Iterable, typename Sentence2 = typename Iterable::value_type>
|
||||
std::vector<std::pair<Sentence2, double>>
|
||||
extract(const Sentence1& query, const Iterable& choices, const double score_cutoff = 0.0)
|
||||
{
|
||||
std::vector<std::pair<Sentence2, double>> results(choices.size());
|
||||
|
||||
rapidfuzz::fuzz::CachedRatio<typename Sentence1::value_type> scorer(query);
|
||||
|
||||
#pragma omp parallel for
|
||||
for (size_t i = 0; i < choices.size(); ++i) {
|
||||
double score = scorer.similarity(choices[i], score_cutoff);
|
||||
results[i] = std::make_pair(choices[i], score);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
```
|
||||
|
||||
## License
|
||||
RapidFuzz is licensed under the MIT license since I believe that everyone should be able to use it without being forced to adopt the GPL license. That's why the library is based on an older version of fuzzywuzzy that was MIT-licensed as well.
|
||||
This old version of fuzzywuzzy can be found [here](https://github.com/seatgeek/fuzzywuzzy/tree/4bf28161f7005f3aa9d4d931455ac55126918df7).
|
||||
19
src/external/rapidfuzz-cpp/SECURITY.md
vendored
Normal file
19
src/external/rapidfuzz-cpp/SECURITY.md
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
## Reporting Security Issues
|
||||
|
||||
If you believe you have found a security vulnerability in the project, please report it to us through coordinated disclosure.
|
||||
|
||||
**Please do not report security vulnerabilities through public GitHub issues, discussions, or pull requests.**
|
||||
|
||||
Instead, please send an email to oss@maxbachmann.de.
|
||||
|
||||
Please include as much of the information listed below as you can to help us better understand and resolve the issue:
|
||||
|
||||
* The type of issue (e.g., buffer overflow, SQL injection, or cross-site scripting)
|
||||
* Full paths of source file(s) related to the manifestation of the issue
|
||||
* The location of the affected source code (tag/branch/commit or direct URL)
|
||||
* Any special configuration required to reproduce the issue
|
||||
* Step-by-step instructions to reproduce the issue
|
||||
* Proof-of-concept or exploit code (if possible)
|
||||
* Impact of the issue, including how an attacker might exploit the issue
|
||||
|
||||
This information will help us triage your report more quickly.
|
||||
25
src/external/rapidfuzz-cpp/bench/CMakeLists.txt
vendored
Normal file
25
src/external/rapidfuzz-cpp/bench/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(googletest
|
||||
GIT_REPOSITORY https://github.com/google/googletest.git
|
||||
GIT_TAG v1.12.x)
|
||||
|
||||
FetchContent_Declare(googlebenchmark
|
||||
GIT_REPOSITORY https://github.com/google/benchmark.git
|
||||
GIT_TAG main) # need master for benchmark::benchmark
|
||||
|
||||
FetchContent_MakeAvailable(
|
||||
googletest
|
||||
googlebenchmark)
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
|
||||
|
||||
function(rapidfuzz_add_benchmark NAME SOURCE)
|
||||
add_executable(bench_${NAME} ${SOURCE})
|
||||
target_link_libraries(bench_${NAME} PRIVATE ${PROJECT_NAME})
|
||||
target_link_libraries(bench_${NAME} PRIVATE benchmark::benchmark)
|
||||
endfunction()
|
||||
|
||||
rapidfuzz_add_benchmark(lcs bench-lcs.cpp)
|
||||
rapidfuzz_add_benchmark(fuzz bench-fuzz.cpp)
|
||||
rapidfuzz_add_benchmark(levenshtein bench-levenshtein.cpp)
|
||||
rapidfuzz_add_benchmark(jarowinkler bench-jarowinkler.cpp)
|
||||
225
src/external/rapidfuzz-cpp/bench/bench-fuzz.cpp
vendored
Normal file
225
src/external/rapidfuzz-cpp/bench/bench-fuzz.cpp
vendored
Normal file
@@ -0,0 +1,225 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <rapidfuzz/fuzz.hpp>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using rapidfuzz::fuzz::partial_ratio;
|
||||
using rapidfuzz::fuzz::partial_token_ratio;
|
||||
using rapidfuzz::fuzz::partial_token_set_ratio;
|
||||
using rapidfuzz::fuzz::partial_token_sort_ratio;
|
||||
using rapidfuzz::fuzz::ratio;
|
||||
using rapidfuzz::fuzz::token_ratio;
|
||||
using rapidfuzz::fuzz::token_set_ratio;
|
||||
using rapidfuzz::fuzz::token_sort_ratio;
|
||||
using rapidfuzz::fuzz::WRatio;
|
||||
|
||||
static void BM_FuzzRatio1(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(ratio(a, a));
|
||||
}
|
||||
state.SetLabel("Similar Strings");
|
||||
}
|
||||
|
||||
static void BM_FuzzRatio2(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
std::wstring b = L"bbbbb bbbbb";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(ratio(a, b));
|
||||
}
|
||||
state.SetLabel("Different Strings");
|
||||
}
|
||||
|
||||
BENCHMARK(BM_FuzzRatio1);
|
||||
BENCHMARK(BM_FuzzRatio2);
|
||||
|
||||
static void BM_FuzzPartialRatio1(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(partial_ratio(a, a));
|
||||
}
|
||||
state.SetLabel("Similar Strings");
|
||||
}
|
||||
|
||||
static void BM_FuzzPartialRatio2(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
std::wstring b = L"bbbbb bbbbb";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(partial_ratio(a, b));
|
||||
}
|
||||
state.SetLabel("Different Strings");
|
||||
}
|
||||
|
||||
BENCHMARK(BM_FuzzPartialRatio1);
|
||||
BENCHMARK(BM_FuzzPartialRatio2);
|
||||
|
||||
static void BM_FuzzTokenSort1(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(token_sort_ratio(a, a));
|
||||
}
|
||||
state.SetLabel("Similar Strings");
|
||||
}
|
||||
|
||||
static void BM_FuzzTokenSort2(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
std::wstring b = L"bbbbb bbbbb";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(token_sort_ratio(a, b));
|
||||
}
|
||||
state.SetLabel("Different Strings");
|
||||
}
|
||||
|
||||
BENCHMARK(BM_FuzzTokenSort1);
|
||||
BENCHMARK(BM_FuzzTokenSort2);
|
||||
|
||||
static void BM_FuzzPartialTokenSort1(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(partial_token_sort_ratio(a, a));
|
||||
}
|
||||
state.SetLabel("Similar Strings");
|
||||
}
|
||||
|
||||
static void BM_FuzzPartialTokenSort2(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
std::wstring b = L"bbbbb bbbbb";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(partial_token_sort_ratio(a, b));
|
||||
}
|
||||
state.SetLabel("Different Strings");
|
||||
}
|
||||
|
||||
BENCHMARK(BM_FuzzPartialTokenSort1);
|
||||
BENCHMARK(BM_FuzzPartialTokenSort2);
|
||||
|
||||
static void BM_FuzzTokenSet1(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(token_set_ratio(a, a));
|
||||
}
|
||||
state.SetLabel("Similar Strings");
|
||||
}
|
||||
|
||||
static void BM_FuzzTokenSet2(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
std::wstring b = L"bbbbb bbbbb";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(token_set_ratio(a, b));
|
||||
}
|
||||
state.SetLabel("Different Strings");
|
||||
}
|
||||
|
||||
BENCHMARK(BM_FuzzTokenSet1);
|
||||
BENCHMARK(BM_FuzzTokenSet2);
|
||||
|
||||
static void BM_FuzzPartialTokenSet1(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(partial_token_set_ratio(a, a));
|
||||
}
|
||||
state.SetLabel("Similar Strings");
|
||||
}
|
||||
|
||||
static void BM_FuzzPartialTokenSet2(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
std::wstring b = L"bbbbb bbbbb";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(partial_token_set_ratio(a, b));
|
||||
}
|
||||
state.SetLabel("Different Strings");
|
||||
}
|
||||
|
||||
BENCHMARK(BM_FuzzPartialTokenSet1);
|
||||
BENCHMARK(BM_FuzzPartialTokenSet2);
|
||||
|
||||
static void BM_FuzzToken1(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(token_ratio(a, a));
|
||||
}
|
||||
state.SetLabel("Similar Strings");
|
||||
}
|
||||
|
||||
static void BM_FuzzToken2(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
std::wstring b = L"bbbbb bbbbb";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(token_ratio(a, b));
|
||||
}
|
||||
state.SetLabel("Different Strings");
|
||||
}
|
||||
|
||||
BENCHMARK(BM_FuzzToken1);
|
||||
BENCHMARK(BM_FuzzToken2);
|
||||
|
||||
static void BM_FuzzPartialToken1(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(partial_token_ratio(a, a));
|
||||
}
|
||||
state.SetLabel("Similar Strings");
|
||||
}
|
||||
|
||||
static void BM_FuzzPartialToken2(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
std::wstring b = L"bbbbb bbbbb";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(partial_token_ratio(a, b));
|
||||
}
|
||||
state.SetLabel("Different Strings");
|
||||
}
|
||||
|
||||
BENCHMARK(BM_FuzzPartialToken1);
|
||||
BENCHMARK(BM_FuzzPartialToken2);
|
||||
|
||||
static void BM_FuzzWRatio1(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(WRatio(a, a));
|
||||
}
|
||||
state.SetLabel("Similar Strings");
|
||||
}
|
||||
|
||||
static void BM_FuzzWRatio3(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa aaaaa";
|
||||
std::wstring b = L"bbbbb bbbbb";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(WRatio(a, b));
|
||||
}
|
||||
state.SetLabel("Different Strings");
|
||||
}
|
||||
|
||||
static void BM_FuzzWRatio2(benchmark::State& state)
|
||||
{
|
||||
std::wstring a = L"aaaaa b";
|
||||
std::wstring b = L"bbbbb bbbbbbbbb";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(WRatio(a, b));
|
||||
}
|
||||
state.SetLabel("Different length Strings");
|
||||
}
|
||||
|
||||
BENCHMARK(BM_FuzzWRatio1);
|
||||
BENCHMARK(BM_FuzzWRatio2);
|
||||
BENCHMARK(BM_FuzzWRatio3);
|
||||
|
||||
BENCHMARK_MAIN();
|
||||
196
src/external/rapidfuzz-cpp/bench/bench-jarowinkler.cpp
vendored
Normal file
196
src/external/rapidfuzz-cpp/bench/bench-jarowinkler.cpp
vendored
Normal file
@@ -0,0 +1,196 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <random>
|
||||
#include <rapidfuzz/distance/Jaro.hpp>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
std::string generate(int max_length)
|
||||
{
|
||||
std::string possible_characters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
||||
std::random_device rd;
|
||||
std::mt19937 engine(rd());
|
||||
std::uniform_int_distribution<> dist(0, static_cast<int>(possible_characters.size() - 1));
|
||||
std::string ret = "";
|
||||
for (int i = 0; i < max_length; i++) {
|
||||
int random_index = dist(engine);
|
||||
ret += possible_characters[static_cast<size_t>(random_index)];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::basic_string<T> str_multiply(std::basic_string<T> a, unsigned int b)
|
||||
{
|
||||
std::basic_string<T> output;
|
||||
while (b--)
|
||||
output += a;
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
static void BM_JaroLongSimilarSequence(benchmark::State& state)
|
||||
{
|
||||
size_t len = state.range(0);
|
||||
size_t score_cutoff = state.range(1);
|
||||
std::string s1 = std::string("a") + str_multiply(std::string("b"), (len - 2)) + std::string("a");
|
||||
std::string s2 = str_multiply(std::string("b"), len);
|
||||
|
||||
size_t num = 0;
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(rapidfuzz::jaro_similarity(s1, s2));
|
||||
++num;
|
||||
}
|
||||
|
||||
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num * len), benchmark::Counter::kIsRate);
|
||||
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num * len),
|
||||
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
|
||||
}
|
||||
|
||||
static void BM_JaroLongNonSimilarSequence(benchmark::State& state)
|
||||
{
|
||||
size_t len = state.range(0);
|
||||
size_t score_cutoff = state.range(1);
|
||||
std::string s1 = str_multiply(std::string("a"), len);
|
||||
std::string s2 = str_multiply(std::string("b"), len);
|
||||
|
||||
size_t num = 0;
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(rapidfuzz::jaro_similarity(s1, s2));
|
||||
++num;
|
||||
}
|
||||
|
||||
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num * len), benchmark::Counter::kIsRate);
|
||||
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num * len),
|
||||
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
|
||||
}
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
template <size_t MaxLen1, size_t MaxLen2>
|
||||
static void BM_Jaro_SIMD(benchmark::State& state)
|
||||
{
|
||||
std::vector<std::string> seq1;
|
||||
std::vector<std::string> seq2;
|
||||
std::vector<double> results(64);
|
||||
for (int i = 0; i < 64; i++)
|
||||
seq1.push_back(generate(MaxLen1));
|
||||
for (int i = 0; i < 10000; i++)
|
||||
seq2.push_back(generate(MaxLen2));
|
||||
|
||||
size_t num = 0;
|
||||
for (auto _ : state) {
|
||||
rapidfuzz::experimental::MultiJaro<MaxLen1> scorer(seq1.size());
|
||||
for (const auto& str1 : seq1)
|
||||
scorer.insert(str1);
|
||||
|
||||
for (const auto& str2 : seq2)
|
||||
scorer.similarity(&results[0], results.size(), str2);
|
||||
|
||||
num += seq1.size() * seq2.size();
|
||||
}
|
||||
|
||||
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num), benchmark::Counter::kIsRate);
|
||||
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num),
|
||||
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <size_t MaxLen1, size_t MaxLen2>
|
||||
static void BM_Jaro(benchmark::State& state)
|
||||
{
|
||||
std::vector<std::string> seq1;
|
||||
std::vector<std::string> seq2;
|
||||
for (int i = 0; i < 256; i++)
|
||||
seq1.push_back(generate(MaxLen1));
|
||||
for (int i = 0; i < 10000; i++)
|
||||
seq2.push_back(generate(MaxLen2));
|
||||
|
||||
size_t num = 0;
|
||||
for (auto _ : state) {
|
||||
for (size_t j = 0; j < seq2.size(); ++j)
|
||||
for (size_t i = 0; i < seq1.size(); ++i)
|
||||
benchmark::DoNotOptimize(rapidfuzz::jaro_similarity(seq1[i], seq2[j]));
|
||||
|
||||
num += seq1.size() * seq2.size();
|
||||
}
|
||||
|
||||
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num), benchmark::Counter::kIsRate);
|
||||
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num),
|
||||
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
|
||||
}
|
||||
|
||||
template <size_t MaxLen1, size_t MaxLen2>
|
||||
static void BM_Jaro_Cached(benchmark::State& state)
|
||||
{
|
||||
std::vector<std::string> seq1;
|
||||
std::vector<std::string> seq2;
|
||||
for (int i = 0; i < 256; i++)
|
||||
seq1.push_back(generate(MaxLen1));
|
||||
for (int i = 0; i < 10000; i++)
|
||||
seq2.push_back(generate(MaxLen2));
|
||||
|
||||
size_t num = 0;
|
||||
for (auto _ : state) {
|
||||
for (const auto& str1 : seq1) {
|
||||
rapidfuzz::CachedJaro<char> scorer(str1);
|
||||
for (size_t j = 0; j < seq2.size(); ++j)
|
||||
benchmark::DoNotOptimize(scorer.similarity(seq2[j]));
|
||||
}
|
||||
num += seq1.size() * seq2.size();
|
||||
}
|
||||
|
||||
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num), benchmark::Counter::kIsRate);
|
||||
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num),
|
||||
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
|
||||
}
|
||||
|
||||
BENCHMARK_TEMPLATE(BM_Jaro, 8, 8);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro, 16, 16);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro, 32, 32);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro, 64, 64);
|
||||
|
||||
BENCHMARK_TEMPLATE(BM_Jaro_Cached, 8, 8);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro_Cached, 16, 16);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro_Cached, 32, 32);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro_Cached, 64, 64);
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
BENCHMARK_TEMPLATE(BM_Jaro_SIMD, 8, 8);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro_SIMD, 16, 16);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro_SIMD, 32, 32);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro_SIMD, 64, 64);
|
||||
#endif
|
||||
|
||||
BENCHMARK_TEMPLATE(BM_Jaro, 8, 1000);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro, 16, 1000);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro, 32, 1000);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro, 64, 1000);
|
||||
|
||||
BENCHMARK_TEMPLATE(BM_Jaro_Cached, 8, 1000);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro_Cached, 16, 1000);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro_Cached, 32, 1000);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro_Cached, 64, 1000);
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
BENCHMARK_TEMPLATE(BM_Jaro_SIMD, 8, 1000);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro_SIMD, 16, 1000);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro_SIMD, 32, 1000);
|
||||
BENCHMARK_TEMPLATE(BM_Jaro_SIMD, 64, 1000);
|
||||
#endif
|
||||
|
||||
BENCHMARK(BM_JaroLongSimilarSequence)
|
||||
->Args({100, 30})
|
||||
->Args({500, 30})
|
||||
->Args({5000, 30})
|
||||
->Args({10000, 30})
|
||||
->Args({20000, 30})
|
||||
->Args({50000, 30});
|
||||
|
||||
BENCHMARK(BM_JaroLongNonSimilarSequence)
|
||||
->Args({100, 30})
|
||||
->Args({500, 30})
|
||||
->Args({5000, 30})
|
||||
->Args({10000, 30})
|
||||
->Args({20000, 30})
|
||||
->Args({50000, 30});
|
||||
|
||||
BENCHMARK_MAIN();
|
||||
181
src/external/rapidfuzz-cpp/bench/bench-lcs.cpp
vendored
Normal file
181
src/external/rapidfuzz-cpp/bench/bench-lcs.cpp
vendored
Normal file
@@ -0,0 +1,181 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <random>
|
||||
#include <rapidfuzz/details/intrinsics.hpp>
|
||||
#include <rapidfuzz/distance/LCSseq.hpp>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
std::string generate(int max_length)
|
||||
{
|
||||
std::string possible_characters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
||||
std::random_device rd;
|
||||
std::mt19937 engine(rd());
|
||||
std::uniform_int_distribution<> dist(0, static_cast<int>(possible_characters.size() - 1));
|
||||
std::string ret = "";
|
||||
for (int i = 0; i < max_length; i++) {
|
||||
int random_index = dist(engine);
|
||||
ret += possible_characters[static_cast<size_t>(random_index)];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::basic_string<T> str_multiply(std::basic_string<T> a, unsigned int b)
|
||||
{
|
||||
std::basic_string<T> output;
|
||||
while (b--)
|
||||
output += a;
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
static void BM_LcsLongSimilarSequence(benchmark::State& state)
|
||||
{
|
||||
size_t len = state.range(0);
|
||||
size_t score_cutoff = state.range(1);
|
||||
std::string s1 = std::string("a") + str_multiply(std::string("b"), (len - 2)) + std::string("a");
|
||||
std::string s2 = str_multiply(std::string("b"), len);
|
||||
|
||||
size_t num = 0;
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(rapidfuzz::lcs_seq_distance(s1, s2, score_cutoff));
|
||||
++num;
|
||||
}
|
||||
|
||||
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num * len), benchmark::Counter::kIsRate);
|
||||
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num * len),
|
||||
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
|
||||
}
|
||||
|
||||
static void BM_LcsLongNonSimilarSequence(benchmark::State& state)
|
||||
{
|
||||
size_t len = state.range(0);
|
||||
size_t score_cutoff = state.range(1);
|
||||
std::string s1 = str_multiply(std::string("a"), len);
|
||||
std::string s2 = str_multiply(std::string("b"), len);
|
||||
|
||||
size_t num = 0;
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(rapidfuzz::lcs_seq_distance(s1, s2, score_cutoff));
|
||||
++num;
|
||||
}
|
||||
|
||||
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num * len), benchmark::Counter::kIsRate);
|
||||
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num * len),
|
||||
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
|
||||
}
|
||||
|
||||
template <size_t MaxLen>
|
||||
static void BM_LCS(benchmark::State& state)
|
||||
{
|
||||
std::vector<std::string> seq1;
|
||||
std::vector<std::string> seq2;
|
||||
for (int i = 0; i < 256; i++)
|
||||
seq1.push_back(generate(MaxLen));
|
||||
for (int i = 0; i < 10000; i++)
|
||||
seq2.push_back(generate(MaxLen));
|
||||
|
||||
size_t num = 0;
|
||||
for (auto _ : state) {
|
||||
for (size_t j = 0; j < seq2.size(); ++j)
|
||||
for (size_t i = 0; i < seq1.size(); ++i)
|
||||
benchmark::DoNotOptimize(rapidfuzz::lcs_seq_distance(seq1[i], seq2[j]));
|
||||
|
||||
num += seq1.size() * seq2.size();
|
||||
}
|
||||
|
||||
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num), benchmark::Counter::kIsRate);
|
||||
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num),
|
||||
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
|
||||
}
|
||||
|
||||
template <size_t MaxLen>
|
||||
static void BM_LCS_Cached(benchmark::State& state)
|
||||
{
|
||||
std::vector<std::string> seq1;
|
||||
std::vector<std::string> seq2;
|
||||
for (int i = 0; i < 256; i++)
|
||||
seq1.push_back(generate(MaxLen));
|
||||
for (int i = 0; i < 10000; i++)
|
||||
seq2.push_back(generate(MaxLen));
|
||||
|
||||
size_t num = 0;
|
||||
for (auto _ : state) {
|
||||
for (const auto& str1 : seq1) {
|
||||
rapidfuzz::CachedLCSseq<char> scorer(str1);
|
||||
for (size_t j = 0; j < seq2.size(); ++j)
|
||||
benchmark::DoNotOptimize(scorer.similarity(seq2[j]));
|
||||
}
|
||||
num += seq1.size() * seq2.size();
|
||||
}
|
||||
|
||||
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num), benchmark::Counter::kIsRate);
|
||||
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num),
|
||||
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
|
||||
}
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
template <size_t MaxLen>
|
||||
static void BM_LCS_SIMD(benchmark::State& state)
|
||||
{
|
||||
std::vector<std::string> seq1;
|
||||
std::vector<std::string> seq2;
|
||||
std::vector<size_t> results(32 * 3 * 4);
|
||||
for (int i = 0; i < 32 * 3 * 4; i++)
|
||||
seq1.push_back(generate(MaxLen));
|
||||
for (int i = 0; i < 10000; i++)
|
||||
seq2.push_back(generate(MaxLen));
|
||||
|
||||
size_t num = 0;
|
||||
for (auto _ : state) {
|
||||
rapidfuzz::experimental::MultiLCSseq<MaxLen> scorer(seq1.size());
|
||||
for (const auto& str1 : seq1)
|
||||
scorer.insert(str1);
|
||||
|
||||
for (const auto& str2 : seq2)
|
||||
scorer.similarity(&results[0], results.size(), str2);
|
||||
|
||||
num += seq1.size() * seq2.size();
|
||||
}
|
||||
|
||||
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num), benchmark::Counter::kIsRate);
|
||||
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num),
|
||||
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
|
||||
}
|
||||
#endif
|
||||
|
||||
BENCHMARK(BM_LcsLongSimilarSequence)
|
||||
->Args({100, 30})
|
||||
->Args({500, 100})
|
||||
->Args({500, 30})
|
||||
->Args({5000, 30})
|
||||
->Args({10000, 30})
|
||||
->Args({20000, 30})
|
||||
->Args({50000, 30});
|
||||
|
||||
BENCHMARK(BM_LcsLongNonSimilarSequence)
|
||||
->Args({100, 30})
|
||||
->Args({500, 30})
|
||||
->Args({5000, 30})
|
||||
->Args({10000, 30})
|
||||
->Args({20000, 30})
|
||||
->Args({50000, 30});
|
||||
|
||||
BENCHMARK_TEMPLATE(BM_LCS, 8);
|
||||
BENCHMARK_TEMPLATE(BM_LCS, 16);
|
||||
BENCHMARK_TEMPLATE(BM_LCS, 32);
|
||||
BENCHMARK_TEMPLATE(BM_LCS, 64);
|
||||
|
||||
BENCHMARK_TEMPLATE(BM_LCS_Cached, 8);
|
||||
BENCHMARK_TEMPLATE(BM_LCS_Cached, 16);
|
||||
BENCHMARK_TEMPLATE(BM_LCS_Cached, 32);
|
||||
BENCHMARK_TEMPLATE(BM_LCS_Cached, 64);
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
BENCHMARK_TEMPLATE(BM_LCS_SIMD, 8);
|
||||
BENCHMARK_TEMPLATE(BM_LCS_SIMD, 16);
|
||||
BENCHMARK_TEMPLATE(BM_LCS_SIMD, 32);
|
||||
BENCHMARK_TEMPLATE(BM_LCS_SIMD, 64);
|
||||
#endif
|
||||
|
||||
BENCHMARK_MAIN();
|
||||
224
src/external/rapidfuzz-cpp/bench/bench-levenshtein.cpp
vendored
Normal file
224
src/external/rapidfuzz-cpp/bench/bench-levenshtein.cpp
vendored
Normal file
@@ -0,0 +1,224 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <random>
|
||||
#include <rapidfuzz/distance/Levenshtein.hpp>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
std::string generate(int max_length)
|
||||
{
|
||||
std::string possible_characters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
||||
std::random_device rd;
|
||||
std::mt19937 engine(rd());
|
||||
std::uniform_int_distribution<> dist(0, static_cast<int>(possible_characters.size() - 1));
|
||||
std::string ret = "";
|
||||
for (int i = 0; i < max_length; i++) {
|
||||
int random_index = dist(engine);
|
||||
ret += possible_characters[static_cast<size_t>(random_index)];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::basic_string<T> str_multiply(std::basic_string<T> a, unsigned int b)
|
||||
{
|
||||
std::basic_string<T> output;
|
||||
while (b--)
|
||||
output += a;
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
// Define another benchmark
|
||||
static void BM_LevWeightedDist1(benchmark::State& state)
|
||||
{
|
||||
std::string a = "aaaaa aaaaa";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(rapidfuzz::levenshtein_distance(a, a));
|
||||
}
|
||||
state.SetLabel("Similar Strings");
|
||||
}
|
||||
|
||||
static void BM_LevWeightedDist2(benchmark::State& state)
|
||||
{
|
||||
std::string a = "aaaaa aaaaa";
|
||||
std::string b = "bbbbb bbbbb";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(rapidfuzz::levenshtein_distance(a, b));
|
||||
}
|
||||
state.SetLabel("Different Strings");
|
||||
}
|
||||
|
||||
static void BM_LevNormWeightedDist1(benchmark::State& state)
|
||||
{
|
||||
std::string a = "aaaaa aaaaa";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(rapidfuzz::levenshtein_normalized_distance(a, a));
|
||||
}
|
||||
state.SetLabel("Similar Strings");
|
||||
}
|
||||
|
||||
static void BM_LevNormWeightedDist2(benchmark::State& state)
|
||||
{
|
||||
std::string a = "aaaaa aaaaa";
|
||||
std::string b = "bbbbb bbbbb";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(rapidfuzz::levenshtein_normalized_distance(a, b));
|
||||
}
|
||||
state.SetLabel("Different Strings");
|
||||
}
|
||||
|
||||
static void BM_LevLongSimilarSequence(benchmark::State& state)
|
||||
{
|
||||
size_t len = state.range(0);
|
||||
size_t score_cutoff = state.range(1);
|
||||
std::string s1 = std::string("a") + str_multiply(std::string("b"), (len - 2)) + std::string("a");
|
||||
std::string s2 = str_multiply(std::string("b"), len);
|
||||
|
||||
size_t num = 0;
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(rapidfuzz::levenshtein_distance(s1, s2, {1, 1, 1}, score_cutoff));
|
||||
++num;
|
||||
}
|
||||
|
||||
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num * len), benchmark::Counter::kIsRate);
|
||||
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num * len),
|
||||
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
|
||||
}
|
||||
|
||||
static void BM_LevLongNonSimilarSequence(benchmark::State& state)
|
||||
{
|
||||
size_t len = state.range(0);
|
||||
size_t score_cutoff = state.range(1);
|
||||
std::string s1 = str_multiply(std::string("a"), len);
|
||||
std::string s2 = str_multiply(std::string("b"), len);
|
||||
|
||||
size_t num = 0;
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(rapidfuzz::levenshtein_distance(s1, s2, {1, 1, 1}, score_cutoff));
|
||||
++num;
|
||||
}
|
||||
|
||||
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num * len), benchmark::Counter::kIsRate);
|
||||
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num * len),
|
||||
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
|
||||
}
|
||||
|
||||
template <size_t MaxLen>
|
||||
static void BM_Levenshtein(benchmark::State& state)
|
||||
{
|
||||
std::vector<std::string> seq1;
|
||||
std::vector<std::string> seq2;
|
||||
for (int i = 0; i < 256; i++)
|
||||
seq1.push_back(generate(MaxLen));
|
||||
for (int i = 0; i < 10000; i++)
|
||||
seq2.push_back(generate(MaxLen));
|
||||
|
||||
size_t num = 0;
|
||||
for (auto _ : state) {
|
||||
for (size_t j = 0; j < seq2.size(); ++j)
|
||||
for (size_t i = 0; i < seq1.size(); ++i)
|
||||
benchmark::DoNotOptimize(rapidfuzz::levenshtein_distance(seq1[i], seq2[j]));
|
||||
|
||||
num += seq1.size() * seq2.size();
|
||||
}
|
||||
|
||||
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num), benchmark::Counter::kIsRate);
|
||||
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num),
|
||||
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
|
||||
}
|
||||
|
||||
template <size_t MaxLen>
|
||||
static void BM_Levenshtein_Cached(benchmark::State& state)
|
||||
{
|
||||
std::vector<std::string> seq1;
|
||||
std::vector<std::string> seq2;
|
||||
for (int i = 0; i < 256; i++)
|
||||
seq1.push_back(generate(MaxLen));
|
||||
for (int i = 0; i < 10000; i++)
|
||||
seq2.push_back(generate(MaxLen));
|
||||
|
||||
size_t num = 0;
|
||||
for (auto _ : state) {
|
||||
for (const auto& str1 : seq1) {
|
||||
rapidfuzz::CachedLevenshtein<char> scorer(str1);
|
||||
for (size_t j = 0; j < seq2.size(); ++j)
|
||||
benchmark::DoNotOptimize(scorer.similarity(seq2[j]));
|
||||
}
|
||||
num += seq1.size() * seq2.size();
|
||||
}
|
||||
|
||||
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num), benchmark::Counter::kIsRate);
|
||||
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num),
|
||||
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
|
||||
}
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
template <size_t MaxLen>
|
||||
static void BM_Levenshtein_SIMD(benchmark::State& state)
|
||||
{
|
||||
std::vector<std::string> seq1;
|
||||
std::vector<std::string> seq2;
|
||||
std::vector<size_t> results(64);
|
||||
for (int i = 0; i < 64; i++)
|
||||
seq1.push_back(generate(MaxLen));
|
||||
for (int i = 0; i < 10000; i++)
|
||||
seq2.push_back(generate(MaxLen));
|
||||
|
||||
size_t num = 0;
|
||||
for (auto _ : state) {
|
||||
rapidfuzz::experimental::MultiLevenshtein<MaxLen> scorer(seq1.size());
|
||||
for (const auto& str1 : seq1)
|
||||
scorer.insert(str1);
|
||||
|
||||
for (const auto& str2 : seq2)
|
||||
scorer.similarity(&results[0], results.size(), str2);
|
||||
|
||||
num += seq1.size() * seq2.size();
|
||||
}
|
||||
|
||||
state.counters["Rate"] = benchmark::Counter(static_cast<double>(num), benchmark::Counter::kIsRate);
|
||||
state.counters["InvRate"] = benchmark::Counter(static_cast<double>(num),
|
||||
benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
|
||||
}
|
||||
#endif
|
||||
|
||||
BENCHMARK(BM_LevLongSimilarSequence)
|
||||
->Args({100, 30})
|
||||
->Args({500, 30})
|
||||
->Args({5000, 30})
|
||||
->Args({10000, 30})
|
||||
->Args({20000, 30})
|
||||
->Args({50000, 30});
|
||||
|
||||
BENCHMARK(BM_LevLongNonSimilarSequence)
|
||||
->Args({100, 30})
|
||||
->Args({500, 30})
|
||||
->Args({5000, 30})
|
||||
->Args({10000, 30})
|
||||
->Args({20000, 30})
|
||||
->Args({50000, 30});
|
||||
|
||||
BENCHMARK(BM_LevWeightedDist1);
|
||||
BENCHMARK(BM_LevWeightedDist2);
|
||||
|
||||
BENCHMARK(BM_LevNormWeightedDist1);
|
||||
BENCHMARK(BM_LevNormWeightedDist2);
|
||||
|
||||
BENCHMARK_TEMPLATE(BM_Levenshtein, 8);
|
||||
BENCHMARK_TEMPLATE(BM_Levenshtein, 16);
|
||||
BENCHMARK_TEMPLATE(BM_Levenshtein, 32);
|
||||
BENCHMARK_TEMPLATE(BM_Levenshtein, 64);
|
||||
|
||||
BENCHMARK_TEMPLATE(BM_Levenshtein_Cached, 8);
|
||||
BENCHMARK_TEMPLATE(BM_Levenshtein_Cached, 16);
|
||||
BENCHMARK_TEMPLATE(BM_Levenshtein_Cached, 32);
|
||||
BENCHMARK_TEMPLATE(BM_Levenshtein_Cached, 64);
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
BENCHMARK_TEMPLATE(BM_Levenshtein_SIMD, 8);
|
||||
BENCHMARK_TEMPLATE(BM_Levenshtein_SIMD, 16);
|
||||
BENCHMARK_TEMPLATE(BM_Levenshtein_SIMD, 32);
|
||||
BENCHMARK_TEMPLATE(BM_Levenshtein_SIMD, 64);
|
||||
#endif
|
||||
|
||||
BENCHMARK_MAIN();
|
||||
9
src/external/rapidfuzz-cpp/cmake/rapidfuzzConfig.cmake.in
vendored
Normal file
9
src/external/rapidfuzz-cpp/cmake/rapidfuzzConfig.cmake.in
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
@PACKAGE_INIT@
|
||||
|
||||
# Avoid repeatedly including the targets
|
||||
if(NOT TARGET rapidfuzz::rapidfuzz)
|
||||
# Provide path for scripts
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
|
||||
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/rapidfuzzTargets.cmake)
|
||||
endif()
|
||||
7
src/external/rapidfuzz-cpp/docs/literature/hyrro_2002.bib
vendored
Normal file
7
src/external/rapidfuzz-cpp/docs/literature/hyrro_2002.bib
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
@article{hyrro_2002,
|
||||
author = {Hyyro, Heikki},
|
||||
year = {2002},
|
||||
month = {10},
|
||||
pages = {},
|
||||
title = {Explaining and Extending the Bit-parallel Approximate String Matching Algorithm of Myers}
|
||||
}
|
||||
8
src/external/rapidfuzz-cpp/docs/literature/hyrro_2004.bib
vendored
Normal file
8
src/external/rapidfuzz-cpp/docs/literature/hyrro_2004.bib
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
@article{hyrro_2004,
|
||||
author = {Hyyro, Heikki},
|
||||
year = {2004},
|
||||
month = {08},
|
||||
pages = {},
|
||||
title = {Bit-Parallel LCS-length Computation Revisited},
|
||||
journal = {Proc. 15th Australasian Workshop on Combinatorial Algorithms (AWOCA 2004)}
|
||||
}
|
||||
8
src/external/rapidfuzz-cpp/docs/literature/hyrro_lcs_2004.bib
vendored
Normal file
8
src/external/rapidfuzz-cpp/docs/literature/hyrro_lcs_2004.bib
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
@article{hyrro_lcs_2004,
|
||||
author = {Hyyro, Heikki},
|
||||
year = {2004},
|
||||
month = {08},
|
||||
pages = {},
|
||||
title = {Bit-Parallel LCS-length Computation Revisited},
|
||||
journal = {Proc. 15th Australasian Workshop on Combinatorial Algorithms (AWOCA 2004)}
|
||||
}
|
||||
22
src/external/rapidfuzz-cpp/docs/literature/myers_1999.bib
vendored
Normal file
22
src/external/rapidfuzz-cpp/docs/literature/myers_1999.bib
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
@article{myers_1999,
|
||||
author = {Myers, Gene},
|
||||
title = {A Fast Bit-Vector Algorithm for Approximate String Matching Based on Dynamic Programming},
|
||||
year = {1999},
|
||||
issue_date = {May 1999},
|
||||
publisher = {Association for Computing Machinery},
|
||||
address = {New York, NY, USA},
|
||||
volume = {46},
|
||||
number = {3},
|
||||
issn = {0004-5411},
|
||||
url = {https://doi.org/10.1145/316542.316550},
|
||||
doi = {10.1145/316542.316550},
|
||||
journal = {J. ACM},
|
||||
month = may,
|
||||
pages = {395–415},
|
||||
numpages = {21},
|
||||
keywords = {approximate string search, sequence comparison, bit-parallelism}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
20
src/external/rapidfuzz-cpp/docs/literature/wagner_fischer_1974.bib
vendored
Normal file
20
src/external/rapidfuzz-cpp/docs/literature/wagner_fischer_1974.bib
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
@article{wagner_fischer_1974,
|
||||
author = {Wagner, Robert A. and Fischer, Michael J.},
|
||||
title = {The String-to-String Correction Problem},
|
||||
year = {1974},
|
||||
issue_date = {Jan. 1974},
|
||||
publisher = {Association for Computing Machinery},
|
||||
address = {New York, NY, USA},
|
||||
volume = {21},
|
||||
number = {1},
|
||||
issn = {0004-5411},
|
||||
url = {https://doi.org/10.1145/321796.321811},
|
||||
doi = {10.1145/321796.321811},
|
||||
abstract = {The string-to-string correction problem is to determine the distance between two strings as measured by the minimum cost sequence of “edit operations” needed to change the one string into the other. The edit operations investigated allow changing one symbol of a string into another single symbol, deleting one symbol from a string, or inserting a single symbol into a string. An algorithm is presented which solves this problem in time proportional to the product of the lengths of the two strings. Possible applications are to the problems of automatic spelling correction and determining the longest subsequence of characters common to two strings.},
|
||||
journal = {J. ACM},
|
||||
month = jan,
|
||||
pages = {168–173},
|
||||
numpages = {6}
|
||||
}
|
||||
|
||||
|
||||
39
src/external/rapidfuzz-cpp/examples/cmake_export/CMakeLists.txt
vendored
Normal file
39
src/external/rapidfuzz-cpp/examples/cmake_export/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
cmake_minimum_required(VERSION 3.5)
|
||||
project(foo LANGUAGES CXX VERSION 0.0.1)
|
||||
|
||||
# The example library publicly dependent on RapidFuzz (includes
|
||||
# rapidfuzz.hpp in foo_lib.hpp), necessitating RapidFuzz's installation
|
||||
set(RAPIDFUZZ_INSTALL ON CACHE INTERNAL "")
|
||||
add_subdirectory(${CMAKE_SOURCE_DIR}/../..
|
||||
${CMAKE_SOURCE_DIR}/../../build)
|
||||
|
||||
add_library(foo foo_lib.cc)
|
||||
add_library(foo::foo ALIAS foo)
|
||||
target_link_libraries(foo rapidfuzz)
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
|
||||
include(GNUInstallDirs)
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
set(FOO_CMAKE_CONFIG_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/foo")
|
||||
install(TARGETS foo EXPORT fooTargs DESTINATION ${CMAKE_INSTALL_LIBDIR})
|
||||
install(EXPORT fooTargs NAMESPACE foo:: DESTINATION ${FOO_CMAKE_CONFIG_DESTINATION})
|
||||
|
||||
configure_package_config_file(
|
||||
${CMAKE_CURRENT_LIST_DIR}/${PROJECT_NAME}Config.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake
|
||||
INSTALL_DESTINATION ${FOO_CMAKE_CONFIG_DESTINATION}
|
||||
)
|
||||
write_basic_package_version_file(
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
|
||||
COMPATIBILITY SameMajorVersion
|
||||
)
|
||||
|
||||
install(
|
||||
FILES
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
|
||||
DESTINATION
|
||||
${FOO_CMAKE_CONFIG_DESTINATION}
|
||||
)
|
||||
install(FILES foo_lib.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
10
src/external/rapidfuzz-cpp/examples/cmake_export/fooConfig.cmake.in
vendored
Normal file
10
src/external/rapidfuzz-cpp/examples/cmake_export/fooConfig.cmake.in
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
@PACKAGE_INIT@
|
||||
|
||||
# Avoid repeatedly including the targets
|
||||
if(NOT TARGET foo::foo)
|
||||
find_package(rapidfuzz REQUIRED)
|
||||
# Provide path for scripts
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
|
||||
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/fooTargs.cmake)
|
||||
endif()
|
||||
7
src/external/rapidfuzz-cpp/examples/cmake_export/foo_lib.cc
vendored
Normal file
7
src/external/rapidfuzz-cpp/examples/cmake_export/foo_lib.cc
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
#include "foo_lib.hpp"
|
||||
|
||||
double fooFunc() {
|
||||
std::string_view a("aaaa"), b("abaa");
|
||||
FooType cache(a.begin(), a.end());
|
||||
return cache.similarity(b);
|
||||
}
|
||||
4
src/external/rapidfuzz-cpp/examples/cmake_export/foo_lib.hpp
vendored
Normal file
4
src/external/rapidfuzz-cpp/examples/cmake_export/foo_lib.hpp
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
#include <rapidfuzz/fuzz.hpp>
|
||||
|
||||
using FooType = rapidfuzz::fuzz::CachedRatio<char>;
|
||||
double fooFunc();
|
||||
5
src/external/rapidfuzz-cpp/examples/cmake_export/indirect_app/CMakeLists.txt
vendored
Normal file
5
src/external/rapidfuzz-cpp/examples/cmake_export/indirect_app/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
cmake_minimum_required(VERSION 3.5)
|
||||
project(fooapp LANGUAGES CXX VERSION 0.0.1)
|
||||
find_package(foo REQUIRED)
|
||||
add_executable(fooapp foo_app.cc)
|
||||
target_link_libraries(fooapp foo::foo)
|
||||
7
src/external/rapidfuzz-cpp/examples/cmake_export/indirect_app/foo_app.cc
vendored
Normal file
7
src/external/rapidfuzz-cpp/examples/cmake_export/indirect_app/foo_app.cc
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
#include <iostream>
|
||||
#include <foo_lib.hpp>
|
||||
|
||||
int main() {
|
||||
std::cout << fooFunc() << '\n';
|
||||
return 0;
|
||||
}
|
||||
6
src/external/rapidfuzz-cpp/examples/cmake_installed/CMakeLists.txt
vendored
Normal file
6
src/external/rapidfuzz-cpp/examples/cmake_installed/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
cmake_minimum_required(VERSION 3.8)
|
||||
project(cmake_installed CXX)
|
||||
|
||||
find_package(rapidfuzz REQUIRED)
|
||||
add_executable(foo main.cpp)
|
||||
target_link_libraries(foo rapidfuzz::rapidfuzz)
|
||||
10
src/external/rapidfuzz-cpp/examples/cmake_installed/main.cpp
vendored
Normal file
10
src/external/rapidfuzz-cpp/examples/cmake_installed/main.cpp
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
#include <rapidfuzz/fuzz.hpp>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
|
||||
int main()
|
||||
{
|
||||
std::string a = "aaaa";
|
||||
std::string b = "abab";
|
||||
std::cout << rapidfuzz::fuzz::ratio(a, b) << std::endl;
|
||||
}
|
||||
11096
src/external/rapidfuzz-cpp/extras/rapidfuzz_amalgamated.hpp
vendored
Normal file
11096
src/external/rapidfuzz-cpp/extras/rapidfuzz_amalgamated.hpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
24
src/external/rapidfuzz-cpp/fuzzing/CMakeLists.txt
vendored
Normal file
24
src/external/rapidfuzz-cpp/fuzzing/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
function(create_fuzzer fuzzer)
|
||||
add_executable(fuzz_${fuzzer} fuzz_${fuzzer}.cpp)
|
||||
target_compile_features(fuzz_${fuzzer} PUBLIC cxx_std_17)
|
||||
target_link_libraries(fuzz_${fuzzer} PRIVATE rapidfuzz::rapidfuzz)
|
||||
|
||||
target_compile_options(fuzz_${fuzzer} PRIVATE -g -O1 -fsanitize=fuzzer,address -march=native)
|
||||
target_link_libraries(fuzz_${fuzzer} PRIVATE -fsanitize=fuzzer,address)
|
||||
endfunction(create_fuzzer)
|
||||
|
||||
create_fuzzer(lcs_similarity)
|
||||
|
||||
create_fuzzer(levenshtein_distance)
|
||||
create_fuzzer(levenshtein_editops)
|
||||
|
||||
create_fuzzer(indel_distance)
|
||||
create_fuzzer(indel_editops)
|
||||
|
||||
create_fuzzer(osa_distance)
|
||||
|
||||
create_fuzzer(damerau_levenshtein_distance)
|
||||
|
||||
create_fuzzer(jaro_similarity)
|
||||
|
||||
create_fuzzer(partial_ratio)
|
||||
53
src/external/rapidfuzz-cpp/fuzzing/fuzz_damerau_levenshtein_distance.cpp
vendored
Normal file
53
src/external/rapidfuzz-cpp/fuzzing/fuzz_damerau_levenshtein_distance.cpp
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#include "../rapidfuzz_reference/DamerauLevenshtein.hpp"
|
||||
#include "fuzzing.hpp"
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/distance/DamerauLevenshtein.hpp>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
void validate_distance(size_t reference_dist, const std::vector<uint8_t>& s1, const std::vector<uint8_t>& s2,
|
||||
size_t score_cutoff)
|
||||
{
|
||||
if (reference_dist > score_cutoff) reference_dist = score_cutoff + 1;
|
||||
|
||||
auto dist = rapidfuzz::experimental::damerau_levenshtein_distance(s1, s2, score_cutoff);
|
||||
if (dist != reference_dist) {
|
||||
print_seq("s1", s1);
|
||||
print_seq("s2", s2);
|
||||
throw std::logic_error(std::string("osa distance failed (score_cutoff = ") +
|
||||
std::to_string(score_cutoff) + std::string(", reference_score = ") +
|
||||
std::to_string(reference_dist) + std::string(", score = ") +
|
||||
std::to_string(dist) + ")");
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
|
||||
{
|
||||
std::vector<uint8_t> s1, s2;
|
||||
if (!extract_strings(data, size, s1, s2)) return 0;
|
||||
|
||||
size_t reference_dist = rapidfuzz_reference::damerau_levenshtein_distance(s1, s2);
|
||||
|
||||
/* test small band */
|
||||
for (size_t i = 4; i < 32; ++i)
|
||||
validate_distance(reference_dist, s1, s2, i);
|
||||
|
||||
/* unrestricted */
|
||||
validate_distance(reference_dist, s1, s2, std::numeric_limits<size_t>::max());
|
||||
|
||||
/* test long sequences */
|
||||
for (unsigned int i = 2; i < 9; ++i) {
|
||||
std::vector<uint8_t> s1_ = vec_multiply(s1, pow<size_t>(2, i));
|
||||
std::vector<uint8_t> s2_ = vec_multiply(s2, pow<size_t>(2, i));
|
||||
|
||||
if (s1_.size() > 10000 || s2_.size() > 10000) break;
|
||||
|
||||
reference_dist = rapidfuzz_reference::damerau_levenshtein_distance(s1_, s2_);
|
||||
validate_distance(reference_dist, s1_, s2_, std::numeric_limits<size_t>::max());
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
43
src/external/rapidfuzz-cpp/fuzzing/fuzz_indel_distance.cpp
vendored
Normal file
43
src/external/rapidfuzz-cpp/fuzzing/fuzz_indel_distance.cpp
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#include "../rapidfuzz_reference/Indel.hpp"
|
||||
#include "fuzzing.hpp"
|
||||
#include <limits>
|
||||
#include <rapidfuzz/distance/Indel.hpp>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
void validate_distance(const std::vector<uint8_t>& s1, const std::vector<uint8_t>& s2, size_t score_cutoff)
|
||||
{
|
||||
auto dist = rapidfuzz::indel_distance(s1, s2, score_cutoff);
|
||||
auto reference_dist = rapidfuzz_reference::indel_distance(s1, s2, score_cutoff);
|
||||
if (dist != reference_dist) {
|
||||
print_seq("s1: ", s1);
|
||||
print_seq("s2: ", s2);
|
||||
throw std::logic_error(std::string("indel distance failed (score_cutoff = ") +
|
||||
std::to_string(score_cutoff) + std::string(", reference_score = ") +
|
||||
std::to_string(reference_dist) + std::string(", score = ") +
|
||||
std::to_string(dist) + ")");
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
|
||||
{
|
||||
std::vector<uint8_t> s1, s2;
|
||||
if (!extract_strings(data, size, s1, s2)) return 0;
|
||||
|
||||
validate_distance(s1, s2, 0);
|
||||
validate_distance(s1, s2, 1);
|
||||
validate_distance(s1, s2, 2);
|
||||
validate_distance(s1, s2, 3);
|
||||
validate_distance(s1, s2, 4);
|
||||
/* score_cutoff to trigger banded implementation */
|
||||
validate_distance(s1, s2, s1.size() / 2);
|
||||
validate_distance(s1, s2, s2.size() / 2);
|
||||
|
||||
/* unrestricted */
|
||||
validate_distance(s1, s2, std::numeric_limits<size_t>::max());
|
||||
|
||||
return 0;
|
||||
}
|
||||
22
src/external/rapidfuzz-cpp/fuzzing/fuzz_indel_editops.cpp
vendored
Normal file
22
src/external/rapidfuzz-cpp/fuzzing/fuzz_indel_editops.cpp
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#include "../rapidfuzz_reference/Indel.hpp"
|
||||
#include "fuzzing.hpp"
|
||||
#include <rapidfuzz/distance.hpp>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
|
||||
{
|
||||
std::vector<uint8_t> s1, s2;
|
||||
if (!extract_strings(data, size, s1, s2)) return 0;
|
||||
|
||||
size_t score = rapidfuzz_reference::indel_distance(s1, s2);
|
||||
rapidfuzz::Editops ops = rapidfuzz::indel_editops(s1, s2);
|
||||
|
||||
if (ops.size() == score && s2 != rapidfuzz::editops_apply_vec<uint8_t>(ops, s1, s2))
|
||||
throw std::logic_error("levenshtein_editops failed");
|
||||
|
||||
return 0;
|
||||
}
|
||||
99
src/external/rapidfuzz-cpp/fuzzing/fuzz_jaro_similarity.cpp
vendored
Normal file
99
src/external/rapidfuzz-cpp/fuzzing/fuzz_jaro_similarity.cpp
vendored
Normal file
@@ -0,0 +1,99 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#include "../rapidfuzz_reference/Jaro.hpp"
|
||||
#include "fuzzing.hpp"
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/distance/Jaro.hpp>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
bool is_close(double a, double b, double epsilon)
|
||||
{
|
||||
return fabs(a - b) <= ((fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * epsilon);
|
||||
}
|
||||
|
||||
template <size_t MaxLen>
|
||||
void validate_simd(const std::vector<uint8_t>& s1, const std::vector<uint8_t>& s2)
|
||||
{
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
size_t count = s1.size() / MaxLen + ((s1.size() % MaxLen) != 0);
|
||||
if (count == 0) return;
|
||||
|
||||
rapidfuzz::experimental::MultiJaro<MaxLen> scorer(count);
|
||||
|
||||
std::vector<std::vector<uint8_t>> strings;
|
||||
|
||||
for (auto it1 = s1.begin(); it1 != s1.end(); it1 += MaxLen) {
|
||||
if (std::distance(it1, s1.end()) < static_cast<ptrdiff_t>(MaxLen)) {
|
||||
strings.emplace_back(it1, s1.end());
|
||||
break;
|
||||
}
|
||||
else {
|
||||
strings.emplace_back(it1, it1 + MaxLen);
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& s : strings)
|
||||
scorer.insert(s);
|
||||
|
||||
std::vector<double> simd_results(scorer.result_count());
|
||||
scorer.similarity(&simd_results[0], simd_results.size(), s2);
|
||||
|
||||
for (size_t i = 0; i < strings.size(); ++i) {
|
||||
double reference_sim = rapidfuzz_reference::jaro_similarity(strings[i], s2);
|
||||
|
||||
if (!is_close(simd_results[i], reference_sim, 0.0001)) {
|
||||
print_seq("s1", strings[i]);
|
||||
print_seq("s2", s2);
|
||||
throw std::logic_error(std::string("jaro similarity using simd failed (reference_score = ") +
|
||||
std::to_string(reference_sim) + std::string(", score = ") +
|
||||
std::to_string(simd_results[i]) + std::string(", i = ") +
|
||||
std::to_string(i) + ")");
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
(void)s1;
|
||||
(void)s2;
|
||||
#endif
|
||||
}
|
||||
|
||||
void validate_distance(const std::vector<uint8_t>& s1, const std::vector<uint8_t>& s2)
|
||||
{
|
||||
double reference_sim = rapidfuzz_reference::jaro_similarity(s1, s2);
|
||||
double sim = rapidfuzz::jaro_similarity(s1, s2);
|
||||
|
||||
if (!is_close(sim, reference_sim, 0.0001)) {
|
||||
print_seq("s1", s1);
|
||||
print_seq("s2", s2);
|
||||
throw std::logic_error(std::string("jaro similarity failed (reference_score = ") +
|
||||
std::to_string(reference_sim) + std::string(", score = ") +
|
||||
std::to_string(sim) + ")");
|
||||
}
|
||||
|
||||
validate_simd<8>(s1, s2);
|
||||
validate_simd<16>(s1, s2);
|
||||
validate_simd<32>(s1, s2);
|
||||
validate_simd<64>(s1, s2);
|
||||
}
|
||||
|
||||
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
|
||||
{
|
||||
std::vector<uint8_t> s1, s2;
|
||||
if (!extract_strings(data, size, s1, s2)) return 0;
|
||||
|
||||
validate_distance(s1, s2);
|
||||
|
||||
/* test long sequences */
|
||||
for (unsigned int i = 2; i < 9; ++i) {
|
||||
std::vector<uint8_t> s1_ = vec_multiply(s1, pow<size_t>(2, i));
|
||||
std::vector<uint8_t> s2_ = vec_multiply(s2, pow<size_t>(2, i));
|
||||
|
||||
if (s1_.size() > 10000 || s2_.size() > 10000) break;
|
||||
|
||||
validate_distance(s1_, s2_);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
69
src/external/rapidfuzz-cpp/fuzzing/fuzz_lcs_similarity.cpp
vendored
Normal file
69
src/external/rapidfuzz-cpp/fuzzing/fuzz_lcs_similarity.cpp
vendored
Normal file
@@ -0,0 +1,69 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#include "../rapidfuzz_reference/LCSseq.hpp"
|
||||
#include "fuzzing.hpp"
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/distance/LCSseq.hpp>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
template <size_t MaxLen>
|
||||
void validate_simd(const std::vector<uint8_t>& s1, const std::vector<uint8_t>& s2)
|
||||
{
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
size_t count = s1.size() / MaxLen + ((s1.size() % MaxLen) != 0);
|
||||
rapidfuzz::experimental::MultiLCSseq<MaxLen> scorer(count);
|
||||
|
||||
std::vector<std::vector<uint8_t>> strings;
|
||||
|
||||
for (auto it1 = s1.begin(); it1 != s1.end(); it1 += MaxLen) {
|
||||
if (std::distance(it1, s1.end()) < static_cast<ptrdiff_t>(MaxLen)) {
|
||||
strings.emplace_back(it1, s1.end());
|
||||
break;
|
||||
}
|
||||
else {
|
||||
strings.emplace_back(it1, it1 + MaxLen);
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& s : strings)
|
||||
scorer.insert(s);
|
||||
|
||||
std::vector<size_t> simd_results(scorer.result_count());
|
||||
scorer.similarity(&simd_results[0], simd_results.size(), s2);
|
||||
|
||||
for (size_t i = 0; i < strings.size(); ++i) {
|
||||
size_t reference_score = rapidfuzz_reference::lcs_seq_similarity(strings[i], s2);
|
||||
if (reference_score != simd_results[i]) {
|
||||
print_seq("s1: ", s1);
|
||||
print_seq("s2: ", s2);
|
||||
throw std::logic_error(std::string("lcs distance using simd failed (score_cutoff = ") +
|
||||
std::string(", reference_score = ") + std::to_string(reference_score) +
|
||||
std::string(", score = ") + std::to_string(simd_results[i]) + ")");
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)s1;
|
||||
(void)s2;
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
|
||||
{
|
||||
std::vector<uint8_t> s1, s2;
|
||||
if (!extract_strings(data, size, s1, s2)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (s1.size() == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
validate_simd<8>(s1, s2);
|
||||
validate_simd<16>(s1, s2);
|
||||
validate_simd<32>(s1, s2);
|
||||
validate_simd<64>(s1, s2);
|
||||
|
||||
return 0;
|
||||
}
|
||||
99
src/external/rapidfuzz-cpp/fuzzing/fuzz_levenshtein_distance.cpp
vendored
Normal file
99
src/external/rapidfuzz-cpp/fuzzing/fuzz_levenshtein_distance.cpp
vendored
Normal file
@@ -0,0 +1,99 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#include "../rapidfuzz_reference/Levenshtein.hpp"
|
||||
#include "fuzzing.hpp"
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/distance/Levenshtein.hpp>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
template <size_t MaxLen>
|
||||
void validate_simd(const std::vector<uint8_t>& s1, const std::vector<uint8_t>& s2)
|
||||
{
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
size_t count = s1.size() / MaxLen + ((s1.size() % MaxLen) != 0);
|
||||
if (count == 0) return;
|
||||
|
||||
rapidfuzz::experimental::MultiLevenshtein<MaxLen> scorer(count);
|
||||
|
||||
std::vector<std::vector<uint8_t>> strings;
|
||||
|
||||
for (auto it1 = s1.begin(); it1 != s1.end(); it1 += MaxLen) {
|
||||
if (std::distance(it1, s1.end()) < static_cast<ptrdiff_t>(MaxLen)) {
|
||||
strings.emplace_back(it1, s1.end());
|
||||
break;
|
||||
}
|
||||
else {
|
||||
strings.emplace_back(it1, it1 + MaxLen);
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& s : strings)
|
||||
scorer.insert(s);
|
||||
|
||||
std::vector<size_t> simd_results(scorer.result_count());
|
||||
scorer.distance(&simd_results[0], simd_results.size(), s2);
|
||||
|
||||
for (size_t i = 0; i < strings.size(); ++i) {
|
||||
size_t reference_score = rapidfuzz_reference::levenshtein_distance(strings[i], s2);
|
||||
if (reference_score != simd_results[i]) {
|
||||
print_seq("s1: ", strings[i]);
|
||||
print_seq("s2: ", s2);
|
||||
throw std::logic_error(std::string("levenshtein distance using simd failed (reference_score = ") +
|
||||
std::to_string(reference_score) + std::string(", score = ") +
|
||||
std::to_string(simd_results[i]) + std::string(", i = ") +
|
||||
std::to_string(i) + ")");
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)s1;
|
||||
(void)s2;
|
||||
#endif
|
||||
}
|
||||
|
||||
void validate_distance(size_t reference_dist, const std::vector<uint8_t>& s1, const std::vector<uint8_t>& s2,
|
||||
size_t score_cutoff)
|
||||
{
|
||||
if (reference_dist > score_cutoff) reference_dist = score_cutoff + 1;
|
||||
|
||||
auto dist = rapidfuzz::levenshtein_distance(s1, s2, {1, 1, 1}, score_cutoff);
|
||||
if (dist != reference_dist) {
|
||||
print_seq("s1: ", s1);
|
||||
print_seq("s2: ", s2);
|
||||
throw std::logic_error(std::string("levenshtein distance failed (score_cutoff = ") +
|
||||
std::to_string(score_cutoff) + std::string(", reference_score = ") +
|
||||
std::to_string(reference_dist) + std::string(", score = ") +
|
||||
std::to_string(dist) + ")");
|
||||
}
|
||||
|
||||
validate_simd<8>(s1, s2);
|
||||
validate_simd<16>(s1, s2);
|
||||
validate_simd<32>(s1, s2);
|
||||
validate_simd<64>(s1, s2);
|
||||
}
|
||||
|
||||
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
|
||||
{
|
||||
std::vector<uint8_t> s1, s2;
|
||||
if (!extract_strings(data, size, s1, s2)) return 0;
|
||||
|
||||
size_t reference_dist = rapidfuzz_reference::levenshtein_distance(s1, s2);
|
||||
|
||||
/* test mbleven */
|
||||
for (size_t i = 0; i < 4; ++i)
|
||||
validate_distance(reference_dist, s1, s2, i);
|
||||
|
||||
/* test small band */
|
||||
for (size_t i = 4; i < 32; ++i)
|
||||
validate_distance(reference_dist, s1, s2, i);
|
||||
|
||||
/* unrestricted */
|
||||
validate_distance(reference_dist, s1, s2, std::numeric_limits<size_t>::max());
|
||||
|
||||
/* score_cutoff to trigger banded implementation */
|
||||
validate_distance(reference_dist, s1, s2, s1.size() / 2);
|
||||
validate_distance(reference_dist, s1, s2, s2.size() / 2);
|
||||
|
||||
return 0;
|
||||
}
|
||||
44
src/external/rapidfuzz-cpp/fuzzing/fuzz_levenshtein_editops.cpp
vendored
Normal file
44
src/external/rapidfuzz-cpp/fuzzing/fuzz_levenshtein_editops.cpp
vendored
Normal file
@@ -0,0 +1,44 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#include "../rapidfuzz_reference/Levenshtein.hpp"
|
||||
#include "fuzzing.hpp"
|
||||
#include <rapidfuzz/distance.hpp>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
void validate_editops(const std::vector<uint8_t>& s1, const std::vector<uint8_t>& s2, size_t score,
|
||||
size_t score_hint = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
rapidfuzz::Editops ops = rapidfuzz::levenshtein_editops(s1, s2, score_hint);
|
||||
if (ops.size() == score && s2 != rapidfuzz::editops_apply_vec<uint8_t>(ops, s1, s2))
|
||||
throw std::logic_error("levenshtein_editops failed");
|
||||
}
|
||||
|
||||
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
|
||||
{
|
||||
std::vector<uint8_t> s1, s2;
|
||||
if (!extract_strings(data, size, s1, s2)) return 0;
|
||||
|
||||
/* hirschbergs algorithm is only used for very long sequences which are apparently not generated a lot by
|
||||
* the fuzzer */
|
||||
for (int i = 0; i < 10; i++) {
|
||||
size_t score = rapidfuzz_reference::levenshtein_distance(s1, s2);
|
||||
validate_editops(s1, s2, score);
|
||||
validate_editops(s1, s2, score, 64);
|
||||
validate_editops(s1, s2, score, score != 0 ? score - 1 : 0);
|
||||
validate_editops(s1, s2, score, score);
|
||||
|
||||
if (s1.size() > 1 && s2.size() > 1) {
|
||||
auto hpos = rapidfuzz::detail::find_hirschberg_pos(rapidfuzz::detail::Range(s1),
|
||||
rapidfuzz::detail::Range(s2));
|
||||
if (hpos.left_score + hpos.right_score != score)
|
||||
throw std::logic_error("find_hirschberg_pos failed");
|
||||
}
|
||||
|
||||
s1 = vec_multiply(s1, 2);
|
||||
s2 = vec_multiply(s2, 2);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
53
src/external/rapidfuzz-cpp/fuzzing/fuzz_osa_distance.cpp
vendored
Normal file
53
src/external/rapidfuzz-cpp/fuzzing/fuzz_osa_distance.cpp
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#include "../rapidfuzz_reference/OSA.hpp"
|
||||
#include "fuzzing.hpp"
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/distance/OSA.hpp>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
void validate_distance(size_t reference_dist, const std::vector<uint8_t>& s1, const std::vector<uint8_t>& s2,
|
||||
size_t score_cutoff)
|
||||
{
|
||||
if (reference_dist > score_cutoff) reference_dist = score_cutoff + 1;
|
||||
|
||||
auto dist = rapidfuzz::osa_distance(s1, s2, score_cutoff);
|
||||
if (dist != reference_dist) {
|
||||
print_seq("s1", s1);
|
||||
print_seq("s2", s2);
|
||||
throw std::logic_error(std::string("osa distance failed (score_cutoff = ") +
|
||||
std::to_string(score_cutoff) + std::string(", reference_score = ") +
|
||||
std::to_string(reference_dist) + std::string(", score = ") +
|
||||
std::to_string(dist) + ")");
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
|
||||
{
|
||||
std::vector<uint8_t> s1, s2;
|
||||
if (!extract_strings(data, size, s1, s2)) return 0;
|
||||
|
||||
size_t reference_dist = rapidfuzz_reference::osa_distance(s1, s2);
|
||||
|
||||
/* test small band */
|
||||
for (size_t i = 4; i < 32; ++i)
|
||||
validate_distance(reference_dist, s1, s2, i);
|
||||
|
||||
/* unrestricted */
|
||||
validate_distance(reference_dist, s1, s2, std::numeric_limits<size_t>::max());
|
||||
|
||||
/* test long sequences */
|
||||
for (unsigned int i = 2; i < 9; ++i) {
|
||||
std::vector<uint8_t> s1_ = vec_multiply(s1, pow<size_t>(2, i));
|
||||
std::vector<uint8_t> s2_ = vec_multiply(s2, pow<size_t>(2, i));
|
||||
|
||||
if (s1_.size() > 10000 || s2_.size() > 10000) break;
|
||||
|
||||
reference_dist = rapidfuzz_reference::osa_distance(s1_, s2_);
|
||||
validate_distance(reference_dist, s1_, s2_, std::numeric_limits<size_t>::max());
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
53
src/external/rapidfuzz-cpp/fuzzing/fuzz_partial_ratio.cpp
vendored
Normal file
53
src/external/rapidfuzz-cpp/fuzzing/fuzz_partial_ratio.cpp
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#include "../rapidfuzz_reference/fuzz.hpp"
|
||||
#include "fuzzing.hpp"
|
||||
#include <cstdint>
|
||||
#include <rapidfuzz/fuzz.hpp>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
bool is_close(double a, double b, double epsilon)
|
||||
{
|
||||
return fabs(a - b) <= ((fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * epsilon);
|
||||
}
|
||||
|
||||
void validate_distance(const std::vector<uint8_t>& s1, const std::vector<uint8_t>& s2)
|
||||
{
|
||||
auto sim = rapidfuzz::fuzz::partial_ratio(s1, s2);
|
||||
auto reference_sim = rapidfuzz_reference::partial_ratio(s1, s2);
|
||||
if (!is_close(sim, reference_sim, 0.0001)) {
|
||||
print_seq("s1: ", s1);
|
||||
print_seq("s2: ", s2);
|
||||
throw std::logic_error(std::string("partial_ratio failed (reference_score = ") +
|
||||
std::to_string(reference_sim) + std::string(", score = ") +
|
||||
std::to_string(sim) + ")");
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
|
||||
{
|
||||
std::vector<uint8_t> s1, s2;
|
||||
if (!extract_strings(data, size, s1, s2)) return 0;
|
||||
|
||||
validate_distance(s1, s2);
|
||||
validate_distance(s2, s1);
|
||||
|
||||
/* test long sequences */
|
||||
for (unsigned int i = 2; i < 9; ++i) {
|
||||
std::vector<uint8_t> s1_ = vec_multiply(s1, pow<size_t>(2, i));
|
||||
std::vector<uint8_t> s2_ = vec_multiply(s2, pow<size_t>(2, i));
|
||||
|
||||
if (s1_.size() > 10000 || s2_.size() > 10000) break;
|
||||
|
||||
validate_distance(s1_, s2_);
|
||||
validate_distance(s2_, s1_);
|
||||
validate_distance(s1, s2_);
|
||||
validate_distance(s2_, s1);
|
||||
validate_distance(s1_, s2);
|
||||
validate_distance(s2, s1_);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
55
src/external/rapidfuzz-cpp/fuzzing/fuzzing.hpp
vendored
Normal file
55
src/external/rapidfuzz-cpp/fuzzing/fuzzing.hpp
vendored
Normal file
@@ -0,0 +1,55 @@
|
||||
#pragma once
|
||||
#include <iostream>
|
||||
#include <rapidfuzz/distance/Levenshtein.hpp>
|
||||
#include <vector>
|
||||
|
||||
static inline bool extract_strings(const uint8_t* data, size_t size, std::vector<uint8_t>& s1,
|
||||
std::vector<uint8_t>& s2)
|
||||
{
|
||||
if (size <= sizeof(uint32_t)) {
|
||||
return false;
|
||||
}
|
||||
uint32_t len1 = *(uint32_t*)data;
|
||||
|
||||
if (len1 > size - sizeof(len1)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
data += sizeof(len1);
|
||||
size -= sizeof(len1);
|
||||
s1 = std::vector<uint8_t>(data, data + len1);
|
||||
s2 = std::vector<uint8_t>(data + len1, data + size);
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline T pow(T x, unsigned int p)
|
||||
{
|
||||
if (p == 0) return 1;
|
||||
if (p == 1) return x;
|
||||
|
||||
T tmp = pow(x, p / 2);
|
||||
if (p % 2 == 0)
|
||||
return tmp * tmp;
|
||||
else
|
||||
return x * tmp * tmp;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<T> vec_multiply(const std::vector<T>& a, size_t b)
|
||||
{
|
||||
std::vector<T> output;
|
||||
while (b--)
|
||||
output.insert(output.end(), a.begin(), a.end());
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void print_seq(const std::string& name, const std::vector<T>& seq)
|
||||
{
|
||||
std::cout << name << " len: " << seq.size() << " content: ";
|
||||
for (const auto& ch : seq)
|
||||
std::cout << static_cast<uint64_t>(ch) << " ";
|
||||
std::cout << std::endl;
|
||||
}
|
||||
74
src/external/rapidfuzz-cpp/rapidfuzz/details/CharSet.hpp
vendored
Normal file
74
src/external/rapidfuzz-cpp/rapidfuzz/details/CharSet.hpp
vendored
Normal file
@@ -0,0 +1,74 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright (c) 2022 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <array>
|
||||
#include <limits>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <type_traits>
|
||||
#include <unordered_set>
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
/*
|
||||
* taken from https://stackoverflow.com/a/17251989/11335032
|
||||
*/
|
||||
template <typename T, typename U>
|
||||
bool CanTypeFitValue(const U value)
|
||||
{
|
||||
const intmax_t botT = intmax_t(std::numeric_limits<T>::min());
|
||||
const intmax_t botU = intmax_t(std::numeric_limits<U>::min());
|
||||
const uintmax_t topT = uintmax_t(std::numeric_limits<T>::max());
|
||||
const uintmax_t topU = uintmax_t(std::numeric_limits<U>::max());
|
||||
return !((botT > botU && value < static_cast<U>(botT)) || (topT < topU && value > static_cast<U>(topT)));
|
||||
}
|
||||
|
||||
template <typename CharT1, size_t size = sizeof(CharT1)>
|
||||
struct CharSet;
|
||||
|
||||
template <typename CharT1>
|
||||
struct CharSet<CharT1, 1> {
|
||||
using UCharT1 = typename std::make_unsigned<CharT1>::type;
|
||||
|
||||
std::array<bool, std::numeric_limits<UCharT1>::max() + 1> m_val;
|
||||
|
||||
CharSet() : m_val{}
|
||||
{}
|
||||
|
||||
void insert(CharT1 ch)
|
||||
{
|
||||
m_val[UCharT1(ch)] = true;
|
||||
}
|
||||
|
||||
template <typename CharT2>
|
||||
bool find(CharT2 ch) const
|
||||
{
|
||||
if (!CanTypeFitValue<CharT1>(ch)) return false;
|
||||
|
||||
return m_val[UCharT1(ch)];
|
||||
}
|
||||
};
|
||||
|
||||
template <typename CharT1, size_t size>
|
||||
struct CharSet {
|
||||
std::unordered_set<CharT1> m_val;
|
||||
|
||||
CharSet() : m_val{}
|
||||
{}
|
||||
|
||||
void insert(CharT1 ch)
|
||||
{
|
||||
m_val.insert(ch);
|
||||
}
|
||||
|
||||
template <typename CharT2>
|
||||
bool find(CharT2 ch) const
|
||||
{
|
||||
if (!CanTypeFitValue<CharT1>(ch)) return false;
|
||||
|
||||
return m_val.find(CharT1(ch)) != m_val.end();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
203
src/external/rapidfuzz-cpp/rapidfuzz/details/GrowingHashmap.hpp
vendored
Normal file
203
src/external/rapidfuzz-cpp/rapidfuzz/details/GrowingHashmap.hpp
vendored
Normal file
@@ -0,0 +1,203 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright (c) 2022 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
/* hashmap for integers which can only grow, but can't remove elements */
|
||||
template <typename T_Key, typename T_Entry>
|
||||
struct GrowingHashmap {
|
||||
using key_type = T_Key;
|
||||
using value_type = T_Entry;
|
||||
using size_type = unsigned int;
|
||||
|
||||
private:
|
||||
static constexpr size_type min_size = 8;
|
||||
struct MapElem {
|
||||
key_type key;
|
||||
value_type value = value_type();
|
||||
};
|
||||
|
||||
int used;
|
||||
int fill;
|
||||
int mask;
|
||||
MapElem* m_map;
|
||||
|
||||
public:
|
||||
GrowingHashmap() : used(0), fill(0), mask(-1), m_map(nullptr)
|
||||
{}
|
||||
~GrowingHashmap()
|
||||
{
|
||||
delete[] m_map;
|
||||
}
|
||||
|
||||
GrowingHashmap(const GrowingHashmap& other) : used(other.used), fill(other.fill), mask(other.mask)
|
||||
{
|
||||
int size = mask + 1;
|
||||
m_map = new MapElem[size];
|
||||
std::copy(other.m_map, other.m_map + size, m_map);
|
||||
}
|
||||
|
||||
GrowingHashmap(GrowingHashmap&& other) noexcept : GrowingHashmap()
|
||||
{
|
||||
swap(*this, other);
|
||||
}
|
||||
|
||||
GrowingHashmap& operator=(GrowingHashmap other)
|
||||
{
|
||||
swap(*this, other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
friend void swap(GrowingHashmap& first, GrowingHashmap& second) noexcept
|
||||
{
|
||||
std::swap(first.used, second.used);
|
||||
std::swap(first.fill, second.fill);
|
||||
std::swap(first.mask, second.mask);
|
||||
std::swap(first.m_map, second.m_map);
|
||||
}
|
||||
|
||||
size_type size() const
|
||||
{
|
||||
return used;
|
||||
}
|
||||
size_type capacity() const
|
||||
{
|
||||
return mask + 1;
|
||||
}
|
||||
bool empty() const
|
||||
{
|
||||
return used == 0;
|
||||
}
|
||||
|
||||
value_type get(key_type key) const noexcept
|
||||
{
|
||||
if (m_map == nullptr) return value_type();
|
||||
|
||||
return m_map[lookup(key)].value;
|
||||
}
|
||||
|
||||
value_type& operator[](key_type key) noexcept
|
||||
{
|
||||
if (m_map == nullptr) allocate();
|
||||
|
||||
size_t i = lookup(key);
|
||||
|
||||
if (m_map[i].value == value_type()) {
|
||||
/* resize when 2/3 full */
|
||||
if (++fill * 3 >= (mask + 1) * 2) {
|
||||
grow((used + 1) * 2);
|
||||
i = lookup(key);
|
||||
}
|
||||
|
||||
used++;
|
||||
}
|
||||
|
||||
m_map[i].key = key;
|
||||
return m_map[i].value;
|
||||
}
|
||||
|
||||
private:
|
||||
void allocate()
|
||||
{
|
||||
mask = min_size - 1;
|
||||
m_map = new MapElem[min_size];
|
||||
}
|
||||
|
||||
/**
|
||||
* lookup key inside the hashmap using a similar collision resolution
|
||||
* strategy to CPython and Ruby
|
||||
*/
|
||||
size_t lookup(key_type key) const
|
||||
{
|
||||
size_t hash = static_cast<size_t>(key);
|
||||
size_t i = hash & static_cast<size_t>(mask);
|
||||
|
||||
if (m_map[i].value == value_type() || m_map[i].key == key) return i;
|
||||
|
||||
size_t perturb = hash;
|
||||
while (true) {
|
||||
i = (i * 5 + perturb + 1) & static_cast<size_t>(mask);
|
||||
if (m_map[i].value == value_type() || m_map[i].key == key) return i;
|
||||
|
||||
perturb >>= 5;
|
||||
}
|
||||
}
|
||||
|
||||
void grow(int minUsed)
|
||||
{
|
||||
int newSize = mask + 1;
|
||||
while (newSize <= minUsed)
|
||||
newSize <<= 1;
|
||||
|
||||
MapElem* oldMap = m_map;
|
||||
m_map = new MapElem[static_cast<size_t>(newSize)];
|
||||
|
||||
fill = used;
|
||||
mask = newSize - 1;
|
||||
|
||||
for (int i = 0; used > 0; i++)
|
||||
if (oldMap[i].value != value_type()) {
|
||||
size_t j = lookup(oldMap[i].key);
|
||||
|
||||
m_map[j].key = oldMap[i].key;
|
||||
m_map[j].value = oldMap[i].value;
|
||||
used--;
|
||||
}
|
||||
|
||||
used = fill;
|
||||
delete[] oldMap;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T_Key, typename T_Entry>
|
||||
struct HybridGrowingHashmap {
|
||||
using key_type = T_Key;
|
||||
using value_type = T_Entry;
|
||||
|
||||
HybridGrowingHashmap()
|
||||
{
|
||||
m_extendedAscii.fill(value_type());
|
||||
}
|
||||
|
||||
value_type get(char key) const noexcept
|
||||
{
|
||||
/** treat char as value between 0 and 127 for performance reasons */
|
||||
return m_extendedAscii[static_cast<uint8_t>(key)];
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
value_type get(CharT key) const noexcept
|
||||
{
|
||||
if (key >= 0 && key <= 255)
|
||||
return m_extendedAscii[static_cast<uint8_t>(key)];
|
||||
else
|
||||
return m_map.get(static_cast<key_type>(key));
|
||||
}
|
||||
|
||||
value_type& operator[](char key) noexcept
|
||||
{
|
||||
/** treat char as value between 0 and 127 for performance reasons */
|
||||
return m_extendedAscii[static_cast<uint8_t>(key)];
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
value_type& operator[](CharT key)
|
||||
{
|
||||
if (key >= 0 && key <= 255)
|
||||
return m_extendedAscii[static_cast<uint8_t>(key)];
|
||||
else
|
||||
return m_map[static_cast<key_type>(key)];
|
||||
}
|
||||
|
||||
private:
|
||||
GrowingHashmap<key_type, value_type> m_map;
|
||||
std::array<value_type, 256> m_extendedAscii;
|
||||
};
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
199
src/external/rapidfuzz-cpp/rapidfuzz/details/Matrix.hpp
vendored
Normal file
199
src/external/rapidfuzz-cpp/rapidfuzz/details/Matrix.hpp
vendored
Normal file
@@ -0,0 +1,199 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright (c) 2022 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
template <typename T, bool IsConst>
|
||||
struct BitMatrixView {
|
||||
|
||||
using value_type = T;
|
||||
using size_type = size_t;
|
||||
using pointer = std::conditional_t<IsConst, const value_type*, value_type*>;
|
||||
using reference = std::conditional_t<IsConst, const value_type&, value_type&>;
|
||||
|
||||
BitMatrixView(pointer vector, size_type cols) noexcept : m_vector(vector), m_cols(cols)
|
||||
{}
|
||||
|
||||
reference operator[](size_type col) noexcept
|
||||
{
|
||||
assert(col < m_cols);
|
||||
return m_vector[col];
|
||||
}
|
||||
|
||||
size_type size() const noexcept
|
||||
{
|
||||
return m_cols;
|
||||
}
|
||||
|
||||
private:
|
||||
pointer m_vector;
|
||||
size_type m_cols;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct BitMatrix {
|
||||
|
||||
using value_type = T;
|
||||
|
||||
BitMatrix() : m_rows(0), m_cols(0), m_matrix(nullptr)
|
||||
{}
|
||||
|
||||
BitMatrix(size_t rows, size_t cols, T val) : m_rows(rows), m_cols(cols), m_matrix(nullptr)
|
||||
{
|
||||
if (m_rows && m_cols) m_matrix = new T[m_rows * m_cols];
|
||||
std::fill_n(m_matrix, m_rows * m_cols, val);
|
||||
}
|
||||
|
||||
BitMatrix(const BitMatrix& other) : m_rows(other.m_rows), m_cols(other.m_cols), m_matrix(nullptr)
|
||||
{
|
||||
if (m_rows && m_cols) m_matrix = new T[m_rows * m_cols];
|
||||
std::copy(other.m_matrix, other.m_matrix + m_rows * m_cols, m_matrix);
|
||||
}
|
||||
|
||||
BitMatrix(BitMatrix&& other) noexcept : m_rows(0), m_cols(0), m_matrix(nullptr)
|
||||
{
|
||||
other.swap(*this);
|
||||
}
|
||||
|
||||
BitMatrix& operator=(BitMatrix&& other) noexcept
|
||||
{
|
||||
other.swap(*this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitMatrix& operator=(const BitMatrix& other)
|
||||
{
|
||||
BitMatrix temp = other;
|
||||
temp.swap(*this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
void swap(BitMatrix& rhs) noexcept
|
||||
{
|
||||
using std::swap;
|
||||
swap(m_rows, rhs.m_rows);
|
||||
swap(m_cols, rhs.m_cols);
|
||||
swap(m_matrix, rhs.m_matrix);
|
||||
}
|
||||
|
||||
~BitMatrix()
|
||||
{
|
||||
delete[] m_matrix;
|
||||
}
|
||||
|
||||
BitMatrixView<value_type, false> operator[](size_t row) noexcept
|
||||
{
|
||||
assert(row < m_rows);
|
||||
return {&m_matrix[row * m_cols], m_cols};
|
||||
}
|
||||
|
||||
BitMatrixView<value_type, true> operator[](size_t row) const noexcept
|
||||
{
|
||||
assert(row < m_rows);
|
||||
return {&m_matrix[row * m_cols], m_cols};
|
||||
}
|
||||
|
||||
size_t rows() const noexcept
|
||||
{
|
||||
return m_rows;
|
||||
}
|
||||
|
||||
size_t cols() const noexcept
|
||||
{
|
||||
return m_cols;
|
||||
}
|
||||
|
||||
private:
|
||||
size_t m_rows;
|
||||
size_t m_cols;
|
||||
T* m_matrix;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct ShiftedBitMatrix {
|
||||
using value_type = T;
|
||||
|
||||
ShiftedBitMatrix()
|
||||
{}
|
||||
|
||||
ShiftedBitMatrix(size_t rows, size_t cols, T val) : m_matrix(rows, cols, val), m_offsets(rows)
|
||||
{}
|
||||
|
||||
ShiftedBitMatrix(const ShiftedBitMatrix& other) : m_matrix(other.m_matrix), m_offsets(other.m_offsets)
|
||||
{}
|
||||
|
||||
ShiftedBitMatrix(ShiftedBitMatrix&& other) noexcept
|
||||
{
|
||||
other.swap(*this);
|
||||
}
|
||||
|
||||
ShiftedBitMatrix& operator=(ShiftedBitMatrix&& other) noexcept
|
||||
{
|
||||
other.swap(*this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
ShiftedBitMatrix& operator=(const ShiftedBitMatrix& other)
|
||||
{
|
||||
ShiftedBitMatrix temp = other;
|
||||
temp.swap(*this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
void swap(ShiftedBitMatrix& rhs) noexcept
|
||||
{
|
||||
using std::swap;
|
||||
swap(m_matrix, rhs.m_matrix);
|
||||
swap(m_offsets, rhs.m_offsets);
|
||||
}
|
||||
|
||||
bool test_bit(size_t row, size_t col, bool default_ = false) const noexcept
|
||||
{
|
||||
ptrdiff_t offset = m_offsets[row];
|
||||
|
||||
if (offset < 0) {
|
||||
col += static_cast<size_t>(-offset);
|
||||
}
|
||||
else if (col >= static_cast<size_t>(offset)) {
|
||||
col -= static_cast<size_t>(offset);
|
||||
}
|
||||
/* bit on the left of the band */
|
||||
else {
|
||||
return default_;
|
||||
}
|
||||
|
||||
size_t word_size = sizeof(value_type) * 8;
|
||||
size_t col_word = col / word_size;
|
||||
value_type col_mask = value_type(1) << (col % word_size);
|
||||
|
||||
return bool(m_matrix[row][col_word] & col_mask);
|
||||
}
|
||||
|
||||
auto operator[](size_t row) noexcept
|
||||
{
|
||||
return m_matrix[row];
|
||||
}
|
||||
|
||||
auto operator[](size_t row) const noexcept
|
||||
{
|
||||
return m_matrix[row];
|
||||
}
|
||||
|
||||
void set_offset(size_t row, ptrdiff_t offset)
|
||||
{
|
||||
m_offsets[row] = offset;
|
||||
}
|
||||
|
||||
private:
|
||||
BitMatrix<value_type> m_matrix;
|
||||
std::vector<ptrdiff_t> m_offsets;
|
||||
};
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
222
src/external/rapidfuzz-cpp/rapidfuzz/details/PatternMatchVector.hpp
vendored
Normal file
222
src/external/rapidfuzz-cpp/rapidfuzz/details/PatternMatchVector.hpp
vendored
Normal file
@@ -0,0 +1,222 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright (c) 2022 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <array>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <rapidfuzz/details/GrowingHashmap.hpp>
|
||||
#include <rapidfuzz/details/Matrix.hpp>
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/details/intrinsics.hpp>
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
struct BitvectorHashmap {
|
||||
BitvectorHashmap() : m_map()
|
||||
{}
|
||||
|
||||
template <typename CharT>
|
||||
uint64_t get(CharT key) const noexcept
|
||||
{
|
||||
return m_map[lookup(static_cast<uint64_t>(key))].value;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
uint64_t& operator[](CharT key) noexcept
|
||||
{
|
||||
uint32_t i = lookup(static_cast<uint64_t>(key));
|
||||
m_map[i].key = static_cast<uint64_t>(key);
|
||||
return m_map[i].value;
|
||||
}
|
||||
|
||||
private:
|
||||
/**
|
||||
* lookup key inside the hashmap using a similar collision resolution
|
||||
* strategy to CPython and Ruby
|
||||
*/
|
||||
uint32_t lookup(uint64_t key) const noexcept
|
||||
{
|
||||
uint32_t i = key % 128;
|
||||
|
||||
if (!m_map[i].value || m_map[i].key == key) return i;
|
||||
|
||||
uint64_t perturb = key;
|
||||
while (true) {
|
||||
i = (static_cast<uint64_t>(i) * 5 + perturb + 1) % 128;
|
||||
if (!m_map[i].value || m_map[i].key == key) return i;
|
||||
|
||||
perturb >>= 5;
|
||||
}
|
||||
}
|
||||
|
||||
struct MapElem {
|
||||
uint64_t key = 0;
|
||||
uint64_t value = 0;
|
||||
};
|
||||
std::array<MapElem, 128> m_map;
|
||||
};
|
||||
|
||||
struct PatternMatchVector {
|
||||
PatternMatchVector() : m_extendedAscii()
|
||||
{}
|
||||
|
||||
template <typename InputIt>
|
||||
PatternMatchVector(const Range<InputIt>& s) : m_extendedAscii()
|
||||
{
|
||||
insert(s);
|
||||
}
|
||||
|
||||
size_t size() const noexcept
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
template <typename InputIt>
|
||||
void insert(const Range<InputIt>& s) noexcept
|
||||
{
|
||||
uint64_t mask = 1;
|
||||
for (const auto& ch : s) {
|
||||
insert_mask(ch, mask);
|
||||
mask <<= 1;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
void insert(CharT key, int64_t pos) noexcept
|
||||
{
|
||||
insert_mask(key, UINT64_C(1) << pos);
|
||||
}
|
||||
|
||||
uint64_t get(char key) const noexcept
|
||||
{
|
||||
/** treat char as value between 0 and 127 for performance reasons */
|
||||
return m_extendedAscii[static_cast<uint8_t>(key)];
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
uint64_t get(CharT key) const noexcept
|
||||
{
|
||||
if (key >= 0 && key <= 255)
|
||||
return m_extendedAscii[static_cast<uint8_t>(key)];
|
||||
else
|
||||
return m_map.get(key);
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
uint64_t get(size_t block, CharT key) const noexcept
|
||||
{
|
||||
assert(block == 0);
|
||||
(void)block;
|
||||
return get(key);
|
||||
}
|
||||
|
||||
void insert_mask(char key, uint64_t mask) noexcept
|
||||
{
|
||||
/** treat char as value between 0 and 127 for performance reasons */
|
||||
m_extendedAscii[static_cast<uint8_t>(key)] |= mask;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
void insert_mask(CharT key, uint64_t mask) noexcept
|
||||
{
|
||||
if (key >= 0 && key <= 255)
|
||||
m_extendedAscii[static_cast<uint8_t>(key)] |= mask;
|
||||
else
|
||||
m_map[key] |= mask;
|
||||
}
|
||||
|
||||
private:
|
||||
BitvectorHashmap m_map;
|
||||
std::array<uint64_t, 256> m_extendedAscii;
|
||||
};
|
||||
|
||||
struct BlockPatternMatchVector {
|
||||
BlockPatternMatchVector() = delete;
|
||||
|
||||
BlockPatternMatchVector(size_t str_len)
|
||||
: m_block_count(ceil_div(str_len, 64)), m_map(nullptr), m_extendedAscii(256, m_block_count, 0)
|
||||
{}
|
||||
|
||||
template <typename InputIt>
|
||||
BlockPatternMatchVector(const Range<InputIt>& s) : BlockPatternMatchVector(s.size())
|
||||
{
|
||||
insert(s);
|
||||
}
|
||||
|
||||
~BlockPatternMatchVector()
|
||||
{
|
||||
delete[] m_map;
|
||||
}
|
||||
|
||||
size_t size() const noexcept
|
||||
{
|
||||
return m_block_count;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
void insert(size_t block, CharT ch, int pos) noexcept
|
||||
{
|
||||
uint64_t mask = UINT64_C(1) << pos;
|
||||
insert_mask(block, ch, mask);
|
||||
}
|
||||
|
||||
/**
|
||||
* @warning undefined behavior if iterator \p first is greater than \p last
|
||||
* @tparam InputIt
|
||||
* @param first
|
||||
* @param last
|
||||
*/
|
||||
template <typename InputIt>
|
||||
void insert(const Range<InputIt>& s) noexcept
|
||||
{
|
||||
uint64_t mask = 1;
|
||||
size_t i = 0;
|
||||
for (auto iter = s.begin(); iter != s.end(); ++iter, ++i) {
|
||||
size_t block = i / 64;
|
||||
insert_mask(block, *iter, mask);
|
||||
mask = rotl(mask, 1);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
void insert_mask(size_t block, CharT key, uint64_t mask) noexcept
|
||||
{
|
||||
assert(block < size());
|
||||
if (key >= 0 && key <= 255)
|
||||
m_extendedAscii[static_cast<uint8_t>(key)][block] |= mask;
|
||||
else {
|
||||
if (!m_map) m_map = new BitvectorHashmap[m_block_count];
|
||||
m_map[block][key] |= mask;
|
||||
}
|
||||
}
|
||||
|
||||
void insert_mask(size_t block, char key, uint64_t mask) noexcept
|
||||
{
|
||||
insert_mask(block, static_cast<uint8_t>(key), mask);
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
uint64_t get(size_t block, CharT key) const noexcept
|
||||
{
|
||||
if (key >= 0 && key <= 255)
|
||||
return m_extendedAscii[static_cast<uint8_t>(key)][block];
|
||||
else if (m_map)
|
||||
return m_map[block].get(key);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t get(size_t block, char ch) const noexcept
|
||||
{
|
||||
return get(block, static_cast<uint8_t>(ch));
|
||||
}
|
||||
|
||||
private:
|
||||
size_t m_block_count;
|
||||
BitvectorHashmap* m_map;
|
||||
BitMatrix<uint64_t> m_extendedAscii;
|
||||
};
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
230
src/external/rapidfuzz-cpp/rapidfuzz/details/Range.hpp
vendored
Normal file
230
src/external/rapidfuzz-cpp/rapidfuzz/details/Range.hpp
vendored
Normal file
@@ -0,0 +1,230 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright (c) 2022 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <iterator>
|
||||
#include <limits>
|
||||
#include <ostream>
|
||||
#include <stdexcept>
|
||||
#include <stdint.h>
|
||||
#include <sys/types.h>
|
||||
#include <vector>
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
static inline void assume(bool b)
|
||||
{
|
||||
#if defined(__clang__)
|
||||
__builtin_assume(b);
|
||||
#elif defined(__GNUC__) || defined(__GNUG__)
|
||||
if (!b) __builtin_unreachable();
|
||||
#elif defined(_MSC_VER)
|
||||
__assume(b);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
CharT* to_begin(CharT* s)
|
||||
{
|
||||
return s;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
auto to_begin(T& x)
|
||||
{
|
||||
using std::begin;
|
||||
return begin(x);
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
CharT* to_end(CharT* s)
|
||||
{
|
||||
assume(s != nullptr);
|
||||
while (*s != 0)
|
||||
++s;
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
auto to_end(T& x)
|
||||
{
|
||||
using std::end;
|
||||
return end(x);
|
||||
}
|
||||
|
||||
template <typename Iter>
|
||||
class Range {
|
||||
Iter _first;
|
||||
Iter _last;
|
||||
// todo we might not want to cache the size for iterators
|
||||
// that can can retrieve the size in O(1) time
|
||||
size_t _size;
|
||||
|
||||
public:
|
||||
using value_type = typename std::iterator_traits<Iter>::value_type;
|
||||
using iterator = Iter;
|
||||
using reverse_iterator = std::reverse_iterator<iterator>;
|
||||
|
||||
constexpr Range(Iter first, Iter last) : _first(first), _last(last)
|
||||
{
|
||||
assert(std::distance(_first, _last) >= 0);
|
||||
_size = static_cast<size_t>(std::distance(_first, _last));
|
||||
}
|
||||
|
||||
constexpr Range(Iter first, Iter last, size_t size) : _first(first), _last(last), _size(size)
|
||||
{}
|
||||
|
||||
template <typename T>
|
||||
constexpr Range(T& x) : _first(to_begin(x)), _last(to_end(x))
|
||||
{
|
||||
assert(std::distance(_first, _last) >= 0);
|
||||
_size = static_cast<size_t>(std::distance(_first, _last));
|
||||
}
|
||||
|
||||
constexpr iterator begin() const noexcept
|
||||
{
|
||||
return _first;
|
||||
}
|
||||
constexpr iterator end() const noexcept
|
||||
{
|
||||
return _last;
|
||||
}
|
||||
|
||||
constexpr reverse_iterator rbegin() const noexcept
|
||||
{
|
||||
return reverse_iterator(end());
|
||||
}
|
||||
constexpr reverse_iterator rend() const noexcept
|
||||
{
|
||||
return reverse_iterator(begin());
|
||||
}
|
||||
|
||||
constexpr size_t size() const
|
||||
{
|
||||
return _size;
|
||||
}
|
||||
|
||||
constexpr bool empty() const
|
||||
{
|
||||
return size() == 0;
|
||||
}
|
||||
explicit constexpr operator bool() const
|
||||
{
|
||||
return !empty();
|
||||
}
|
||||
|
||||
template <
|
||||
typename... Dummy, typename IterCopy = Iter,
|
||||
typename = std::enable_if_t<std::is_base_of_v<
|
||||
std::random_access_iterator_tag, typename std::iterator_traits<IterCopy>::iterator_category>>>
|
||||
constexpr decltype(auto) operator[](size_t n) const
|
||||
{
|
||||
return _first[static_cast<ptrdiff_t>(n)];
|
||||
}
|
||||
|
||||
constexpr void remove_prefix(size_t n)
|
||||
{
|
||||
if constexpr (std::is_base_of_v<std::random_access_iterator_tag,
|
||||
typename std::iterator_traits<Iter>::iterator_category>)
|
||||
_first += static_cast<ptrdiff_t>(n);
|
||||
else
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
_first++;
|
||||
|
||||
_size -= n;
|
||||
}
|
||||
constexpr void remove_suffix(size_t n)
|
||||
{
|
||||
if constexpr (std::is_base_of_v<std::random_access_iterator_tag,
|
||||
typename std::iterator_traits<Iter>::iterator_category>)
|
||||
_last -= static_cast<ptrdiff_t>(n);
|
||||
else
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
_last--;
|
||||
|
||||
_size -= n;
|
||||
}
|
||||
|
||||
constexpr Range subseq(size_t pos = 0, size_t count = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
if (pos > size()) throw std::out_of_range("Index out of range in Range::substr");
|
||||
|
||||
Range res = *this;
|
||||
res.remove_prefix(pos);
|
||||
if (count < res.size()) res.remove_suffix(res.size() - count);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
constexpr decltype(auto) front() const
|
||||
{
|
||||
return *(_first);
|
||||
}
|
||||
|
||||
constexpr decltype(auto) back() const
|
||||
{
|
||||
return *(_last - 1);
|
||||
}
|
||||
|
||||
constexpr Range<reverse_iterator> reversed() const
|
||||
{
|
||||
return {rbegin(), rend(), _size};
|
||||
}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const Range& seq)
|
||||
{
|
||||
os << "[";
|
||||
for (auto x : seq)
|
||||
os << static_cast<uint64_t>(x) << ", ";
|
||||
os << "]";
|
||||
return os;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
Range(T& x) -> Range<decltype(to_begin(x))>;
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
inline bool operator==(const Range<InputIt1>& a, const Range<InputIt2>& b)
|
||||
{
|
||||
return std::equal(a.begin(), a.end(), b.begin(), b.end());
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
inline bool operator!=(const Range<InputIt1>& a, const Range<InputIt2>& b)
|
||||
{
|
||||
return !(a == b);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
inline bool operator<(const Range<InputIt1>& a, const Range<InputIt2>& b)
|
||||
{
|
||||
return (std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end()));
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
inline bool operator>(const Range<InputIt1>& a, const Range<InputIt2>& b)
|
||||
{
|
||||
return b < a;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
inline bool operator<=(const Range<InputIt1>& a, const Range<InputIt2>& b)
|
||||
{
|
||||
return !(b < a);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
inline bool operator>=(const Range<InputIt1>& a, const Range<InputIt2>& b)
|
||||
{
|
||||
return !(a < b);
|
||||
}
|
||||
|
||||
template <typename InputIt>
|
||||
using RangeVec = std::vector<Range<InputIt>>;
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
86
src/external/rapidfuzz-cpp/rapidfuzz/details/SplittedSentenceView.hpp
vendored
Normal file
86
src/external/rapidfuzz-cpp/rapidfuzz/details/SplittedSentenceView.hpp
vendored
Normal file
@@ -0,0 +1,86 @@
|
||||
#pragma once
|
||||
#include <algorithm>
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/details/type_traits.hpp>
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
template <typename InputIt>
|
||||
class SplittedSentenceView {
|
||||
public:
|
||||
using CharT = iter_value_t<InputIt>;
|
||||
|
||||
SplittedSentenceView(RangeVec<InputIt> sentence) noexcept(
|
||||
std::is_nothrow_move_constructible_v<RangeVec<InputIt>>)
|
||||
: m_sentence(std::move(sentence))
|
||||
{}
|
||||
|
||||
size_t dedupe();
|
||||
size_t size() const;
|
||||
|
||||
size_t length() const
|
||||
{
|
||||
return size();
|
||||
}
|
||||
|
||||
bool empty() const
|
||||
{
|
||||
return m_sentence.empty();
|
||||
}
|
||||
|
||||
size_t word_count() const
|
||||
{
|
||||
return m_sentence.size();
|
||||
}
|
||||
|
||||
std::vector<CharT> join() const;
|
||||
|
||||
const RangeVec<InputIt>& words() const
|
||||
{
|
||||
return m_sentence;
|
||||
}
|
||||
|
||||
private:
|
||||
RangeVec<InputIt> m_sentence;
|
||||
};
|
||||
|
||||
template <typename InputIt>
|
||||
size_t SplittedSentenceView<InputIt>::dedupe()
|
||||
{
|
||||
size_t old_word_count = word_count();
|
||||
m_sentence.erase(std::unique(m_sentence.begin(), m_sentence.end()), m_sentence.end());
|
||||
return old_word_count - word_count();
|
||||
}
|
||||
|
||||
template <typename InputIt>
|
||||
size_t SplittedSentenceView<InputIt>::size() const
|
||||
{
|
||||
if (m_sentence.empty()) return 0;
|
||||
|
||||
// there is a whitespace between each word
|
||||
size_t result = m_sentence.size() - 1;
|
||||
for (const auto& word : m_sentence) {
|
||||
result += static_cast<size_t>(std::distance(word.begin(), word.end()));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename InputIt>
|
||||
auto SplittedSentenceView<InputIt>::join() const -> std::vector<CharT>
|
||||
{
|
||||
if (m_sentence.empty()) {
|
||||
return std::vector<CharT>();
|
||||
}
|
||||
|
||||
auto sentence_iter = m_sentence.begin();
|
||||
std::vector<CharT> joined(sentence_iter->begin(), sentence_iter->end());
|
||||
++sentence_iter;
|
||||
for (; sentence_iter != m_sentence.end(); ++sentence_iter) {
|
||||
joined.push_back(0x20);
|
||||
joined.insert(joined.end(), sentence_iter->begin(), sentence_iter->end());
|
||||
}
|
||||
return joined;
|
||||
}
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
101
src/external/rapidfuzz-cpp/rapidfuzz/details/common.hpp
vendored
Normal file
101
src/external/rapidfuzz-cpp/rapidfuzz/details/common.hpp
vendored
Normal file
@@ -0,0 +1,101 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <cstring>
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/details/SplittedSentenceView.hpp>
|
||||
#include <rapidfuzz/details/intrinsics.hpp>
|
||||
#include <rapidfuzz/details/type_traits.hpp>
|
||||
#include <rapidfuzz/details/types.hpp>
|
||||
|
||||
#if defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)
|
||||
# include <mm_malloc.h>
|
||||
#endif
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
template <typename InputIt1, typename InputIt2, typename InputIt3>
|
||||
struct DecomposedSet {
|
||||
SplittedSentenceView<InputIt1> difference_ab;
|
||||
SplittedSentenceView<InputIt2> difference_ba;
|
||||
SplittedSentenceView<InputIt3> intersection;
|
||||
DecomposedSet(SplittedSentenceView<InputIt1> diff_ab, SplittedSentenceView<InputIt2> diff_ba,
|
||||
SplittedSentenceView<InputIt3> intersect)
|
||||
: difference_ab(std::move(diff_ab)),
|
||||
difference_ba(std::move(diff_ba)),
|
||||
intersection(std::move(intersect))
|
||||
{}
|
||||
};
|
||||
|
||||
static inline size_t abs_diff(size_t a, size_t b)
|
||||
{
|
||||
return a > b ? a - b : b - a;
|
||||
}
|
||||
|
||||
template<typename TO, typename FROM>
|
||||
TO opt_static_cast(const FROM &value)
|
||||
{
|
||||
if constexpr (std::is_same_v<TO, FROM>)
|
||||
return value;
|
||||
else
|
||||
return static_cast<TO>(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* @defgroup Common Common
|
||||
* Common utilities shared among multiple functions
|
||||
* @{
|
||||
*/
|
||||
|
||||
static inline double NormSim_to_NormDist(double score_cutoff, double imprecision = 0.00001)
|
||||
{
|
||||
return std::min(1.0, 1.0 - score_cutoff + imprecision);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
DecomposedSet<InputIt1, InputIt2, InputIt1> set_decomposition(SplittedSentenceView<InputIt1> a,
|
||||
SplittedSentenceView<InputIt2> b);
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
StringAffix remove_common_affix(Range<InputIt1>& s1, Range<InputIt2>& s2);
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t remove_common_prefix(Range<InputIt1>& s1, Range<InputIt2>& s2);
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t remove_common_suffix(Range<InputIt1>& s1, Range<InputIt2>& s2);
|
||||
|
||||
template <typename InputIt, typename CharT = iter_value_t<InputIt>>
|
||||
SplittedSentenceView<InputIt> sorted_split(InputIt first, InputIt last);
|
||||
|
||||
static inline void* rf_aligned_alloc(size_t alignment, size_t size)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
return _aligned_malloc(size, alignment);
|
||||
#elif defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)
|
||||
return _mm_malloc(size, alignment);
|
||||
#elif defined(__ANDROID__) && __ANDROID_API__ > 16
|
||||
void* ptr = nullptr;
|
||||
return posix_memalign(&ptr, alignment, size) ? nullptr : ptr;
|
||||
#else
|
||||
return aligned_alloc(alignment, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void rf_aligned_free(void* ptr)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
_aligned_free(ptr);
|
||||
#elif defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)
|
||||
_mm_free(ptr);
|
||||
#else
|
||||
free(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
/**@}*/
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
|
||||
#include <rapidfuzz/details/common_impl.hpp>
|
||||
172
src/external/rapidfuzz-cpp/rapidfuzz/details/common_impl.hpp
vendored
Normal file
172
src/external/rapidfuzz-cpp/rapidfuzz/details/common_impl.hpp
vendored
Normal file
@@ -0,0 +1,172 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2020 Max Bachmann */
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <iterator>
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
DecomposedSet<InputIt1, InputIt2, InputIt1> set_decomposition(SplittedSentenceView<InputIt1> a,
|
||||
SplittedSentenceView<InputIt2> b)
|
||||
{
|
||||
a.dedupe();
|
||||
b.dedupe();
|
||||
|
||||
RangeVec<InputIt1> intersection;
|
||||
RangeVec<InputIt1> difference_ab;
|
||||
RangeVec<InputIt2> difference_ba = b.words();
|
||||
|
||||
for (const auto& current_a : a.words()) {
|
||||
auto element_b = std::find(difference_ba.begin(), difference_ba.end(), current_a);
|
||||
|
||||
if (element_b != difference_ba.end()) {
|
||||
difference_ba.erase(element_b);
|
||||
intersection.push_back(current_a);
|
||||
}
|
||||
else {
|
||||
difference_ab.push_back(current_a);
|
||||
}
|
||||
}
|
||||
|
||||
return {difference_ab, difference_ba, intersection};
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes common prefix of two string views
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t remove_common_prefix(Range<InputIt1>& s1, Range<InputIt2>& s2)
|
||||
{
|
||||
auto first1 = std::begin(s1);
|
||||
size_t prefix = static_cast<size_t>(
|
||||
std::distance(first1, std::mismatch(first1, std::end(s1), std::begin(s2), std::end(s2)).first));
|
||||
s1.remove_prefix(prefix);
|
||||
s2.remove_prefix(prefix);
|
||||
return prefix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes common suffix of two string views
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t remove_common_suffix(Range<InputIt1>& s1, Range<InputIt2>& s2)
|
||||
{
|
||||
auto rfirst1 = std::rbegin(s1);
|
||||
size_t suffix = static_cast<size_t>(
|
||||
std::distance(rfirst1, std::mismatch(rfirst1, std::rend(s1), std::rbegin(s2), std::rend(s2)).first));
|
||||
s1.remove_suffix(suffix);
|
||||
s2.remove_suffix(suffix);
|
||||
return suffix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes common affix of two string views
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
StringAffix remove_common_affix(Range<InputIt1>& s1, Range<InputIt2>& s2)
|
||||
{
|
||||
return StringAffix{remove_common_prefix(s1, s2), remove_common_suffix(s1, s2)};
|
||||
}
|
||||
|
||||
template <typename, typename = void>
|
||||
struct is_space_dispatch_tag : std::integral_constant<int, 0> {};
|
||||
|
||||
template <typename CharT>
|
||||
struct is_space_dispatch_tag<CharT, typename std::enable_if<sizeof(CharT) == 1>::type>
|
||||
: std::integral_constant<int, 1> {};
|
||||
|
||||
/*
|
||||
* Implementation of is_space for char types that are at least 2 Byte in size
|
||||
*/
|
||||
template <typename CharT>
|
||||
bool is_space_impl(const CharT ch, std::integral_constant<int, 0>)
|
||||
{
|
||||
switch (ch) {
|
||||
case 0x0009:
|
||||
case 0x000A:
|
||||
case 0x000B:
|
||||
case 0x000C:
|
||||
case 0x000D:
|
||||
case 0x001C:
|
||||
case 0x001D:
|
||||
case 0x001E:
|
||||
case 0x001F:
|
||||
case 0x0020:
|
||||
case 0x0085:
|
||||
case 0x00A0:
|
||||
case 0x1680:
|
||||
case 0x2000:
|
||||
case 0x2001:
|
||||
case 0x2002:
|
||||
case 0x2003:
|
||||
case 0x2004:
|
||||
case 0x2005:
|
||||
case 0x2006:
|
||||
case 0x2007:
|
||||
case 0x2008:
|
||||
case 0x2009:
|
||||
case 0x200A:
|
||||
case 0x2028:
|
||||
case 0x2029:
|
||||
case 0x202F:
|
||||
case 0x205F:
|
||||
case 0x3000: return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Implementation of is_space for char types that are 1 Byte in size
|
||||
*/
|
||||
template <typename CharT>
|
||||
bool is_space_impl(const CharT ch, std::integral_constant<int, 1>)
|
||||
{
|
||||
switch (ch) {
|
||||
case 0x0009:
|
||||
case 0x000A:
|
||||
case 0x000B:
|
||||
case 0x000C:
|
||||
case 0x000D:
|
||||
case 0x001C:
|
||||
case 0x001D:
|
||||
case 0x001E:
|
||||
case 0x001F:
|
||||
case 0x0020: return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* checks whether unicode characters have the bidirectional
|
||||
* type 'WS', 'B' or 'S' or the category 'Zs'
|
||||
*/
|
||||
template <typename CharT>
|
||||
bool is_space(const CharT ch)
|
||||
{
|
||||
return is_space_impl(ch, is_space_dispatch_tag<CharT>{});
|
||||
}
|
||||
|
||||
template <typename InputIt, typename CharT>
|
||||
SplittedSentenceView<InputIt> sorted_split(InputIt first, InputIt last)
|
||||
{
|
||||
RangeVec<InputIt> splitted;
|
||||
auto second = first;
|
||||
|
||||
for (; first != last; first = second + 1) {
|
||||
second = std::find_if(first, last, is_space<CharT>);
|
||||
|
||||
if (first != second) {
|
||||
splitted.emplace_back(first, second);
|
||||
}
|
||||
|
||||
if (second == last) break;
|
||||
}
|
||||
|
||||
std::sort(splitted.begin(), splitted.end());
|
||||
|
||||
return SplittedSentenceView<InputIt>(splitted);
|
||||
}
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
548
src/external/rapidfuzz-cpp/rapidfuzz/details/distance.hpp
vendored
Normal file
548
src/external/rapidfuzz-cpp/rapidfuzz/details/distance.hpp
vendored
Normal file
@@ -0,0 +1,548 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cmath>
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/details/common.hpp>
|
||||
#include <rapidfuzz/details/simd.hpp>
|
||||
#include <type_traits>
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
template <typename T, typename... Args>
|
||||
struct NormalizedMetricBase {
|
||||
template <typename InputIt1, typename InputIt2,
|
||||
typename = std::enable_if_t<!std::is_same_v<InputIt2, double>>>
|
||||
static double normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
Args... args, double score_cutoff, double score_hint)
|
||||
{
|
||||
return _normalized_distance(Range(first1, last1), Range(first2, last2), std::forward<Args>(args)...,
|
||||
score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
static double normalized_distance(const Sentence1& s1, const Sentence2& s2, Args... args,
|
||||
double score_cutoff, double score_hint)
|
||||
{
|
||||
return _normalized_distance(Range(s1), Range(s2), std::forward<Args>(args)..., score_cutoff,
|
||||
score_hint);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2,
|
||||
typename = std::enable_if_t<!std::is_same_v<InputIt2, double>>>
|
||||
static double normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
Args... args, double score_cutoff, double score_hint)
|
||||
{
|
||||
return _normalized_similarity(Range(first1, last1), Range(first2, last2), std::forward<Args>(args)...,
|
||||
score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
static double normalized_similarity(const Sentence1& s1, const Sentence2& s2, Args... args,
|
||||
double score_cutoff, double score_hint)
|
||||
{
|
||||
return _normalized_similarity(Range(s1), Range(s2), std::forward<Args>(args)..., score_cutoff,
|
||||
score_hint);
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static double _normalized_distance(const Range<InputIt1>& s1, const Range<InputIt2>& s2, Args... args,
|
||||
double score_cutoff, double score_hint)
|
||||
{
|
||||
auto maximum = T::maximum(s1, s2, args...);
|
||||
auto cutoff_distance =
|
||||
static_cast<decltype(maximum)>(std::ceil(static_cast<double>(maximum) * score_cutoff));
|
||||
auto hint_distance =
|
||||
static_cast<decltype(maximum)>(std::ceil(static_cast<double>(maximum) * score_hint));
|
||||
auto dist = T::_distance(s1, s2, std::forward<Args>(args)..., cutoff_distance, hint_distance);
|
||||
double norm_dist = (maximum != 0) ? static_cast<double>(dist) / static_cast<double>(maximum) : 0.0;
|
||||
return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static double _normalized_similarity(const Range<InputIt1>& s1, const Range<InputIt2>& s2, Args... args,
|
||||
double score_cutoff, double score_hint)
|
||||
{
|
||||
double cutoff_score = NormSim_to_NormDist(score_cutoff);
|
||||
double hint_score = NormSim_to_NormDist(score_hint);
|
||||
double norm_dist =
|
||||
_normalized_distance(s1, s2, std::forward<Args>(args)..., cutoff_score, hint_score);
|
||||
double norm_sim = 1.0 - norm_dist;
|
||||
return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
|
||||
}
|
||||
|
||||
NormalizedMetricBase()
|
||||
{}
|
||||
friend T;
|
||||
};
|
||||
|
||||
template <typename T, typename ResType, int64_t WorstSimilarity, int64_t WorstDistance, typename... Args>
|
||||
struct DistanceBase : public NormalizedMetricBase<T, Args...> {
|
||||
template <typename InputIt1, typename InputIt2,
|
||||
typename = std::enable_if_t<!std::is_same_v<InputIt2, double>>>
|
||||
static ResType distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, Args... args,
|
||||
ResType score_cutoff, ResType score_hint)
|
||||
{
|
||||
return T::_distance(Range(first1, last1), Range(first2, last2), std::forward<Args>(args)...,
|
||||
score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
static ResType distance(const Sentence1& s1, const Sentence2& s2, Args... args, ResType score_cutoff,
|
||||
ResType score_hint)
|
||||
{
|
||||
return T::_distance(Range(s1), Range(s2), std::forward<Args>(args)..., score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2,
|
||||
typename = std::enable_if_t<!std::is_same_v<InputIt2, double>>>
|
||||
static ResType similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, Args... args,
|
||||
ResType score_cutoff, ResType score_hint)
|
||||
{
|
||||
return _similarity(Range(first1, last1), Range(first2, last2), std::forward<Args>(args)...,
|
||||
score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
static ResType similarity(const Sentence1& s1, const Sentence2& s2, Args... args, ResType score_cutoff,
|
||||
ResType score_hint)
|
||||
{
|
||||
return _similarity(Range(s1), Range(s2), std::forward<Args>(args)..., score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static ResType _similarity(Range<InputIt1> s1, Range<InputIt2> s2, Args... args, ResType score_cutoff,
|
||||
ResType score_hint)
|
||||
{
|
||||
auto maximum = T::maximum(s1, s2, args...);
|
||||
if (score_cutoff > maximum) return 0;
|
||||
|
||||
score_hint = std::min(score_cutoff, score_hint);
|
||||
ResType cutoff_distance = maximum - score_cutoff;
|
||||
ResType hint_distance = maximum - score_hint;
|
||||
ResType dist = T::_distance(s1, s2, std::forward<Args>(args)..., cutoff_distance, hint_distance);
|
||||
ResType sim = maximum - dist;
|
||||
return (sim >= score_cutoff) ? sim : 0;
|
||||
}
|
||||
|
||||
DistanceBase()
|
||||
{}
|
||||
friend T;
|
||||
};
|
||||
|
||||
template <typename T, typename ResType, int64_t WorstSimilarity, int64_t WorstDistance, typename... Args>
|
||||
struct SimilarityBase : public NormalizedMetricBase<T, Args...> {
|
||||
template <typename InputIt1, typename InputIt2,
|
||||
typename = std::enable_if_t<!std::is_same_v<InputIt2, double>>>
|
||||
static ResType distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, Args... args,
|
||||
ResType score_cutoff, ResType score_hint)
|
||||
{
|
||||
return _distance(Range(first1, last1), Range(first2, last2), std::forward<Args>(args)...,
|
||||
score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
static ResType distance(const Sentence1& s1, const Sentence2& s2, Args... args, ResType score_cutoff,
|
||||
ResType score_hint)
|
||||
{
|
||||
return _distance(Range(s1), Range(s2), std::forward<Args>(args)..., score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2,
|
||||
typename = std::enable_if_t<!std::is_same_v<InputIt2, double>>>
|
||||
static ResType similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, Args... args,
|
||||
ResType score_cutoff, ResType score_hint)
|
||||
{
|
||||
return T::_similarity(Range(first1, last1), Range(first2, last2), std::forward<Args>(args)...,
|
||||
score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
static ResType similarity(const Sentence1& s1, const Sentence2& s2, Args... args, ResType score_cutoff,
|
||||
ResType score_hint)
|
||||
{
|
||||
return T::_similarity(Range(s1), Range(s2), std::forward<Args>(args)..., score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static ResType _distance(const Range<InputIt1>& s1, const Range<InputIt2>& s2, Args... args,
|
||||
ResType score_cutoff, ResType score_hint)
|
||||
{
|
||||
auto maximum = T::maximum(s1, s2, args...);
|
||||
ResType cutoff_similarity =
|
||||
(maximum >= score_cutoff) ? maximum - score_cutoff : static_cast<ResType>(WorstSimilarity);
|
||||
ResType hint_similarity =
|
||||
(maximum >= score_hint) ? maximum - score_hint : static_cast<ResType>(WorstSimilarity);
|
||||
ResType sim = T::_similarity(s1, s2, std::forward<Args>(args)..., cutoff_similarity, hint_similarity);
|
||||
ResType dist = maximum - sim;
|
||||
|
||||
if constexpr (std::is_floating_point_v<ResType>)
|
||||
return (dist <= score_cutoff) ? dist : 1.0;
|
||||
else
|
||||
return (dist <= score_cutoff) ? dist : score_cutoff + 1;
|
||||
}
|
||||
|
||||
SimilarityBase()
|
||||
{}
|
||||
friend T;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct CachedNormalizedMetricBase {
|
||||
template <typename InputIt2>
|
||||
double normalized_distance(InputIt2 first2, InputIt2 last2, double score_cutoff = 1.0,
|
||||
double score_hint = 1.0) const
|
||||
{
|
||||
return _normalized_distance(Range(first2, last2), score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence2>
|
||||
double normalized_distance(const Sentence2& s2, double score_cutoff = 1.0, double score_hint = 1.0) const
|
||||
{
|
||||
return _normalized_distance(Range(s2), score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
||||
double score_hint = 0.0) const
|
||||
{
|
||||
return _normalized_similarity(Range(first2, last2), score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence2>
|
||||
double normalized_similarity(const Sentence2& s2, double score_cutoff = 0.0,
|
||||
double score_hint = 0.0) const
|
||||
{
|
||||
return _normalized_similarity(Range(s2), score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename InputIt2>
|
||||
double _normalized_distance(const Range<InputIt2>& s2, double score_cutoff, double score_hint) const
|
||||
{
|
||||
const T& derived = static_cast<const T&>(*this);
|
||||
auto maximum = derived.maximum(s2);
|
||||
auto cutoff_distance =
|
||||
static_cast<decltype(maximum)>(std::ceil(static_cast<double>(maximum) * score_cutoff));
|
||||
auto hint_distance =
|
||||
static_cast<decltype(maximum)>(std::ceil(static_cast<double>(maximum) * score_hint));
|
||||
double dist = static_cast<double>(derived._distance(s2, cutoff_distance, hint_distance));
|
||||
double norm_dist = (maximum != 0) ? dist / static_cast<double>(maximum) : 0.0;
|
||||
return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
double _normalized_similarity(const Range<InputIt2>& s2, double score_cutoff, double score_hint) const
|
||||
{
|
||||
double cutoff_score = NormSim_to_NormDist(score_cutoff);
|
||||
double hint_score = NormSim_to_NormDist(score_hint);
|
||||
double norm_dist = _normalized_distance(s2, cutoff_score, hint_score);
|
||||
double norm_sim = 1.0 - norm_dist;
|
||||
return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
|
||||
}
|
||||
|
||||
CachedNormalizedMetricBase()
|
||||
{}
|
||||
friend T;
|
||||
};
|
||||
|
||||
template <typename T, typename ResType, int64_t WorstSimilarity, int64_t WorstDistance>
|
||||
struct CachedDistanceBase : public CachedNormalizedMetricBase<T> {
|
||||
template <typename InputIt2>
|
||||
ResType distance(InputIt2 first2, InputIt2 last2,
|
||||
ResType score_cutoff = static_cast<ResType>(WorstDistance),
|
||||
ResType score_hint = static_cast<ResType>(WorstDistance)) const
|
||||
{
|
||||
const T& derived = static_cast<const T&>(*this);
|
||||
return derived._distance(Range(first2, last2), score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence2>
|
||||
ResType distance(const Sentence2& s2, ResType score_cutoff = static_cast<ResType>(WorstDistance),
|
||||
ResType score_hint = static_cast<ResType>(WorstDistance)) const
|
||||
{
|
||||
const T& derived = static_cast<const T&>(*this);
|
||||
return derived._distance(Range(s2), score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
ResType similarity(InputIt2 first2, InputIt2 last2,
|
||||
ResType score_cutoff = static_cast<ResType>(WorstSimilarity),
|
||||
ResType score_hint = static_cast<ResType>(WorstSimilarity)) const
|
||||
{
|
||||
return _similarity(Range(first2, last2), score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence2>
|
||||
ResType similarity(const Sentence2& s2, ResType score_cutoff = static_cast<ResType>(WorstSimilarity),
|
||||
ResType score_hint = static_cast<ResType>(WorstSimilarity)) const
|
||||
{
|
||||
return _similarity(Range(s2), score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename InputIt2>
|
||||
ResType _similarity(const Range<InputIt2>& s2, ResType score_cutoff, ResType score_hint) const
|
||||
{
|
||||
const T& derived = static_cast<const T&>(*this);
|
||||
ResType maximum = derived.maximum(s2);
|
||||
if (score_cutoff > maximum) return 0;
|
||||
|
||||
score_hint = std::min(score_cutoff, score_hint);
|
||||
ResType cutoff_distance = maximum - score_cutoff;
|
||||
ResType hint_distance = maximum - score_hint;
|
||||
ResType dist = derived._distance(s2, cutoff_distance, hint_distance);
|
||||
ResType sim = maximum - dist;
|
||||
return (sim >= score_cutoff) ? sim : 0;
|
||||
}
|
||||
|
||||
CachedDistanceBase()
|
||||
{}
|
||||
friend T;
|
||||
};
|
||||
|
||||
template <typename T, typename ResType, int64_t WorstSimilarity, int64_t WorstDistance>
|
||||
struct CachedSimilarityBase : public CachedNormalizedMetricBase<T> {
|
||||
template <typename InputIt2>
|
||||
ResType distance(InputIt2 first2, InputIt2 last2,
|
||||
ResType score_cutoff = static_cast<ResType>(WorstDistance),
|
||||
ResType score_hint = static_cast<ResType>(WorstDistance)) const
|
||||
{
|
||||
return _distance(Range(first2, last2), score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence2>
|
||||
ResType distance(const Sentence2& s2, ResType score_cutoff = static_cast<ResType>(WorstDistance),
|
||||
ResType score_hint = static_cast<ResType>(WorstDistance)) const
|
||||
{
|
||||
return _distance(Range(s2), score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
ResType similarity(InputIt2 first2, InputIt2 last2,
|
||||
ResType score_cutoff = static_cast<ResType>(WorstSimilarity),
|
||||
ResType score_hint = static_cast<ResType>(WorstSimilarity)) const
|
||||
{
|
||||
const T& derived = static_cast<const T&>(*this);
|
||||
return derived._similarity(Range(first2, last2), score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence2>
|
||||
ResType similarity(const Sentence2& s2, ResType score_cutoff = static_cast<ResType>(WorstSimilarity),
|
||||
ResType score_hint = static_cast<ResType>(WorstSimilarity)) const
|
||||
{
|
||||
const T& derived = static_cast<const T&>(*this);
|
||||
return derived._similarity(Range(s2), score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename InputIt2>
|
||||
ResType _distance(const Range<InputIt2>& s2, ResType score_cutoff, ResType score_hint) const
|
||||
{
|
||||
const T& derived = static_cast<const T&>(*this);
|
||||
ResType maximum = derived.maximum(s2);
|
||||
ResType cutoff_similarity = (maximum > score_cutoff) ? maximum - score_cutoff : 0;
|
||||
ResType hint_similarity = (maximum > score_hint) ? maximum - score_hint : 0;
|
||||
ResType sim = derived._similarity(s2, cutoff_similarity, hint_similarity);
|
||||
ResType dist = maximum - sim;
|
||||
|
||||
if constexpr (std::is_floating_point_v<ResType>)
|
||||
return (dist <= score_cutoff) ? dist : 1.0;
|
||||
else
|
||||
return (dist <= score_cutoff) ? dist : score_cutoff + 1;
|
||||
}
|
||||
|
||||
CachedSimilarityBase()
|
||||
{}
|
||||
friend T;
|
||||
};
|
||||
|
||||
template <typename T, typename ResType>
|
||||
struct MultiNormalizedMetricBase {
|
||||
template <typename InputIt2>
|
||||
void normalized_distance(double* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 1.0) const
|
||||
{
|
||||
_normalized_distance(scores, score_count, Range(first2, last2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence2>
|
||||
void normalized_distance(double* scores, size_t score_count, const Sentence2& s2,
|
||||
double score_cutoff = 1.0) const
|
||||
{
|
||||
_normalized_distance(scores, score_count, Range(s2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
void normalized_similarity(double* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0.0) const
|
||||
{
|
||||
_normalized_similarity(scores, score_count, Range(first2, last2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence2>
|
||||
void normalized_similarity(double* scores, size_t score_count, const Sentence2& s2,
|
||||
double score_cutoff = 0.0) const
|
||||
{
|
||||
_normalized_similarity(scores, score_count, Range(s2), score_cutoff);
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename InputIt2>
|
||||
void _normalized_distance(double* scores, size_t score_count, const Range<InputIt2>& s2,
|
||||
double score_cutoff = 1.0) const
|
||||
{
|
||||
const T& derived = static_cast<const T&>(*this);
|
||||
if (score_count < derived.result_count())
|
||||
throw std::invalid_argument("scores has to have >= result_count() elements");
|
||||
|
||||
// reinterpretation only works when the types have the same size
|
||||
ResType* scores_orig = nullptr;
|
||||
if constexpr (sizeof(double) == sizeof(ResType))
|
||||
scores_orig = reinterpret_cast<ResType*>(scores);
|
||||
else
|
||||
scores_orig = new ResType[derived.result_count()];
|
||||
|
||||
derived.distance(scores_orig, derived.result_count(), s2);
|
||||
|
||||
for (size_t i = 0; i < derived.get_input_count(); ++i) {
|
||||
auto maximum = derived.maximum(i, s2);
|
||||
double norm_dist =
|
||||
(maximum != 0) ? static_cast<double>(scores_orig[i]) / static_cast<double>(maximum) : 0.0;
|
||||
scores[i] = (norm_dist <= score_cutoff) ? norm_dist : 1.0;
|
||||
}
|
||||
|
||||
if constexpr (sizeof(double) != sizeof(ResType)) delete[] scores_orig;
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
void _normalized_similarity(double* scores, size_t score_count, const Range<InputIt2>& s2,
|
||||
double score_cutoff) const
|
||||
{
|
||||
const T& derived = static_cast<const T&>(*this);
|
||||
_normalized_distance(scores, score_count, s2);
|
||||
|
||||
for (size_t i = 0; i < derived.get_input_count(); ++i) {
|
||||
double norm_sim = 1.0 - scores[i];
|
||||
scores[i] = (norm_sim >= score_cutoff) ? norm_sim : 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
MultiNormalizedMetricBase()
|
||||
{}
|
||||
friend T;
|
||||
};
|
||||
|
||||
template <typename T, typename ResType, int64_t WorstSimilarity, int64_t WorstDistance>
|
||||
struct MultiDistanceBase : public MultiNormalizedMetricBase<T, ResType> {
|
||||
template <typename InputIt2>
|
||||
void distance(ResType* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
|
||||
ResType score_cutoff = static_cast<ResType>(WorstDistance)) const
|
||||
{
|
||||
const T& derived = static_cast<const T&>(*this);
|
||||
derived._distance(scores, score_count, Range(first2, last2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence2>
|
||||
void distance(ResType* scores, size_t score_count, const Sentence2& s2,
|
||||
ResType score_cutoff = static_cast<ResType>(WorstDistance)) const
|
||||
{
|
||||
const T& derived = static_cast<const T&>(*this);
|
||||
derived._distance(scores, score_count, Range(s2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
void similarity(ResType* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
|
||||
ResType score_cutoff = static_cast<ResType>(WorstSimilarity)) const
|
||||
{
|
||||
_similarity(scores, score_count, Range(first2, last2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence2>
|
||||
void similarity(ResType* scores, size_t score_count, const Sentence2& s2,
|
||||
ResType score_cutoff = static_cast<ResType>(WorstSimilarity)) const
|
||||
{
|
||||
_similarity(scores, score_count, Range(s2), score_cutoff);
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename InputIt2>
|
||||
void _similarity(ResType* scores, size_t score_count, const Range<InputIt2>& s2,
|
||||
ResType score_cutoff) const
|
||||
{
|
||||
const T& derived = static_cast<const T&>(*this);
|
||||
derived._distance(scores, score_count, s2);
|
||||
|
||||
for (size_t i = 0; i < derived.get_input_count(); ++i) {
|
||||
ResType maximum = derived.maximum(i, s2);
|
||||
ResType sim = maximum - scores[i];
|
||||
scores[i] = (sim >= score_cutoff) ? sim : 0;
|
||||
}
|
||||
}
|
||||
|
||||
MultiDistanceBase()
|
||||
{}
|
||||
friend T;
|
||||
};
|
||||
|
||||
template <typename T, typename ResType, int64_t WorstSimilarity, int64_t WorstDistance>
|
||||
struct MultiSimilarityBase : public MultiNormalizedMetricBase<T, ResType> {
|
||||
template <typename InputIt2>
|
||||
void distance(ResType* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
|
||||
ResType score_cutoff = static_cast<ResType>(WorstDistance)) const
|
||||
{
|
||||
_distance(scores, score_count, Range(first2, last2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence2>
|
||||
void distance(ResType* scores, size_t score_count, const Sentence2& s2,
|
||||
ResType score_cutoff = static_cast<ResType>(WorstDistance)) const
|
||||
{
|
||||
_distance(scores, score_count, Range(s2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
void similarity(ResType* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
|
||||
ResType score_cutoff = static_cast<ResType>(WorstSimilarity)) const
|
||||
{
|
||||
const T& derived = static_cast<const T&>(*this);
|
||||
derived._similarity(scores, score_count, Range(first2, last2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence2>
|
||||
void similarity(ResType* scores, size_t score_count, const Sentence2& s2,
|
||||
ResType score_cutoff = static_cast<ResType>(WorstSimilarity)) const
|
||||
{
|
||||
const T& derived = static_cast<const T&>(*this);
|
||||
derived._similarity(scores, score_count, Range(s2), score_cutoff);
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename InputIt2>
|
||||
void _distance(ResType* scores, size_t score_count, const Range<InputIt2>& s2, ResType score_cutoff) const
|
||||
{
|
||||
const T& derived = static_cast<const T&>(*this);
|
||||
derived._similarity(scores, score_count, s2);
|
||||
|
||||
for (size_t i = 0; i < derived.get_input_count(); ++i) {
|
||||
ResType maximum = derived.maximum(i, s2);
|
||||
ResType dist = maximum - scores[i];
|
||||
|
||||
if constexpr (std::is_floating_point_v<ResType>)
|
||||
scores[i] = (dist <= score_cutoff) ? dist : 1.0;
|
||||
else
|
||||
scores[i] = (dist <= score_cutoff) ? dist : score_cutoff + 1;
|
||||
}
|
||||
}
|
||||
|
||||
MultiSimilarityBase()
|
||||
{}
|
||||
friend T;
|
||||
};
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
212
src/external/rapidfuzz-cpp/rapidfuzz/details/intrinsics.hpp
vendored
Normal file
212
src/external/rapidfuzz-cpp/rapidfuzz/details/intrinsics.hpp
vendored
Normal file
@@ -0,0 +1,212 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <bitset>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <limits>
|
||||
#include <stdint.h>
|
||||
#include <type_traits>
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
# include <intrin.h>
|
||||
#endif
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
template <typename T>
|
||||
T bit_mask_lsb(size_t n)
|
||||
{
|
||||
T mask = static_cast<T>(-1);
|
||||
if (n < sizeof(T) * 8) {
|
||||
mask += static_cast<T>(static_cast<T>(1) << n);
|
||||
}
|
||||
return mask;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool bittest(T a, int bit)
|
||||
{
|
||||
return (a >> bit) & 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* shift right without undefined behavior for shifts > bit width
|
||||
*/
|
||||
template <typename U>
|
||||
constexpr uint64_t shr64(uint64_t a, U shift)
|
||||
{
|
||||
return (shift < 64) ? a >> shift : 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* shift left without undefined behavior for shifts > bit width
|
||||
*/
|
||||
template <typename U>
|
||||
constexpr uint64_t shl64(uint64_t a, U shift)
|
||||
{
|
||||
return (shift < 64) ? a << shift : 0;
|
||||
}
|
||||
|
||||
constexpr uint64_t addc64(uint64_t a, uint64_t b, uint64_t carryin, uint64_t* carryout)
|
||||
{
|
||||
/* todo should use _addcarry_u64 when available */
|
||||
a += carryin;
|
||||
*carryout = a < carryin;
|
||||
a += b;
|
||||
*carryout |= a < b;
|
||||
return a;
|
||||
}
|
||||
|
||||
template <typename T, typename U>
|
||||
constexpr T ceil_div(T a, U divisor)
|
||||
{
|
||||
T _div = static_cast<T>(divisor);
|
||||
return a / _div + static_cast<T>(a % _div != 0);
|
||||
}
|
||||
|
||||
static inline size_t popcount(uint64_t x)
|
||||
{
|
||||
return std::bitset<64>(x).count();
|
||||
}
|
||||
|
||||
static inline size_t popcount(uint32_t x)
|
||||
{
|
||||
return std::bitset<32>(x).count();
|
||||
}
|
||||
|
||||
static inline size_t popcount(uint16_t x)
|
||||
{
|
||||
return std::bitset<16>(x).count();
|
||||
}
|
||||
|
||||
static inline size_t popcount(uint8_t x)
|
||||
{
|
||||
static constexpr uint8_t bit_count[256] = {
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
|
||||
return bit_count[x];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr T rotl(T x, unsigned int n)
|
||||
{
|
||||
unsigned int num_bits = std::numeric_limits<T>::digits;
|
||||
assert(n < num_bits);
|
||||
unsigned int count_mask = num_bits - 1;
|
||||
|
||||
#if _MSC_VER && !defined(__clang__)
|
||||
# pragma warning(push)
|
||||
/* unary minus operator applied to unsigned type, result still unsigned */
|
||||
# pragma warning(disable : 4146)
|
||||
#endif
|
||||
return (x << n) | (x >> (-n & count_mask));
|
||||
#if _MSC_VER && !defined(__clang__)
|
||||
# pragma warning(pop)
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the lowest set bit from a. If no bits are set in a returns 0.
|
||||
*/
|
||||
template <typename T>
|
||||
constexpr T blsi(T a)
|
||||
{
|
||||
#if _MSC_VER && !defined(__clang__)
|
||||
# pragma warning(push)
|
||||
/* unary minus operator applied to unsigned type, result still unsigned */
|
||||
# pragma warning(disable : 4146)
|
||||
#endif
|
||||
return a & -a;
|
||||
#if _MSC_VER && !defined(__clang__)
|
||||
# pragma warning(pop)
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear the lowest set bit in a.
|
||||
*/
|
||||
template <typename T>
|
||||
constexpr T blsr(T x)
|
||||
{
|
||||
return x & (x - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets all the lower bits of the result to 1 up to and including lowest set bit (=1) in a.
|
||||
* If a is zero, blsmsk sets all bits to 1.
|
||||
*/
|
||||
template <typename T>
|
||||
constexpr T blsmsk(T a)
|
||||
{
|
||||
return a ^ (a - 1);
|
||||
}
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
static inline unsigned int countr_zero(uint32_t x)
|
||||
{
|
||||
unsigned long trailing_zero = 0;
|
||||
_BitScanForward(&trailing_zero, x);
|
||||
return trailing_zero;
|
||||
}
|
||||
|
||||
# if defined(_M_ARM) || defined(_M_X64)
|
||||
static inline unsigned int countr_zero(uint64_t x)
|
||||
{
|
||||
unsigned long trailing_zero = 0;
|
||||
_BitScanForward64(&trailing_zero, x);
|
||||
return trailing_zero;
|
||||
}
|
||||
# else
|
||||
static inline unsigned int countr_zero(uint64_t x)
|
||||
{
|
||||
uint32_t msh = (uint32_t)(x >> 32);
|
||||
uint32_t lsh = (uint32_t)(x & 0xFFFFFFFF);
|
||||
if (lsh != 0) return countr_zero(lsh);
|
||||
return 32 + countr_zero(msh);
|
||||
}
|
||||
# endif
|
||||
|
||||
#else /* gcc / clang */
|
||||
static inline unsigned int countr_zero(uint32_t x)
|
||||
{
|
||||
return static_cast<unsigned int>(__builtin_ctz(x));
|
||||
}
|
||||
|
||||
static inline unsigned int countr_zero(uint64_t x)
|
||||
{
|
||||
return static_cast<unsigned int>(__builtin_ctzll(x));
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline unsigned int countr_zero(uint16_t x)
|
||||
{
|
||||
return countr_zero(static_cast<uint32_t>(x));
|
||||
}
|
||||
|
||||
static inline unsigned int countr_zero(uint8_t x)
|
||||
{
|
||||
return countr_zero(static_cast<uint32_t>(x));
|
||||
}
|
||||
|
||||
template <class T, T... inds, class F>
|
||||
constexpr void unroll_impl(std::integer_sequence<T, inds...>, F&& f)
|
||||
{
|
||||
(f(std::integral_constant<T, inds>{}), ...);
|
||||
}
|
||||
|
||||
template <class T, T count, class F>
|
||||
constexpr void unroll(F&& f)
|
||||
{
|
||||
unroll_impl(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
|
||||
}
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
21
src/external/rapidfuzz-cpp/rapidfuzz/details/simd.hpp
vendored
Normal file
21
src/external/rapidfuzz-cpp/rapidfuzz/details/simd.hpp
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022 Max Bachmann */
|
||||
#pragma once
|
||||
|
||||
/* RAPIDFUZZ_LTO_HACK is used to differentiate functions between different
|
||||
* translation units to avoid warnings when using lto */
|
||||
#ifndef RAPIDFUZZ_EXCLUDE_SIMD
|
||||
# if __AVX2__
|
||||
# define RAPIDFUZZ_SIMD
|
||||
# define RAPIDFUZZ_AVX2
|
||||
# define RAPIDFUZZ_LTO_HACK 0
|
||||
# include <rapidfuzz/details/simd_avx2.hpp>
|
||||
|
||||
# elif (defined(_M_AMD64) || defined(_M_X64)) || defined(__SSE2__)
|
||||
# define RAPIDFUZZ_SIMD
|
||||
# define RAPIDFUZZ_SSE2
|
||||
# define RAPIDFUZZ_LTO_HACK 1
|
||||
# include <rapidfuzz/details/simd_sse2.hpp>
|
||||
# endif
|
||||
#endif
|
||||
647
src/external/rapidfuzz-cpp/rapidfuzz/details/simd_avx2.hpp
vendored
Normal file
647
src/external/rapidfuzz-cpp/rapidfuzz/details/simd_avx2.hpp
vendored
Normal file
@@ -0,0 +1,647 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022 Max Bachmann */
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <immintrin.h>
|
||||
#include <ostream>
|
||||
#include <rapidfuzz/details/intrinsics.hpp>
|
||||
#include <stdint.h>
|
||||
|
||||
namespace rapidfuzz {
|
||||
namespace detail {
|
||||
namespace simd_avx2 {
|
||||
|
||||
template <typename T>
|
||||
class native_simd;
|
||||
|
||||
template <>
|
||||
class native_simd<uint64_t> {
|
||||
public:
|
||||
using value_type = uint64_t;
|
||||
|
||||
static constexpr int alignment = 32;
|
||||
static const int size = 4;
|
||||
__m256i xmm;
|
||||
|
||||
native_simd() noexcept
|
||||
{}
|
||||
|
||||
native_simd(__m256i val) noexcept : xmm(val)
|
||||
{}
|
||||
|
||||
native_simd(uint64_t a) noexcept
|
||||
{
|
||||
xmm = _mm256_set1_epi64x(static_cast<int64_t>(a));
|
||||
}
|
||||
|
||||
native_simd(const uint64_t* p) noexcept
|
||||
{
|
||||
load(p);
|
||||
}
|
||||
|
||||
operator __m256i() const noexcept
|
||||
{
|
||||
return xmm;
|
||||
}
|
||||
|
||||
native_simd load(const uint64_t* p) noexcept
|
||||
{
|
||||
xmm = _mm256_set_epi64x(static_cast<int64_t>(p[3]), static_cast<int64_t>(p[2]),
|
||||
static_cast<int64_t>(p[1]), static_cast<int64_t>(p[0]));
|
||||
return *this;
|
||||
}
|
||||
|
||||
void store(uint64_t* p) const noexcept
|
||||
{
|
||||
_mm256_store_si256(reinterpret_cast<__m256i*>(p), xmm);
|
||||
}
|
||||
|
||||
native_simd operator+(const native_simd b) const noexcept
|
||||
{
|
||||
return _mm256_add_epi64(xmm, b);
|
||||
}
|
||||
|
||||
native_simd& operator+=(const native_simd b) noexcept
|
||||
{
|
||||
xmm = _mm256_add_epi64(xmm, b);
|
||||
return *this;
|
||||
}
|
||||
|
||||
native_simd operator-(const native_simd b) const noexcept
|
||||
{
|
||||
return _mm256_sub_epi64(xmm, b);
|
||||
}
|
||||
|
||||
native_simd operator-() const noexcept
|
||||
{
|
||||
return _mm256_sub_epi64(_mm256_setzero_si256(), xmm);
|
||||
}
|
||||
|
||||
native_simd& operator-=(const native_simd b) noexcept
|
||||
{
|
||||
xmm = _mm256_sub_epi64(xmm, b);
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class native_simd<uint32_t> {
|
||||
public:
|
||||
using value_type = uint32_t;
|
||||
|
||||
static constexpr int alignment = 32;
|
||||
static const int size = 8;
|
||||
__m256i xmm;
|
||||
|
||||
native_simd() noexcept
|
||||
{}
|
||||
|
||||
native_simd(__m256i val) noexcept : xmm(val)
|
||||
{}
|
||||
|
||||
native_simd(uint32_t a) noexcept
|
||||
{
|
||||
xmm = _mm256_set1_epi32(static_cast<int>(a));
|
||||
}
|
||||
|
||||
native_simd(const uint64_t* p) noexcept
|
||||
{
|
||||
load(p);
|
||||
}
|
||||
|
||||
operator __m256i() const
|
||||
{
|
||||
return xmm;
|
||||
}
|
||||
|
||||
native_simd load(const uint64_t* p) noexcept
|
||||
{
|
||||
xmm = _mm256_set_epi64x(static_cast<int64_t>(p[3]), static_cast<int64_t>(p[2]),
|
||||
static_cast<int64_t>(p[1]), static_cast<int64_t>(p[0]));
|
||||
return *this;
|
||||
}
|
||||
|
||||
void store(uint32_t* p) const noexcept
|
||||
{
|
||||
_mm256_store_si256(reinterpret_cast<__m256i*>(p), xmm);
|
||||
}
|
||||
|
||||
native_simd operator+(const native_simd b) const noexcept
|
||||
{
|
||||
return _mm256_add_epi32(xmm, b);
|
||||
}
|
||||
|
||||
native_simd& operator+=(const native_simd b) noexcept
|
||||
{
|
||||
xmm = _mm256_add_epi32(xmm, b);
|
||||
return *this;
|
||||
}
|
||||
|
||||
native_simd operator-() const noexcept
|
||||
{
|
||||
return _mm256_sub_epi32(_mm256_setzero_si256(), xmm);
|
||||
}
|
||||
|
||||
native_simd operator-(const native_simd b) const noexcept
|
||||
{
|
||||
return _mm256_sub_epi32(xmm, b);
|
||||
}
|
||||
|
||||
native_simd& operator-=(const native_simd b) noexcept
|
||||
{
|
||||
xmm = _mm256_sub_epi32(xmm, b);
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class native_simd<uint16_t> {
|
||||
public:
|
||||
using value_type = uint16_t;
|
||||
|
||||
static constexpr int alignment = 32;
|
||||
static const int size = 16;
|
||||
__m256i xmm;
|
||||
|
||||
native_simd() noexcept
|
||||
{}
|
||||
|
||||
native_simd(__m256i val) : xmm(val)
|
||||
{}
|
||||
|
||||
native_simd(uint16_t a) noexcept
|
||||
{
|
||||
xmm = _mm256_set1_epi16(static_cast<short>(a));
|
||||
}
|
||||
|
||||
native_simd(const uint64_t* p) noexcept
|
||||
{
|
||||
load(p);
|
||||
}
|
||||
|
||||
operator __m256i() const noexcept
|
||||
{
|
||||
return xmm;
|
||||
}
|
||||
|
||||
native_simd load(const uint64_t* p) noexcept
|
||||
{
|
||||
xmm = _mm256_set_epi64x(static_cast<int64_t>(p[3]), static_cast<int64_t>(p[2]),
|
||||
static_cast<int64_t>(p[1]), static_cast<int64_t>(p[0]));
|
||||
return *this;
|
||||
}
|
||||
|
||||
void store(uint16_t* p) const noexcept
|
||||
{
|
||||
_mm256_store_si256(reinterpret_cast<__m256i*>(p), xmm);
|
||||
}
|
||||
|
||||
native_simd operator+(const native_simd b) const noexcept
|
||||
{
|
||||
return _mm256_add_epi16(xmm, b);
|
||||
}
|
||||
|
||||
native_simd& operator+=(const native_simd b) noexcept
|
||||
{
|
||||
xmm = _mm256_add_epi16(xmm, b);
|
||||
return *this;
|
||||
}
|
||||
|
||||
native_simd operator-(const native_simd b) const noexcept
|
||||
{
|
||||
return _mm256_sub_epi16(xmm, b);
|
||||
}
|
||||
|
||||
native_simd operator-() const noexcept
|
||||
{
|
||||
return _mm256_sub_epi16(_mm256_setzero_si256(), xmm);
|
||||
}
|
||||
|
||||
native_simd& operator-=(const native_simd b) noexcept
|
||||
{
|
||||
xmm = _mm256_sub_epi16(xmm, b);
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class native_simd<uint8_t> {
|
||||
public:
|
||||
using value_type = uint8_t;
|
||||
|
||||
static constexpr int alignment = 32;
|
||||
static const int size = 32;
|
||||
__m256i xmm;
|
||||
|
||||
native_simd() noexcept
|
||||
{}
|
||||
|
||||
native_simd(__m256i val) noexcept : xmm(val)
|
||||
{}
|
||||
|
||||
native_simd(uint8_t a) noexcept
|
||||
{
|
||||
xmm = _mm256_set1_epi8(static_cast<char>(a));
|
||||
}
|
||||
|
||||
native_simd(const uint64_t* p) noexcept
|
||||
{
|
||||
load(p);
|
||||
}
|
||||
|
||||
operator __m256i() const noexcept
|
||||
{
|
||||
return xmm;
|
||||
}
|
||||
|
||||
native_simd load(const uint64_t* p) noexcept
|
||||
{
|
||||
xmm = _mm256_set_epi64x(static_cast<int64_t>(p[3]), static_cast<int64_t>(p[2]),
|
||||
static_cast<int64_t>(p[1]), static_cast<int64_t>(p[0]));
|
||||
return *this;
|
||||
}
|
||||
|
||||
void store(uint8_t* p) const noexcept
|
||||
{
|
||||
_mm256_store_si256(reinterpret_cast<__m256i*>(p), xmm);
|
||||
}
|
||||
|
||||
native_simd operator+(const native_simd b) const noexcept
|
||||
{
|
||||
return _mm256_add_epi8(xmm, b);
|
||||
}
|
||||
|
||||
native_simd& operator+=(const native_simd b) noexcept
|
||||
{
|
||||
xmm = _mm256_add_epi8(xmm, b);
|
||||
return *this;
|
||||
}
|
||||
|
||||
native_simd operator-(const native_simd b) const noexcept
|
||||
{
|
||||
return _mm256_sub_epi8(xmm, b);
|
||||
}
|
||||
|
||||
native_simd operator-() const noexcept
|
||||
{
|
||||
return _mm256_sub_epi8(_mm256_setzero_si256(), xmm);
|
||||
}
|
||||
|
||||
native_simd& operator-=(const native_simd b) noexcept
|
||||
{
|
||||
xmm = _mm256_sub_epi8(xmm, b);
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& os, const native_simd<T>& a)
|
||||
{
|
||||
alignas(native_simd<T>::alignment) std::array<T, native_simd<T>::size> res;
|
||||
a.store(&res[0]);
|
||||
|
||||
for (size_t i = res.size() - 1; i != 0; i--)
|
||||
os << std::bitset<std::numeric_limits<T>::digits>(res[i]) << "|";
|
||||
|
||||
os << std::bitset<std::numeric_limits<T>::digits>(res[0]);
|
||||
return os;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__m256i hadd_impl(__m256i x) noexcept;
|
||||
|
||||
template <>
|
||||
inline __m256i hadd_impl<uint8_t>(__m256i x) noexcept
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m256i hadd_impl<uint16_t>(__m256i x) noexcept
|
||||
{
|
||||
const __m256i mask = _mm256_set1_epi16(0x001f);
|
||||
__m256i y = _mm256_srli_si256(x, 1);
|
||||
x = _mm256_add_epi16(x, y);
|
||||
return _mm256_and_si256(x, mask);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m256i hadd_impl<uint32_t>(__m256i x) noexcept
|
||||
{
|
||||
const __m256i mask = _mm256_set1_epi32(0x0000003F);
|
||||
x = hadd_impl<uint16_t>(x);
|
||||
__m256i y = _mm256_srli_si256(x, 2);
|
||||
x = _mm256_add_epi32(x, y);
|
||||
return _mm256_and_si256(x, mask);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m256i hadd_impl<uint64_t>(__m256i x) noexcept
|
||||
{
|
||||
return _mm256_sad_epu8(x, _mm256_setzero_si256());
|
||||
}
|
||||
|
||||
/* based on the paper `Faster Population Counts Using AVX2 Instructions` */
|
||||
template <typename T>
|
||||
native_simd<T> popcount_impl(const native_simd<T>& v) noexcept
|
||||
{
|
||||
__m256i lookup = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3,
|
||||
1, 2, 2, 3, 2, 3, 3, 4);
|
||||
const __m256i low_mask = _mm256_set1_epi8(0x0F);
|
||||
__m256i lo = _mm256_and_si256(v, low_mask);
|
||||
__m256i hi = _mm256_and_si256(_mm256_srli_epi32(v, 4), low_mask);
|
||||
__m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo);
|
||||
__m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi);
|
||||
__m256i total = _mm256_add_epi8(popcnt1, popcnt2);
|
||||
return hadd_impl<T>(total);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::array<T, native_simd<T>::size> popcount(const native_simd<T>& a) noexcept
|
||||
{
|
||||
alignas(native_simd<T>::alignment) std::array<T, native_simd<T>::size> res;
|
||||
popcount_impl(a).store(&res[0]);
|
||||
return res;
|
||||
}
|
||||
|
||||
// function andnot: a & ~ b
|
||||
template <typename T>
|
||||
native_simd<T> andnot(const native_simd<T>& a, const native_simd<T>& b)
|
||||
{
|
||||
return _mm256_andnot_si256(b, a);
|
||||
}
|
||||
|
||||
static inline native_simd<uint8_t> operator==(const native_simd<uint8_t>& a,
|
||||
const native_simd<uint8_t>& b) noexcept
|
||||
{
|
||||
return _mm256_cmpeq_epi8(a, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint16_t> operator==(const native_simd<uint16_t>& a,
|
||||
const native_simd<uint16_t>& b) noexcept
|
||||
{
|
||||
return _mm256_cmpeq_epi16(a, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint32_t> operator==(const native_simd<uint32_t>& a,
|
||||
const native_simd<uint32_t>& b) noexcept
|
||||
{
|
||||
return _mm256_cmpeq_epi32(a, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint64_t> operator==(const native_simd<uint64_t>& a,
|
||||
const native_simd<uint64_t>& b) noexcept
|
||||
{
|
||||
return _mm256_cmpeq_epi64(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline native_simd<T> operator!=(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return ~(a == b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint8_t> operator<<(const native_simd<uint8_t>& a, int b) noexcept
|
||||
{
|
||||
char mask = static_cast<char>(0xFF >> b);
|
||||
__m256i am = _mm256_and_si256(a, _mm256_set1_epi8(mask));
|
||||
return _mm256_slli_epi16(am, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint16_t> operator<<(const native_simd<uint16_t>& a, int b) noexcept
|
||||
{
|
||||
return _mm256_slli_epi16(a, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint32_t> operator<<(const native_simd<uint32_t>& a, int b) noexcept
|
||||
{
|
||||
return _mm256_slli_epi32(a, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint64_t> operator<<(const native_simd<uint64_t>& a, int b) noexcept
|
||||
{
|
||||
return _mm256_slli_epi64(a, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint8_t> operator>>(const native_simd<uint8_t>& a, int b) noexcept
|
||||
{
|
||||
char mask = static_cast<char>(0xFF << b);
|
||||
__m256i am = _mm256_and_si256(a, _mm256_set1_epi8(mask));
|
||||
return _mm256_srli_epi16(am, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint16_t> operator>>(const native_simd<uint16_t>& a, int b) noexcept
|
||||
{
|
||||
return _mm256_srli_epi16(a, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint32_t> operator>>(const native_simd<uint32_t>& a, int b) noexcept
|
||||
{
|
||||
return _mm256_srli_epi32(a, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint64_t> operator>>(const native_simd<uint64_t>& a, int b) noexcept
|
||||
{
|
||||
return _mm256_srli_epi64(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
native_simd<T> operator&(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return _mm256_and_si256(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
native_simd<T> operator&=(native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
a = a & b;
|
||||
return a;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
native_simd<T> operator|(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return _mm256_or_si256(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
native_simd<T> operator|=(native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
a = a | b;
|
||||
return a;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
native_simd<T> operator^(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return _mm256_xor_si256(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
native_simd<T> operator^=(native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
a = a ^ b;
|
||||
return a;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
native_simd<T> operator~(const native_simd<T>& a) noexcept
|
||||
{
|
||||
return _mm256_xor_si256(a, _mm256_set1_epi32(-1));
|
||||
}
|
||||
|
||||
// potentially we want a special native_simd<bool> for this
|
||||
static inline native_simd<uint8_t> operator>=(const native_simd<uint8_t>& a,
|
||||
const native_simd<uint8_t>& b) noexcept
|
||||
{
|
||||
return _mm256_cmpeq_epi8(_mm256_max_epu8(a, b), a); // a == max(a,b)
|
||||
}
|
||||
|
||||
static inline native_simd<uint16_t> operator>=(const native_simd<uint16_t>& a,
|
||||
const native_simd<uint16_t>& b) noexcept
|
||||
{
|
||||
return _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a); // a == max(a,b)
|
||||
}
|
||||
|
||||
static inline native_simd<uint32_t> operator>=(const native_simd<uint32_t>& a,
|
||||
const native_simd<uint32_t>& b) noexcept
|
||||
{
|
||||
return _mm256_cmpeq_epi32(_mm256_max_epu32(a, b), a); // a == max(a,b)
|
||||
}
|
||||
|
||||
static inline native_simd<uint64_t> operator>(const native_simd<uint64_t>& a,
|
||||
const native_simd<uint64_t>& b) noexcept;
|
||||
|
||||
static inline native_simd<uint64_t> operator>=(const native_simd<uint64_t>& a,
|
||||
const native_simd<uint64_t>& b) noexcept
|
||||
{
|
||||
return ~(b > a);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline native_simd<T> operator<=(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return b >= a;
|
||||
}
|
||||
|
||||
static inline native_simd<uint8_t> operator>(const native_simd<uint8_t>& a,
|
||||
const native_simd<uint8_t>& b) noexcept
|
||||
{
|
||||
return ~(b >= a);
|
||||
}
|
||||
|
||||
static inline native_simd<uint16_t> operator>(const native_simd<uint16_t>& a,
|
||||
const native_simd<uint16_t>& b) noexcept
|
||||
{
|
||||
return ~(b >= a);
|
||||
}
|
||||
|
||||
static inline native_simd<uint32_t> operator>(const native_simd<uint32_t>& a,
|
||||
const native_simd<uint32_t>& b) noexcept
|
||||
{
|
||||
__m256i signbit = _mm256_set1_epi32(static_cast<int32_t>(0x80000000));
|
||||
__m256i a1 = _mm256_xor_si256(a, signbit);
|
||||
__m256i b1 = _mm256_xor_si256(b, signbit);
|
||||
return _mm256_cmpgt_epi32(a1, b1); // signed compare
|
||||
}
|
||||
|
||||
static inline native_simd<uint64_t> operator>(const native_simd<uint64_t>& a,
|
||||
const native_simd<uint64_t>& b) noexcept
|
||||
{
|
||||
__m256i sign64 = native_simd<uint64_t>(0x8000000000000000);
|
||||
__m256i aflip = _mm256_xor_si256(a, sign64);
|
||||
__m256i bflip = _mm256_xor_si256(b, sign64);
|
||||
return _mm256_cmpgt_epi64(aflip, bflip); // signed compare
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline native_simd<T> operator<(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return b > a;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline native_simd<T> max8(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return _mm256_max_epu8(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline native_simd<T> max16(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return _mm256_max_epu16(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline native_simd<T> max32(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return _mm256_max_epu32(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline native_simd<T> min8(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return _mm256_min_epu8(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline native_simd<T> min16(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return _mm256_min_epu16(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline native_simd<T> min32(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return _mm256_min_epu32(a, b);
|
||||
}
|
||||
|
||||
/* taken from https://stackoverflow.com/a/51807800/11335032 */
|
||||
static inline native_simd<uint8_t> sllv(const native_simd<uint8_t>& a,
|
||||
const native_simd<uint8_t>& count_) noexcept
|
||||
{
|
||||
__m256i mask_hi = _mm256_set1_epi32(static_cast<int32_t>(0xFF00FF00));
|
||||
__m256i multiplier_lut = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, char(128), 64, 32, 16, 8, 4, 2, 1, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, char(128), 64, 32, 16, 8, 4, 2, 1);
|
||||
|
||||
__m256i count_sat =
|
||||
_mm256_min_epu8(count_, _mm256_set1_epi8(8)); /* AVX shift counts are not masked. So a_i << n_i = 0
|
||||
for n_i >= 8. count_sat is always less than 9.*/
|
||||
__m256i multiplier = _mm256_shuffle_epi8(
|
||||
multiplier_lut, count_sat); /* Select the right multiplication factor in the lookup table. */
|
||||
__m256i x_lo = _mm256_mullo_epi16(a, multiplier); /* Unfortunately _mm256_mullo_epi8 doesn't exist. Split
|
||||
the 16 bit elements in a high and low part. */
|
||||
|
||||
__m256i multiplier_hi = _mm256_srli_epi16(multiplier, 8); /* The multiplier of the high bits. */
|
||||
__m256i a_hi = _mm256_and_si256(a, mask_hi); /* Mask off the low bits. */
|
||||
__m256i x_hi = _mm256_mullo_epi16(a_hi, multiplier_hi);
|
||||
__m256i x = _mm256_blendv_epi8(x_lo, x_hi, mask_hi); /* Merge the high and low part. */
|
||||
return x;
|
||||
}
|
||||
|
||||
/* taken from https://stackoverflow.com/a/51805592/11335032 */
|
||||
static inline native_simd<uint16_t> sllv(const native_simd<uint16_t>& a,
|
||||
const native_simd<uint16_t>& count) noexcept
|
||||
{
|
||||
const __m256i mask = _mm256_set1_epi32(static_cast<int32_t>(0xFFFF0000));
|
||||
__m256i low_half = _mm256_sllv_epi32(a, _mm256_andnot_si256(mask, count));
|
||||
__m256i high_half = _mm256_sllv_epi32(_mm256_and_si256(mask, a), _mm256_srli_epi32(count, 16));
|
||||
return _mm256_blend_epi16(low_half, high_half, 0xAA);
|
||||
}
|
||||
|
||||
static inline native_simd<uint32_t> sllv(const native_simd<uint32_t>& a,
|
||||
const native_simd<uint32_t>& count) noexcept
|
||||
{
|
||||
return _mm256_sllv_epi32(a, count);
|
||||
}
|
||||
|
||||
static inline native_simd<uint64_t> sllv(const native_simd<uint64_t>& a,
|
||||
const native_simd<uint64_t>& count) noexcept
|
||||
{
|
||||
return _mm256_sllv_epi64(a, count);
|
||||
}
|
||||
|
||||
} // namespace simd_avx2
|
||||
} // namespace detail
|
||||
} // namespace rapidfuzz
|
||||
602
src/external/rapidfuzz-cpp/rapidfuzz/details/simd_sse2.hpp
vendored
Normal file
602
src/external/rapidfuzz-cpp/rapidfuzz/details/simd_sse2.hpp
vendored
Normal file
@@ -0,0 +1,602 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022 Max Bachmann */
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <emmintrin.h>
|
||||
#include <ostream>
|
||||
#include <rapidfuzz/details/intrinsics.hpp>
|
||||
#include <stdint.h>
|
||||
|
||||
namespace rapidfuzz {
|
||||
namespace detail {
|
||||
namespace simd_sse2 {
|
||||
|
||||
template <typename T>
|
||||
class native_simd;
|
||||
|
||||
template <>
|
||||
class native_simd<uint64_t> {
|
||||
public:
|
||||
static constexpr int alignment = 16;
|
||||
static const int size = 2;
|
||||
__m128i xmm;
|
||||
|
||||
native_simd() noexcept
|
||||
{}
|
||||
|
||||
native_simd(__m128i val) noexcept : xmm(val)
|
||||
{}
|
||||
|
||||
native_simd(uint64_t a) noexcept
|
||||
{
|
||||
xmm = _mm_set1_epi64x(static_cast<int64_t>(a));
|
||||
}
|
||||
|
||||
native_simd(const uint64_t* p) noexcept
|
||||
{
|
||||
load(p);
|
||||
}
|
||||
|
||||
operator __m128i() const noexcept
|
||||
{
|
||||
return xmm;
|
||||
}
|
||||
|
||||
native_simd load(const uint64_t* p) noexcept
|
||||
{
|
||||
xmm = _mm_set_epi64x(static_cast<int64_t>(p[1]), static_cast<int64_t>(p[0]));
|
||||
return *this;
|
||||
}
|
||||
|
||||
void store(uint64_t* p) const noexcept
|
||||
{
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(p), xmm);
|
||||
}
|
||||
|
||||
native_simd operator+(const native_simd b) const noexcept
|
||||
{
|
||||
return _mm_add_epi64(xmm, b);
|
||||
}
|
||||
|
||||
native_simd& operator+=(const native_simd b) noexcept
|
||||
{
|
||||
xmm = _mm_add_epi64(xmm, b);
|
||||
return *this;
|
||||
}
|
||||
|
||||
native_simd operator-(const native_simd b) const noexcept
|
||||
{
|
||||
return _mm_sub_epi64(xmm, b);
|
||||
}
|
||||
|
||||
native_simd operator-() const noexcept
|
||||
{
|
||||
return _mm_sub_epi64(_mm_setzero_si128(), xmm);
|
||||
}
|
||||
|
||||
native_simd& operator-=(const native_simd b) noexcept
|
||||
{
|
||||
xmm = _mm_sub_epi64(xmm, b);
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class native_simd<uint32_t> {
|
||||
public:
|
||||
static constexpr int alignment = 16;
|
||||
static const int size = 4;
|
||||
__m128i xmm;
|
||||
|
||||
native_simd() noexcept
|
||||
{}
|
||||
|
||||
native_simd(__m128i val) noexcept : xmm(val)
|
||||
{}
|
||||
|
||||
native_simd(uint32_t a) noexcept
|
||||
{
|
||||
xmm = _mm_set1_epi32(static_cast<int>(a));
|
||||
}
|
||||
|
||||
native_simd(const uint64_t* p) noexcept
|
||||
{
|
||||
load(p);
|
||||
}
|
||||
|
||||
operator __m128i() const noexcept
|
||||
{
|
||||
return xmm;
|
||||
}
|
||||
|
||||
native_simd load(const uint64_t* p) noexcept
|
||||
{
|
||||
xmm = _mm_set_epi64x(static_cast<int64_t>(p[1]), static_cast<int64_t>(p[0]));
|
||||
return *this;
|
||||
}
|
||||
|
||||
void store(uint32_t* p) const noexcept
|
||||
{
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(p), xmm);
|
||||
}
|
||||
|
||||
native_simd operator+(const native_simd b) const noexcept
|
||||
{
|
||||
return _mm_add_epi32(xmm, b);
|
||||
}
|
||||
|
||||
native_simd& operator+=(const native_simd b) noexcept
|
||||
{
|
||||
xmm = _mm_add_epi32(xmm, b);
|
||||
return *this;
|
||||
}
|
||||
|
||||
native_simd operator-(const native_simd b) const noexcept
|
||||
{
|
||||
return _mm_sub_epi32(xmm, b);
|
||||
}
|
||||
|
||||
native_simd operator-() const noexcept
|
||||
{
|
||||
return _mm_sub_epi32(_mm_setzero_si128(), xmm);
|
||||
}
|
||||
|
||||
native_simd& operator-=(const native_simd b) noexcept
|
||||
{
|
||||
xmm = _mm_sub_epi32(xmm, b);
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class native_simd<uint16_t> {
|
||||
public:
|
||||
static constexpr int alignment = 16;
|
||||
static const int size = 8;
|
||||
__m128i xmm;
|
||||
|
||||
native_simd() noexcept
|
||||
{}
|
||||
|
||||
native_simd(__m128i val) noexcept : xmm(val)
|
||||
{}
|
||||
|
||||
native_simd(uint16_t a) noexcept
|
||||
{
|
||||
xmm = _mm_set1_epi16(static_cast<short>(a));
|
||||
}
|
||||
|
||||
native_simd(const uint64_t* p) noexcept
|
||||
{
|
||||
load(p);
|
||||
}
|
||||
|
||||
operator __m128i() const noexcept
|
||||
{
|
||||
return xmm;
|
||||
}
|
||||
|
||||
native_simd load(const uint64_t* p) noexcept
|
||||
{
|
||||
xmm = _mm_set_epi64x(static_cast<int64_t>(p[1]), static_cast<int64_t>(p[0]));
|
||||
return *this;
|
||||
}
|
||||
|
||||
void store(uint16_t* p) const noexcept
|
||||
{
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(p), xmm);
|
||||
}
|
||||
|
||||
native_simd operator+(const native_simd b) const noexcept
|
||||
{
|
||||
return _mm_add_epi16(xmm, b);
|
||||
}
|
||||
|
||||
native_simd& operator+=(const native_simd b) noexcept
|
||||
{
|
||||
xmm = _mm_add_epi16(xmm, b);
|
||||
return *this;
|
||||
}
|
||||
|
||||
native_simd operator-(const native_simd b) const noexcept
|
||||
{
|
||||
return _mm_sub_epi16(xmm, b);
|
||||
}
|
||||
|
||||
native_simd operator-() const noexcept
|
||||
{
|
||||
return _mm_sub_epi16(_mm_setzero_si128(), xmm);
|
||||
}
|
||||
|
||||
native_simd& operator-=(const native_simd b) noexcept
|
||||
{
|
||||
xmm = _mm_sub_epi16(xmm, b);
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class native_simd<uint8_t> {
|
||||
public:
|
||||
static constexpr int alignment = 16;
|
||||
static const int size = 16;
|
||||
__m128i xmm;
|
||||
|
||||
native_simd() noexcept
|
||||
{}
|
||||
|
||||
native_simd(__m128i val) noexcept : xmm(val)
|
||||
{}
|
||||
|
||||
native_simd(uint8_t a) noexcept
|
||||
{
|
||||
xmm = _mm_set1_epi8(static_cast<char>(a));
|
||||
}
|
||||
|
||||
native_simd(const uint64_t* p) noexcept
|
||||
{
|
||||
load(p);
|
||||
}
|
||||
|
||||
operator __m128i() const noexcept
|
||||
{
|
||||
return xmm;
|
||||
}
|
||||
|
||||
native_simd load(const uint64_t* p) noexcept
|
||||
{
|
||||
xmm = _mm_set_epi64x(static_cast<int64_t>(p[1]), static_cast<int64_t>(p[0]));
|
||||
return *this;
|
||||
}
|
||||
|
||||
void store(uint8_t* p) const noexcept
|
||||
{
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(p), xmm);
|
||||
}
|
||||
|
||||
native_simd operator+(const native_simd b) const noexcept
|
||||
{
|
||||
return _mm_add_epi8(xmm, b);
|
||||
}
|
||||
|
||||
native_simd& operator+=(const native_simd b) noexcept
|
||||
{
|
||||
xmm = _mm_add_epi8(xmm, b);
|
||||
return *this;
|
||||
}
|
||||
|
||||
native_simd operator-(const native_simd b) const noexcept
|
||||
{
|
||||
return _mm_sub_epi8(xmm, b);
|
||||
}
|
||||
|
||||
native_simd operator-() const noexcept
|
||||
{
|
||||
return _mm_sub_epi8(_mm_setzero_si128(), xmm);
|
||||
}
|
||||
|
||||
native_simd& operator-=(const native_simd b) noexcept
|
||||
{
|
||||
xmm = _mm_sub_epi8(xmm, b);
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& os, const native_simd<T>& a)
|
||||
{
|
||||
alignas(native_simd<T>::alignment) std::array<T, native_simd<T>::size> res;
|
||||
a.store(&res[0]);
|
||||
|
||||
for (size_t i = res.size() - 1; i != 0; i--)
|
||||
os << std::bitset<std::numeric_limits<T>::digits>(res[i]) << "|";
|
||||
|
||||
os << std::bitset<std::numeric_limits<T>::digits>(res[0]);
|
||||
return os;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__m128i hadd_impl(__m128i x) noexcept;
|
||||
|
||||
template <>
|
||||
inline __m128i hadd_impl<uint8_t>(__m128i x) noexcept
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i hadd_impl<uint16_t>(__m128i x) noexcept
|
||||
{
|
||||
const __m128i mask = _mm_set1_epi16(0x001f);
|
||||
__m128i y = _mm_srli_si128(x, 1);
|
||||
x = _mm_add_epi16(x, y);
|
||||
return _mm_and_si128(x, mask);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i hadd_impl<uint32_t>(__m128i x) noexcept
|
||||
{
|
||||
const __m128i mask = _mm_set1_epi32(0x0000003f);
|
||||
x = hadd_impl<uint16_t>(x);
|
||||
__m128i y = _mm_srli_si128(x, 2);
|
||||
x = _mm_add_epi32(x, y);
|
||||
return _mm_and_si128(x, mask);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i hadd_impl<uint64_t>(__m128i x) noexcept
|
||||
{
|
||||
return _mm_sad_epu8(x, _mm_setzero_si128());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
native_simd<T> popcount_impl(const native_simd<T>& v) noexcept
|
||||
{
|
||||
const __m128i m1 = _mm_set1_epi8(0x55);
|
||||
const __m128i m2 = _mm_set1_epi8(0x33);
|
||||
const __m128i m3 = _mm_set1_epi8(0x0F);
|
||||
|
||||
/* Note: if we returned x here it would be like _mm_popcnt_epi1(x) */
|
||||
__m128i y;
|
||||
__m128i x = v;
|
||||
/* add even and odd bits*/
|
||||
y = _mm_srli_epi64(x, 1); // put even bits in odd place
|
||||
y = _mm_and_si128(y, m1); // mask out the even bits (0x55)
|
||||
x = _mm_subs_epu8(x, y); // shortcut to mask even bits and add
|
||||
/* if we just returned x here it would be like popcnt_epi2(x) */
|
||||
/* now add the half nibbles */
|
||||
y = _mm_srli_epi64(x, 2); // move half nibbles in place to add
|
||||
y = _mm_and_si128(y, m2); // mask off the extra half nibbles (0x0f)
|
||||
x = _mm_and_si128(x, m2); // ditto
|
||||
x = _mm_adds_epu8(x, y); // totals are a maximum of 5 bits (0x1f)
|
||||
/* if we just returned x here it would be like popcnt_epi4(x) */
|
||||
/* now add the nibbles */
|
||||
y = _mm_srli_epi64(x, 4); // move nibbles in place to add
|
||||
x = _mm_adds_epu8(x, y); // totals are a maximum of 6 bits (0x3f)
|
||||
x = _mm_and_si128(x, m3); // mask off the extra bits
|
||||
|
||||
/* todo use when sse3 available
|
||||
__m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
|
||||
const __m128i low_mask = _mm_set1_epi8(0x0F);
|
||||
__m128i lo = _mm_and_si128(v, low_mask);
|
||||
__m128i hi = _mm_and_si256(_mm_srli_epi32(v, 4), low_mask);
|
||||
__m128i popcnt1 = _mm_shuffle_epi8(lookup, lo);
|
||||
__m128i popcnt2 = _mm_shuffle_epi8(lookup, hi);
|
||||
__m128i total = _mm_add_epi8(popcnt1, popcnt2);*/
|
||||
|
||||
return hadd_impl<T>(x);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::array<T, native_simd<T>::size> popcount(const native_simd<T>& a) noexcept
|
||||
{
|
||||
alignas(native_simd<T>::alignment) std::array<T, native_simd<T>::size> res;
|
||||
popcount_impl(a).store(&res[0]);
|
||||
return res;
|
||||
}
|
||||
|
||||
// function andnot: a & ~ b
|
||||
template <typename T>
|
||||
native_simd<T> andnot(const native_simd<T>& a, const native_simd<T>& b)
|
||||
{
|
||||
return _mm_andnot_si128(b, a);
|
||||
}
|
||||
|
||||
static inline native_simd<uint8_t> operator==(const native_simd<uint8_t>& a,
|
||||
const native_simd<uint8_t>& b) noexcept
|
||||
{
|
||||
return _mm_cmpeq_epi8(a, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint16_t> operator==(const native_simd<uint16_t>& a,
|
||||
const native_simd<uint16_t>& b) noexcept
|
||||
{
|
||||
return _mm_cmpeq_epi16(a, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint32_t> operator==(const native_simd<uint32_t>& a,
|
||||
const native_simd<uint32_t>& b) noexcept
|
||||
{
|
||||
return _mm_cmpeq_epi32(a, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint64_t> operator==(const native_simd<uint64_t>& a,
|
||||
const native_simd<uint64_t>& b) noexcept
|
||||
{
|
||||
// no 64 compare instruction. Do two 32 bit compares
|
||||
__m128i com32 = _mm_cmpeq_epi32(a, b); // 32 bit compares
|
||||
__m128i com32s = _mm_shuffle_epi32(com32, 0xB1); // swap low and high dwords
|
||||
__m128i test = _mm_and_si128(com32, com32s); // low & high
|
||||
__m128i teste = _mm_srai_epi32(test, 31); // extend sign bit to 32 bits
|
||||
__m128i testee = _mm_shuffle_epi32(teste, 0xF5); // extend sign bit to 64 bits
|
||||
return testee;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline native_simd<T> operator!=(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return ~(a == b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint8_t> operator<<(const native_simd<uint8_t>& a, int b) noexcept
|
||||
{
|
||||
char mask = static_cast<char>(0xFF >> b);
|
||||
__m128i am = _mm_and_si128(a, _mm_set1_epi8(mask));
|
||||
return _mm_slli_epi16(am, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint16_t> operator<<(const native_simd<uint16_t>& a, int b) noexcept
|
||||
{
|
||||
return _mm_slli_epi16(a, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint32_t> operator<<(const native_simd<uint32_t>& a, int b) noexcept
|
||||
{
|
||||
return _mm_slli_epi32(a, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint64_t> operator<<(const native_simd<uint64_t>& a, int b) noexcept
|
||||
{
|
||||
return _mm_slli_epi64(a, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint8_t> operator>>(const native_simd<uint8_t>& a, int b) noexcept
|
||||
{
|
||||
char mask = static_cast<char>(0xFF << b);
|
||||
__m128i am = _mm_and_si128(a, _mm_set1_epi8(mask));
|
||||
return _mm_srli_epi16(am, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint16_t> operator>>(const native_simd<uint16_t>& a, int b) noexcept
|
||||
{
|
||||
return _mm_srli_epi16(a, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint32_t> operator>>(const native_simd<uint32_t>& a, int b) noexcept
|
||||
{
|
||||
return _mm_srli_epi32(a, b);
|
||||
}
|
||||
|
||||
static inline native_simd<uint64_t> operator>>(const native_simd<uint64_t>& a, int b) noexcept
|
||||
{
|
||||
return _mm_srli_epi64(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
native_simd<T> operator&(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return _mm_and_si128(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
native_simd<T> operator&=(native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
a = a & b;
|
||||
return a;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
native_simd<T> operator|(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return _mm_or_si128(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
native_simd<T> operator|=(native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
a = a | b;
|
||||
return a;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
native_simd<T> operator^(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return _mm_xor_si128(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
native_simd<T> operator^=(native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
a = a ^ b;
|
||||
return a;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
native_simd<T> operator~(const native_simd<T>& a) noexcept
|
||||
{
|
||||
return _mm_xor_si128(a, _mm_set1_epi32(-1));
|
||||
}
|
||||
|
||||
// potentially we want a special native_simd<bool> for this
|
||||
static inline native_simd<uint8_t> operator>=(const native_simd<uint8_t>& a,
|
||||
const native_simd<uint8_t>& b) noexcept
|
||||
{
|
||||
return _mm_cmpeq_epi8(_mm_max_epu8(a, b), a); // a == max(a,b)
|
||||
}
|
||||
|
||||
static inline native_simd<uint16_t> operator>=(const native_simd<uint16_t>& a,
|
||||
const native_simd<uint16_t>& b) noexcept
|
||||
{
|
||||
/* sse4.1 */
|
||||
#if 0
|
||||
return _mm_cmpeq_epi16(_mm_max_epu16(a, b), a); // a == max(a,b)
|
||||
#endif
|
||||
|
||||
__m128i s = _mm_subs_epu16(b, a); // b-a, saturated
|
||||
return _mm_cmpeq_epi16(s, _mm_setzero_si128()); // s == 0
|
||||
}
|
||||
|
||||
static inline native_simd<uint64_t> operator>(const native_simd<uint64_t>& a,
|
||||
const native_simd<uint64_t>& b) noexcept;
|
||||
static inline native_simd<uint32_t> operator>(const native_simd<uint32_t>& a,
|
||||
const native_simd<uint32_t>& b) noexcept;
|
||||
|
||||
static inline native_simd<uint32_t> operator>=(const native_simd<uint32_t>& a,
|
||||
const native_simd<uint32_t>& b) noexcept
|
||||
{
|
||||
/* sse4.1 */
|
||||
#if 0
|
||||
return (Vec4ib)_mm_cmpeq_epi32(_mm_max_epu32(a, b), a); // a == max(a,b)
|
||||
#endif
|
||||
|
||||
return ~(b > a);
|
||||
}
|
||||
|
||||
static inline native_simd<uint64_t> operator>=(const native_simd<uint64_t>& a,
|
||||
const native_simd<uint64_t>& b) noexcept
|
||||
{
|
||||
return ~(b > a);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline native_simd<T> operator<=(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return b >= a;
|
||||
}
|
||||
|
||||
static inline native_simd<uint8_t> operator>(const native_simd<uint8_t>& a,
|
||||
const native_simd<uint8_t>& b) noexcept
|
||||
{
|
||||
return ~(b >= a);
|
||||
}
|
||||
|
||||
static inline native_simd<uint16_t> operator>(const native_simd<uint16_t>& a,
|
||||
const native_simd<uint16_t>& b) noexcept
|
||||
{
|
||||
return ~(b >= a);
|
||||
}
|
||||
|
||||
static inline native_simd<uint32_t> operator>(const native_simd<uint32_t>& a,
|
||||
const native_simd<uint32_t>& b) noexcept
|
||||
{
|
||||
__m128i signbit = _mm_set1_epi32(static_cast<int32_t>(0x80000000));
|
||||
__m128i a1 = _mm_xor_si128(a, signbit);
|
||||
__m128i b1 = _mm_xor_si128(b, signbit);
|
||||
return _mm_cmpgt_epi32(a1, b1); // signed compare
|
||||
}
|
||||
|
||||
static inline native_simd<uint64_t> operator>(const native_simd<uint64_t>& a,
|
||||
const native_simd<uint64_t>& b) noexcept
|
||||
{
|
||||
__m128i sign32 = _mm_set1_epi32(static_cast<int32_t>(0x80000000)); // sign bit of each dword
|
||||
__m128i aflip = _mm_xor_si128(a, sign32); // a with sign bits flipped to use signed compare
|
||||
__m128i bflip = _mm_xor_si128(b, sign32); // b with sign bits flipped to use signed compare
|
||||
__m128i equal = _mm_cmpeq_epi32(a, b); // a == b, dwords
|
||||
__m128i bigger = _mm_cmpgt_epi32(aflip, bflip); // a > b, dwords
|
||||
__m128i biggerl = _mm_shuffle_epi32(bigger, 0xA0); // a > b, low dwords copied to high dwords
|
||||
__m128i eqbig = _mm_and_si128(equal, biggerl); // high part equal and low part bigger
|
||||
__m128i hibig = _mm_or_si128(bigger, eqbig); // high part bigger or high part equal and low part bigger
|
||||
__m128i big = _mm_shuffle_epi32(hibig, 0xF5); // result copied to low part
|
||||
return big;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline native_simd<T> operator<(const native_simd<T>& a, const native_simd<T>& b) noexcept
|
||||
{
|
||||
return b > a;
|
||||
}
|
||||
|
||||
} // namespace simd_sse2
|
||||
} // namespace detail
|
||||
} // namespace rapidfuzz
|
||||
52
src/external/rapidfuzz-cpp/rapidfuzz/details/type_traits.hpp
vendored
Normal file
52
src/external/rapidfuzz-cpp/rapidfuzz/details/type_traits.hpp
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2020 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <rapidfuzz/details/types.hpp>
|
||||
|
||||
#include <iterator>
|
||||
#include <utility>
|
||||
|
||||
namespace rapidfuzz {
|
||||
|
||||
namespace detail {
|
||||
template <typename T>
|
||||
auto inner_type(T const*) -> T;
|
||||
|
||||
template <typename T>
|
||||
auto inner_type(T const&) -> typename T::value_type;
|
||||
} // namespace detail
|
||||
|
||||
template <typename T>
|
||||
using char_type = decltype(detail::inner_type(std::declval<T const&>()));
|
||||
|
||||
/* backport of std::iter_value_t from C++20
|
||||
* This does not cover the complete functionality, but should be enough for
|
||||
* the use cases in this library
|
||||
*/
|
||||
template <typename T>
|
||||
using iter_value_t = typename std::iterator_traits<T>::value_type;
|
||||
|
||||
// taken from
|
||||
// https://stackoverflow.com/questions/16893992/check-if-type-can-be-explicitly-converted
|
||||
template <typename From, typename To>
|
||||
struct is_explicitly_convertible {
|
||||
template <typename T>
|
||||
static void f(T);
|
||||
|
||||
template <typename F, typename T>
|
||||
static constexpr auto test(int /*unused*/) -> decltype(f(static_cast<T>(std::declval<F>())), true)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename F, typename T>
|
||||
static constexpr auto test(...) -> bool
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool const value = test<From, To>(0);
|
||||
};
|
||||
|
||||
} // namespace rapidfuzz
|
||||
596
src/external/rapidfuzz-cpp/rapidfuzz/details/types.hpp
vendored
Normal file
596
src/external/rapidfuzz-cpp/rapidfuzz/details/types.hpp
vendored
Normal file
@@ -0,0 +1,596 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2020 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <stddef.h>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
|
||||
namespace rapidfuzz {
|
||||
|
||||
struct StringAffix {
|
||||
size_t prefix_len;
|
||||
size_t suffix_len;
|
||||
};
|
||||
|
||||
struct LevenshteinWeightTable {
|
||||
size_t insert_cost;
|
||||
size_t delete_cost;
|
||||
size_t replace_cost;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Edit operation types used by the Levenshtein distance
|
||||
*/
|
||||
enum class EditType {
|
||||
None = 0, /**< No Operation required */
|
||||
Replace = 1, /**< Replace a character if a string by another character */
|
||||
Insert = 2, /**< Insert a character into a string */
|
||||
Delete = 3 /**< Delete a character from a string */
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Edit operations used by the Levenshtein distance
|
||||
*
|
||||
* This represents an edit operation of type type which is applied to
|
||||
* the source string
|
||||
*
|
||||
* Replace: replace character at src_pos with character at dest_pos
|
||||
* Insert: insert character from dest_pos at src_pos
|
||||
* Delete: delete character at src_pos
|
||||
*/
|
||||
struct EditOp {
|
||||
EditType type; /**< type of the edit operation */
|
||||
size_t src_pos; /**< index into the source string */
|
||||
size_t dest_pos; /**< index into the destination string */
|
||||
|
||||
EditOp() : type(EditType::None), src_pos(0), dest_pos(0)
|
||||
{}
|
||||
|
||||
EditOp(EditType type_, size_t src_pos_, size_t dest_pos_)
|
||||
: type(type_), src_pos(src_pos_), dest_pos(dest_pos_)
|
||||
{}
|
||||
};
|
||||
|
||||
inline bool operator==(EditOp a, EditOp b)
|
||||
{
|
||||
return (a.type == b.type) && (a.src_pos == b.src_pos) && (a.dest_pos == b.dest_pos);
|
||||
}
|
||||
|
||||
inline bool operator!=(EditOp a, EditOp b)
|
||||
{
|
||||
return !(a == b);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Edit operations used by the Levenshtein distance
|
||||
*
|
||||
* This represents an edit operation of type type which is applied to
|
||||
* the source string
|
||||
*
|
||||
* None: s1[src_begin:src_end] == s1[dest_begin:dest_end]
|
||||
* Replace: s1[i1:i2] should be replaced by s2[dest_begin:dest_end]
|
||||
* Insert: s2[dest_begin:dest_end] should be inserted at s1[src_begin:src_begin].
|
||||
* Note that src_begin==src_end in this case.
|
||||
* Delete: s1[src_begin:src_end] should be deleted.
|
||||
* Note that dest_begin==dest_end in this case.
|
||||
*/
|
||||
struct Opcode {
|
||||
EditType type; /**< type of the edit operation */
|
||||
size_t src_begin; /**< index into the source string */
|
||||
size_t src_end; /**< index into the source string */
|
||||
size_t dest_begin; /**< index into the destination string */
|
||||
size_t dest_end; /**< index into the destination string */
|
||||
|
||||
Opcode() : type(EditType::None), src_begin(0), src_end(0), dest_begin(0), dest_end(0)
|
||||
{}
|
||||
|
||||
Opcode(EditType type_, size_t src_begin_, size_t src_end_, size_t dest_begin_, size_t dest_end_)
|
||||
: type(type_), src_begin(src_begin_), src_end(src_end_), dest_begin(dest_begin_), dest_end(dest_end_)
|
||||
{}
|
||||
};
|
||||
|
||||
inline bool operator==(Opcode a, Opcode b)
|
||||
{
|
||||
return (a.type == b.type) && (a.src_begin == b.src_begin) && (a.src_end == b.src_end) &&
|
||||
(a.dest_begin == b.dest_begin) && (a.dest_end == b.dest_end);
|
||||
}
|
||||
|
||||
inline bool operator!=(Opcode a, Opcode b)
|
||||
{
|
||||
return !(a == b);
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
template <typename Vec>
|
||||
auto vector_slice(const Vec& vec, int start, int stop, int step) -> Vec
|
||||
{
|
||||
Vec new_vec;
|
||||
|
||||
if (step == 0) throw std::invalid_argument("slice step cannot be zero");
|
||||
if (step < 0) throw std::invalid_argument("step sizes below 0 lead to an invalid order of editops");
|
||||
|
||||
if (start < 0)
|
||||
start = std::max<int>(start + static_cast<int>(vec.size()), 0);
|
||||
else if (start > static_cast<int>(vec.size()))
|
||||
start = static_cast<int>(vec.size());
|
||||
|
||||
if (stop < 0)
|
||||
stop = std::max<int>(stop + static_cast<int>(vec.size()), 0);
|
||||
else if (stop > static_cast<int>(vec.size()))
|
||||
stop = static_cast<int>(vec.size());
|
||||
|
||||
if (start >= stop) return new_vec;
|
||||
|
||||
int count = (stop - 1 - start) / step + 1;
|
||||
new_vec.reserve(static_cast<size_t>(count));
|
||||
|
||||
for (int i = start; i < stop; i += step)
|
||||
new_vec.push_back(vec[static_cast<size_t>(i)]);
|
||||
|
||||
return new_vec;
|
||||
}
|
||||
|
||||
template <typename Vec>
|
||||
void vector_remove_slice(Vec& vec, int start, int stop, int step)
|
||||
{
|
||||
if (step == 0) throw std::invalid_argument("slice step cannot be zero");
|
||||
if (step < 0) throw std::invalid_argument("step sizes below 0 lead to an invalid order of editops");
|
||||
|
||||
if (start < 0)
|
||||
start = std::max<int>(start + static_cast<int>(vec.size()), 0);
|
||||
else if (start > static_cast<int>(vec.size()))
|
||||
start = static_cast<int>(vec.size());
|
||||
|
||||
if (stop < 0)
|
||||
stop = std::max<int>(stop + static_cast<int>(vec.size()), 0);
|
||||
else if (stop > static_cast<int>(vec.size()))
|
||||
stop = static_cast<int>(vec.size());
|
||||
|
||||
if (start >= stop) return;
|
||||
|
||||
auto iter = vec.begin() + start;
|
||||
for (int i = start; i < static_cast<int>(vec.size()); i++)
|
||||
if (i >= stop || ((i - start) % step != 0)) *(iter++) = vec[static_cast<size_t>(i)];
|
||||
|
||||
vec.resize(static_cast<size_t>(std::distance(vec.begin(), iter)));
|
||||
vec.shrink_to_fit();
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
class Opcodes;
|
||||
|
||||
class Editops : private std::vector<EditOp> {
|
||||
public:
|
||||
using std::vector<EditOp>::size_type;
|
||||
|
||||
Editops() noexcept : src_len(0), dest_len(0)
|
||||
{}
|
||||
|
||||
Editops(size_type count, const EditOp& value) : std::vector<EditOp>(count, value), src_len(0), dest_len(0)
|
||||
{}
|
||||
|
||||
explicit Editops(size_type count) : std::vector<EditOp>(count), src_len(0), dest_len(0)
|
||||
{}
|
||||
|
||||
Editops(const Editops& other)
|
||||
: std::vector<EditOp>(other), src_len(other.src_len), dest_len(other.dest_len)
|
||||
{}
|
||||
|
||||
Editops(const Opcodes& other);
|
||||
|
||||
Editops(Editops&& other) noexcept
|
||||
{
|
||||
swap(other);
|
||||
}
|
||||
|
||||
Editops& operator=(Editops other) noexcept
|
||||
{
|
||||
swap(other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
/* Element access */
|
||||
using std::vector<EditOp>::at;
|
||||
using std::vector<EditOp>::operator[];
|
||||
using std::vector<EditOp>::front;
|
||||
using std::vector<EditOp>::back;
|
||||
using std::vector<EditOp>::data;
|
||||
|
||||
/* Iterators */
|
||||
using std::vector<EditOp>::begin;
|
||||
using std::vector<EditOp>::cbegin;
|
||||
using std::vector<EditOp>::end;
|
||||
using std::vector<EditOp>::cend;
|
||||
using std::vector<EditOp>::rbegin;
|
||||
using std::vector<EditOp>::crbegin;
|
||||
using std::vector<EditOp>::rend;
|
||||
using std::vector<EditOp>::crend;
|
||||
|
||||
/* Capacity */
|
||||
using std::vector<EditOp>::empty;
|
||||
using std::vector<EditOp>::size;
|
||||
using std::vector<EditOp>::max_size;
|
||||
using std::vector<EditOp>::reserve;
|
||||
using std::vector<EditOp>::capacity;
|
||||
using std::vector<EditOp>::shrink_to_fit;
|
||||
|
||||
/* Modifiers */
|
||||
using std::vector<EditOp>::clear;
|
||||
using std::vector<EditOp>::insert;
|
||||
using std::vector<EditOp>::emplace;
|
||||
using std::vector<EditOp>::erase;
|
||||
using std::vector<EditOp>::push_back;
|
||||
using std::vector<EditOp>::emplace_back;
|
||||
using std::vector<EditOp>::pop_back;
|
||||
using std::vector<EditOp>::resize;
|
||||
|
||||
void swap(Editops& rhs) noexcept
|
||||
{
|
||||
std::swap(src_len, rhs.src_len);
|
||||
std::swap(dest_len, rhs.dest_len);
|
||||
std::vector<EditOp>::swap(rhs);
|
||||
}
|
||||
|
||||
Editops slice(int start, int stop, int step = 1) const
|
||||
{
|
||||
Editops ed_slice = detail::vector_slice(*this, start, stop, step);
|
||||
ed_slice.src_len = src_len;
|
||||
ed_slice.dest_len = dest_len;
|
||||
return ed_slice;
|
||||
}
|
||||
|
||||
void remove_slice(int start, int stop, int step = 1)
|
||||
{
|
||||
detail::vector_remove_slice(*this, start, stop, step);
|
||||
}
|
||||
|
||||
Editops reverse() const
|
||||
{
|
||||
Editops reversed = *this;
|
||||
std::reverse(reversed.begin(), reversed.end());
|
||||
return reversed;
|
||||
}
|
||||
|
||||
size_t get_src_len() const noexcept
|
||||
{
|
||||
return src_len;
|
||||
}
|
||||
void set_src_len(size_t len) noexcept
|
||||
{
|
||||
src_len = len;
|
||||
}
|
||||
size_t get_dest_len() const noexcept
|
||||
{
|
||||
return dest_len;
|
||||
}
|
||||
void set_dest_len(size_t len) noexcept
|
||||
{
|
||||
dest_len = len;
|
||||
}
|
||||
|
||||
Editops inverse() const
|
||||
{
|
||||
Editops inv_ops = *this;
|
||||
std::swap(inv_ops.src_len, inv_ops.dest_len);
|
||||
for (auto& op : inv_ops) {
|
||||
std::swap(op.src_pos, op.dest_pos);
|
||||
if (op.type == EditType::Delete)
|
||||
op.type = EditType::Insert;
|
||||
else if (op.type == EditType::Insert)
|
||||
op.type = EditType::Delete;
|
||||
}
|
||||
return inv_ops;
|
||||
}
|
||||
|
||||
Editops remove_subsequence(const Editops& subsequence) const
|
||||
{
|
||||
Editops result;
|
||||
result.set_src_len(src_len);
|
||||
result.set_dest_len(dest_len);
|
||||
|
||||
if (subsequence.size() > size()) throw std::invalid_argument("subsequence is not a subsequence");
|
||||
|
||||
result.resize(size() - subsequence.size());
|
||||
|
||||
/* offset to correct removed edit operations */
|
||||
int offset = 0;
|
||||
auto op_iter = begin();
|
||||
auto op_end = end();
|
||||
size_t result_pos = 0;
|
||||
for (const auto& sop : subsequence) {
|
||||
for (; op_iter != op_end && sop != *op_iter; op_iter++) {
|
||||
result[result_pos] = *op_iter;
|
||||
result[result_pos].src_pos =
|
||||
static_cast<size_t>(static_cast<ptrdiff_t>(result[result_pos].src_pos) + offset);
|
||||
result_pos++;
|
||||
}
|
||||
/* element of subsequence not part of the sequence */
|
||||
if (op_iter == op_end) throw std::invalid_argument("subsequence is not a subsequence");
|
||||
|
||||
if (sop.type == EditType::Insert)
|
||||
offset++;
|
||||
else if (sop.type == EditType::Delete)
|
||||
offset--;
|
||||
op_iter++;
|
||||
}
|
||||
|
||||
/* add remaining elements */
|
||||
for (; op_iter != op_end; op_iter++) {
|
||||
result[result_pos] = *op_iter;
|
||||
result[result_pos].src_pos =
|
||||
static_cast<size_t>(static_cast<ptrdiff_t>(result[result_pos].src_pos) + offset);
|
||||
result_pos++;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
size_t src_len;
|
||||
size_t dest_len;
|
||||
};
|
||||
|
||||
inline bool operator==(const Editops& lhs, const Editops& rhs)
|
||||
{
|
||||
if (lhs.get_src_len() != rhs.get_src_len() || lhs.get_dest_len() != rhs.get_dest_len()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (lhs.size() != rhs.size()) {
|
||||
return false;
|
||||
}
|
||||
return std::equal(lhs.begin(), lhs.end(), rhs.begin());
|
||||
}
|
||||
|
||||
inline bool operator!=(const Editops& lhs, const Editops& rhs)
|
||||
{
|
||||
return !(lhs == rhs);
|
||||
}
|
||||
|
||||
inline void swap(Editops& lhs, Editops& rhs) noexcept(noexcept(lhs.swap(rhs)))
|
||||
{
|
||||
lhs.swap(rhs);
|
||||
}
|
||||
|
||||
class Opcodes : private std::vector<Opcode> {
|
||||
public:
|
||||
using std::vector<Opcode>::size_type;
|
||||
|
||||
Opcodes() noexcept : src_len(0), dest_len(0)
|
||||
{}
|
||||
|
||||
Opcodes(size_type count, const Opcode& value) : std::vector<Opcode>(count, value), src_len(0), dest_len(0)
|
||||
{}
|
||||
|
||||
explicit Opcodes(size_type count) : std::vector<Opcode>(count), src_len(0), dest_len(0)
|
||||
{}
|
||||
|
||||
Opcodes(const Opcodes& other)
|
||||
: std::vector<Opcode>(other), src_len(other.src_len), dest_len(other.dest_len)
|
||||
{}
|
||||
|
||||
Opcodes(const Editops& other);
|
||||
|
||||
Opcodes(Opcodes&& other) noexcept
|
||||
{
|
||||
swap(other);
|
||||
}
|
||||
|
||||
Opcodes& operator=(Opcodes other) noexcept
|
||||
{
|
||||
swap(other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
/* Element access */
|
||||
using std::vector<Opcode>::at;
|
||||
using std::vector<Opcode>::operator[];
|
||||
using std::vector<Opcode>::front;
|
||||
using std::vector<Opcode>::back;
|
||||
using std::vector<Opcode>::data;
|
||||
|
||||
/* Iterators */
|
||||
using std::vector<Opcode>::begin;
|
||||
using std::vector<Opcode>::cbegin;
|
||||
using std::vector<Opcode>::end;
|
||||
using std::vector<Opcode>::cend;
|
||||
using std::vector<Opcode>::rbegin;
|
||||
using std::vector<Opcode>::crbegin;
|
||||
using std::vector<Opcode>::rend;
|
||||
using std::vector<Opcode>::crend;
|
||||
|
||||
/* Capacity */
|
||||
using std::vector<Opcode>::empty;
|
||||
using std::vector<Opcode>::size;
|
||||
using std::vector<Opcode>::max_size;
|
||||
using std::vector<Opcode>::reserve;
|
||||
using std::vector<Opcode>::capacity;
|
||||
using std::vector<Opcode>::shrink_to_fit;
|
||||
|
||||
/* Modifiers */
|
||||
using std::vector<Opcode>::clear;
|
||||
using std::vector<Opcode>::insert;
|
||||
using std::vector<Opcode>::emplace;
|
||||
using std::vector<Opcode>::erase;
|
||||
using std::vector<Opcode>::push_back;
|
||||
using std::vector<Opcode>::emplace_back;
|
||||
using std::vector<Opcode>::pop_back;
|
||||
using std::vector<Opcode>::resize;
|
||||
|
||||
void swap(Opcodes& rhs) noexcept
|
||||
{
|
||||
std::swap(src_len, rhs.src_len);
|
||||
std::swap(dest_len, rhs.dest_len);
|
||||
std::vector<Opcode>::swap(rhs);
|
||||
}
|
||||
|
||||
Opcodes slice(int start, int stop, int step = 1) const
|
||||
{
|
||||
Opcodes ed_slice = detail::vector_slice(*this, start, stop, step);
|
||||
ed_slice.src_len = src_len;
|
||||
ed_slice.dest_len = dest_len;
|
||||
return ed_slice;
|
||||
}
|
||||
|
||||
Opcodes reverse() const
|
||||
{
|
||||
Opcodes reversed = *this;
|
||||
std::reverse(reversed.begin(), reversed.end());
|
||||
return reversed;
|
||||
}
|
||||
|
||||
size_t get_src_len() const noexcept
|
||||
{
|
||||
return src_len;
|
||||
}
|
||||
void set_src_len(size_t len) noexcept
|
||||
{
|
||||
src_len = len;
|
||||
}
|
||||
size_t get_dest_len() const noexcept
|
||||
{
|
||||
return dest_len;
|
||||
}
|
||||
void set_dest_len(size_t len) noexcept
|
||||
{
|
||||
dest_len = len;
|
||||
}
|
||||
|
||||
Opcodes inverse() const
|
||||
{
|
||||
Opcodes inv_ops = *this;
|
||||
std::swap(inv_ops.src_len, inv_ops.dest_len);
|
||||
for (auto& op : inv_ops) {
|
||||
std::swap(op.src_begin, op.dest_begin);
|
||||
std::swap(op.src_end, op.dest_end);
|
||||
if (op.type == EditType::Delete)
|
||||
op.type = EditType::Insert;
|
||||
else if (op.type == EditType::Insert)
|
||||
op.type = EditType::Delete;
|
||||
}
|
||||
return inv_ops;
|
||||
}
|
||||
|
||||
private:
|
||||
size_t src_len;
|
||||
size_t dest_len;
|
||||
};
|
||||
|
||||
inline bool operator==(const Opcodes& lhs, const Opcodes& rhs)
|
||||
{
|
||||
if (lhs.get_src_len() != rhs.get_src_len() || lhs.get_dest_len() != rhs.get_dest_len()) return false;
|
||||
|
||||
if (lhs.size() != rhs.size()) return false;
|
||||
|
||||
return std::equal(lhs.begin(), lhs.end(), rhs.begin());
|
||||
}
|
||||
|
||||
inline bool operator!=(const Opcodes& lhs, const Opcodes& rhs)
|
||||
{
|
||||
return !(lhs == rhs);
|
||||
}
|
||||
|
||||
inline void swap(Opcodes& lhs, Opcodes& rhs) noexcept(noexcept(lhs.swap(rhs)))
|
||||
{
|
||||
lhs.swap(rhs);
|
||||
}
|
||||
|
||||
inline Editops::Editops(const Opcodes& other)
|
||||
{
|
||||
src_len = other.get_src_len();
|
||||
dest_len = other.get_dest_len();
|
||||
for (const auto& op : other) {
|
||||
switch (op.type) {
|
||||
case EditType::None: break;
|
||||
|
||||
case EditType::Replace:
|
||||
for (size_t j = 0; j < op.src_end - op.src_begin; j++)
|
||||
push_back({EditType::Replace, op.src_begin + j, op.dest_begin + j});
|
||||
break;
|
||||
|
||||
case EditType::Insert:
|
||||
for (size_t j = 0; j < op.dest_end - op.dest_begin; j++)
|
||||
push_back({EditType::Insert, op.src_begin, op.dest_begin + j});
|
||||
break;
|
||||
|
||||
case EditType::Delete:
|
||||
for (size_t j = 0; j < op.src_end - op.src_begin; j++)
|
||||
push_back({EditType::Delete, op.src_begin + j, op.dest_begin});
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline Opcodes::Opcodes(const Editops& other)
|
||||
{
|
||||
src_len = other.get_src_len();
|
||||
dest_len = other.get_dest_len();
|
||||
size_t src_pos = 0;
|
||||
size_t dest_pos = 0;
|
||||
for (size_t i = 0; i < other.size();) {
|
||||
if (src_pos < other[i].src_pos || dest_pos < other[i].dest_pos) {
|
||||
push_back({EditType::None, src_pos, other[i].src_pos, dest_pos, other[i].dest_pos});
|
||||
src_pos = other[i].src_pos;
|
||||
dest_pos = other[i].dest_pos;
|
||||
}
|
||||
|
||||
size_t src_begin = src_pos;
|
||||
size_t dest_begin = dest_pos;
|
||||
EditType type = other[i].type;
|
||||
do {
|
||||
switch (type) {
|
||||
case EditType::None: break;
|
||||
|
||||
case EditType::Replace:
|
||||
src_pos++;
|
||||
dest_pos++;
|
||||
break;
|
||||
|
||||
case EditType::Insert: dest_pos++; break;
|
||||
|
||||
case EditType::Delete: src_pos++; break;
|
||||
}
|
||||
i++;
|
||||
} while (i < other.size() && other[i].type == type && src_pos == other[i].src_pos &&
|
||||
dest_pos == other[i].dest_pos);
|
||||
|
||||
push_back({type, src_begin, src_pos, dest_begin, dest_pos});
|
||||
}
|
||||
|
||||
if (src_pos < other.get_src_len() || dest_pos < other.get_dest_len()) {
|
||||
push_back({EditType::None, src_pos, other.get_src_len(), dest_pos, other.get_dest_len()});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct ScoreAlignment {
|
||||
T score; /**< resulting score of the algorithm */
|
||||
size_t src_start; /**< index into the source string */
|
||||
size_t src_end; /**< index into the source string */
|
||||
size_t dest_start; /**< index into the destination string */
|
||||
size_t dest_end; /**< index into the destination string */
|
||||
|
||||
ScoreAlignment() : score(T()), src_start(0), src_end(0), dest_start(0), dest_end(0)
|
||||
{}
|
||||
|
||||
ScoreAlignment(T score_, size_t src_start_, size_t src_end_, size_t dest_start_, size_t dest_end_)
|
||||
: score(score_),
|
||||
src_start(src_start_),
|
||||
src_end(src_end_),
|
||||
dest_start(dest_start_),
|
||||
dest_end(dest_end_)
|
||||
{}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline bool operator==(const ScoreAlignment<T>& a, const ScoreAlignment<T>& b)
|
||||
{
|
||||
return (a.score == b.score) && (a.src_start == b.src_start) && (a.src_end == b.src_end) &&
|
||||
(a.dest_start == b.dest_start) && (a.dest_end == b.dest_end);
|
||||
}
|
||||
|
||||
} // namespace rapidfuzz
|
||||
161
src/external/rapidfuzz-cpp/rapidfuzz/distance.hpp
vendored
Normal file
161
src/external/rapidfuzz-cpp/rapidfuzz/distance.hpp
vendored
Normal file
@@ -0,0 +1,161 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <rapidfuzz/distance/DamerauLevenshtein.hpp>
|
||||
#include <rapidfuzz/distance/Hamming.hpp>
|
||||
#include <rapidfuzz/distance/Indel.hpp>
|
||||
#include <rapidfuzz/distance/Jaro.hpp>
|
||||
#include <rapidfuzz/distance/JaroWinkler.hpp>
|
||||
#include <rapidfuzz/distance/LCSseq.hpp>
|
||||
#include <rapidfuzz/distance/Levenshtein.hpp>
|
||||
#include <rapidfuzz/distance/OSA.hpp>
|
||||
#include <rapidfuzz/distance/Postfix.hpp>
|
||||
#include <rapidfuzz/distance/Prefix.hpp>
|
||||
|
||||
namespace rapidfuzz {
|
||||
|
||||
namespace detail {
|
||||
template <typename ReturnType, typename InputIt1, typename InputIt2>
|
||||
ReturnType editops_apply_impl(const Editops& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2,
|
||||
InputIt2 last2)
|
||||
{
|
||||
auto len1 = static_cast<size_t>(std::distance(first1, last1));
|
||||
auto len2 = static_cast<size_t>(std::distance(first2, last2));
|
||||
|
||||
ReturnType res_str;
|
||||
res_str.resize(len1 + len2);
|
||||
size_t src_pos = 0;
|
||||
size_t dest_pos = 0;
|
||||
|
||||
for (const auto& op : ops) {
|
||||
/* matches between last and current editop */
|
||||
while (src_pos < op.src_pos) {
|
||||
res_str[dest_pos] =
|
||||
static_cast<typename ReturnType::value_type>(first1[static_cast<ptrdiff_t>(src_pos)]);
|
||||
src_pos++;
|
||||
dest_pos++;
|
||||
}
|
||||
|
||||
switch (op.type) {
|
||||
case EditType::None:
|
||||
case EditType::Replace:
|
||||
res_str[dest_pos] =
|
||||
static_cast<typename ReturnType::value_type>(first2[static_cast<ptrdiff_t>(op.dest_pos)]);
|
||||
src_pos++;
|
||||
dest_pos++;
|
||||
break;
|
||||
case EditType::Insert:
|
||||
res_str[dest_pos] =
|
||||
static_cast<typename ReturnType::value_type>(first2[static_cast<ptrdiff_t>(op.dest_pos)]);
|
||||
dest_pos++;
|
||||
break;
|
||||
case EditType::Delete: src_pos++; break;
|
||||
}
|
||||
}
|
||||
|
||||
/* matches after the last editop */
|
||||
while (src_pos < len1) {
|
||||
res_str[dest_pos] =
|
||||
static_cast<typename ReturnType::value_type>(first1[static_cast<ptrdiff_t>(src_pos)]);
|
||||
src_pos++;
|
||||
dest_pos++;
|
||||
}
|
||||
|
||||
res_str.resize(dest_pos);
|
||||
return res_str;
|
||||
}
|
||||
|
||||
template <typename ReturnType, typename InputIt1, typename InputIt2>
|
||||
ReturnType opcodes_apply_impl(const Opcodes& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2,
|
||||
InputIt2 last2)
|
||||
{
|
||||
auto len1 = static_cast<size_t>(std::distance(first1, last1));
|
||||
auto len2 = static_cast<size_t>(std::distance(first2, last2));
|
||||
|
||||
ReturnType res_str;
|
||||
res_str.resize(len1 + len2);
|
||||
size_t dest_pos = 0;
|
||||
|
||||
for (const auto& op : ops) {
|
||||
switch (op.type) {
|
||||
case EditType::None:
|
||||
for (auto i = op.src_begin; i < op.src_end; ++i) {
|
||||
res_str[dest_pos++] =
|
||||
static_cast<typename ReturnType::value_type>(first1[static_cast<ptrdiff_t>(i)]);
|
||||
}
|
||||
break;
|
||||
case EditType::Replace:
|
||||
case EditType::Insert:
|
||||
for (auto i = op.dest_begin; i < op.dest_end; ++i) {
|
||||
res_str[dest_pos++] =
|
||||
static_cast<typename ReturnType::value_type>(first2[static_cast<ptrdiff_t>(i)]);
|
||||
}
|
||||
break;
|
||||
case EditType::Delete: break;
|
||||
}
|
||||
}
|
||||
|
||||
res_str.resize(dest_pos);
|
||||
return res_str;
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename CharT, typename InputIt1, typename InputIt2>
|
||||
std::basic_string<CharT> editops_apply_str(const Editops& ops, InputIt1 first1, InputIt1 last1,
|
||||
InputIt2 first2, InputIt2 last2)
|
||||
{
|
||||
return detail::editops_apply_impl<std::basic_string<CharT>>(ops, first1, last1, first2, last2);
|
||||
}
|
||||
|
||||
template <typename CharT, typename Sentence1, typename Sentence2>
|
||||
std::basic_string<CharT> editops_apply_str(const Editops& ops, const Sentence1& s1, const Sentence2& s2)
|
||||
{
|
||||
return detail::editops_apply_impl<std::basic_string<CharT>>(ops, detail::to_begin(s1), detail::to_end(s1),
|
||||
detail::to_begin(s2), detail::to_end(s2));
|
||||
}
|
||||
|
||||
template <typename CharT, typename InputIt1, typename InputIt2>
|
||||
std::basic_string<CharT> opcodes_apply_str(const Opcodes& ops, InputIt1 first1, InputIt1 last1,
|
||||
InputIt2 first2, InputIt2 last2)
|
||||
{
|
||||
return detail::opcodes_apply_impl<std::basic_string<CharT>>(ops, first1, last1, first2, last2);
|
||||
}
|
||||
|
||||
template <typename CharT, typename Sentence1, typename Sentence2>
|
||||
std::basic_string<CharT> opcodes_apply_str(const Opcodes& ops, const Sentence1& s1, const Sentence2& s2)
|
||||
{
|
||||
return detail::opcodes_apply_impl<std::basic_string<CharT>>(ops, detail::to_begin(s1), detail::to_end(s1),
|
||||
detail::to_begin(s2), detail::to_end(s2));
|
||||
}
|
||||
|
||||
template <typename CharT, typename InputIt1, typename InputIt2>
|
||||
std::vector<CharT> editops_apply_vec(const Editops& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2,
|
||||
InputIt2 last2)
|
||||
{
|
||||
return detail::editops_apply_impl<std::vector<CharT>>(ops, first1, last1, first2, last2);
|
||||
}
|
||||
|
||||
template <typename CharT, typename Sentence1, typename Sentence2>
|
||||
std::vector<CharT> editops_apply_vec(const Editops& ops, const Sentence1& s1, const Sentence2& s2)
|
||||
{
|
||||
return detail::editops_apply_impl<std::vector<CharT>>(ops, detail::to_begin(s1), detail::to_end(s1),
|
||||
detail::to_begin(s2), detail::to_end(s2));
|
||||
}
|
||||
|
||||
template <typename CharT, typename InputIt1, typename InputIt2>
|
||||
std::vector<CharT> opcodes_apply_vec(const Opcodes& ops, InputIt1 first1, InputIt1 last1, InputIt2 first2,
|
||||
InputIt2 last2)
|
||||
{
|
||||
return detail::opcodes_apply_impl<std::vector<CharT>>(ops, first1, last1, first2, last2);
|
||||
}
|
||||
|
||||
template <typename CharT, typename Sentence1, typename Sentence2>
|
||||
std::vector<CharT> opcodes_apply_vec(const Opcodes& ops, const Sentence1& s1, const Sentence2& s2)
|
||||
{
|
||||
return detail::opcodes_apply_impl<std::vector<CharT>>(ops, detail::to_begin(s1), detail::to_end(s1),
|
||||
detail::to_begin(s2), detail::to_end(s2));
|
||||
}
|
||||
|
||||
} // namespace rapidfuzz
|
||||
152
src/external/rapidfuzz-cpp/rapidfuzz/distance/DamerauLevenshtein.hpp
vendored
Normal file
152
src/external/rapidfuzz-cpp/rapidfuzz/distance/DamerauLevenshtein.hpp
vendored
Normal file
@@ -0,0 +1,152 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#include <algorithm>
|
||||
#include <rapidfuzz/distance/DamerauLevenshtein_impl.hpp>
|
||||
|
||||
namespace rapidfuzz {
|
||||
/* the API will require a change when adding custom weights */
|
||||
namespace experimental {
|
||||
/**
|
||||
* @brief Calculates the Damerau Levenshtein distance between two strings.
|
||||
*
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1
|
||||
* string to compare with s2 (for type info check Template parameters above)
|
||||
* @param s2
|
||||
* string to compare with s1 (for type info check Template parameters above)
|
||||
* @param max
|
||||
* Maximum Damerau Levenshtein distance between s1 and s2, that is
|
||||
* considered as a result. If the distance is bigger than max,
|
||||
* max + 1 is returned instead. Default is std::numeric_limits<size_t>::max(),
|
||||
* which deactivates this behaviour.
|
||||
*
|
||||
* @return Damerau Levenshtein distance between s1 and s2
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t damerau_levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::DamerauLevenshtein::distance(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t damerau_levenshtein_distance(const Sentence1& s1, const Sentence2& s2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::DamerauLevenshtein::distance(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t damerau_levenshtein_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_cutoff = 0)
|
||||
{
|
||||
return detail::DamerauLevenshtein::similarity(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t damerau_levenshtein_similarity(const Sentence1& s1, const Sentence2& s2, size_t score_cutoff = 0)
|
||||
{
|
||||
return detail::DamerauLevenshtein::similarity(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double damerau_levenshtein_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2,
|
||||
InputIt2 last2, double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::DamerauLevenshtein::normalized_distance(first1, last1, first2, last2, score_cutoff,
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double damerau_levenshtein_normalized_distance(const Sentence1& s1, const Sentence2& s2,
|
||||
double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::DamerauLevenshtein::normalized_distance(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculates a normalized Damerau Levenshtein similarity
|
||||
*
|
||||
* @details
|
||||
* Both string require a similar length
|
||||
*
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1
|
||||
* string to compare with s2 (for type info check Template parameters above)
|
||||
* @param s2
|
||||
* string to compare with s1 (for type info check Template parameters above)
|
||||
* @param score_cutoff
|
||||
* Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
* For ratio < score_cutoff 0 is returned instead. Default is 0,
|
||||
* which deactivates this behaviour.
|
||||
*
|
||||
* @return Normalized Damerau Levenshtein distance between s1 and s2
|
||||
* as a float between 0 and 1.0
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double damerau_levenshtein_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2,
|
||||
InputIt2 last2, double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::DamerauLevenshtein::normalized_similarity(first1, last1, first2, last2, score_cutoff,
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double damerau_levenshtein_normalized_similarity(const Sentence1& s1, const Sentence2& s2,
|
||||
double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::DamerauLevenshtein::normalized_similarity(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
struct CachedDamerauLevenshtein : public detail::CachedDistanceBase<CachedDamerauLevenshtein<CharT1>, size_t,
|
||||
0, std::numeric_limits<int64_t>::max()> {
|
||||
template <typename Sentence1>
|
||||
explicit CachedDamerauLevenshtein(const Sentence1& s1_)
|
||||
: CachedDamerauLevenshtein(detail::to_begin(s1_), detail::to_end(s1_))
|
||||
{}
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedDamerauLevenshtein(InputIt1 first1, InputIt1 last1) : s1(first1, last1)
|
||||
{}
|
||||
|
||||
private:
|
||||
friend detail::CachedDistanceBase<CachedDamerauLevenshtein<CharT1>, size_t, 0,
|
||||
std::numeric_limits<int64_t>::max()>;
|
||||
friend detail::CachedNormalizedMetricBase<CachedDamerauLevenshtein<CharT1>>;
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t maximum(const detail::Range<InputIt2>& s2) const
|
||||
{
|
||||
return std::max(s1.size(), s2.size());
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t _distance(const detail::Range<InputIt2>& s2, size_t score_cutoff,
|
||||
[[maybe_unused]] size_t score_hint) const
|
||||
{
|
||||
return rapidfuzz::experimental::damerau_levenshtein_distance(s1, s2, score_cutoff);
|
||||
}
|
||||
|
||||
std::vector<CharT1> s1;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedDamerauLevenshtein(const Sentence1& s1_) -> CachedDamerauLevenshtein<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedDamerauLevenshtein(InputIt1 first1, InputIt1 last1) -> CachedDamerauLevenshtein<iter_value_t<InputIt1>>;
|
||||
|
||||
} // namespace experimental
|
||||
} // namespace rapidfuzz
|
||||
140
src/external/rapidfuzz-cpp/rapidfuzz/distance/DamerauLevenshtein_impl.hpp
vendored
Normal file
140
src/external/rapidfuzz-cpp/rapidfuzz/distance/DamerauLevenshtein_impl.hpp
vendored
Normal file
@@ -0,0 +1,140 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <limits>
|
||||
#include <numeric>
|
||||
#include <rapidfuzz/details/GrowingHashmap.hpp>
|
||||
#include <rapidfuzz/details/Matrix.hpp>
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/details/common.hpp>
|
||||
#include <rapidfuzz/details/distance.hpp>
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
template <typename IntType>
|
||||
struct RowId {
|
||||
IntType val = -1;
|
||||
friend bool operator==(const RowId& lhs, const RowId& rhs)
|
||||
{
|
||||
return lhs.val == rhs.val;
|
||||
}
|
||||
|
||||
friend bool operator!=(const RowId& lhs, const RowId& rhs)
|
||||
{
|
||||
return !(lhs == rhs);
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* based on the paper
|
||||
* "Linear space string correction algorithm using the Damerau-Levenshtein distance"
|
||||
* from Chunchun Zhao and Sartaj Sahni
|
||||
*/
|
||||
template <typename IntType, typename InputIt1, typename InputIt2>
|
||||
size_t damerau_levenshtein_distance_zhao(const Range<InputIt1>& s1, const Range<InputIt2>& s2, size_t max)
|
||||
{
|
||||
// todo check types
|
||||
IntType len1 = static_cast<IntType>(s1.size());
|
||||
IntType len2 = static_cast<IntType>(s2.size());
|
||||
IntType maxVal = static_cast<IntType>(std::max(len1, len2) + 1);
|
||||
assert(std::numeric_limits<IntType>::max() > maxVal);
|
||||
|
||||
HybridGrowingHashmap<typename Range<InputIt1>::value_type, RowId<IntType>> last_row_id;
|
||||
size_t size = s2.size() + 2;
|
||||
assume(size != 0);
|
||||
std::vector<IntType> FR_arr(size, maxVal);
|
||||
std::vector<IntType> R1_arr(size, maxVal);
|
||||
std::vector<IntType> R_arr(size);
|
||||
R_arr[0] = maxVal;
|
||||
std::iota(R_arr.begin() + 1, R_arr.end(), IntType(0));
|
||||
|
||||
IntType* R = &R_arr[1];
|
||||
IntType* R1 = &R1_arr[1];
|
||||
IntType* FR = &FR_arr[1];
|
||||
|
||||
auto iter_s1 = s1.begin();
|
||||
for (IntType i = 1; i <= len1; i++) {
|
||||
std::swap(R, R1);
|
||||
IntType last_col_id = -1;
|
||||
IntType last_i2l1 = R[0];
|
||||
R[0] = i;
|
||||
IntType T = maxVal;
|
||||
|
||||
auto iter_s2 = s2.begin();
|
||||
for (IntType j = 1; j <= len2; j++) {
|
||||
int64_t diag = R1[j - 1] + static_cast<IntType>(*iter_s1 != *iter_s2);
|
||||
int64_t left = R[j - 1] + 1;
|
||||
int64_t up = R1[j] + 1;
|
||||
int64_t temp = std::min({diag, left, up});
|
||||
|
||||
if (*iter_s1 == *iter_s2) {
|
||||
last_col_id = j; // last occurence of s1_i
|
||||
FR[j] = R1[j - 2]; // save H_k-1,j-2
|
||||
T = last_i2l1; // save H_i-2,l-1
|
||||
}
|
||||
else {
|
||||
int64_t k = last_row_id.get(static_cast<uint64_t>(*iter_s2)).val;
|
||||
int64_t l = last_col_id;
|
||||
|
||||
if ((j - l) == 1) {
|
||||
int64_t transpose = FR[j] + (i - k);
|
||||
temp = std::min(temp, transpose);
|
||||
}
|
||||
else if ((i - k) == 1) {
|
||||
int64_t transpose = T + (j - l);
|
||||
temp = std::min(temp, transpose);
|
||||
}
|
||||
}
|
||||
|
||||
last_i2l1 = R[j];
|
||||
R[j] = static_cast<IntType>(temp);
|
||||
iter_s2++;
|
||||
}
|
||||
last_row_id[*iter_s1].val = i;
|
||||
iter_s1++;
|
||||
}
|
||||
|
||||
size_t dist = static_cast<size_t>(R[s2.size()]);
|
||||
return (dist <= max) ? dist : max + 1;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t damerau_levenshtein_distance(Range<InputIt1> s1, Range<InputIt2> s2, size_t max)
|
||||
{
|
||||
size_t min_edits = abs_diff(s1.size(), s2.size());
|
||||
if (min_edits > max) return max + 1;
|
||||
|
||||
/* common affix does not effect Levenshtein distance */
|
||||
remove_common_affix(s1, s2);
|
||||
|
||||
size_t maxVal = std::max(s1.size(), s2.size()) + 1;
|
||||
if (std::numeric_limits<int16_t>::max() > maxVal)
|
||||
return damerau_levenshtein_distance_zhao<int16_t>(s1, s2, max);
|
||||
else if (std::numeric_limits<int32_t>::max() > maxVal)
|
||||
return damerau_levenshtein_distance_zhao<int32_t>(s1, s2, max);
|
||||
else
|
||||
return damerau_levenshtein_distance_zhao<int64_t>(s1, s2, max);
|
||||
}
|
||||
|
||||
class DamerauLevenshtein
|
||||
: public DistanceBase<DamerauLevenshtein, size_t, 0, std::numeric_limits<int64_t>::max()> {
|
||||
friend DistanceBase<DamerauLevenshtein, size_t, 0, std::numeric_limits<int64_t>::max()>;
|
||||
friend NormalizedMetricBase<DamerauLevenshtein>;
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static size_t maximum(const Range<InputIt1>& s1, const Range<InputIt2>& s2)
|
||||
{
|
||||
return std::max(s1.size(), s2.size());
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static size_t _distance(const Range<InputIt1>& s1, const Range<InputIt2>& s2, size_t score_cutoff,
|
||||
[[maybe_unused]] size_t score_hint)
|
||||
{
|
||||
return damerau_levenshtein_distance(s1, s2, score_cutoff);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
172
src/external/rapidfuzz-cpp/rapidfuzz/distance/Hamming.hpp
vendored
Normal file
172
src/external/rapidfuzz-cpp/rapidfuzz/distance/Hamming.hpp
vendored
Normal file
@@ -0,0 +1,172 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <limits>
|
||||
#include <rapidfuzz/details/common.hpp>
|
||||
#include <rapidfuzz/distance/Hamming_impl.hpp>
|
||||
|
||||
namespace rapidfuzz {
|
||||
|
||||
/**
|
||||
* @brief Calculates the Hamming distance between two strings.
|
||||
*
|
||||
* @details
|
||||
* Both strings require a similar length
|
||||
*
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1
|
||||
* string to compare with s2 (for type info check Template parameters above)
|
||||
* @param s2
|
||||
* string to compare with s1 (for type info check Template parameters above)
|
||||
* @param max
|
||||
* Maximum Hamming distance between s1 and s2, that is
|
||||
* considered as a result. If the distance is bigger than max,
|
||||
* max + 1 is returned instead. Default is std::numeric_limits<size_t>::max(),
|
||||
* which deactivates this behaviour.
|
||||
*
|
||||
* @return Hamming distance between s1 and s2
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t hamming_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, bool pad_ = true,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::Hamming::distance(first1, last1, first2, last2, pad_, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t hamming_distance(const Sentence1& s1, const Sentence2& s2, bool pad_ = true,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::Hamming::distance(s1, s2, pad_, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t hamming_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, bool pad_ = true,
|
||||
size_t score_cutoff = 0)
|
||||
{
|
||||
return detail::Hamming::similarity(first1, last1, first2, last2, pad_, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t hamming_similarity(const Sentence1& s1, const Sentence2& s2, bool pad_ = true, size_t score_cutoff = 0)
|
||||
{
|
||||
return detail::Hamming::similarity(s1, s2, pad_, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double hamming_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
bool pad_ = true, double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::Hamming::normalized_distance(first1, last1, first2, last2, pad_, score_cutoff,
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double hamming_normalized_distance(const Sentence1& s1, const Sentence2& s2, bool pad_ = true,
|
||||
double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::Hamming::normalized_distance(s1, s2, pad_, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
Editops hamming_editops(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, bool pad_ = true,
|
||||
size_t score_hint = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::hamming_editops(detail::Range(first1, last1), detail::Range(first2, last2), pad_,
|
||||
score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
Editops hamming_editops(const Sentence1& s1, const Sentence2& s2, bool pad_ = true,
|
||||
size_t score_hint = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::hamming_editops(detail::Range(s1), detail::Range(s2), pad_, score_hint);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculates a normalized hamming similarity
|
||||
*
|
||||
* @details
|
||||
* Both string require a similar length
|
||||
*
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1
|
||||
* string to compare with s2 (for type info check Template parameters above)
|
||||
* @param s2
|
||||
* string to compare with s1 (for type info check Template parameters above)
|
||||
* @param score_cutoff
|
||||
* Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
* For ratio < score_cutoff 0 is returned instead. Default is 0,
|
||||
* which deactivates this behaviour.
|
||||
*
|
||||
* @return Normalized hamming distance between s1 and s2
|
||||
* as a float between 0 and 1.0
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double hamming_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
bool pad_ = true, double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::Hamming::normalized_similarity(first1, last1, first2, last2, pad_, score_cutoff,
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double hamming_normalized_similarity(const Sentence1& s1, const Sentence2& s2, bool pad_ = true,
|
||||
double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::Hamming::normalized_similarity(s1, s2, pad_, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
struct CachedHamming : public detail::CachedDistanceBase<CachedHamming<CharT1>, size_t, 0,
|
||||
std::numeric_limits<int64_t>::max()> {
|
||||
template <typename Sentence1>
|
||||
explicit CachedHamming(const Sentence1& s1_, bool pad_ = true)
|
||||
: CachedHamming(detail::to_begin(s1_), detail::to_end(s1_), pad_)
|
||||
{}
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedHamming(InputIt1 first1, InputIt1 last1, bool pad_ = true) : s1(first1, last1), pad(pad_)
|
||||
{}
|
||||
|
||||
private:
|
||||
friend detail::CachedDistanceBase<CachedHamming<CharT1>, size_t, 0, std::numeric_limits<int64_t>::max()>;
|
||||
friend detail::CachedNormalizedMetricBase<CachedHamming<CharT1>>;
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t maximum(const detail::Range<InputIt2>& s2) const
|
||||
{
|
||||
return std::max(s1.size(), s2.size());
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t _distance(const detail::Range<InputIt2>& s2, size_t score_cutoff,
|
||||
[[maybe_unused]] size_t score_hint) const
|
||||
{
|
||||
return detail::Hamming::distance(s1, s2, pad, score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
std::vector<CharT1> s1;
|
||||
bool pad;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedHamming(const Sentence1& s1_, bool pad_ = true) -> CachedHamming<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedHamming(InputIt1 first1, InputIt1 last1, bool pad_ = true) -> CachedHamming<iter_value_t<InputIt1>>;
|
||||
|
||||
/**@}*/
|
||||
|
||||
} // namespace rapidfuzz
|
||||
60
src/external/rapidfuzz-cpp/rapidfuzz/distance/Hamming_impl.hpp
vendored
Normal file
60
src/external/rapidfuzz-cpp/rapidfuzz/distance/Hamming_impl.hpp
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/details/distance.hpp>
|
||||
#include <stdexcept>
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
class Hamming : public DistanceBase<Hamming, size_t, 0, std::numeric_limits<int64_t>::max(), bool> {
|
||||
friend DistanceBase<Hamming, size_t, 0, std::numeric_limits<int64_t>::max(), bool>;
|
||||
friend NormalizedMetricBase<Hamming, bool>;
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static size_t maximum(const Range<InputIt1>& s1, const Range<InputIt2>& s2, bool)
|
||||
{
|
||||
return std::max(s1.size(), s2.size());
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static size_t _distance(const Range<InputIt1>& s1, const Range<InputIt2>& s2, bool pad,
|
||||
size_t score_cutoff, [[maybe_unused]] size_t score_hint)
|
||||
{
|
||||
if (!pad && s1.size() != s2.size()) throw std::invalid_argument("Sequences are not the same length.");
|
||||
|
||||
size_t min_len = std::min(s1.size(), s2.size());
|
||||
size_t dist = std::max(s1.size(), s2.size());
|
||||
auto iter_s1 = s1.begin();
|
||||
auto iter_s2 = s2.begin();
|
||||
for (size_t i = 0; i < min_len; ++i)
|
||||
dist -= bool(*(iter_s1++) == *(iter_s2++));
|
||||
|
||||
return (dist <= score_cutoff) ? dist : score_cutoff + 1;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
Editops hamming_editops(const Range<InputIt1>& s1, const Range<InputIt2>& s2, bool pad, size_t)
|
||||
{
|
||||
if (!pad && s1.size() != s2.size()) throw std::invalid_argument("Sequences are not the same length.");
|
||||
|
||||
Editops ops;
|
||||
size_t min_len = std::min(s1.size(), s2.size());
|
||||
size_t i = 0;
|
||||
for (; i < min_len; ++i)
|
||||
if (s1[i] != s2[i]) ops.emplace_back(EditType::Replace, i, i);
|
||||
|
||||
for (; i < s1.size(); ++i)
|
||||
ops.emplace_back(EditType::Delete, i, s2.size());
|
||||
|
||||
for (; i < s2.size(); ++i)
|
||||
ops.emplace_back(EditType::Insert, s1.size(), i);
|
||||
|
||||
ops.set_src_len(s1.size());
|
||||
ops.set_dest_len(s2.size());
|
||||
return ops;
|
||||
}
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
191
src/external/rapidfuzz-cpp/rapidfuzz/distance/Indel.hpp
vendored
Normal file
191
src/external/rapidfuzz-cpp/rapidfuzz/distance/Indel.hpp
vendored
Normal file
@@ -0,0 +1,191 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <limits>
|
||||
#include <rapidfuzz/distance/Indel_impl.hpp>
|
||||
#include <rapidfuzz/distance/LCSseq.hpp>
|
||||
|
||||
namespace rapidfuzz {
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t indel_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::Indel::distance(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t indel_distance(const Sentence1& s1, const Sentence2& s2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::Indel::distance(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t indel_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_cutoff = 0.0)
|
||||
{
|
||||
return detail::Indel::similarity(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t indel_similarity(const Sentence1& s1, const Sentence2& s2, size_t score_cutoff = 0.0)
|
||||
{
|
||||
return detail::Indel::similarity(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double indel_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::Indel::normalized_distance(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double indel_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::Indel::normalized_distance(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double indel_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::Indel::normalized_similarity(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double indel_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::Indel::normalized_similarity(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
Editops indel_editops(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2)
|
||||
{
|
||||
return lcs_seq_editops(first1, last1, first2, last2);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
Editops indel_editops(const Sentence1& s1, const Sentence2& s2)
|
||||
{
|
||||
return lcs_seq_editops(s1, s2);
|
||||
}
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
namespace experimental {
|
||||
template <int MaxLen>
|
||||
struct MultiIndel
|
||||
: public detail::MultiDistanceBase<MultiIndel<MaxLen>, size_t, 0, std::numeric_limits<int64_t>::max()> {
|
||||
private:
|
||||
friend detail::MultiDistanceBase<MultiIndel<MaxLen>, size_t, 0, std::numeric_limits<int64_t>::max()>;
|
||||
friend detail::MultiNormalizedMetricBase<MultiIndel<MaxLen>, size_t>;
|
||||
|
||||
public:
|
||||
MultiIndel(size_t count) : scorer(count)
|
||||
{}
|
||||
|
||||
/**
|
||||
* @brief get minimum size required for result vectors passed into
|
||||
* - distance
|
||||
* - similarity
|
||||
* - normalized_distance
|
||||
* - normalized_similarity
|
||||
*
|
||||
* @return minimum vector size
|
||||
*/
|
||||
size_t result_count() const
|
||||
{
|
||||
return scorer.result_count();
|
||||
}
|
||||
|
||||
template <typename Sentence1>
|
||||
void insert(const Sentence1& s1_)
|
||||
{
|
||||
insert(detail::to_begin(s1_), detail::to_end(s1_));
|
||||
}
|
||||
|
||||
template <typename InputIt1>
|
||||
void insert(InputIt1 first1, InputIt1 last1)
|
||||
{
|
||||
scorer.insert(first1, last1);
|
||||
str_lens.push_back(static_cast<size_t>(std::distance(first1, last1)));
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename InputIt2>
|
||||
void _distance(size_t* scores, size_t score_count, const detail::Range<InputIt2>& s2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max()) const
|
||||
{
|
||||
scorer.similarity(scores, score_count, s2);
|
||||
|
||||
for (size_t i = 0; i < get_input_count(); ++i) {
|
||||
size_t maximum_ = maximum(i, s2);
|
||||
size_t dist = maximum_ - 2 * scores[i];
|
||||
scores[i] = (dist <= score_cutoff) ? dist : score_cutoff + 1;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t maximum(size_t s1_idx, const detail::Range<InputIt2>& s2) const
|
||||
{
|
||||
return str_lens[s1_idx] + s2.size();
|
||||
}
|
||||
|
||||
size_t get_input_count() const noexcept
|
||||
{
|
||||
return str_lens.size();
|
||||
}
|
||||
|
||||
std::vector<size_t> str_lens;
|
||||
MultiLCSseq<MaxLen> scorer;
|
||||
};
|
||||
} /* namespace experimental */
|
||||
#endif
|
||||
|
||||
template <typename CharT1>
|
||||
struct CachedIndel
|
||||
: public detail::CachedDistanceBase<CachedIndel<CharT1>, size_t, 0, std::numeric_limits<int64_t>::max()> {
|
||||
template <typename Sentence1>
|
||||
explicit CachedIndel(const Sentence1& s1_) : CachedIndel(detail::to_begin(s1_), detail::to_end(s1_))
|
||||
{}
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedIndel(InputIt1 first1, InputIt1 last1)
|
||||
: s1_len(static_cast<size_t>(std::distance(first1, last1))), scorer(first1, last1)
|
||||
{}
|
||||
|
||||
private:
|
||||
friend detail::CachedDistanceBase<CachedIndel<CharT1>, size_t, 0, std::numeric_limits<int64_t>::max()>;
|
||||
friend detail::CachedNormalizedMetricBase<CachedIndel<CharT1>>;
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t maximum(const detail::Range<InputIt2>& s2) const
|
||||
{
|
||||
return s1_len + s2.size();
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t _distance(const detail::Range<InputIt2>& s2, size_t score_cutoff, size_t score_hint) const
|
||||
{
|
||||
size_t maximum_ = maximum(s2);
|
||||
size_t lcs_cutoff = (maximum_ / 2 >= score_cutoff) ? maximum_ / 2 - score_cutoff : 0;
|
||||
size_t lcs_cutoff_hint = (maximum_ / 2 >= score_hint) ? maximum_ / 2 - score_hint : 0;
|
||||
size_t lcs_sim = scorer.similarity(s2, lcs_cutoff, lcs_cutoff_hint);
|
||||
size_t dist = maximum_ - 2 * lcs_sim;
|
||||
return (dist <= score_cutoff) ? dist : score_cutoff + 1;
|
||||
}
|
||||
|
||||
size_t s1_len;
|
||||
CachedLCSseq<CharT1> scorer;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedIndel(const Sentence1& s1_) -> CachedIndel<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedIndel(InputIt1 first1, InputIt1 last1) -> CachedIndel<iter_value_t<InputIt1>>;
|
||||
|
||||
} // namespace rapidfuzz
|
||||
68
src/external/rapidfuzz-cpp/rapidfuzz/distance/Indel_impl.hpp
vendored
Normal file
68
src/external/rapidfuzz-cpp/rapidfuzz/distance/Indel_impl.hpp
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#include <rapidfuzz/details/PatternMatchVector.hpp>
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/details/common.hpp>
|
||||
#include <rapidfuzz/details/distance.hpp>
|
||||
#include <rapidfuzz/details/intrinsics.hpp>
|
||||
#include <rapidfuzz/distance/LCSseq.hpp>
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t indel_distance(const BlockPatternMatchVector& block, const Range<InputIt1>& s1,
|
||||
const Range<InputIt2>& s2, size_t score_cutoff)
|
||||
{
|
||||
size_t maximum = s1.size() + s2.size();
|
||||
size_t lcs_cutoff = (maximum / 2 >= score_cutoff) ? maximum / 2 - score_cutoff : 0;
|
||||
size_t lcs_sim = lcs_seq_similarity(block, s1, s2, lcs_cutoff);
|
||||
size_t dist = maximum - 2 * lcs_sim;
|
||||
return (dist <= score_cutoff) ? dist : score_cutoff + 1;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double indel_normalized_distance(const BlockPatternMatchVector& block, const Range<InputIt1>& s1,
|
||||
const Range<InputIt2>& s2, double score_cutoff)
|
||||
{
|
||||
size_t maximum = s1.size() + s2.size();
|
||||
size_t cutoff_distance = static_cast<size_t>(std::ceil(static_cast<double>(maximum) * score_cutoff));
|
||||
size_t dist = indel_distance(block, s1, s2, cutoff_distance);
|
||||
double norm_dist = (maximum) ? static_cast<double>(dist) / static_cast<double>(maximum) : 0.0;
|
||||
return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double indel_normalized_similarity(const BlockPatternMatchVector& block, const Range<InputIt1>& s1,
|
||||
const Range<InputIt2>& s2, double score_cutoff)
|
||||
{
|
||||
double cutoff_score = NormSim_to_NormDist(score_cutoff);
|
||||
double norm_dist = indel_normalized_distance(block, s1, s2, cutoff_score);
|
||||
double norm_sim = 1.0 - norm_dist;
|
||||
return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
|
||||
}
|
||||
|
||||
class Indel : public DistanceBase<Indel, size_t, 0, std::numeric_limits<int64_t>::max()> {
|
||||
friend DistanceBase<Indel, size_t, 0, std::numeric_limits<int64_t>::max()>;
|
||||
friend NormalizedMetricBase<Indel>;
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static size_t maximum(const Range<InputIt1>& s1, const Range<InputIt2>& s2)
|
||||
{
|
||||
return s1.size() + s2.size();
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static size_t _distance(const Range<InputIt1>& s1, const Range<InputIt2>& s2, size_t score_cutoff,
|
||||
size_t score_hint)
|
||||
{
|
||||
size_t maximum = Indel::maximum(s1, s2);
|
||||
size_t lcs_cutoff = (maximum / 2 >= score_cutoff) ? maximum / 2 - score_cutoff : 0;
|
||||
size_t lcs_hint = (maximum / 2 >= score_hint) ? maximum / 2 - score_hint : 0;
|
||||
size_t lcs_sim = LCSseq::similarity(s1, s2, lcs_cutoff, lcs_hint);
|
||||
size_t dist = maximum - 2 * lcs_sim;
|
||||
return (dist <= score_cutoff) ? dist : score_cutoff + 1;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
231
src/external/rapidfuzz-cpp/rapidfuzz/distance/Jaro.hpp
vendored
Normal file
231
src/external/rapidfuzz-cpp/rapidfuzz/distance/Jaro.hpp
vendored
Normal file
@@ -0,0 +1,231 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/distance/Jaro_impl.hpp>
|
||||
#include <stdlib.h>
|
||||
|
||||
namespace rapidfuzz {
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double jaro_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::Jaro::distance(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double jaro_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::Jaro::distance(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double jaro_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::Jaro::similarity(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double jaro_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::Jaro::similarity(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double jaro_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::Jaro::normalized_distance(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double jaro_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::Jaro::normalized_distance(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double jaro_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::Jaro::normalized_similarity(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double jaro_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::Jaro::normalized_similarity(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
namespace experimental {
|
||||
template <int MaxLen>
|
||||
struct MultiJaro : public detail::MultiSimilarityBase<MultiJaro<MaxLen>, double, 0, 1> {
|
||||
|
||||
private:
|
||||
friend detail::MultiSimilarityBase<MultiJaro<MaxLen>, double, 0, 1>;
|
||||
friend detail::MultiNormalizedMetricBase<MultiJaro<MaxLen>, double>;
|
||||
|
||||
static_assert(MaxLen == 8 || MaxLen == 16 || MaxLen == 32 || MaxLen == 64);
|
||||
|
||||
using VecType = typename std::conditional_t<
|
||||
MaxLen == 8, uint8_t,
|
||||
typename std::conditional_t<MaxLen == 16, uint16_t,
|
||||
typename std::conditional_t<MaxLen == 32, uint32_t, uint64_t>>>;
|
||||
|
||||
constexpr static size_t get_vec_size()
|
||||
{
|
||||
# ifdef RAPIDFUZZ_AVX2
|
||||
return detail::simd_avx2::native_simd<VecType>::size;
|
||||
# else
|
||||
return detail::simd_sse2::native_simd<VecType>::size;
|
||||
# endif
|
||||
}
|
||||
|
||||
constexpr static size_t get_vec_alignment()
|
||||
{
|
||||
# ifdef RAPIDFUZZ_AVX2
|
||||
return detail::simd_avx2::native_simd<VecType>::alignment;
|
||||
# else
|
||||
return detail::simd_sse2::native_simd<VecType>::alignment;
|
||||
# endif
|
||||
}
|
||||
|
||||
constexpr static size_t find_block_count(size_t count)
|
||||
{
|
||||
size_t vec_size = get_vec_size();
|
||||
size_t simd_vec_count = detail::ceil_div(count, vec_size);
|
||||
return detail::ceil_div(simd_vec_count * vec_size * MaxLen, 64);
|
||||
}
|
||||
|
||||
public:
|
||||
MultiJaro(size_t count) : input_count(count), PM(find_block_count(count) * 64)
|
||||
{
|
||||
/* align for avx2 so we can directly load into avx2 registers */
|
||||
str_lens_size = result_count();
|
||||
|
||||
str_lens = static_cast<VecType*>(
|
||||
detail::rf_aligned_alloc(get_vec_alignment(), sizeof(VecType) * str_lens_size));
|
||||
std::fill(str_lens, str_lens + str_lens_size, VecType(0));
|
||||
}
|
||||
|
||||
~MultiJaro()
|
||||
{
|
||||
detail::rf_aligned_free(str_lens);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief get minimum size required for result vectors passed into
|
||||
* - distance
|
||||
* - similarity
|
||||
* - normalized_distance
|
||||
* - normalized_similarity
|
||||
*
|
||||
* @return minimum vector size
|
||||
*/
|
||||
size_t result_count() const
|
||||
{
|
||||
size_t vec_size = get_vec_size();
|
||||
size_t simd_vec_count = detail::ceil_div(input_count, vec_size);
|
||||
return simd_vec_count * vec_size;
|
||||
}
|
||||
|
||||
template <typename Sentence1>
|
||||
void insert(const Sentence1& s1_)
|
||||
{
|
||||
insert(detail::to_begin(s1_), detail::to_end(s1_));
|
||||
}
|
||||
|
||||
template <typename InputIt1>
|
||||
void insert(InputIt1 first1, InputIt1 last1)
|
||||
{
|
||||
auto len = std::distance(first1, last1);
|
||||
int block_pos = static_cast<int>((pos * MaxLen) % 64);
|
||||
auto block = (pos * MaxLen) / 64;
|
||||
assert(len <= MaxLen);
|
||||
|
||||
if (pos >= input_count) throw std::invalid_argument("out of bounds insert");
|
||||
|
||||
str_lens[pos] = static_cast<VecType>(len);
|
||||
for (; first1 != last1; ++first1) {
|
||||
PM.insert(block, *first1, block_pos);
|
||||
block_pos++;
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename InputIt2>
|
||||
void _similarity(double* scores, size_t score_count, const detail::Range<InputIt2>& s2,
|
||||
double score_cutoff = 0.0) const
|
||||
{
|
||||
if (score_count < result_count())
|
||||
throw std::invalid_argument("scores has to have >= result_count() elements");
|
||||
|
||||
detail::Range scores_(scores, scores + score_count);
|
||||
detail::jaro_similarity_simd<VecType>(scores_, PM, str_lens, str_lens_size, s2, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
double maximum([[maybe_unused]] size_t s1_idx, const detail::Range<InputIt2>&) const
|
||||
{
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
size_t get_input_count() const noexcept
|
||||
{
|
||||
return input_count;
|
||||
}
|
||||
|
||||
size_t input_count;
|
||||
size_t pos = 0;
|
||||
detail::BlockPatternMatchVector PM;
|
||||
VecType* str_lens;
|
||||
size_t str_lens_size;
|
||||
};
|
||||
|
||||
} /* namespace experimental */
|
||||
#endif /* RAPIDFUZZ_SIMD */
|
||||
|
||||
template <typename CharT1>
|
||||
struct CachedJaro : public detail::CachedSimilarityBase<CachedJaro<CharT1>, double, 0, 1> {
|
||||
template <typename Sentence1>
|
||||
explicit CachedJaro(const Sentence1& s1_) : CachedJaro(detail::to_begin(s1_), detail::to_end(s1_))
|
||||
{}
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedJaro(InputIt1 first1, InputIt1 last1) : s1(first1, last1), PM(detail::Range(first1, last1))
|
||||
{}
|
||||
|
||||
private:
|
||||
friend detail::CachedSimilarityBase<CachedJaro<CharT1>, double, 0, 1>;
|
||||
friend detail::CachedNormalizedMetricBase<CachedJaro<CharT1>>;
|
||||
|
||||
template <typename InputIt2>
|
||||
double maximum(const detail::Range<InputIt2>&) const
|
||||
{
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
double _similarity(const detail::Range<InputIt2>& s2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
return detail::jaro_similarity(PM, detail::Range(s1), s2, score_cutoff);
|
||||
}
|
||||
|
||||
std::vector<CharT1> s1;
|
||||
detail::BlockPatternMatchVector PM;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedJaro(const Sentence1& s1_) -> CachedJaro<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedJaro(InputIt1 first1, InputIt1 last1) -> CachedJaro<iter_value_t<InputIt1>>;
|
||||
|
||||
} // namespace rapidfuzz
|
||||
210
src/external/rapidfuzz-cpp/rapidfuzz/distance/JaroWinkler.hpp
vendored
Normal file
210
src/external/rapidfuzz-cpp/rapidfuzz/distance/JaroWinkler.hpp
vendored
Normal file
@@ -0,0 +1,210 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/distance/JaroWinkler_impl.hpp>
|
||||
|
||||
namespace rapidfuzz {
|
||||
|
||||
template <typename InputIt1, typename InputIt2,
|
||||
typename = std::enable_if_t<!std::is_same_v<InputIt2, double>>>
|
||||
double jaro_winkler_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double prefix_weight = 0.1, double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::JaroWinkler::distance(first1, last1, first2, last2, prefix_weight, score_cutoff,
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double jaro_winkler_distance(const Sentence1& s1, const Sentence2& s2, double prefix_weight = 0.1,
|
||||
double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::JaroWinkler::distance(s1, s2, prefix_weight, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2,
|
||||
typename = std::enable_if_t<!std::is_same_v<InputIt2, double>>>
|
||||
double jaro_winkler_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double prefix_weight = 0.1, double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::JaroWinkler::similarity(first1, last1, first2, last2, prefix_weight, score_cutoff,
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double jaro_winkler_similarity(const Sentence1& s1, const Sentence2& s2, double prefix_weight = 0.1,
|
||||
double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::JaroWinkler::similarity(s1, s2, prefix_weight, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2,
|
||||
typename = std::enable_if_t<!std::is_same_v<InputIt2, double>>>
|
||||
double jaro_winkler_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double prefix_weight = 0.1, double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::JaroWinkler::normalized_distance(first1, last1, first2, last2, prefix_weight, score_cutoff,
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double jaro_winkler_normalized_distance(const Sentence1& s1, const Sentence2& s2, double prefix_weight = 0.1,
|
||||
double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::JaroWinkler::normalized_distance(s1, s2, prefix_weight, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2,
|
||||
typename = std::enable_if_t<!std::is_same_v<InputIt2, double>>>
|
||||
double jaro_winkler_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double prefix_weight = 0.1, double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::JaroWinkler::normalized_similarity(first1, last1, first2, last2, prefix_weight,
|
||||
score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double jaro_winkler_normalized_similarity(const Sentence1& s1, const Sentence2& s2,
|
||||
double prefix_weight = 0.1, double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::JaroWinkler::normalized_similarity(s1, s2, prefix_weight, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
namespace experimental {
|
||||
template <int MaxLen>
|
||||
struct MultiJaroWinkler : public detail::MultiSimilarityBase<MultiJaroWinkler<MaxLen>, double, 0, 1> {
|
||||
|
||||
private:
|
||||
friend detail::MultiSimilarityBase<MultiJaroWinkler<MaxLen>, double, 0, 1>;
|
||||
friend detail::MultiNormalizedMetricBase<MultiJaroWinkler<MaxLen>, double>;
|
||||
|
||||
public:
|
||||
MultiJaroWinkler(size_t count, double prefix_weight_ = 0.1) : scorer(count), prefix_weight(prefix_weight_)
|
||||
{}
|
||||
|
||||
/**
|
||||
* @brief get minimum size required for result vectors passed into
|
||||
* - distance
|
||||
* - similarity
|
||||
* - normalized_distance
|
||||
* - normalized_similarity
|
||||
*
|
||||
* @return minimum vector size
|
||||
*/
|
||||
size_t result_count() const
|
||||
{
|
||||
return scorer.result_count();
|
||||
}
|
||||
|
||||
template <typename Sentence1>
|
||||
void insert(const Sentence1& s1_)
|
||||
{
|
||||
insert(detail::to_begin(s1_), detail::to_end(s1_));
|
||||
}
|
||||
|
||||
template <typename InputIt1>
|
||||
void insert(InputIt1 first1, InputIt1 last1)
|
||||
{
|
||||
scorer.insert(first1, last1);
|
||||
size_t len = static_cast<size_t>(std::distance(first1, last1));
|
||||
std::array<uint64_t, 4> prefix;
|
||||
for (size_t i = 0; i < std::min(len, size_t(4)); ++i)
|
||||
prefix[i] = static_cast<uint64_t>(first1[static_cast<ptrdiff_t>(i)]);
|
||||
|
||||
str_lens.push_back(len);
|
||||
prefixes.push_back(prefix);
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename InputIt2>
|
||||
void _similarity(double* scores, size_t score_count, const detail::Range<InputIt2>& s2,
|
||||
double score_cutoff = 0.0) const
|
||||
{
|
||||
if (score_count < result_count())
|
||||
throw std::invalid_argument("scores has to have >= result_count() elements");
|
||||
|
||||
scorer.similarity(scores, score_count, s2, std::min(0.7, score_cutoff));
|
||||
|
||||
for (size_t i = 0; i < get_input_count(); ++i) {
|
||||
if (scores[i] > 0.7) {
|
||||
size_t min_len = std::min(s2.size(), str_lens[i]);
|
||||
size_t max_prefix = std::min(min_len, size_t(4));
|
||||
size_t prefix = 0;
|
||||
for (; prefix < max_prefix; ++prefix)
|
||||
if (static_cast<uint64_t>(s2[prefix]) != prefixes[i][prefix]) break;
|
||||
|
||||
scores[i] += static_cast<double>(prefix) * prefix_weight * (1.0 - scores[i]);
|
||||
scores[i] = std::min(scores[i], 1.0);
|
||||
}
|
||||
|
||||
if (scores[i] < score_cutoff) scores[i] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
double maximum([[maybe_unused]] size_t s1_idx, const detail::Range<InputIt2>&) const
|
||||
{
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
size_t get_input_count() const noexcept
|
||||
{
|
||||
return str_lens.size();
|
||||
}
|
||||
|
||||
std::vector<size_t> str_lens;
|
||||
// todo this could lead to incorrect results when comparing uint64_t with int64_t
|
||||
std::vector<std::array<uint64_t, 4>> prefixes;
|
||||
MultiJaro<MaxLen> scorer;
|
||||
double prefix_weight;
|
||||
};
|
||||
|
||||
} /* namespace experimental */
|
||||
#endif /* RAPIDFUZZ_SIMD */
|
||||
|
||||
template <typename CharT1>
|
||||
struct CachedJaroWinkler : public detail::CachedSimilarityBase<CachedJaroWinkler<CharT1>, double, 0, 1> {
|
||||
template <typename Sentence1>
|
||||
explicit CachedJaroWinkler(const Sentence1& s1_, double _prefix_weight = 0.1)
|
||||
: CachedJaroWinkler(detail::to_begin(s1_), detail::to_end(s1_), _prefix_weight)
|
||||
{}
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedJaroWinkler(InputIt1 first1, InputIt1 last1, double _prefix_weight = 0.1)
|
||||
: prefix_weight(_prefix_weight), s1(first1, last1), PM(detail::Range(first1, last1))
|
||||
{}
|
||||
|
||||
private:
|
||||
friend detail::CachedSimilarityBase<CachedJaroWinkler<CharT1>, double, 0, 1>;
|
||||
friend detail::CachedNormalizedMetricBase<CachedJaroWinkler<CharT1>>;
|
||||
|
||||
template <typename InputIt2>
|
||||
double maximum(const detail::Range<InputIt2>&) const
|
||||
{
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
double _similarity(const detail::Range<InputIt2>& s2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
return detail::jaro_winkler_similarity(PM, detail::Range(s1), s2, prefix_weight, score_cutoff);
|
||||
}
|
||||
|
||||
double prefix_weight;
|
||||
std::vector<CharT1> s1;
|
||||
detail::BlockPatternMatchVector PM;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedJaroWinkler(const Sentence1& s1_,
|
||||
double _prefix_weight = 0.1) -> CachedJaroWinkler<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedJaroWinkler(InputIt1 first1, InputIt1 last1,
|
||||
double _prefix_weight = 0.1) -> CachedJaroWinkler<iter_value_t<InputIt1>>;
|
||||
|
||||
} // namespace rapidfuzz
|
||||
90
src/external/rapidfuzz-cpp/rapidfuzz/distance/JaroWinkler_impl.hpp
vendored
Normal file
90
src/external/rapidfuzz-cpp/rapidfuzz/distance/JaroWinkler_impl.hpp
vendored
Normal file
@@ -0,0 +1,90 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#include <rapidfuzz/distance/Jaro.hpp>
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double jaro_winkler_similarity(const Range<InputIt1>& P, const Range<InputIt2>& T, double prefix_weight,
|
||||
double score_cutoff)
|
||||
{
|
||||
size_t P_len = P.size();
|
||||
size_t T_len = T.size();
|
||||
size_t min_len = std::min(P_len, T_len);
|
||||
size_t prefix = 0;
|
||||
size_t max_prefix = std::min(min_len, size_t(4));
|
||||
|
||||
for (; prefix < max_prefix; ++prefix)
|
||||
if (T[prefix] != P[prefix]) break;
|
||||
|
||||
double jaro_score_cutoff = score_cutoff;
|
||||
if (jaro_score_cutoff > 0.7) {
|
||||
double prefix_sim = static_cast<double>(prefix) * prefix_weight;
|
||||
|
||||
if (prefix_sim >= 1.0)
|
||||
jaro_score_cutoff = 0.7;
|
||||
else
|
||||
jaro_score_cutoff = std::max(0.7, (prefix_sim - jaro_score_cutoff) / (prefix_sim - 1.0));
|
||||
}
|
||||
|
||||
double Sim = jaro_similarity(P, T, jaro_score_cutoff);
|
||||
if (Sim > 0.7) {
|
||||
Sim += static_cast<double>(prefix) * prefix_weight * (1.0 - Sim);
|
||||
Sim = std::min(Sim, 1.0);
|
||||
}
|
||||
|
||||
return (Sim >= score_cutoff) ? Sim : 0;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double jaro_winkler_similarity(const BlockPatternMatchVector& PM, const Range<InputIt1>& P,
|
||||
const Range<InputIt2>& T, double prefix_weight, double score_cutoff)
|
||||
{
|
||||
size_t P_len = P.size();
|
||||
size_t T_len = T.size();
|
||||
size_t min_len = std::min(P_len, T_len);
|
||||
size_t prefix = 0;
|
||||
size_t max_prefix = std::min(min_len, size_t(4));
|
||||
|
||||
for (; prefix < max_prefix; ++prefix)
|
||||
if (T[prefix] != P[prefix]) break;
|
||||
|
||||
double jaro_score_cutoff = score_cutoff;
|
||||
if (jaro_score_cutoff > 0.7) {
|
||||
double prefix_sim = static_cast<double>(prefix) * prefix_weight;
|
||||
|
||||
if (prefix_sim >= 1.0)
|
||||
jaro_score_cutoff = 0.7;
|
||||
else
|
||||
jaro_score_cutoff = std::max(0.7, (prefix_sim - jaro_score_cutoff) / (prefix_sim - 1.0));
|
||||
}
|
||||
|
||||
double Sim = jaro_similarity(PM, P, T, jaro_score_cutoff);
|
||||
if (Sim > 0.7) {
|
||||
Sim += static_cast<double>(prefix) * prefix_weight * (1.0 - Sim);
|
||||
Sim = std::min(Sim, 1.0);
|
||||
}
|
||||
|
||||
return (Sim >= score_cutoff) ? Sim : 0;
|
||||
}
|
||||
|
||||
class JaroWinkler : public SimilarityBase<JaroWinkler, double, 0, 1, double> {
|
||||
friend SimilarityBase<JaroWinkler, double, 0, 1, double>;
|
||||
friend NormalizedMetricBase<JaroWinkler, double>;
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static double maximum(const Range<InputIt1>&, const Range<InputIt2>&, double) noexcept
|
||||
{
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static double _similarity(const Range<InputIt1>& s1, const Range<InputIt2>& s2, double prefix_weight,
|
||||
double score_cutoff, [[maybe_unused]] double score_hint)
|
||||
{
|
||||
return jaro_winkler_similarity(s1, s2, prefix_weight, score_cutoff);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
845
src/external/rapidfuzz-cpp/rapidfuzz/distance/Jaro_impl.hpp
vendored
Normal file
845
src/external/rapidfuzz-cpp/rapidfuzz/distance/Jaro_impl.hpp
vendored
Normal file
@@ -0,0 +1,845 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <rapidfuzz/details/PatternMatchVector.hpp>
|
||||
#include <rapidfuzz/details/common.hpp>
|
||||
#include <rapidfuzz/details/distance.hpp>
|
||||
#include <rapidfuzz/details/intrinsics.hpp>
|
||||
#include <vector>
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
struct FlaggedCharsWord {
|
||||
uint64_t P_flag;
|
||||
uint64_t T_flag;
|
||||
};
|
||||
|
||||
struct FlaggedCharsMultiword {
|
||||
std::vector<uint64_t> P_flag;
|
||||
std::vector<uint64_t> T_flag;
|
||||
};
|
||||
|
||||
struct SearchBoundMask {
|
||||
size_t words = 0;
|
||||
size_t empty_words = 0;
|
||||
uint64_t last_mask = 0;
|
||||
uint64_t first_mask = 0;
|
||||
};
|
||||
|
||||
static inline double jaro_calculate_similarity(size_t P_len, size_t T_len, size_t CommonChars,
|
||||
size_t Transpositions)
|
||||
{
|
||||
Transpositions /= 2;
|
||||
double Sim = 0;
|
||||
Sim += static_cast<double>(CommonChars) / static_cast<double>(P_len);
|
||||
Sim += static_cast<double>(CommonChars) / static_cast<double>(T_len);
|
||||
Sim += (static_cast<double>(CommonChars) - static_cast<double>(Transpositions)) /
|
||||
static_cast<double>(CommonChars);
|
||||
return Sim / 3.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief filter matches below score_cutoff based on string lengths
|
||||
*/
|
||||
static inline bool jaro_length_filter(size_t P_len, size_t T_len, double score_cutoff)
|
||||
{
|
||||
if (!T_len || !P_len) return false;
|
||||
|
||||
double min_len = static_cast<double>(std::min(P_len, T_len));
|
||||
double Sim = min_len / static_cast<double>(P_len) + min_len / static_cast<double>(T_len) + 1.0;
|
||||
Sim /= 3.0;
|
||||
return Sim >= score_cutoff;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief filter matches below score_cutoff based on string lengths and common characters
|
||||
*/
|
||||
static inline bool jaro_common_char_filter(size_t P_len, size_t T_len, size_t CommonChars,
|
||||
double score_cutoff)
|
||||
{
|
||||
if (!CommonChars) return false;
|
||||
|
||||
double Sim = 0;
|
||||
Sim += static_cast<double>(CommonChars) / static_cast<double>(P_len);
|
||||
Sim += static_cast<double>(CommonChars) / static_cast<double>(T_len);
|
||||
Sim += 1.0;
|
||||
Sim /= 3.0;
|
||||
return Sim >= score_cutoff;
|
||||
}
|
||||
|
||||
static inline size_t count_common_chars(const FlaggedCharsWord& flagged)
|
||||
{
|
||||
return popcount(flagged.P_flag);
|
||||
}
|
||||
|
||||
static inline size_t count_common_chars(const FlaggedCharsMultiword& flagged)
|
||||
{
|
||||
size_t CommonChars = 0;
|
||||
if (flagged.P_flag.size() < flagged.T_flag.size()) {
|
||||
for (uint64_t flag : flagged.P_flag) {
|
||||
CommonChars += popcount(flag);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (uint64_t flag : flagged.T_flag) {
|
||||
CommonChars += popcount(flag);
|
||||
}
|
||||
}
|
||||
return CommonChars;
|
||||
}
|
||||
|
||||
template <typename PM_Vec, typename InputIt1, typename InputIt2>
|
||||
static inline FlaggedCharsWord flag_similar_characters_word(const PM_Vec& PM,
|
||||
[[maybe_unused]] const Range<InputIt1>& P,
|
||||
const Range<InputIt2>& T, size_t Bound)
|
||||
{
|
||||
assert(P.size() <= 64);
|
||||
assert(T.size() <= 64);
|
||||
assert(Bound > P.size() || P.size() - Bound <= T.size());
|
||||
|
||||
FlaggedCharsWord flagged = {0, 0};
|
||||
|
||||
uint64_t BoundMask = bit_mask_lsb<uint64_t>(Bound + 1);
|
||||
|
||||
size_t j = 0;
|
||||
auto T_iter = T.begin();
|
||||
for (; j < std::min(Bound, T.size()); ++j, ++T_iter) {
|
||||
uint64_t PM_j = PM.get(0, *T_iter) & BoundMask & (~flagged.P_flag);
|
||||
|
||||
flagged.P_flag |= blsi(PM_j);
|
||||
flagged.T_flag |= static_cast<uint64_t>(PM_j != 0) << j;
|
||||
|
||||
BoundMask = (BoundMask << 1) | 1;
|
||||
}
|
||||
|
||||
for (; j < T.size(); ++j, ++T_iter) {
|
||||
uint64_t PM_j = PM.get(0, *T_iter) & BoundMask & (~flagged.P_flag);
|
||||
|
||||
flagged.P_flag |= blsi(PM_j);
|
||||
flagged.T_flag |= static_cast<uint64_t>(PM_j != 0) << j;
|
||||
|
||||
BoundMask <<= 1;
|
||||
}
|
||||
|
||||
return flagged;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
static inline void flag_similar_characters_step(const BlockPatternMatchVector& PM, CharT T_j,
|
||||
FlaggedCharsMultiword& flagged, size_t j,
|
||||
SearchBoundMask BoundMask)
|
||||
{
|
||||
size_t j_word = j / 64;
|
||||
size_t j_pos = j % 64;
|
||||
size_t word = BoundMask.empty_words;
|
||||
size_t last_word = word + BoundMask.words;
|
||||
|
||||
if (BoundMask.words == 1) {
|
||||
uint64_t PM_j =
|
||||
PM.get(word, T_j) & BoundMask.last_mask & BoundMask.first_mask & (~flagged.P_flag[word]);
|
||||
|
||||
flagged.P_flag[word] |= blsi(PM_j);
|
||||
flagged.T_flag[j_word] |= static_cast<uint64_t>(PM_j != 0) << j_pos;
|
||||
return;
|
||||
}
|
||||
|
||||
if (BoundMask.first_mask) {
|
||||
uint64_t PM_j = PM.get(word, T_j) & BoundMask.first_mask & (~flagged.P_flag[word]);
|
||||
|
||||
if (PM_j) {
|
||||
flagged.P_flag[word] |= blsi(PM_j);
|
||||
flagged.T_flag[j_word] |= 1ull << j_pos;
|
||||
return;
|
||||
}
|
||||
word++;
|
||||
}
|
||||
|
||||
/* unroll for better performance on long sequences when access is fast */
|
||||
if (T_j >= 0 && T_j < 256) {
|
||||
for (; word + 3 < last_word - 1; word += 4) {
|
||||
uint64_t PM_j[4];
|
||||
unroll<int, 4>([&](auto i) {
|
||||
PM_j[i] = PM.get(word + i, static_cast<uint8_t>(T_j)) & (~flagged.P_flag[word + i]);
|
||||
});
|
||||
|
||||
if (PM_j[0]) {
|
||||
flagged.P_flag[word] |= blsi(PM_j[0]);
|
||||
flagged.T_flag[j_word] |= 1ull << j_pos;
|
||||
return;
|
||||
}
|
||||
if (PM_j[1]) {
|
||||
flagged.P_flag[word + 1] |= blsi(PM_j[1]);
|
||||
flagged.T_flag[j_word] |= 1ull << j_pos;
|
||||
return;
|
||||
}
|
||||
if (PM_j[2]) {
|
||||
flagged.P_flag[word + 2] |= blsi(PM_j[2]);
|
||||
flagged.T_flag[j_word] |= 1ull << j_pos;
|
||||
return;
|
||||
}
|
||||
if (PM_j[3]) {
|
||||
flagged.P_flag[word + 3] |= blsi(PM_j[3]);
|
||||
flagged.T_flag[j_word] |= 1ull << j_pos;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (; word < last_word - 1; ++word) {
|
||||
uint64_t PM_j = PM.get(word, T_j) & (~flagged.P_flag[word]);
|
||||
|
||||
if (PM_j) {
|
||||
flagged.P_flag[word] |= blsi(PM_j);
|
||||
flagged.T_flag[j_word] |= 1ull << j_pos;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (BoundMask.last_mask) {
|
||||
uint64_t PM_j = PM.get(word, T_j) & BoundMask.last_mask & (~flagged.P_flag[word]);
|
||||
|
||||
flagged.P_flag[word] |= blsi(PM_j);
|
||||
flagged.T_flag[j_word] |= static_cast<uint64_t>(PM_j != 0) << j_pos;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static inline FlaggedCharsMultiword flag_similar_characters_block(const BlockPatternMatchVector& PM,
|
||||
const Range<InputIt1>& P,
|
||||
const Range<InputIt2>& T, size_t Bound)
|
||||
{
|
||||
assert(P.size() > 64 || T.size() > 64);
|
||||
assert(Bound > P.size() || P.size() - Bound <= T.size());
|
||||
assert(Bound >= 31);
|
||||
|
||||
FlaggedCharsMultiword flagged;
|
||||
flagged.T_flag.resize(ceil_div(T.size(), 64));
|
||||
flagged.P_flag.resize(ceil_div(P.size(), 64));
|
||||
|
||||
SearchBoundMask BoundMask;
|
||||
size_t start_range = std::min(Bound + 1, P.size());
|
||||
BoundMask.words = 1 + start_range / 64;
|
||||
BoundMask.empty_words = 0;
|
||||
BoundMask.last_mask = (1ull << (start_range % 64)) - 1;
|
||||
BoundMask.first_mask = ~UINT64_C(0);
|
||||
|
||||
auto T_iter = T.begin();
|
||||
for (size_t j = 0; j < T.size(); ++j, ++T_iter) {
|
||||
flag_similar_characters_step(PM, *T_iter, flagged, j, BoundMask);
|
||||
|
||||
if (j + Bound + 1 < P.size()) {
|
||||
BoundMask.last_mask = (BoundMask.last_mask << 1) | 1;
|
||||
if (j + Bound + 2 < P.size() && BoundMask.last_mask == ~UINT64_C(0)) {
|
||||
BoundMask.last_mask = 0;
|
||||
BoundMask.words++;
|
||||
}
|
||||
}
|
||||
|
||||
if (j >= Bound) {
|
||||
BoundMask.first_mask <<= 1;
|
||||
if (BoundMask.first_mask == 0) {
|
||||
BoundMask.first_mask = ~UINT64_C(0);
|
||||
BoundMask.words--;
|
||||
BoundMask.empty_words++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return flagged;
|
||||
}
|
||||
|
||||
template <typename PM_Vec, typename InputIt1>
|
||||
static inline size_t count_transpositions_word(const PM_Vec& PM, const Range<InputIt1>& T,
|
||||
const FlaggedCharsWord& flagged)
|
||||
{
|
||||
uint64_t P_flag = flagged.P_flag;
|
||||
uint64_t T_flag = flagged.T_flag;
|
||||
|
||||
size_t Transpositions = 0;
|
||||
while (T_flag) {
|
||||
uint64_t PatternFlagMask = blsi(P_flag);
|
||||
|
||||
Transpositions += !(PM.get(0, T[countr_zero(T_flag)]) & PatternFlagMask);
|
||||
|
||||
T_flag = blsr(T_flag);
|
||||
P_flag ^= PatternFlagMask;
|
||||
}
|
||||
|
||||
return Transpositions;
|
||||
}
|
||||
|
||||
template <typename InputIt1>
|
||||
static inline size_t count_transpositions_block(const BlockPatternMatchVector& PM, const Range<InputIt1>& T,
|
||||
const FlaggedCharsMultiword& flagged, size_t FlaggedChars)
|
||||
{
|
||||
size_t TextWord = 0;
|
||||
size_t PatternWord = 0;
|
||||
uint64_t T_flag = flagged.T_flag[TextWord];
|
||||
uint64_t P_flag = flagged.P_flag[PatternWord];
|
||||
|
||||
auto T_first = T.begin();
|
||||
size_t Transpositions = 0;
|
||||
while (FlaggedChars) {
|
||||
while (!T_flag) {
|
||||
TextWord++;
|
||||
T_first += 64;
|
||||
T_flag = flagged.T_flag[TextWord];
|
||||
}
|
||||
|
||||
while (T_flag) {
|
||||
while (!P_flag) {
|
||||
PatternWord++;
|
||||
P_flag = flagged.P_flag[PatternWord];
|
||||
}
|
||||
|
||||
uint64_t PatternFlagMask = blsi(P_flag);
|
||||
|
||||
Transpositions += !(PM.get(PatternWord, T_first[static_cast<ptrdiff_t>(countr_zero(T_flag))]) &
|
||||
PatternFlagMask);
|
||||
|
||||
T_flag = blsr(T_flag);
|
||||
P_flag ^= PatternFlagMask;
|
||||
|
||||
FlaggedChars--;
|
||||
}
|
||||
}
|
||||
|
||||
return Transpositions;
|
||||
}
|
||||
|
||||
// todo cleanup the split between jaro_bounds
|
||||
/**
|
||||
* @brief find bounds
|
||||
*/
|
||||
static inline size_t jaro_bounds(size_t P_len, size_t T_len)
|
||||
{
|
||||
/* since jaro uses a sliding window some parts of T/P might never be in
|
||||
* range an can be removed ahead of time
|
||||
*/
|
||||
size_t Bound = (T_len > P_len) ? T_len : P_len;
|
||||
Bound /= 2;
|
||||
if (Bound > 0) Bound--;
|
||||
|
||||
return Bound;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief find bounds and skip out of bound parts of the sequences
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static inline size_t jaro_bounds(Range<InputIt1>& P, Range<InputIt2>& T)
|
||||
{
|
||||
size_t P_len = P.size();
|
||||
size_t T_len = T.size();
|
||||
|
||||
// this is currently an early exit condition
|
||||
// if this is changed handle this below, so Bound is never below 0
|
||||
assert(P_len != 0 || T_len != 0);
|
||||
|
||||
/* since jaro uses a sliding window some parts of T/P might never be in
|
||||
* range an can be removed ahead of time
|
||||
*/
|
||||
size_t Bound = 0;
|
||||
if (T_len > P_len) {
|
||||
Bound = T_len / 2 - 1;
|
||||
if (T_len > P_len + Bound) T.remove_suffix(T_len - (P_len + Bound));
|
||||
}
|
||||
else {
|
||||
Bound = P_len / 2 - 1;
|
||||
if (P_len > T_len + Bound) P.remove_suffix(P_len - (T_len + Bound));
|
||||
}
|
||||
return Bound;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static inline double jaro_similarity(Range<InputIt1> P, Range<InputIt2> T, double score_cutoff)
|
||||
{
|
||||
size_t P_len = P.size();
|
||||
size_t T_len = T.size();
|
||||
|
||||
if (score_cutoff > 1.0) return 0.0;
|
||||
|
||||
if (!P_len && !T_len) return 1.0;
|
||||
|
||||
/* filter out based on the length difference between the two strings */
|
||||
if (!jaro_length_filter(P_len, T_len, score_cutoff)) return 0.0;
|
||||
|
||||
if (P_len == 1 && T_len == 1) return static_cast<double>(P.front() == T.front());
|
||||
|
||||
size_t Bound = jaro_bounds(P, T);
|
||||
|
||||
/* common prefix never includes Transpositions */
|
||||
size_t CommonChars = remove_common_prefix(P, T);
|
||||
size_t Transpositions = 0;
|
||||
|
||||
if (P.empty() || T.empty()) {
|
||||
/* already has correct number of common chars and transpositions */
|
||||
}
|
||||
else if (P.size() <= 64 && T.size() <= 64) {
|
||||
PatternMatchVector PM(P);
|
||||
auto flagged = flag_similar_characters_word(PM, P, T, Bound);
|
||||
CommonChars += count_common_chars(flagged);
|
||||
|
||||
if (!jaro_common_char_filter(P_len, T_len, CommonChars, score_cutoff)) return 0.0;
|
||||
|
||||
Transpositions = count_transpositions_word(PM, T, flagged);
|
||||
}
|
||||
else {
|
||||
BlockPatternMatchVector PM(P);
|
||||
auto flagged = flag_similar_characters_block(PM, P, T, Bound);
|
||||
size_t FlaggedChars = count_common_chars(flagged);
|
||||
CommonChars += FlaggedChars;
|
||||
|
||||
if (!jaro_common_char_filter(P_len, T_len, CommonChars, score_cutoff)) return 0.0;
|
||||
|
||||
Transpositions = count_transpositions_block(PM, T, flagged, FlaggedChars);
|
||||
}
|
||||
|
||||
double Sim = jaro_calculate_similarity(P_len, T_len, CommonChars, Transpositions);
|
||||
return (Sim >= score_cutoff) ? Sim : 0;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static inline double jaro_similarity(const BlockPatternMatchVector& PM, Range<InputIt1> P, Range<InputIt2> T,
|
||||
double score_cutoff)
|
||||
{
|
||||
size_t P_len = P.size();
|
||||
size_t T_len = T.size();
|
||||
|
||||
if (score_cutoff > 1.0) return 0.0;
|
||||
|
||||
if (!P_len && !T_len) return 1.0;
|
||||
|
||||
/* filter out based on the length difference between the two strings */
|
||||
if (!jaro_length_filter(P_len, T_len, score_cutoff)) return 0.0;
|
||||
|
||||
if (P_len == 1 && T_len == 1) return static_cast<double>(P[0] == T[0]);
|
||||
|
||||
size_t Bound = jaro_bounds(P, T);
|
||||
|
||||
/* common prefix never includes Transpositions */
|
||||
size_t CommonChars = 0;
|
||||
size_t Transpositions = 0;
|
||||
|
||||
if (P.empty() || T.empty()) {
|
||||
/* already has correct number of common chars and transpositions */
|
||||
}
|
||||
else if (P.size() <= 64 && T.size() <= 64) {
|
||||
auto flagged = flag_similar_characters_word(PM, P, T, Bound);
|
||||
CommonChars += count_common_chars(flagged);
|
||||
|
||||
if (!jaro_common_char_filter(P_len, T_len, CommonChars, score_cutoff)) return 0.0;
|
||||
|
||||
Transpositions = count_transpositions_word(PM, T, flagged);
|
||||
}
|
||||
else {
|
||||
auto flagged = flag_similar_characters_block(PM, P, T, Bound);
|
||||
size_t FlaggedChars = count_common_chars(flagged);
|
||||
CommonChars += FlaggedChars;
|
||||
|
||||
if (!jaro_common_char_filter(P_len, T_len, CommonChars, score_cutoff)) return 0.0;
|
||||
|
||||
Transpositions = count_transpositions_block(PM, T, flagged, FlaggedChars);
|
||||
}
|
||||
|
||||
double Sim = jaro_calculate_similarity(P_len, T_len, CommonChars, Transpositions);
|
||||
return (Sim >= score_cutoff) ? Sim : 0;
|
||||
}
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
|
||||
template <typename VecType>
|
||||
struct JaroSimilaritySimdBounds {
|
||||
size_t maxBound = 0;
|
||||
VecType boundMaskSize;
|
||||
VecType boundMask;
|
||||
};
|
||||
|
||||
template <typename VecType, typename InputIt, int _lto_hack = RAPIDFUZZ_LTO_HACK>
|
||||
static inline auto jaro_similarity_prepare_bound_short_s2(const VecType* s1_lengths, Range<InputIt>& s2)
|
||||
{
|
||||
# ifdef RAPIDFUZZ_AVX2
|
||||
using namespace simd_avx2;
|
||||
# else
|
||||
using namespace simd_sse2;
|
||||
# endif
|
||||
|
||||
[[maybe_unused]] static constexpr size_t alignment = native_simd<VecType>::alignment;
|
||||
static constexpr size_t vec_width = native_simd<VecType>::size;
|
||||
assert(s2.size() <= sizeof(VecType) * 8);
|
||||
|
||||
JaroSimilaritySimdBounds<native_simd<VecType>> bounds;
|
||||
|
||||
VecType maxLen = 0;
|
||||
// todo permutate + max to find maxLen
|
||||
// side-note: we know only the first 8 bit are actually used
|
||||
for (size_t i = 0; i < vec_width; ++i)
|
||||
if (s1_lengths[i] > maxLen) maxLen = s1_lengths[i];
|
||||
|
||||
# ifdef RAPIDFUZZ_AVX2
|
||||
native_simd<VecType> zero(VecType(0));
|
||||
native_simd<VecType> one(1);
|
||||
|
||||
native_simd<VecType> s1_lengths_simd(reinterpret_cast<const uint64_t*>(s1_lengths));
|
||||
native_simd<VecType> s2_length_simd(static_cast<VecType>(s2.size()));
|
||||
|
||||
// we always know that the number does not exceed 64, so we can operate on smaller vectors if this
|
||||
// proves to be faster
|
||||
native_simd<VecType> boundSizes = max8(s1_lengths_simd, s2_length_simd) >> 1; // divide by two
|
||||
// todo there could be faster options since comparisions can be relatively expensive for some vector sizes
|
||||
boundSizes -= (boundSizes > zero) & one;
|
||||
|
||||
// this can never overflow even when using larger vectors for shifting here, since in the worst case of
|
||||
// 8bit vectors this shifts by (8/2-1)*2=6 bits todo << 1 performs unneeded masking here sllv is pretty
|
||||
// expensive for 8 / 16 bit since it has to be emulated maybe there is a better solution
|
||||
bounds.boundMaskSize = sllv(one, boundSizes << 1) - one;
|
||||
bounds.boundMask = sllv(one, boundSizes + one) - one;
|
||||
|
||||
bounds.maxBound = (s2.size() > maxLen) ? s2.size() : maxLen;
|
||||
bounds.maxBound /= 2;
|
||||
if (bounds.maxBound > 0) bounds.maxBound--;
|
||||
# else
|
||||
alignas(alignment) std::array<VecType, vec_width> boundMaskSize_;
|
||||
alignas(alignment) std::array<VecType, vec_width> boundMask_;
|
||||
|
||||
// todo try to find a simd implementation for sse2
|
||||
for (size_t i = 0; i < vec_width; ++i) {
|
||||
size_t Bound = jaro_bounds(static_cast<size_t>(s1_lengths[i]), s2.size());
|
||||
|
||||
if (Bound > bounds.maxBound) bounds.maxBound = Bound;
|
||||
|
||||
boundMaskSize_[i] = bit_mask_lsb<VecType>(2 * Bound);
|
||||
boundMask_[i] = bit_mask_lsb<VecType>(Bound + 1);
|
||||
}
|
||||
|
||||
bounds.boundMaskSize = native_simd<VecType>(reinterpret_cast<uint64_t*>(boundMaskSize_.data()));
|
||||
bounds.boundMask = native_simd<VecType>(reinterpret_cast<uint64_t*>(boundMask_.data()));
|
||||
# endif
|
||||
|
||||
size_t lastRelevantChar = static_cast<size_t>(maxLen) + bounds.maxBound;
|
||||
if (s2.size() > lastRelevantChar) s2.remove_suffix(s2.size() - lastRelevantChar);
|
||||
|
||||
return bounds;
|
||||
}
|
||||
|
||||
template <typename VecType, typename InputIt, int _lto_hack = RAPIDFUZZ_LTO_HACK>
|
||||
static inline auto jaro_similarity_prepare_bound_long_s2(const VecType* s1_lengths, Range<InputIt>& s2)
|
||||
{
|
||||
# ifdef RAPIDFUZZ_AVX2
|
||||
using namespace simd_avx2;
|
||||
# else
|
||||
using namespace simd_sse2;
|
||||
# endif
|
||||
|
||||
static constexpr size_t vec_width = native_simd<VecType>::size;
|
||||
assert(s2.size() > sizeof(VecType) * 8);
|
||||
|
||||
JaroSimilaritySimdBounds<native_simd<VecType>> bounds;
|
||||
|
||||
VecType maxLen = 0;
|
||||
// todo permutate + max to find maxLen
|
||||
// side-note: we know only the first 8 bit are actually used
|
||||
for (size_t i = 0; i < vec_width; ++i)
|
||||
if (s1_lengths[i] > maxLen) maxLen = s1_lengths[i];
|
||||
|
||||
bounds.maxBound = s2.size() / 2 - 1;
|
||||
bounds.boundMaskSize = native_simd<VecType>(bit_mask_lsb<VecType>(2 * bounds.maxBound));
|
||||
bounds.boundMask = native_simd<VecType>(bit_mask_lsb<VecType>(bounds.maxBound + 1));
|
||||
|
||||
size_t lastRelevantChar = static_cast<size_t>(maxLen) + bounds.maxBound;
|
||||
if (s2.size() > lastRelevantChar) s2.remove_suffix(s2.size() - lastRelevantChar);
|
||||
|
||||
return bounds;
|
||||
}
|
||||
|
||||
template <typename VecType, typename InputIt, int _lto_hack = RAPIDFUZZ_LTO_HACK>
|
||||
static inline void
|
||||
jaro_similarity_simd_long_s2(Range<double*> scores, const detail::BlockPatternMatchVector& block,
|
||||
VecType* s1_lengths, Range<InputIt> s2, double score_cutoff) noexcept
|
||||
{
|
||||
# ifdef RAPIDFUZZ_AVX2
|
||||
using namespace simd_avx2;
|
||||
# else
|
||||
using namespace simd_sse2;
|
||||
# endif
|
||||
|
||||
static constexpr size_t alignment = native_simd<VecType>::alignment;
|
||||
static constexpr size_t vec_width = native_simd<VecType>::size;
|
||||
static constexpr size_t vecs = native_simd<uint64_t>::size;
|
||||
assert(block.size() % vecs == 0);
|
||||
assert(s2.size() > sizeof(VecType) * 8);
|
||||
|
||||
struct AlignedAlloc {
|
||||
AlignedAlloc(size_t size) : memory(rf_aligned_alloc(native_simd<VecType>::alignment, size))
|
||||
{}
|
||||
|
||||
~AlignedAlloc()
|
||||
{
|
||||
rf_aligned_free(memory);
|
||||
}
|
||||
|
||||
void* memory = nullptr;
|
||||
};
|
||||
|
||||
native_simd<VecType> zero(VecType(0));
|
||||
native_simd<VecType> one(1);
|
||||
size_t result_index = 0;
|
||||
|
||||
size_t s2_block_count = detail::ceil_div(s2.size(), sizeof(VecType) * 8);
|
||||
AlignedAlloc memory(2 * s2_block_count * sizeof(native_simd<VecType>));
|
||||
|
||||
native_simd<VecType>* T_flag = static_cast<native_simd<VecType>*>(memory.memory);
|
||||
// reuse the same memory since counter is only required in the first half of the algorithm while
|
||||
// T_flags is required in the second half
|
||||
native_simd<VecType>* counter = static_cast<native_simd<VecType>*>(memory.memory) + s2_block_count;
|
||||
VecType* T_flags = static_cast<VecType*>(memory.memory) + s2_block_count * vec_width;
|
||||
|
||||
for (size_t cur_vec = 0; cur_vec < block.size(); cur_vec += vecs) {
|
||||
auto s2_cur = s2;
|
||||
auto bounds = jaro_similarity_prepare_bound_long_s2(s1_lengths + result_index, s2_cur);
|
||||
|
||||
native_simd<VecType> P_flag(VecType(0));
|
||||
|
||||
std::fill(T_flag, T_flag + detail::ceil_div(s2_cur.size(), sizeof(VecType) * 8),
|
||||
native_simd<VecType>(VecType(0)));
|
||||
std::fill(counter, counter + detail::ceil_div(s2_cur.size(), sizeof(VecType) * 8),
|
||||
native_simd<VecType>(VecType(1)));
|
||||
|
||||
// In case s2 is longer than all of the elements in s1_lengths boundMaskSize
|
||||
// might have all bits set and therefor the condition ((boundMask <= boundMaskSize) & one)
|
||||
// would incorrectly always set the first bit to 1.
|
||||
// this is solved by splitting the loop into two parts where after this boundary is reached
|
||||
// the first bit inside boundMask is no longer set
|
||||
size_t j = 0;
|
||||
for (; j < std::min(bounds.maxBound, s2_cur.size()); ++j) {
|
||||
alignas(alignment) std::array<uint64_t, vecs> stored;
|
||||
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, s2_cur[j]); });
|
||||
native_simd<VecType> X(stored.data());
|
||||
native_simd<VecType> PM_j = andnot(X & bounds.boundMask, P_flag);
|
||||
|
||||
P_flag |= blsi(PM_j);
|
||||
size_t T_word_index = j / (sizeof(VecType) * 8);
|
||||
T_flag[T_word_index] |= andnot(counter[T_word_index], (PM_j == zero));
|
||||
|
||||
counter[T_word_index] = counter[T_word_index] << 1;
|
||||
bounds.boundMask = (bounds.boundMask << 1) | ((bounds.boundMask <= bounds.boundMaskSize) & one);
|
||||
}
|
||||
|
||||
for (; j < s2_cur.size(); ++j) {
|
||||
alignas(alignment) std::array<uint64_t, vecs> stored;
|
||||
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, s2_cur[j]); });
|
||||
native_simd<VecType> X(stored.data());
|
||||
native_simd<VecType> PM_j = andnot(X & bounds.boundMask, P_flag);
|
||||
|
||||
P_flag |= blsi(PM_j);
|
||||
size_t T_word_index = j / (sizeof(VecType) * 8);
|
||||
T_flag[T_word_index] |= andnot(counter[T_word_index], (PM_j == zero));
|
||||
|
||||
counter[T_word_index] = counter[T_word_index] << 1;
|
||||
bounds.boundMask = bounds.boundMask << 1;
|
||||
}
|
||||
|
||||
auto counts = popcount(P_flag);
|
||||
alignas(alignment) std::array<VecType, vec_width> P_flags;
|
||||
P_flag.store(P_flags.data());
|
||||
|
||||
for (size_t i = 0; i < detail::ceil_div(s2_cur.size(), sizeof(VecType) * 8); ++i)
|
||||
T_flag[i].store(T_flags + i * vec_width);
|
||||
|
||||
for (size_t i = 0; i < vec_width; ++i) {
|
||||
size_t CommonChars = static_cast<size_t>(counts[i]);
|
||||
if (!jaro_common_char_filter(static_cast<size_t>(s1_lengths[result_index]), s2.size(),
|
||||
CommonChars, score_cutoff))
|
||||
{
|
||||
scores[result_index] = 0.0;
|
||||
result_index++;
|
||||
continue;
|
||||
}
|
||||
|
||||
VecType P_flag_cur = P_flags[i];
|
||||
size_t Transpositions = 0;
|
||||
|
||||
static constexpr size_t vecs_per_word = vec_width / vecs;
|
||||
size_t cur_block = i / vecs_per_word;
|
||||
size_t offset = sizeof(VecType) * 8 * (i % vecs_per_word);
|
||||
|
||||
{
|
||||
size_t T_word_index = 0;
|
||||
VecType T_flag_cur = T_flags[T_word_index * vec_width + i];
|
||||
while (P_flag_cur) {
|
||||
while (!T_flag_cur) {
|
||||
++T_word_index;
|
||||
T_flag_cur = T_flags[T_word_index * vec_width + i];
|
||||
}
|
||||
|
||||
VecType PatternFlagMask = blsi(P_flag_cur);
|
||||
|
||||
uint64_t PM_j =
|
||||
block.get(cur_vec + cur_block,
|
||||
s2[countr_zero(T_flag_cur) + T_word_index * sizeof(VecType) * 8]);
|
||||
Transpositions += !(PM_j & (static_cast<uint64_t>(PatternFlagMask) << offset));
|
||||
|
||||
T_flag_cur = blsr(T_flag_cur);
|
||||
P_flag_cur ^= PatternFlagMask;
|
||||
}
|
||||
}
|
||||
|
||||
double Sim = jaro_calculate_similarity(static_cast<size_t>(s1_lengths[result_index]), s2.size(),
|
||||
CommonChars, Transpositions);
|
||||
|
||||
scores[result_index] = (Sim >= score_cutoff) ? Sim : 0;
|
||||
result_index++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename VecType, typename InputIt, int _lto_hack = RAPIDFUZZ_LTO_HACK>
|
||||
static inline void
|
||||
jaro_similarity_simd_short_s2(Range<double*> scores, const detail::BlockPatternMatchVector& block,
|
||||
VecType* s1_lengths, Range<InputIt> s2, double score_cutoff) noexcept
|
||||
{
|
||||
# ifdef RAPIDFUZZ_AVX2
|
||||
using namespace simd_avx2;
|
||||
# else
|
||||
using namespace simd_sse2;
|
||||
# endif
|
||||
|
||||
static constexpr size_t alignment = native_simd<VecType>::alignment;
|
||||
static constexpr size_t vec_width = native_simd<VecType>::size;
|
||||
static constexpr size_t vecs = native_simd<uint64_t>::size;
|
||||
assert(block.size() % vecs == 0);
|
||||
assert(s2.size() <= sizeof(VecType) * 8);
|
||||
|
||||
native_simd<VecType> zero(VecType(0));
|
||||
native_simd<VecType> one(1);
|
||||
size_t result_index = 0;
|
||||
|
||||
for (size_t cur_vec = 0; cur_vec < block.size(); cur_vec += vecs) {
|
||||
auto s2_cur = s2;
|
||||
auto bounds = jaro_similarity_prepare_bound_short_s2(s1_lengths + result_index, s2_cur);
|
||||
|
||||
native_simd<VecType> P_flag(VecType(0));
|
||||
native_simd<VecType> T_flag(VecType(0));
|
||||
native_simd<VecType> counter(VecType(1));
|
||||
|
||||
// In case s2 is longer than all of the elements in s1_lengths boundMaskSize
|
||||
// might have all bits set and therefor the condition ((boundMask <= boundMaskSize) & one)
|
||||
// would incorrectly always set the first bit to 1.
|
||||
// this is solved by splitting the loop into two parts where after this boundary is reached
|
||||
// the first bit inside boundMask is no longer set
|
||||
size_t j = 0;
|
||||
for (; j < std::min(bounds.maxBound, s2_cur.size()); ++j) {
|
||||
alignas(alignment) std::array<uint64_t, vecs> stored;
|
||||
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, s2_cur[j]); });
|
||||
native_simd<VecType> X(stored.data());
|
||||
native_simd<VecType> PM_j = andnot(X & bounds.boundMask, P_flag);
|
||||
|
||||
P_flag |= blsi(PM_j);
|
||||
T_flag |= andnot(counter, (PM_j == zero));
|
||||
|
||||
counter = counter << 1;
|
||||
bounds.boundMask = (bounds.boundMask << 1) | ((bounds.boundMask <= bounds.boundMaskSize) & one);
|
||||
}
|
||||
|
||||
for (; j < s2_cur.size(); ++j) {
|
||||
alignas(alignment) std::array<uint64_t, vecs> stored;
|
||||
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, s2_cur[j]); });
|
||||
native_simd<VecType> X(stored.data());
|
||||
native_simd<VecType> PM_j = andnot(X & bounds.boundMask, P_flag);
|
||||
|
||||
P_flag |= blsi(PM_j);
|
||||
T_flag |= andnot(counter, (PM_j == zero));
|
||||
|
||||
counter = counter << 1;
|
||||
bounds.boundMask = bounds.boundMask << 1;
|
||||
}
|
||||
|
||||
auto counts = popcount(P_flag);
|
||||
alignas(alignment) std::array<VecType, vec_width> P_flags;
|
||||
P_flag.store(P_flags.data());
|
||||
alignas(alignment) std::array<VecType, vec_width> T_flags;
|
||||
T_flag.store(T_flags.data());
|
||||
for (size_t i = 0; i < vec_width; ++i) {
|
||||
size_t CommonChars = static_cast<size_t>(counts[i]);
|
||||
if (!jaro_common_char_filter(static_cast<size_t>(s1_lengths[result_index]), s2.size(),
|
||||
CommonChars, score_cutoff))
|
||||
{
|
||||
scores[result_index] = 0.0;
|
||||
result_index++;
|
||||
continue;
|
||||
}
|
||||
|
||||
VecType P_flag_cur = P_flags[i];
|
||||
VecType T_flag_cur = T_flags[i];
|
||||
size_t Transpositions = 0;
|
||||
|
||||
static constexpr size_t vecs_per_word = vec_width / vecs;
|
||||
size_t cur_block = i / vecs_per_word;
|
||||
size_t offset = sizeof(VecType) * 8 * (i % vecs_per_word);
|
||||
while (P_flag_cur) {
|
||||
VecType PatternFlagMask = blsi(P_flag_cur);
|
||||
|
||||
uint64_t PM_j = block.get(cur_vec + cur_block, s2[countr_zero(T_flag_cur)]);
|
||||
Transpositions += !(PM_j & (static_cast<uint64_t>(PatternFlagMask) << offset));
|
||||
|
||||
T_flag_cur = blsr(T_flag_cur);
|
||||
P_flag_cur ^= PatternFlagMask;
|
||||
}
|
||||
|
||||
double Sim = jaro_calculate_similarity(static_cast<size_t>(s1_lengths[result_index]), s2.size(),
|
||||
CommonChars, Transpositions);
|
||||
|
||||
scores[result_index] = (Sim >= score_cutoff) ? Sim : 0;
|
||||
result_index++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename VecType, typename InputIt, int _lto_hack = RAPIDFUZZ_LTO_HACK>
|
||||
static inline void jaro_similarity_simd(Range<double*> scores, const detail::BlockPatternMatchVector& block,
|
||||
VecType* s1_lengths, size_t s1_lengths_size, const Range<InputIt>& s2,
|
||||
double score_cutoff) noexcept
|
||||
{
|
||||
if (score_cutoff > 1.0) {
|
||||
for (size_t i = 0; i < s1_lengths_size; i++)
|
||||
scores[i] = 0.0;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (s2.empty()) {
|
||||
for (size_t i = 0; i < s1_lengths_size; i++)
|
||||
scores[i] = s1_lengths[i] ? 0.0 : 1.0;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (s2.size() > sizeof(VecType) * 8)
|
||||
return jaro_similarity_simd_long_s2(scores, block, s1_lengths, s2, score_cutoff);
|
||||
else
|
||||
return jaro_similarity_simd_short_s2(scores, block, s1_lengths, s2, score_cutoff);
|
||||
}
|
||||
|
||||
#endif /* RAPIDFUZZ_SIMD */
|
||||
|
||||
class Jaro : public SimilarityBase<Jaro, double, 0, 1> {
|
||||
friend SimilarityBase<Jaro, double, 0, 1>;
|
||||
friend NormalizedMetricBase<Jaro>;
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static double maximum(const Range<InputIt1>&, const Range<InputIt2>&) noexcept
|
||||
{
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static double _similarity(const Range<InputIt1>& s1, const Range<InputIt2>& s2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint)
|
||||
{
|
||||
return jaro_similarity(s1, s2, score_cutoff);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
235
src/external/rapidfuzz-cpp/rapidfuzz/distance/LCSseq.hpp
vendored
Normal file
235
src/external/rapidfuzz-cpp/rapidfuzz/distance/LCSseq.hpp
vendored
Normal file
@@ -0,0 +1,235 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <rapidfuzz/distance/LCSseq_impl.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
namespace rapidfuzz {
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t lcs_seq_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::LCSseq::distance(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t lcs_seq_distance(const Sentence1& s1, const Sentence2& s2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::LCSseq::distance(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t lcs_seq_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_cutoff = 0)
|
||||
{
|
||||
return detail::LCSseq::similarity(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t lcs_seq_similarity(const Sentence1& s1, const Sentence2& s2, size_t score_cutoff = 0)
|
||||
{
|
||||
return detail::LCSseq::similarity(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double lcs_seq_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::LCSseq::normalized_distance(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double lcs_seq_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::LCSseq::normalized_distance(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double lcs_seq_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::LCSseq::normalized_similarity(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double lcs_seq_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::LCSseq::normalized_similarity(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
Editops lcs_seq_editops(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2)
|
||||
{
|
||||
return detail::lcs_seq_editops(detail::Range(first1, last1), detail::Range(first2, last2));
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
Editops lcs_seq_editops(const Sentence1& s1, const Sentence2& s2)
|
||||
{
|
||||
return detail::lcs_seq_editops(detail::Range(s1), detail::Range(s2));
|
||||
}
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
namespace experimental {
|
||||
template <int MaxLen>
|
||||
struct MultiLCSseq : public detail::MultiSimilarityBase<MultiLCSseq<MaxLen>, size_t, 0,
|
||||
std::numeric_limits<int64_t>::max()> {
|
||||
private:
|
||||
friend detail::MultiSimilarityBase<MultiLCSseq<MaxLen>, size_t, 0, std::numeric_limits<int64_t>::max()>;
|
||||
friend detail::MultiNormalizedMetricBase<MultiLCSseq<MaxLen>, size_t>;
|
||||
|
||||
constexpr static size_t get_vec_size()
|
||||
{
|
||||
# ifdef RAPIDFUZZ_AVX2
|
||||
using namespace detail::simd_avx2;
|
||||
# else
|
||||
using namespace detail::simd_sse2;
|
||||
# endif
|
||||
if constexpr (MaxLen <= 8)
|
||||
return native_simd<uint8_t>::size;
|
||||
else if constexpr (MaxLen <= 16)
|
||||
return native_simd<uint16_t>::size;
|
||||
else if constexpr (MaxLen <= 32)
|
||||
return native_simd<uint32_t>::size;
|
||||
else if constexpr (MaxLen <= 64)
|
||||
return native_simd<uint64_t>::size;
|
||||
|
||||
static_assert(MaxLen <= 64);
|
||||
}
|
||||
|
||||
constexpr static size_t find_block_count(size_t count)
|
||||
{
|
||||
size_t vec_size = get_vec_size();
|
||||
size_t simd_vec_count = detail::ceil_div(count, vec_size);
|
||||
return detail::ceil_div(simd_vec_count * vec_size * MaxLen, 64);
|
||||
}
|
||||
|
||||
public:
|
||||
MultiLCSseq(size_t count) : input_count(count), pos(0), PM(find_block_count(count) * 64)
|
||||
{
|
||||
str_lens.resize(result_count());
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief get minimum size required for result vectors passed into
|
||||
* - distance
|
||||
* - similarity
|
||||
* - normalized_distance
|
||||
* - normalized_similarity
|
||||
*
|
||||
* @return minimum vector size
|
||||
*/
|
||||
size_t result_count() const
|
||||
{
|
||||
size_t vec_size = get_vec_size();
|
||||
size_t simd_vec_count = detail::ceil_div(input_count, vec_size);
|
||||
return simd_vec_count * vec_size;
|
||||
}
|
||||
|
||||
template <typename Sentence1>
|
||||
void insert(const Sentence1& s1_)
|
||||
{
|
||||
insert(detail::to_begin(s1_), detail::to_end(s1_));
|
||||
}
|
||||
|
||||
template <typename InputIt1>
|
||||
void insert(InputIt1 first1, InputIt1 last1)
|
||||
{
|
||||
auto len = std::distance(first1, last1);
|
||||
int block_pos = static_cast<int>((pos * MaxLen) % 64);
|
||||
auto block = (pos * MaxLen) / 64;
|
||||
assert(len <= MaxLen);
|
||||
|
||||
if (pos >= input_count) throw std::invalid_argument("out of bounds insert");
|
||||
|
||||
str_lens[pos] = static_cast<size_t>(len);
|
||||
|
||||
for (; first1 != last1; ++first1) {
|
||||
PM.insert(block, *first1, block_pos);
|
||||
block_pos++;
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename InputIt2>
|
||||
void _similarity(size_t* scores, size_t score_count, const detail::Range<InputIt2>& s2,
|
||||
size_t score_cutoff = 0) const
|
||||
{
|
||||
if (score_count < result_count())
|
||||
throw std::invalid_argument("scores has to have >= result_count() elements");
|
||||
|
||||
detail::Range scores_(scores, scores + score_count);
|
||||
if constexpr (MaxLen == 8)
|
||||
detail::lcs_simd<uint8_t>(scores_, PM, s2, score_cutoff);
|
||||
else if constexpr (MaxLen == 16)
|
||||
detail::lcs_simd<uint16_t>(scores_, PM, s2, score_cutoff);
|
||||
else if constexpr (MaxLen == 32)
|
||||
detail::lcs_simd<uint32_t>(scores_, PM, s2, score_cutoff);
|
||||
else if constexpr (MaxLen == 64)
|
||||
detail::lcs_simd<uint64_t>(scores_, PM, s2, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t maximum(size_t s1_idx, const detail::Range<InputIt2>& s2) const
|
||||
{
|
||||
return std::max(str_lens[s1_idx], s2.size());
|
||||
}
|
||||
|
||||
size_t get_input_count() const noexcept
|
||||
{
|
||||
return input_count;
|
||||
}
|
||||
|
||||
size_t input_count;
|
||||
size_t pos;
|
||||
detail::BlockPatternMatchVector PM;
|
||||
std::vector<size_t> str_lens;
|
||||
};
|
||||
} /* namespace experimental */
|
||||
#endif
|
||||
|
||||
template <typename CharT1>
|
||||
struct CachedLCSseq
|
||||
: detail::CachedSimilarityBase<CachedLCSseq<CharT1>, size_t, 0, std::numeric_limits<int64_t>::max()> {
|
||||
template <typename Sentence1>
|
||||
explicit CachedLCSseq(const Sentence1& s1_) : CachedLCSseq(detail::to_begin(s1_), detail::to_end(s1_))
|
||||
{}
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedLCSseq(InputIt1 first1, InputIt1 last1) : s1(first1, last1), PM(detail::Range(first1, last1))
|
||||
{}
|
||||
|
||||
private:
|
||||
friend detail::CachedSimilarityBase<CachedLCSseq<CharT1>, size_t, 0, std::numeric_limits<int64_t>::max()>;
|
||||
friend detail::CachedNormalizedMetricBase<CachedLCSseq<CharT1>>;
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t maximum(const detail::Range<InputIt2>& s2) const
|
||||
{
|
||||
return std::max(s1.size(), s2.size());
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t _similarity(const detail::Range<InputIt2>& s2, size_t score_cutoff,
|
||||
[[maybe_unused]] size_t score_hint) const
|
||||
{
|
||||
return detail::lcs_seq_similarity(PM, detail::Range(s1), s2, score_cutoff);
|
||||
}
|
||||
|
||||
std::vector<CharT1> s1;
|
||||
detail::BlockPatternMatchVector PM;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedLCSseq(const Sentence1& s1_) -> CachedLCSseq<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedLCSseq(InputIt1 first1, InputIt1 last1) -> CachedLCSseq<iter_value_t<InputIt1>>;
|
||||
|
||||
} // namespace rapidfuzz
|
||||
529
src/external/rapidfuzz-cpp/rapidfuzz/distance/LCSseq_impl.hpp
vendored
Normal file
529
src/external/rapidfuzz-cpp/rapidfuzz/distance/LCSseq_impl.hpp
vendored
Normal file
@@ -0,0 +1,529 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#include <limits>
|
||||
#include <rapidfuzz/details/Matrix.hpp>
|
||||
#include <rapidfuzz/details/PatternMatchVector.hpp>
|
||||
#include <rapidfuzz/details/common.hpp>
|
||||
#include <rapidfuzz/details/distance.hpp>
|
||||
#include <rapidfuzz/details/intrinsics.hpp>
|
||||
#include <rapidfuzz/details/simd.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <rapidfuzz/details/types.hpp>
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
template <bool RecordMatrix>
|
||||
struct LCSseqResult;
|
||||
|
||||
template <>
|
||||
struct LCSseqResult<true> {
|
||||
ShiftedBitMatrix<uint64_t> S;
|
||||
|
||||
size_t sim;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct LCSseqResult<false> {
|
||||
size_t sim;
|
||||
};
|
||||
|
||||
/*
|
||||
* An encoded mbleven model table.
|
||||
*
|
||||
* Each 8-bit integer represents an edit sequence, with using two
|
||||
* bits for a single operation.
|
||||
*
|
||||
* Each Row of 8 integers represent all possible combinations
|
||||
* of edit sequences for a gived maximum edit distance and length
|
||||
* difference between the two strings, that is below the maximum
|
||||
* edit distance
|
||||
*
|
||||
* 0x1 = 01 = DELETE,
|
||||
* 0x2 = 10 = INSERT
|
||||
*
|
||||
* 0x5 -> DEL + DEL
|
||||
* 0x6 -> DEL + INS
|
||||
* 0x9 -> INS + DEL
|
||||
* 0xA -> INS + INS
|
||||
*/
|
||||
static constexpr std::array<std::array<uint8_t, 6>, 14> lcs_seq_mbleven2018_matrix = {{
|
||||
/* max edit distance 1 */
|
||||
{0},
|
||||
/* case does not occur */ /* len_diff 0 */
|
||||
{0x01}, /* len_diff 1 */
|
||||
/* max edit distance 2 */
|
||||
{0x09, 0x06}, /* len_diff 0 */
|
||||
{0x01}, /* len_diff 1 */
|
||||
{0x05}, /* len_diff 2 */
|
||||
/* max edit distance 3 */
|
||||
{0x09, 0x06}, /* len_diff 0 */
|
||||
{0x25, 0x19, 0x16}, /* len_diff 1 */
|
||||
{0x05}, /* len_diff 2 */
|
||||
{0x15}, /* len_diff 3 */
|
||||
/* max edit distance 4 */
|
||||
{0x96, 0x66, 0x5A, 0x99, 0x69, 0xA5}, /* len_diff 0 */
|
||||
{0x25, 0x19, 0x16}, /* len_diff 1 */
|
||||
{0x65, 0x56, 0x95, 0x59}, /* len_diff 2 */
|
||||
{0x15}, /* len_diff 3 */
|
||||
{0x55}, /* len_diff 4 */
|
||||
}};
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t lcs_seq_mbleven2018(const Range<InputIt1>& s1, const Range<InputIt2>& s2, size_t score_cutoff)
|
||||
{
|
||||
auto len1 = s1.size();
|
||||
auto len2 = s2.size();
|
||||
assert(len1 != 0);
|
||||
assert(len2 != 0);
|
||||
|
||||
if (len1 < len2) return lcs_seq_mbleven2018(s2, s1, score_cutoff);
|
||||
|
||||
auto len_diff = len1 - len2;
|
||||
size_t max_misses = len1 + len2 - 2 * score_cutoff;
|
||||
size_t ops_index = (max_misses + max_misses * max_misses) / 2 + len_diff - 1;
|
||||
auto& possible_ops = lcs_seq_mbleven2018_matrix[ops_index];
|
||||
size_t max_len = 0;
|
||||
|
||||
for (uint8_t ops : possible_ops) {
|
||||
auto iter_s1 = s1.begin();
|
||||
auto iter_s2 = s2.begin();
|
||||
size_t cur_len = 0;
|
||||
|
||||
if (!ops) break;
|
||||
|
||||
while (iter_s1 != s1.end() && iter_s2 != s2.end()) {
|
||||
if (*iter_s1 != *iter_s2) {
|
||||
if (!ops) break;
|
||||
if (ops & 1)
|
||||
iter_s1++;
|
||||
else if (ops & 2)
|
||||
iter_s2++;
|
||||
#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ < 10
|
||||
# pragma GCC diagnostic push
|
||||
# pragma GCC diagnostic ignored "-Wconversion"
|
||||
#endif
|
||||
ops >>= 2;
|
||||
#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ < 10
|
||||
# pragma GCC diagnostic pop
|
||||
#endif
|
||||
}
|
||||
else {
|
||||
cur_len++;
|
||||
iter_s1++;
|
||||
iter_s2++;
|
||||
}
|
||||
}
|
||||
|
||||
max_len = std::max(max_len, cur_len);
|
||||
}
|
||||
|
||||
return (max_len >= score_cutoff) ? max_len : 0;
|
||||
}
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
template <typename VecType, typename InputIt, int _lto_hack = RAPIDFUZZ_LTO_HACK>
|
||||
void lcs_simd(Range<size_t*> scores, const BlockPatternMatchVector& block, const Range<InputIt>& s2,
|
||||
size_t score_cutoff) noexcept
|
||||
{
|
||||
# ifdef RAPIDFUZZ_AVX2
|
||||
using namespace simd_avx2;
|
||||
# else
|
||||
using namespace simd_sse2;
|
||||
# endif
|
||||
auto score_iter = scores.begin();
|
||||
static constexpr size_t alignment = native_simd<VecType>::alignment;
|
||||
static constexpr size_t vecs = native_simd<uint64_t>::size;
|
||||
assert(block.size() % vecs == 0);
|
||||
|
||||
static constexpr size_t interleaveCount = 3;
|
||||
|
||||
size_t cur_vec = 0;
|
||||
for (; cur_vec + interleaveCount * vecs <= block.size(); cur_vec += interleaveCount * vecs) {
|
||||
std::array<native_simd<VecType>, interleaveCount> S;
|
||||
unroll<int, interleaveCount>([&](auto j) { S[j] = static_cast<VecType>(-1); });
|
||||
|
||||
for (const auto& ch : s2) {
|
||||
unroll<int, interleaveCount>([&](auto j) {
|
||||
alignas(32) std::array<uint64_t, vecs> stored;
|
||||
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + j * vecs + i, ch); });
|
||||
|
||||
native_simd<VecType> Matches(stored.data());
|
||||
native_simd<VecType> u = S[j] & Matches;
|
||||
S[j] = (S[j] + u) | (S[j] - u);
|
||||
});
|
||||
}
|
||||
|
||||
unroll<int, interleaveCount>([&](auto j) {
|
||||
auto counts = popcount(~S[j]);
|
||||
unroll<int, counts.size()>([&](auto i) {
|
||||
*score_iter = (counts[i] >= score_cutoff) ? static_cast<size_t>(counts[i]) : 0;
|
||||
score_iter++;
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
for (; cur_vec < block.size(); cur_vec += vecs) {
|
||||
native_simd<VecType> S = static_cast<VecType>(-1);
|
||||
|
||||
for (const auto& ch : s2) {
|
||||
alignas(alignment) std::array<uint64_t, vecs> stored;
|
||||
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, ch); });
|
||||
|
||||
native_simd<VecType> Matches(stored.data());
|
||||
native_simd<VecType> u = S & Matches;
|
||||
S = (S + u) | (S - u);
|
||||
}
|
||||
|
||||
auto counts = popcount(~S);
|
||||
unroll<int, counts.size()>([&](auto i) {
|
||||
*score_iter = (counts[i] >= score_cutoff) ? static_cast<size_t>(counts[i]) : 0;
|
||||
score_iter++;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <size_t N, bool RecordMatrix, typename PMV, typename InputIt1, typename InputIt2>
|
||||
auto lcs_unroll(const PMV& block, const Range<InputIt1>&, const Range<InputIt2>& s2,
|
||||
size_t score_cutoff = 0) -> LCSseqResult<RecordMatrix>
|
||||
{
|
||||
uint64_t S[N];
|
||||
unroll<size_t, N>([&](size_t i) { S[i] = ~UINT64_C(0); });
|
||||
|
||||
LCSseqResult<RecordMatrix> res;
|
||||
if constexpr (RecordMatrix) res.S = ShiftedBitMatrix<uint64_t>(s2.size(), N, ~UINT64_C(0));
|
||||
|
||||
auto iter_s2 = s2.begin();
|
||||
for (size_t i = 0; i < s2.size(); ++i) {
|
||||
uint64_t carry = 0;
|
||||
|
||||
static constexpr size_t unroll_factor = 3;
|
||||
for (unsigned int j = 0; j < N / unroll_factor; ++j) {
|
||||
unroll<size_t, unroll_factor>([&](size_t word_) {
|
||||
size_t word = word_ + j * unroll_factor;
|
||||
uint64_t Matches = block.get(word, *iter_s2);
|
||||
uint64_t u = S[word] & Matches;
|
||||
uint64_t x = addc64(S[word], u, carry, &carry);
|
||||
S[word] = x | (S[word] - u);
|
||||
|
||||
if constexpr (RecordMatrix) res.S[i][word] = S[word];
|
||||
});
|
||||
}
|
||||
|
||||
unroll<size_t, N % unroll_factor>([&](size_t word_) {
|
||||
size_t word = word_ + N / unroll_factor * unroll_factor;
|
||||
uint64_t Matches = block.get(word, *iter_s2);
|
||||
uint64_t u = S[word] & Matches;
|
||||
uint64_t x = addc64(S[word], u, carry, &carry);
|
||||
S[word] = x | (S[word] - u);
|
||||
|
||||
if constexpr (RecordMatrix) res.S[i][word] = S[word];
|
||||
});
|
||||
|
||||
iter_s2++;
|
||||
}
|
||||
|
||||
res.sim = 0;
|
||||
unroll<size_t, N>([&](size_t i) { res.sim += popcount(~S[i]); });
|
||||
|
||||
if (res.sim < score_cutoff) res.sim = 0;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* implementation is following the paper Bit-Parallel LCS-length Computation Revisited
|
||||
* from Heikki Hyyrö
|
||||
*
|
||||
* The paper refers to s1 as m and s2 as n
|
||||
*/
|
||||
template <bool RecordMatrix, typename PMV, typename InputIt1, typename InputIt2>
|
||||
auto lcs_blockwise(const PMV& PM, const Range<InputIt1>& s1, const Range<InputIt2>& s2,
|
||||
size_t score_cutoff = 0) -> LCSseqResult<RecordMatrix>
|
||||
{
|
||||
assert(score_cutoff <= s1.size());
|
||||
assert(score_cutoff <= s2.size());
|
||||
|
||||
size_t word_size = sizeof(uint64_t) * 8;
|
||||
size_t words = PM.size();
|
||||
std::vector<uint64_t> S(words, ~UINT64_C(0));
|
||||
|
||||
size_t band_width_left = s1.size() - score_cutoff;
|
||||
size_t band_width_right = s2.size() - score_cutoff;
|
||||
|
||||
LCSseqResult<RecordMatrix> res;
|
||||
if constexpr (RecordMatrix) {
|
||||
size_t full_band = band_width_left + 1 + band_width_right;
|
||||
size_t full_band_words = std::min(words, full_band / word_size + 2);
|
||||
res.S = ShiftedBitMatrix<uint64_t>(s2.size(), full_band_words, ~UINT64_C(0));
|
||||
}
|
||||
|
||||
/* first_block is the index of the first block in Ukkonen band. */
|
||||
size_t first_block = 0;
|
||||
size_t last_block = std::min(words, ceil_div(band_width_left + 1, word_size));
|
||||
|
||||
auto iter_s2 = s2.begin();
|
||||
for (size_t row = 0; row < s2.size(); ++row) {
|
||||
uint64_t carry = 0;
|
||||
|
||||
if constexpr (RecordMatrix) res.S.set_offset(row, static_cast<ptrdiff_t>(first_block * word_size));
|
||||
|
||||
for (size_t word = first_block; word < last_block; ++word) {
|
||||
const uint64_t Matches = PM.get(word, *iter_s2);
|
||||
uint64_t Stemp = S[word];
|
||||
|
||||
uint64_t u = Stemp & Matches;
|
||||
|
||||
uint64_t x = addc64(Stemp, u, carry, &carry);
|
||||
S[word] = x | (Stemp - u);
|
||||
|
||||
if constexpr (RecordMatrix) res.S[row][word - first_block] = S[word];
|
||||
}
|
||||
|
||||
if (row > band_width_right) first_block = (row - band_width_right) / word_size;
|
||||
|
||||
if (row + 1 + band_width_left <= s1.size())
|
||||
last_block = ceil_div(row + 1 + band_width_left, word_size);
|
||||
|
||||
iter_s2++;
|
||||
}
|
||||
|
||||
res.sim = 0;
|
||||
for (uint64_t Stemp : S)
|
||||
res.sim += popcount(~Stemp);
|
||||
|
||||
if (res.sim < score_cutoff) res.sim = 0;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename PMV, typename InputIt1, typename InputIt2>
|
||||
size_t longest_common_subsequence(const PMV& PM, const Range<InputIt1>& s1, const Range<InputIt2>& s2,
|
||||
size_t score_cutoff)
|
||||
{
|
||||
assert(score_cutoff <= s1.size());
|
||||
assert(score_cutoff <= s2.size());
|
||||
|
||||
size_t word_size = sizeof(uint64_t) * 8;
|
||||
size_t words = PM.size();
|
||||
size_t band_width_left = s1.size() - score_cutoff;
|
||||
size_t band_width_right = s2.size() - score_cutoff;
|
||||
size_t full_band = band_width_left + 1 + band_width_right;
|
||||
size_t full_band_words = std::min(words, full_band / word_size + 2);
|
||||
|
||||
if (full_band_words < words) return lcs_blockwise<false>(PM, s1, s2, score_cutoff).sim;
|
||||
|
||||
auto nr = ceil_div(s1.size(), 64);
|
||||
switch (nr) {
|
||||
case 0: return 0;
|
||||
case 1: return lcs_unroll<1, false>(PM, s1, s2, score_cutoff).sim;
|
||||
case 2: return lcs_unroll<2, false>(PM, s1, s2, score_cutoff).sim;
|
||||
case 3: return lcs_unroll<3, false>(PM, s1, s2, score_cutoff).sim;
|
||||
case 4: return lcs_unroll<4, false>(PM, s1, s2, score_cutoff).sim;
|
||||
case 5: return lcs_unroll<5, false>(PM, s1, s2, score_cutoff).sim;
|
||||
case 6: return lcs_unroll<6, false>(PM, s1, s2, score_cutoff).sim;
|
||||
case 7: return lcs_unroll<7, false>(PM, s1, s2, score_cutoff).sim;
|
||||
case 8: return lcs_unroll<8, false>(PM, s1, s2, score_cutoff).sim;
|
||||
default: return lcs_blockwise<false>(PM, s1, s2, score_cutoff).sim;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t longest_common_subsequence(const Range<InputIt1>& s1, const Range<InputIt2>& s2, size_t score_cutoff)
|
||||
{
|
||||
if (s1.empty()) return 0;
|
||||
if (s1.size() <= 64) return longest_common_subsequence(PatternMatchVector(s1), s1, s2, score_cutoff);
|
||||
|
||||
return longest_common_subsequence(BlockPatternMatchVector(s1), s1, s2, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t lcs_seq_similarity(const BlockPatternMatchVector& block, Range<InputIt1> s1, Range<InputIt2> s2,
|
||||
size_t score_cutoff)
|
||||
{
|
||||
auto len1 = s1.size();
|
||||
auto len2 = s2.size();
|
||||
|
||||
if (score_cutoff > len1 || score_cutoff > len2) return 0;
|
||||
|
||||
size_t max_misses = len1 + len2 - 2 * score_cutoff;
|
||||
|
||||
/* no edits are allowed */
|
||||
if (max_misses == 0 || (max_misses == 1 && len1 == len2))
|
||||
return std::equal(s1.begin(), s1.end(), s2.begin(), s2.end()) ? len1 : 0;
|
||||
|
||||
if (max_misses < abs_diff(len1, len2)) return 0;
|
||||
|
||||
// do this first, since we can not remove any affix in encoded form
|
||||
if (max_misses >= 5) return longest_common_subsequence(block, s1, s2, score_cutoff);
|
||||
|
||||
/* common affix does not effect Levenshtein distance */
|
||||
StringAffix affix = remove_common_affix(s1, s2);
|
||||
size_t lcs_sim = affix.prefix_len + affix.suffix_len;
|
||||
if (!s1.empty() && !s2.empty()) {
|
||||
size_t adjusted_cutoff = score_cutoff >= lcs_sim ? score_cutoff - lcs_sim : 0;
|
||||
lcs_sim += lcs_seq_mbleven2018(s1, s2, adjusted_cutoff);
|
||||
}
|
||||
|
||||
return (lcs_sim >= score_cutoff) ? lcs_sim : 0;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t lcs_seq_similarity(Range<InputIt1> s1, Range<InputIt2> s2, size_t score_cutoff)
|
||||
{
|
||||
auto len1 = s1.size();
|
||||
auto len2 = s2.size();
|
||||
|
||||
// Swapping the strings so the second string is shorter
|
||||
if (len1 < len2) return lcs_seq_similarity(s2, s1, score_cutoff);
|
||||
|
||||
if (score_cutoff > len1 || score_cutoff > len2) return 0;
|
||||
|
||||
size_t max_misses = len1 + len2 - 2 * score_cutoff;
|
||||
|
||||
/* no edits are allowed */
|
||||
if (max_misses == 0 || (max_misses == 1 && len1 == len2))
|
||||
return std::equal(s1.begin(), s1.end(), s2.begin(), s2.end()) ? len1 : 0;
|
||||
|
||||
if (max_misses < abs_diff(len1, len2)) return 0;
|
||||
|
||||
/* common affix does not effect Levenshtein distance */
|
||||
StringAffix affix = remove_common_affix(s1, s2);
|
||||
size_t lcs_sim = affix.prefix_len + affix.suffix_len;
|
||||
if (s1.size() && s2.size()) {
|
||||
size_t adjusted_cutoff = score_cutoff >= lcs_sim ? score_cutoff - lcs_sim : 0;
|
||||
if (max_misses < 5)
|
||||
lcs_sim += lcs_seq_mbleven2018(s1, s2, adjusted_cutoff);
|
||||
else
|
||||
lcs_sim += longest_common_subsequence(s1, s2, adjusted_cutoff);
|
||||
}
|
||||
|
||||
return (lcs_sim >= score_cutoff) ? lcs_sim : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief recover alignment from bitparallel Levenshtein matrix
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
Editops recover_alignment(const Range<InputIt1>& s1, const Range<InputIt2>& s2,
|
||||
const LCSseqResult<true>& matrix, StringAffix affix)
|
||||
{
|
||||
size_t len1 = s1.size();
|
||||
size_t len2 = s2.size();
|
||||
size_t dist = len1 + len2 - 2 * matrix.sim;
|
||||
Editops editops(dist);
|
||||
editops.set_src_len(len1 + affix.prefix_len + affix.suffix_len);
|
||||
editops.set_dest_len(len2 + affix.prefix_len + affix.suffix_len);
|
||||
|
||||
if (dist == 0) return editops;
|
||||
|
||||
[[maybe_unused]] size_t band_width_right = s2.size() - matrix.sim;
|
||||
|
||||
auto col = len1;
|
||||
auto row = len2;
|
||||
|
||||
while (row && col) {
|
||||
/* Deletion */
|
||||
if (matrix.S.test_bit(row - 1, col - 1)) {
|
||||
assert(dist > 0);
|
||||
assert(static_cast<ptrdiff_t>(col) >=
|
||||
static_cast<ptrdiff_t>(row) - static_cast<ptrdiff_t>(band_width_right));
|
||||
dist--;
|
||||
col--;
|
||||
editops[dist].type = EditType::Delete;
|
||||
editops[dist].src_pos = col + affix.prefix_len;
|
||||
editops[dist].dest_pos = row + affix.prefix_len;
|
||||
}
|
||||
else {
|
||||
row--;
|
||||
|
||||
/* Insertion */
|
||||
if (row && !(matrix.S.test_bit(row - 1, col - 1))) {
|
||||
assert(dist > 0);
|
||||
dist--;
|
||||
editops[dist].type = EditType::Insert;
|
||||
editops[dist].src_pos = col + affix.prefix_len;
|
||||
editops[dist].dest_pos = row + affix.prefix_len;
|
||||
}
|
||||
/* Match */
|
||||
else {
|
||||
col--;
|
||||
assert(s1[col] == s2[row]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while (col) {
|
||||
dist--;
|
||||
col--;
|
||||
editops[dist].type = EditType::Delete;
|
||||
editops[dist].src_pos = col + affix.prefix_len;
|
||||
editops[dist].dest_pos = row + affix.prefix_len;
|
||||
}
|
||||
|
||||
while (row) {
|
||||
dist--;
|
||||
row--;
|
||||
editops[dist].type = EditType::Insert;
|
||||
editops[dist].src_pos = col + affix.prefix_len;
|
||||
editops[dist].dest_pos = row + affix.prefix_len;
|
||||
}
|
||||
|
||||
return editops;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
LCSseqResult<true> lcs_matrix(const Range<InputIt1>& s1, const Range<InputIt2>& s2)
|
||||
{
|
||||
size_t nr = ceil_div(s1.size(), 64);
|
||||
switch (nr) {
|
||||
case 0:
|
||||
{
|
||||
LCSseqResult<true> res;
|
||||
res.sim = 0;
|
||||
return res;
|
||||
}
|
||||
case 1: return lcs_unroll<1, true>(PatternMatchVector(s1), s1, s2);
|
||||
case 2: return lcs_unroll<2, true>(BlockPatternMatchVector(s1), s1, s2);
|
||||
case 3: return lcs_unroll<3, true>(BlockPatternMatchVector(s1), s1, s2);
|
||||
case 4: return lcs_unroll<4, true>(BlockPatternMatchVector(s1), s1, s2);
|
||||
case 5: return lcs_unroll<5, true>(BlockPatternMatchVector(s1), s1, s2);
|
||||
case 6: return lcs_unroll<6, true>(BlockPatternMatchVector(s1), s1, s2);
|
||||
case 7: return lcs_unroll<7, true>(BlockPatternMatchVector(s1), s1, s2);
|
||||
case 8: return lcs_unroll<8, true>(BlockPatternMatchVector(s1), s1, s2);
|
||||
default: return lcs_blockwise<true>(BlockPatternMatchVector(s1), s1, s2);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
Editops lcs_seq_editops(Range<InputIt1> s1, Range<InputIt2> s2)
|
||||
{
|
||||
/* prefix and suffix are no-ops, which do not need to be added to the editops */
|
||||
StringAffix affix = remove_common_affix(s1, s2);
|
||||
|
||||
return recover_alignment(s1, s2, lcs_matrix(s1, s2), affix);
|
||||
}
|
||||
|
||||
class LCSseq : public SimilarityBase<LCSseq, size_t, 0, std::numeric_limits<int64_t>::max()> {
|
||||
friend SimilarityBase<LCSseq, size_t, 0, std::numeric_limits<int64_t>::max()>;
|
||||
friend NormalizedMetricBase<LCSseq>;
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static size_t maximum(const Range<InputIt1>& s1, const Range<InputIt2>& s2)
|
||||
{
|
||||
return std::max(s1.size(), s2.size());
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static size_t _similarity(const Range<InputIt1>& s1, const Range<InputIt2>& s2, size_t score_cutoff,
|
||||
[[maybe_unused]] size_t score_hint)
|
||||
{
|
||||
return lcs_seq_similarity(s1, s2, score_cutoff);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
492
src/external/rapidfuzz-cpp/rapidfuzz/distance/Levenshtein.hpp
vendored
Normal file
492
src/external/rapidfuzz-cpp/rapidfuzz/distance/Levenshtein.hpp
vendored
Normal file
@@ -0,0 +1,492 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <limits>
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/distance/Levenshtein_impl.hpp>
|
||||
|
||||
namespace rapidfuzz {
|
||||
|
||||
/**
|
||||
* @brief Calculates the minimum number of insertions, deletions, and substitutions
|
||||
* required to change one sequence into the other according to Levenshtein with custom
|
||||
* costs for insertion, deletion and substitution
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1
|
||||
* string to compare with s2 (for type info check Template parameters above)
|
||||
* @param s2
|
||||
* string to compare with s1 (for type info check Template parameters above)
|
||||
* @param weights
|
||||
* The weights for the three operations in the form
|
||||
* (insertion, deletion, substitution). Default is {1, 1, 1},
|
||||
* which gives all three operations a weight of 1.
|
||||
* @param max
|
||||
* Maximum Levenshtein distance between s1 and s2, that is
|
||||
* considered as a result. If the distance is bigger than max,
|
||||
* max + 1 is returned instead. Default is std::numeric_limits<size_t>::max(),
|
||||
* which deactivates this behaviour.
|
||||
*
|
||||
* @return returns the levenshtein distance between s1 and s2
|
||||
*
|
||||
* @remarks
|
||||
* @parblock
|
||||
* Depending on the input parameters different optimized implementation are used
|
||||
* to improve the performance. Worst-case performance is ``O(m * n)``.
|
||||
*
|
||||
* <b>Insertion = Deletion = Substitution:</b>
|
||||
*
|
||||
* This is known as uniform Levenshtein distance and is the distance most commonly
|
||||
* referred to as Levenshtein distance. The following implementation is used
|
||||
* with a worst-case performance of ``O([N/64]M)``.
|
||||
*
|
||||
* - if max is 0 the similarity can be calculated using a direct comparision,
|
||||
* since no difference between the strings is allowed. The time complexity of
|
||||
* this algorithm is ``O(N)``.
|
||||
*
|
||||
* - A common prefix/suffix of the two compared strings does not affect
|
||||
* the Levenshtein distance, so the affix is removed before calculating the
|
||||
* similarity.
|
||||
*
|
||||
* - If max is <= 3 the mbleven algorithm is used. This algorithm
|
||||
* checks all possible edit operations that are possible under
|
||||
* the threshold `max`. The time complexity of this algorithm is ``O(N)``.
|
||||
*
|
||||
* - If the length of the shorter string is <= 64 after removing the common affix
|
||||
* Hyyrös' algorithm is used, which calculates the Levenshtein distance in
|
||||
* parallel. The algorithm is described by @cite hyrro_2002. The time complexity of this
|
||||
* algorithm is ``O(N)``.
|
||||
*
|
||||
* - If the length of the shorter string is >= 64 after removing the common affix
|
||||
* a blockwise implementation of Myers' algorithm is used, which calculates
|
||||
* the Levenshtein distance in parallel (64 characters at a time).
|
||||
* The algorithm is described by @cite myers_1999. The time complexity of this
|
||||
* algorithm is ``O([N/64]M)``.
|
||||
*
|
||||
*
|
||||
* <b>Insertion = Deletion, Substitution >= Insertion + Deletion:</b>
|
||||
*
|
||||
* Since every Substitution can be performed as Insertion + Deletion, this variant
|
||||
* of the Levenshtein distance only uses Insertions and Deletions. Therefore this
|
||||
* variant is often referred to as InDel-Distance. The following implementation
|
||||
* is used with a worst-case performance of ``O([N/64]M)``.
|
||||
*
|
||||
* - if max is 0 the similarity can be calculated using a direct comparision,
|
||||
* since no difference between the strings is allowed. The time complexity of
|
||||
* this algorithm is ``O(N)``.
|
||||
*
|
||||
* - if max is 1 and the two strings have a similar length, the similarity can be
|
||||
* calculated using a direct comparision aswell, since a substitution would cause
|
||||
* a edit distance higher than max. The time complexity of this algorithm
|
||||
* is ``O(N)``.
|
||||
*
|
||||
* - A common prefix/suffix of the two compared strings does not affect
|
||||
* the Levenshtein distance, so the affix is removed before calculating the
|
||||
* similarity.
|
||||
*
|
||||
* - If max is <= 4 the mbleven algorithm is used. This algorithm
|
||||
* checks all possible edit operations that are possible under
|
||||
* the threshold `max`. As a difference to the normal Levenshtein distance this
|
||||
* algorithm can even be used up to a threshold of 4 here, since the higher weight
|
||||
* of substitutions decreases the amount of possible edit operations.
|
||||
* The time complexity of this algorithm is ``O(N)``.
|
||||
*
|
||||
* - If the length of the shorter string is <= 64 after removing the common affix
|
||||
* Hyyrös' lcs algorithm is used, which calculates the InDel distance in
|
||||
* parallel. The algorithm is described by @cite hyrro_lcs_2004 and is extended with support
|
||||
* for UTF32 in this implementation. The time complexity of this
|
||||
* algorithm is ``O(N)``.
|
||||
*
|
||||
* - If the length of the shorter string is >= 64 after removing the common affix
|
||||
* a blockwise implementation of Hyyrös' lcs algorithm is used, which calculates
|
||||
* the Levenshtein distance in parallel (64 characters at a time).
|
||||
* The algorithm is described by @cite hyrro_lcs_2004. The time complexity of this
|
||||
* algorithm is ``O([N/64]M)``.
|
||||
*
|
||||
* <b>Other weights:</b>
|
||||
*
|
||||
* The implementation for other weights is based on Wagner-Fischer.
|
||||
* It has a performance of ``O(N * M)`` and has a memory usage of ``O(N)``.
|
||||
* Further details can be found in @cite wagner_fischer_1974.
|
||||
* @endparblock
|
||||
*
|
||||
* @par Examples
|
||||
* @parblock
|
||||
* Find the Levenshtein distance between two strings:
|
||||
* @code{.cpp}
|
||||
* // dist is 2
|
||||
* size_t dist = levenshtein_distance("lewenstein", "levenshtein");
|
||||
* @endcode
|
||||
*
|
||||
* Setting a maximum distance allows the implementation to select
|
||||
* a more efficient implementation:
|
||||
* @code{.cpp}
|
||||
* // dist is 2
|
||||
* size_t dist = levenshtein_distance("lewenstein", "levenshtein", {1, 1, 1}, 1);
|
||||
* @endcode
|
||||
*
|
||||
* It is possible to select different weights by passing a `weight` struct.
|
||||
* @code{.cpp}
|
||||
* // dist is 3
|
||||
* size_t dist = levenshtein_distance("lewenstein", "levenshtein", {1, 1, 2});
|
||||
* @endcode
|
||||
* @endparblock
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
LevenshteinWeightTable weights = {1, 1, 1},
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max(),
|
||||
size_t score_hint = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::Levenshtein::distance(first1, last1, first2, last2, weights, score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t levenshtein_distance(const Sentence1& s1, const Sentence2& s2,
|
||||
LevenshteinWeightTable weights = {1, 1, 1},
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max(),
|
||||
size_t score_hint = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::Levenshtein::distance(s1, s2, weights, score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t levenshtein_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
LevenshteinWeightTable weights = {1, 1, 1}, size_t score_cutoff = 0,
|
||||
size_t score_hint = 0)
|
||||
{
|
||||
return detail::Levenshtein::similarity(first1, last1, first2, last2, weights, score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t levenshtein_similarity(const Sentence1& s1, const Sentence2& s2,
|
||||
LevenshteinWeightTable weights = {1, 1, 1}, size_t score_cutoff = 0,
|
||||
size_t score_hint = 0)
|
||||
{
|
||||
return detail::Levenshtein::similarity(s1, s2, weights, score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double levenshtein_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
LevenshteinWeightTable weights = {1, 1, 1}, double score_cutoff = 1.0,
|
||||
double score_hint = 1.0)
|
||||
{
|
||||
return detail::Levenshtein::normalized_distance(first1, last1, first2, last2, weights, score_cutoff,
|
||||
score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double levenshtein_normalized_distance(const Sentence1& s1, const Sentence2& s2,
|
||||
LevenshteinWeightTable weights = {1, 1, 1}, double score_cutoff = 1.0,
|
||||
double score_hint = 1.0)
|
||||
{
|
||||
return detail::Levenshtein::normalized_distance(s1, s2, weights, score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculates a normalized levenshtein distance using custom
|
||||
* costs for insertion, deletion and substitution.
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1
|
||||
* string to compare with s2 (for type info check Template parameters above)
|
||||
* @param s2
|
||||
* string to compare with s1 (for type info check Template parameters above)
|
||||
* @param weights
|
||||
* The weights for the three operations in the form
|
||||
* (insertion, deletion, substitution). Default is {1, 1, 1},
|
||||
* which gives all three operations a weight of 1.
|
||||
* @param score_cutoff
|
||||
* Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
* For ratio < score_cutoff 0 is returned instead. Default is 0,
|
||||
* which deactivates this behaviour.
|
||||
*
|
||||
* @return Normalized weighted levenshtein distance between s1 and s2
|
||||
* as a double between 0 and 1.0
|
||||
*
|
||||
* @see levenshtein()
|
||||
*
|
||||
* @remarks
|
||||
* @parblock
|
||||
* The normalization of the Levenshtein distance is performed in the following way:
|
||||
*
|
||||
* \f{align*}{
|
||||
* ratio &= \frac{distance(s1, s2)}{max_dist}
|
||||
* \f}
|
||||
* @endparblock
|
||||
*
|
||||
*
|
||||
* @par Examples
|
||||
* @parblock
|
||||
* Find the normalized Levenshtein distance between two strings:
|
||||
* @code{.cpp}
|
||||
* // ratio is 81.81818181818181
|
||||
* double ratio = normalized_levenshtein("lewenstein", "levenshtein");
|
||||
* @endcode
|
||||
*
|
||||
* Setting a score_cutoff allows the implementation to select
|
||||
* a more efficient implementation:
|
||||
* @code{.cpp}
|
||||
* // ratio is 0.0
|
||||
* double ratio = normalized_levenshtein("lewenstein", "levenshtein", {1, 1, 1}, 85.0);
|
||||
* @endcode
|
||||
*
|
||||
* It is possible to select different weights by passing a `weight` struct
|
||||
* @code{.cpp}
|
||||
* // ratio is 85.71428571428571
|
||||
* double ratio = normalized_levenshtein("lewenstein", "levenshtein", {1, 1, 2});
|
||||
* @endcode
|
||||
* @endparblock
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double levenshtein_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
LevenshteinWeightTable weights = {1, 1, 1},
|
||||
double score_cutoff = 0.0, double score_hint = 0.0)
|
||||
{
|
||||
return detail::Levenshtein::normalized_similarity(first1, last1, first2, last2, weights, score_cutoff,
|
||||
score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double levenshtein_normalized_similarity(const Sentence1& s1, const Sentence2& s2,
|
||||
LevenshteinWeightTable weights = {1, 1, 1},
|
||||
double score_cutoff = 0.0, double score_hint = 0.0)
|
||||
{
|
||||
return detail::Levenshtein::normalized_similarity(s1, s2, weights, score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return list of EditOp describing how to turn s1 into s2.
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1
|
||||
* string to compare with s2 (for type info check Template parameters above)
|
||||
* @param s2
|
||||
* string to compare with s1 (for type info check Template parameters above)
|
||||
*
|
||||
* @return Edit operations required to turn s1 into s2
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
Editops levenshtein_editops(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_hint = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::levenshtein_editops(detail::Range(first1, last1), detail::Range(first2, last2),
|
||||
score_hint);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
Editops levenshtein_editops(const Sentence1& s1, const Sentence2& s2,
|
||||
size_t score_hint = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::levenshtein_editops(detail::Range(s1), detail::Range(s2), score_hint);
|
||||
}
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
namespace experimental {
|
||||
template <int MaxLen>
|
||||
struct MultiLevenshtein : public detail::MultiDistanceBase<MultiLevenshtein<MaxLen>, size_t, 0,
|
||||
std::numeric_limits<int64_t>::max()> {
|
||||
private:
|
||||
friend detail::MultiDistanceBase<MultiLevenshtein<MaxLen>, size_t, 0,
|
||||
std::numeric_limits<int64_t>::max()>;
|
||||
friend detail::MultiNormalizedMetricBase<MultiLevenshtein<MaxLen>, size_t>;
|
||||
|
||||
constexpr static size_t get_vec_size()
|
||||
{
|
||||
# ifdef RAPIDFUZZ_AVX2
|
||||
using namespace detail::simd_avx2;
|
||||
# else
|
||||
using namespace detail::simd_sse2;
|
||||
# endif
|
||||
if constexpr (MaxLen <= 8)
|
||||
return native_simd<uint8_t>::size;
|
||||
else if constexpr (MaxLen <= 16)
|
||||
return native_simd<uint16_t>::size;
|
||||
else if constexpr (MaxLen <= 32)
|
||||
return native_simd<uint32_t>::size;
|
||||
else if constexpr (MaxLen <= 64)
|
||||
return native_simd<uint64_t>::size;
|
||||
|
||||
static_assert(MaxLen <= 64);
|
||||
}
|
||||
|
||||
constexpr static size_t find_block_count(size_t count)
|
||||
{
|
||||
size_t vec_size = get_vec_size();
|
||||
size_t simd_vec_count = detail::ceil_div(count, vec_size);
|
||||
return detail::ceil_div(simd_vec_count * vec_size * MaxLen, 64);
|
||||
}
|
||||
|
||||
public:
|
||||
MultiLevenshtein(size_t count, LevenshteinWeightTable aWeights = {1, 1, 1})
|
||||
: input_count(count), PM(find_block_count(count) * 64), weights(aWeights)
|
||||
{
|
||||
str_lens.resize(result_count());
|
||||
if (weights.delete_cost != 1 || weights.insert_cost != 1 || weights.replace_cost > 2)
|
||||
throw std::invalid_argument("unsupported weights");
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief get minimum size required for result vectors passed into
|
||||
* - distance
|
||||
* - similarity
|
||||
* - normalized_distance
|
||||
* - normalized_similarity
|
||||
*
|
||||
* @return minimum vector size
|
||||
*/
|
||||
size_t result_count() const
|
||||
{
|
||||
size_t vec_size = get_vec_size();
|
||||
size_t simd_vec_count = detail::ceil_div(input_count, vec_size);
|
||||
return simd_vec_count * vec_size;
|
||||
}
|
||||
|
||||
template <typename Sentence1>
|
||||
void insert(const Sentence1& s1_)
|
||||
{
|
||||
insert(detail::to_begin(s1_), detail::to_end(s1_));
|
||||
}
|
||||
|
||||
template <typename InputIt1>
|
||||
void insert(InputIt1 first1, InputIt1 last1)
|
||||
{
|
||||
auto len = std::distance(first1, last1);
|
||||
int block_pos = static_cast<int>((pos * MaxLen) % 64);
|
||||
auto block = (pos * MaxLen) / 64;
|
||||
assert(len <= MaxLen);
|
||||
|
||||
if (pos >= input_count) throw std::invalid_argument("out of bounds insert");
|
||||
|
||||
str_lens[pos] = static_cast<size_t>(len);
|
||||
for (; first1 != last1; ++first1) {
|
||||
PM.insert(block, *first1, block_pos);
|
||||
block_pos++;
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename InputIt2>
|
||||
void _distance(size_t* scores, size_t score_count, const detail::Range<InputIt2>& s2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max()) const
|
||||
{
|
||||
if (score_count < result_count())
|
||||
throw std::invalid_argument("scores has to have >= result_count() elements");
|
||||
|
||||
detail::Range scores_(scores, scores + score_count);
|
||||
if constexpr (MaxLen == 8)
|
||||
detail::levenshtein_hyrroe2003_simd<uint8_t>(scores_, PM, str_lens, s2, score_cutoff);
|
||||
else if constexpr (MaxLen == 16)
|
||||
detail::levenshtein_hyrroe2003_simd<uint16_t>(scores_, PM, str_lens, s2, score_cutoff);
|
||||
else if constexpr (MaxLen == 32)
|
||||
detail::levenshtein_hyrroe2003_simd<uint32_t>(scores_, PM, str_lens, s2, score_cutoff);
|
||||
else if constexpr (MaxLen == 64)
|
||||
detail::levenshtein_hyrroe2003_simd<uint64_t>(scores_, PM, str_lens, s2, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t maximum(size_t s1_idx, const detail::Range<InputIt2>& s2) const
|
||||
{
|
||||
return detail::levenshtein_maximum(str_lens[s1_idx], s2.size(), weights);
|
||||
}
|
||||
|
||||
size_t get_input_count() const noexcept
|
||||
{
|
||||
return input_count;
|
||||
}
|
||||
|
||||
size_t input_count;
|
||||
size_t pos = 0;
|
||||
detail::BlockPatternMatchVector PM;
|
||||
std::vector<size_t> str_lens;
|
||||
LevenshteinWeightTable weights;
|
||||
};
|
||||
} /* namespace experimental */
|
||||
#endif /* RAPIDFUZZ_SIMD */
|
||||
|
||||
template <typename CharT1>
|
||||
struct CachedLevenshtein : public detail::CachedDistanceBase<CachedLevenshtein<CharT1>, size_t, 0,
|
||||
std::numeric_limits<int64_t>::max()> {
|
||||
template <typename Sentence1>
|
||||
explicit CachedLevenshtein(const Sentence1& s1_, LevenshteinWeightTable aWeights = {1, 1, 1})
|
||||
: CachedLevenshtein(detail::to_begin(s1_), detail::to_end(s1_), aWeights)
|
||||
{}
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedLevenshtein(InputIt1 first1, InputIt1 last1, LevenshteinWeightTable aWeights = {1, 1, 1})
|
||||
: s1(first1, last1), PM(detail::Range(first1, last1)), weights(aWeights)
|
||||
{}
|
||||
|
||||
private:
|
||||
friend detail::CachedDistanceBase<CachedLevenshtein<CharT1>, size_t, 0,
|
||||
std::numeric_limits<int64_t>::max()>;
|
||||
friend detail::CachedNormalizedMetricBase<CachedLevenshtein<CharT1>>;
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t maximum(const detail::Range<InputIt2>& s2) const
|
||||
{
|
||||
return detail::levenshtein_maximum(s1.size(), s2.size(), weights);
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t _distance(const detail::Range<InputIt2>& s2, size_t score_cutoff, size_t score_hint) const
|
||||
{
|
||||
if (weights.insert_cost == weights.delete_cost) {
|
||||
/* when insertions + deletions operations are free there can not be any edit distance */
|
||||
if (weights.insert_cost == 0) return 0;
|
||||
|
||||
/* uniform Levenshtein multiplied with the common factor */
|
||||
if (weights.insert_cost == weights.replace_cost) {
|
||||
// max can make use of the common divisor of the three weights
|
||||
size_t new_score_cutoff = detail::ceil_div(score_cutoff, weights.insert_cost);
|
||||
size_t new_score_hint = detail::ceil_div(score_hint, weights.insert_cost);
|
||||
size_t dist = detail::uniform_levenshtein_distance(PM, detail::Range(s1), s2,
|
||||
new_score_cutoff, new_score_hint);
|
||||
dist *= weights.insert_cost;
|
||||
|
||||
return (dist <= score_cutoff) ? dist : score_cutoff + 1;
|
||||
}
|
||||
/*
|
||||
* when replace_cost >= insert_cost + delete_cost no substitutions are performed
|
||||
* therefore this can be implemented as InDel distance multiplied with the common factor
|
||||
*/
|
||||
else if (weights.replace_cost >= weights.insert_cost + weights.delete_cost) {
|
||||
// max can make use of the common divisor of the three weights
|
||||
size_t new_max = detail::ceil_div(score_cutoff, weights.insert_cost);
|
||||
size_t dist = detail::indel_distance(PM, detail::Range(s1), s2, new_max);
|
||||
dist *= weights.insert_cost;
|
||||
return (dist <= score_cutoff) ? dist : score_cutoff + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return detail::generalized_levenshtein_distance(detail::Range(s1), s2, weights, score_cutoff);
|
||||
}
|
||||
|
||||
std::vector<CharT1> s1;
|
||||
detail::BlockPatternMatchVector PM;
|
||||
LevenshteinWeightTable weights;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedLevenshtein(const Sentence1& s1_, LevenshteinWeightTable aWeights = {
|
||||
1, 1, 1}) -> CachedLevenshtein<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedLevenshtein(InputIt1 first1, InputIt1 last1,
|
||||
LevenshteinWeightTable aWeights = {1, 1, 1}) -> CachedLevenshtein<iter_value_t<InputIt1>>;
|
||||
|
||||
} // namespace rapidfuzz
|
||||
1220
src/external/rapidfuzz-cpp/rapidfuzz/distance/Levenshtein_impl.hpp
vendored
Normal file
1220
src/external/rapidfuzz-cpp/rapidfuzz/distance/Levenshtein_impl.hpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
281
src/external/rapidfuzz-cpp/rapidfuzz/distance/OSA.hpp
vendored
Normal file
281
src/external/rapidfuzz-cpp/rapidfuzz/distance/OSA.hpp
vendored
Normal file
@@ -0,0 +1,281 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <limits>
|
||||
#include <rapidfuzz/details/common.hpp>
|
||||
#include <rapidfuzz/distance/OSA_impl.hpp>
|
||||
|
||||
namespace rapidfuzz {
|
||||
|
||||
/**
|
||||
* @brief Calculates the optimal string alignment (OSA) distance between two strings.
|
||||
*
|
||||
* @details
|
||||
* Both strings require a similar length
|
||||
*
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1
|
||||
* string to compare with s2 (for type info check Template parameters above)
|
||||
* @param s2
|
||||
* string to compare with s1 (for type info check Template parameters above)
|
||||
* @param max
|
||||
* Maximum OSA distance between s1 and s2, that is
|
||||
* considered as a result. If the distance is bigger than max,
|
||||
* max + 1 is returned instead. Default is std::numeric_limits<size_t>::max(),
|
||||
* which deactivates this behaviour.
|
||||
*
|
||||
* @return OSA distance between s1 and s2
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t osa_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::OSA::distance(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t osa_distance(const Sentence1& s1, const Sentence2& s2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::OSA::distance(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t osa_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_cutoff = 0)
|
||||
{
|
||||
return detail::OSA::similarity(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t osa_similarity(const Sentence1& s1, const Sentence2& s2, size_t score_cutoff = 0)
|
||||
{
|
||||
return detail::OSA::similarity(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double osa_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::OSA::normalized_distance(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double osa_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::OSA::normalized_distance(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculates a normalized hamming similarity
|
||||
*
|
||||
* @details
|
||||
* Both string require a similar length
|
||||
*
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1
|
||||
* string to compare with s2 (for type info check Template parameters above)
|
||||
* @param s2
|
||||
* string to compare with s1 (for type info check Template parameters above)
|
||||
* @param score_cutoff
|
||||
* Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
* For ratio < score_cutoff 0 is returned instead. Default is 0,
|
||||
* which deactivates this behaviour.
|
||||
*
|
||||
* @return Normalized hamming distance between s1 and s2
|
||||
* as a float between 0 and 1.0
|
||||
*/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double osa_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::OSA::normalized_similarity(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double osa_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::OSA::normalized_similarity(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
namespace experimental {
|
||||
template <int MaxLen>
|
||||
struct MultiOSA
|
||||
: public detail::MultiDistanceBase<MultiOSA<MaxLen>, size_t, 0, std::numeric_limits<int64_t>::max()> {
|
||||
private:
|
||||
friend detail::MultiDistanceBase<MultiOSA<MaxLen>, size_t, 0, std::numeric_limits<int64_t>::max()>;
|
||||
friend detail::MultiNormalizedMetricBase<MultiOSA<MaxLen>, size_t>;
|
||||
|
||||
constexpr static size_t get_vec_size()
|
||||
{
|
||||
# ifdef RAPIDFUZZ_AVX2
|
||||
using namespace detail::simd_avx2;
|
||||
# else
|
||||
using namespace detail::simd_sse2;
|
||||
# endif
|
||||
if constexpr (MaxLen <= 8)
|
||||
return native_simd<uint8_t>::size;
|
||||
else if constexpr (MaxLen <= 16)
|
||||
return native_simd<uint16_t>::size;
|
||||
else if constexpr (MaxLen <= 32)
|
||||
return native_simd<uint32_t>::size;
|
||||
else if constexpr (MaxLen <= 64)
|
||||
return native_simd<uint64_t>::size;
|
||||
|
||||
static_assert(MaxLen <= 64);
|
||||
}
|
||||
|
||||
constexpr static size_t find_block_count(size_t count)
|
||||
{
|
||||
size_t vec_size = get_vec_size();
|
||||
size_t simd_vec_count = detail::ceil_div(count, vec_size);
|
||||
return detail::ceil_div(simd_vec_count * vec_size * MaxLen, 64);
|
||||
}
|
||||
|
||||
public:
|
||||
MultiOSA(size_t count) : input_count(count), PM(find_block_count(count) * 64)
|
||||
{
|
||||
str_lens.resize(result_count());
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief get minimum size required for result vectors passed into
|
||||
* - distance
|
||||
* - similarity
|
||||
* - normalized_distance
|
||||
* - normalized_similarity
|
||||
*
|
||||
* @return minimum vector size
|
||||
*/
|
||||
size_t result_count() const
|
||||
{
|
||||
size_t vec_size = get_vec_size();
|
||||
size_t simd_vec_count = detail::ceil_div(input_count, vec_size);
|
||||
return simd_vec_count * vec_size;
|
||||
}
|
||||
|
||||
template <typename Sentence1>
|
||||
void insert(const Sentence1& s1_)
|
||||
{
|
||||
insert(detail::to_begin(s1_), detail::to_end(s1_));
|
||||
}
|
||||
|
||||
template <typename InputIt1>
|
||||
void insert(InputIt1 first1, InputIt1 last1)
|
||||
{
|
||||
auto len = std::distance(first1, last1);
|
||||
int block_pos = static_cast<int>((pos * MaxLen) % 64);
|
||||
auto block = (pos * MaxLen) / 64;
|
||||
assert(len <= MaxLen);
|
||||
|
||||
if (pos >= input_count) throw std::invalid_argument("out of bounds insert");
|
||||
|
||||
str_lens[pos] = static_cast<size_t>(len);
|
||||
for (; first1 != last1; ++first1) {
|
||||
PM.insert(block, *first1, block_pos);
|
||||
block_pos++;
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename InputIt2>
|
||||
void _distance(size_t* scores, size_t score_count, const detail::Range<InputIt2>& s2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max()) const
|
||||
{
|
||||
if (score_count < result_count())
|
||||
throw std::invalid_argument("scores has to have >= result_count() elements");
|
||||
|
||||
detail::Range scores_(scores, scores + score_count);
|
||||
if constexpr (MaxLen == 8)
|
||||
detail::osa_hyrroe2003_simd<uint8_t>(scores_, PM, str_lens, s2, score_cutoff);
|
||||
else if constexpr (MaxLen == 16)
|
||||
detail::osa_hyrroe2003_simd<uint16_t>(scores_, PM, str_lens, s2, score_cutoff);
|
||||
else if constexpr (MaxLen == 32)
|
||||
detail::osa_hyrroe2003_simd<uint32_t>(scores_, PM, str_lens, s2, score_cutoff);
|
||||
else if constexpr (MaxLen == 64)
|
||||
detail::osa_hyrroe2003_simd<uint64_t>(scores_, PM, str_lens, s2, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t maximum(size_t s1_idx, const detail::Range<InputIt2>& s2) const
|
||||
{
|
||||
return std::max(str_lens[s1_idx], s2.size());
|
||||
}
|
||||
|
||||
size_t get_input_count() const noexcept
|
||||
{
|
||||
return input_count;
|
||||
}
|
||||
|
||||
size_t input_count;
|
||||
size_t pos = 0;
|
||||
detail::BlockPatternMatchVector PM;
|
||||
std::vector<size_t> str_lens;
|
||||
};
|
||||
} /* namespace experimental */
|
||||
#endif
|
||||
|
||||
template <typename CharT1>
|
||||
struct CachedOSA
|
||||
: public detail::CachedDistanceBase<CachedOSA<CharT1>, size_t, 0, std::numeric_limits<int64_t>::max()> {
|
||||
template <typename Sentence1>
|
||||
explicit CachedOSA(const Sentence1& s1_) : CachedOSA(detail::to_begin(s1_), detail::to_end(s1_))
|
||||
{}
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedOSA(InputIt1 first1, InputIt1 last1) : s1(first1, last1), PM(detail::Range(first1, last1))
|
||||
{}
|
||||
|
||||
private:
|
||||
friend detail::CachedDistanceBase<CachedOSA<CharT1>, size_t, 0, std::numeric_limits<int64_t>::max()>;
|
||||
friend detail::CachedNormalizedMetricBase<CachedOSA<CharT1>>;
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t maximum(const detail::Range<InputIt2>& s2) const
|
||||
{
|
||||
return std::max(s1.size(), s2.size());
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t _distance(const detail::Range<InputIt2>& s2, size_t score_cutoff,
|
||||
[[maybe_unused]] size_t score_hint) const
|
||||
{
|
||||
size_t res;
|
||||
if (s1.empty())
|
||||
res = s2.size();
|
||||
else if (s2.empty())
|
||||
res = s1.size();
|
||||
else if (s1.size() < 64)
|
||||
res = detail::osa_hyrroe2003(PM, detail::Range(s1), s2, score_cutoff);
|
||||
else
|
||||
res = detail::osa_hyrroe2003_block(PM, detail::Range(s1), s2, score_cutoff);
|
||||
|
||||
return (res <= score_cutoff) ? res : score_cutoff + 1;
|
||||
}
|
||||
|
||||
std::vector<CharT1> s1;
|
||||
detail::BlockPatternMatchVector PM;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
CachedOSA(const Sentence1& s1_) -> CachedOSA<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedOSA(InputIt1 first1, InputIt1 last1) -> CachedOSA<iter_value_t<InputIt1>>;
|
||||
/**@}*/
|
||||
|
||||
} // namespace rapidfuzz
|
||||
273
src/external/rapidfuzz-cpp/rapidfuzz/distance/OSA_impl.hpp
vendored
Normal file
273
src/external/rapidfuzz-cpp/rapidfuzz/distance/OSA_impl.hpp
vendored
Normal file
@@ -0,0 +1,273 @@
|
||||
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <cstdint>
|
||||
#include <rapidfuzz/details/PatternMatchVector.hpp>
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/details/common.hpp>
|
||||
#include <rapidfuzz/details/distance.hpp>
|
||||
#include <rapidfuzz/details/simd.hpp>
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
/**
|
||||
* @brief Bitparallel implementation of the OSA distance.
|
||||
*
|
||||
* This implementation requires the first string to have a length <= 64.
|
||||
* The algorithm used is described @cite hyrro_2002 and has a time complexity
|
||||
* of O(N). Comments and variable names in the implementation follow the
|
||||
* paper. This implementation is used internally when the strings are short enough
|
||||
*
|
||||
* @tparam CharT1 This is the char type of the first sentence
|
||||
* @tparam CharT2 This is the char type of the second sentence
|
||||
*
|
||||
* @param s1
|
||||
* string to compare with s2 (for type info check Template parameters above)
|
||||
* @param s2
|
||||
* string to compare with s1 (for type info check Template parameters above)
|
||||
*
|
||||
* @return returns the OSA distance between s1 and s2
|
||||
*/
|
||||
template <typename PM_Vec, typename InputIt1, typename InputIt2>
|
||||
size_t osa_hyrroe2003(const PM_Vec& PM, const Range<InputIt1>& s1, const Range<InputIt2>& s2, size_t max)
|
||||
{
|
||||
/* VP is set to 1^m. Shifting by bitwidth would be undefined behavior */
|
||||
uint64_t VP = ~UINT64_C(0);
|
||||
uint64_t VN = 0;
|
||||
uint64_t D0 = 0;
|
||||
uint64_t PM_j_old = 0;
|
||||
size_t currDist = s1.size();
|
||||
assert(s1.size() != 0);
|
||||
|
||||
/* mask used when computing D[m,j] in the paper 10^(m-1) */
|
||||
uint64_t mask = UINT64_C(1) << (s1.size() - 1);
|
||||
|
||||
/* Searching */
|
||||
for (const auto& ch : s2) {
|
||||
/* Step 1: Computing D0 */
|
||||
uint64_t PM_j = PM.get(0, ch);
|
||||
uint64_t TR = (((~D0) & PM_j) << 1) & PM_j_old;
|
||||
D0 = (((PM_j & VP) + VP) ^ VP) | PM_j | VN;
|
||||
D0 = D0 | TR;
|
||||
|
||||
/* Step 2: Computing HP and HN */
|
||||
uint64_t HP = VN | ~(D0 | VP);
|
||||
uint64_t HN = D0 & VP;
|
||||
|
||||
/* Step 3: Computing the value D[m,j] */
|
||||
currDist += bool(HP & mask);
|
||||
currDist -= bool(HN & mask);
|
||||
|
||||
/* Step 4: Computing Vp and VN */
|
||||
HP = (HP << 1) | 1;
|
||||
HN = (HN << 1);
|
||||
|
||||
VP = HN | ~(D0 | HP);
|
||||
VN = HP & D0;
|
||||
PM_j_old = PM_j;
|
||||
}
|
||||
|
||||
return (currDist <= max) ? currDist : max + 1;
|
||||
}
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
template <typename VecType, typename InputIt, int _lto_hack = RAPIDFUZZ_LTO_HACK>
|
||||
void osa_hyrroe2003_simd(Range<size_t*> scores, const detail::BlockPatternMatchVector& block,
|
||||
const std::vector<size_t>& s1_lengths, const Range<InputIt>& s2,
|
||||
size_t score_cutoff) noexcept
|
||||
{
|
||||
# ifdef RAPIDFUZZ_AVX2
|
||||
using namespace simd_avx2;
|
||||
# else
|
||||
using namespace simd_sse2;
|
||||
# endif
|
||||
static constexpr size_t alignment = native_simd<VecType>::alignment;
|
||||
static constexpr size_t vec_width = native_simd<VecType>::size;
|
||||
static constexpr size_t vecs = native_simd<uint64_t>::size;
|
||||
assert(block.size() % vecs == 0);
|
||||
|
||||
native_simd<VecType> zero(VecType(0));
|
||||
native_simd<VecType> one(1);
|
||||
size_t result_index = 0;
|
||||
|
||||
for (size_t cur_vec = 0; cur_vec < block.size(); cur_vec += vecs) {
|
||||
/* VP is set to 1^m */
|
||||
native_simd<VecType> VP(static_cast<VecType>(-1));
|
||||
native_simd<VecType> VN(VecType(0));
|
||||
native_simd<VecType> D0(VecType(0));
|
||||
native_simd<VecType> PM_j_old(VecType(0));
|
||||
|
||||
alignas(alignment) std::array<VecType, vec_width> currDist_;
|
||||
unroll<int, vec_width>(
|
||||
[&](auto i) { currDist_[i] = static_cast<VecType>(s1_lengths[result_index + i]); });
|
||||
native_simd<VecType> currDist(reinterpret_cast<uint64_t*>(currDist_.data()));
|
||||
/* mask used when computing D[m,j] in the paper 10^(m-1) */
|
||||
alignas(alignment) std::array<VecType, vec_width> mask_;
|
||||
unroll<int, vec_width>([&](auto i) {
|
||||
if (s1_lengths[result_index + i] == 0)
|
||||
mask_[i] = 0;
|
||||
else
|
||||
mask_[i] = static_cast<VecType>(UINT64_C(1) << (s1_lengths[result_index + i] - 1));
|
||||
});
|
||||
native_simd<VecType> mask(reinterpret_cast<uint64_t*>(mask_.data()));
|
||||
|
||||
for (const auto& ch : s2) {
|
||||
/* Step 1: Computing D0 */
|
||||
alignas(alignment) std::array<uint64_t, vecs> stored;
|
||||
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, ch); });
|
||||
|
||||
native_simd<VecType> PM_j(stored.data());
|
||||
auto TR = (andnot(PM_j, D0) << 1) & PM_j_old;
|
||||
D0 = (((PM_j & VP) + VP) ^ VP) | PM_j | VN;
|
||||
D0 = D0 | TR;
|
||||
|
||||
/* Step 2: Computing HP and HN */
|
||||
auto HP = VN | ~(D0 | VP);
|
||||
auto HN = D0 & VP;
|
||||
|
||||
/* Step 3: Computing the value D[m,j] */
|
||||
currDist += andnot(one, (HP & mask) == zero);
|
||||
currDist -= andnot(one, (HN & mask) == zero);
|
||||
|
||||
/* Step 4: Computing Vp and VN */
|
||||
HP = (HP << 1) | one;
|
||||
HN = (HN << 1);
|
||||
|
||||
VP = HN | ~(D0 | HP);
|
||||
VN = HP & D0;
|
||||
PM_j_old = PM_j;
|
||||
}
|
||||
|
||||
alignas(alignment) std::array<VecType, vec_width> distances;
|
||||
currDist.store(distances.data());
|
||||
|
||||
unroll<int, vec_width>([&](auto i) {
|
||||
size_t score = 0;
|
||||
/* strings of length 0 are not handled correctly */
|
||||
if (s1_lengths[result_index] == 0) {
|
||||
score = s2.size();
|
||||
}
|
||||
/* calculate score under consideration of wraparounds in parallel counter */
|
||||
else {
|
||||
if constexpr (std::numeric_limits<VecType>::max() < std::numeric_limits<size_t>::max()) {
|
||||
size_t min_dist = abs_diff(s1_lengths[result_index], s2.size());
|
||||
size_t wraparound_score = static_cast<size_t>(std::numeric_limits<VecType>::max()) + 1;
|
||||
|
||||
score = (min_dist / wraparound_score) * wraparound_score;
|
||||
VecType remainder = static_cast<VecType>(min_dist % wraparound_score);
|
||||
|
||||
if (distances[i] < remainder) score += wraparound_score;
|
||||
}
|
||||
|
||||
score += distances[i];
|
||||
}
|
||||
scores[result_index] = (score <= score_cutoff) ? score : score_cutoff + 1;
|
||||
result_index++;
|
||||
});
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t osa_hyrroe2003_block(const BlockPatternMatchVector& PM, const Range<InputIt1>& s1,
|
||||
const Range<InputIt2>& s2, size_t max = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
struct Row {
|
||||
uint64_t VP;
|
||||
uint64_t VN;
|
||||
uint64_t D0;
|
||||
uint64_t PM;
|
||||
|
||||
Row() : VP(~UINT64_C(0)), VN(0), D0(0), PM(0)
|
||||
{}
|
||||
};
|
||||
|
||||
size_t word_size = sizeof(uint64_t) * 8;
|
||||
size_t words = PM.size();
|
||||
uint64_t Last = UINT64_C(1) << ((s1.size() - 1) % word_size);
|
||||
|
||||
size_t currDist = s1.size();
|
||||
std::vector<Row> old_vecs(words + 1);
|
||||
std::vector<Row> new_vecs(words + 1);
|
||||
|
||||
/* Searching */
|
||||
auto iter_s2 = s2.begin();
|
||||
for (size_t row = 0; row < s2.size(); ++iter_s2, ++row) {
|
||||
uint64_t HP_carry = 1;
|
||||
uint64_t HN_carry = 0;
|
||||
|
||||
for (size_t word = 0; word < words; word++) {
|
||||
/* retrieve bit vectors from last iterations */
|
||||
uint64_t VN = old_vecs[word + 1].VN;
|
||||
uint64_t VP = old_vecs[word + 1].VP;
|
||||
uint64_t D0 = old_vecs[word + 1].D0;
|
||||
/* D0 last word */
|
||||
uint64_t D0_last = old_vecs[word].D0;
|
||||
|
||||
/* PM of last char same word */
|
||||
uint64_t PM_j_old = old_vecs[word + 1].PM;
|
||||
/* PM of last word */
|
||||
uint64_t PM_last = new_vecs[word].PM;
|
||||
|
||||
uint64_t PM_j = PM.get(word, *iter_s2);
|
||||
uint64_t X = PM_j;
|
||||
uint64_t TR = ((((~D0) & X) << 1) | (((~D0_last) & PM_last) >> 63)) & PM_j_old;
|
||||
|
||||
X |= HN_carry;
|
||||
D0 = (((X & VP) + VP) ^ VP) | X | VN | TR;
|
||||
|
||||
uint64_t HP = VN | ~(D0 | VP);
|
||||
uint64_t HN = D0 & VP;
|
||||
|
||||
if (word == words - 1) {
|
||||
currDist += bool(HP & Last);
|
||||
currDist -= bool(HN & Last);
|
||||
}
|
||||
|
||||
uint64_t HP_carry_temp = HP_carry;
|
||||
HP_carry = HP >> 63;
|
||||
HP = (HP << 1) | HP_carry_temp;
|
||||
uint64_t HN_carry_temp = HN_carry;
|
||||
HN_carry = HN >> 63;
|
||||
HN = (HN << 1) | HN_carry_temp;
|
||||
|
||||
new_vecs[word + 1].VP = HN | ~(D0 | HP);
|
||||
new_vecs[word + 1].VN = HP & D0;
|
||||
new_vecs[word + 1].D0 = D0;
|
||||
new_vecs[word + 1].PM = PM_j;
|
||||
}
|
||||
|
||||
std::swap(new_vecs, old_vecs);
|
||||
}
|
||||
|
||||
return (currDist <= max) ? currDist : max + 1;
|
||||
}
|
||||
|
||||
class OSA : public DistanceBase<OSA, size_t, 0, std::numeric_limits<int64_t>::max()> {
|
||||
friend DistanceBase<OSA, size_t, 0, std::numeric_limits<int64_t>::max()>;
|
||||
friend NormalizedMetricBase<OSA>;
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static size_t maximum(const Range<InputIt1>& s1, const Range<InputIt2>& s2)
|
||||
{
|
||||
return std::max(s1.size(), s2.size());
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static size_t _distance(Range<InputIt1> s1, Range<InputIt2> s2, size_t score_cutoff, size_t score_hint)
|
||||
{
|
||||
if (s2.size() < s1.size()) return _distance(s2, s1, score_cutoff, score_hint);
|
||||
|
||||
remove_common_affix(s1, s2);
|
||||
if (s1.empty())
|
||||
return (s2.size() <= score_cutoff) ? s2.size() : score_cutoff + 1;
|
||||
else if (s1.size() < 64)
|
||||
return osa_hyrroe2003(PatternMatchVector(s1), s1, s2, score_cutoff);
|
||||
else
|
||||
return osa_hyrroe2003_block(BlockPatternMatchVector(s1), s1, s2, score_cutoff);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
105
src/external/rapidfuzz-cpp/rapidfuzz/distance/Postfix.hpp
vendored
Normal file
105
src/external/rapidfuzz-cpp/rapidfuzz/distance/Postfix.hpp
vendored
Normal file
@@ -0,0 +1,105 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <limits>
|
||||
#include <rapidfuzz/details/common.hpp>
|
||||
#include <rapidfuzz/distance/Postfix_impl.hpp>
|
||||
|
||||
namespace rapidfuzz {
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t postfix_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::Postfix::distance(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t postfix_distance(const Sentence1& s1, const Sentence2& s2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::Postfix::distance(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t postfix_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_cutoff = 0)
|
||||
{
|
||||
return detail::Postfix::similarity(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t postfix_similarity(const Sentence1& s1, const Sentence2& s2, size_t score_cutoff = 0)
|
||||
{
|
||||
return detail::Postfix::similarity(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double postfix_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::Postfix::normalized_distance(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double postfix_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::Postfix::normalized_distance(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double postfix_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::Postfix::normalized_similarity(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double postfix_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::Postfix::normalized_similarity(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
struct CachedPostfix : public detail::CachedSimilarityBase<CachedPostfix<CharT1>, size_t, 0,
|
||||
std::numeric_limits<int64_t>::max()> {
|
||||
template <typename Sentence1>
|
||||
explicit CachedPostfix(const Sentence1& s1_) : CachedPostfix(detail::to_begin(s1_), detail::to_end(s1_))
|
||||
{}
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedPostfix(InputIt1 first1, InputIt1 last1) : s1(first1, last1)
|
||||
{}
|
||||
|
||||
private:
|
||||
friend detail::CachedSimilarityBase<CachedPostfix<CharT1>, size_t, 0,
|
||||
std::numeric_limits<int64_t>::max()>;
|
||||
friend detail::CachedNormalizedMetricBase<CachedPostfix<CharT1>>;
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t maximum(const detail::Range<InputIt2>& s2) const
|
||||
{
|
||||
return std::max(s1.size(), s2.size());
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t _similarity(detail::Range<InputIt2> s2, size_t score_cutoff,
|
||||
[[maybe_unused]] size_t score_hint) const
|
||||
{
|
||||
return detail::Postfix::similarity(s1, s2, score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
std::vector<CharT1> s1;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedPostfix(const Sentence1& s1_) -> CachedPostfix<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedPostfix(InputIt1 first1, InputIt1 last1) -> CachedPostfix<iter_value_t<InputIt1>>;
|
||||
|
||||
/**@}*/
|
||||
|
||||
} // namespace rapidfuzz
|
||||
30
src/external/rapidfuzz-cpp/rapidfuzz/distance/Postfix_impl.hpp
vendored
Normal file
30
src/external/rapidfuzz-cpp/rapidfuzz/distance/Postfix_impl.hpp
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/details/common.hpp>
|
||||
#include <rapidfuzz/details/distance.hpp>
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
class Postfix : public SimilarityBase<Postfix, size_t, 0, std::numeric_limits<int64_t>::max()> {
|
||||
friend SimilarityBase<Postfix, size_t, 0, std::numeric_limits<int64_t>::max()>;
|
||||
friend NormalizedMetricBase<Postfix>;
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static size_t maximum(const Range<InputIt1>& s1, const Range<InputIt2>& s2)
|
||||
{
|
||||
return std::max(s1.size(), s2.size());
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static size_t _similarity(Range<InputIt1> s1, Range<InputIt2> s2, size_t score_cutoff,
|
||||
[[maybe_unused]] size_t score_hint)
|
||||
{
|
||||
size_t dist = remove_common_suffix(s1, s2);
|
||||
return (dist >= score_cutoff) ? dist : 0;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
104
src/external/rapidfuzz-cpp/rapidfuzz/distance/Prefix.hpp
vendored
Normal file
104
src/external/rapidfuzz-cpp/rapidfuzz/distance/Prefix.hpp
vendored
Normal file
@@ -0,0 +1,104 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <limits>
|
||||
#include <rapidfuzz/details/common.hpp>
|
||||
#include <rapidfuzz/distance/Prefix_impl.hpp>
|
||||
|
||||
namespace rapidfuzz {
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t prefix_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::Prefix::distance(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t prefix_distance(const Sentence1& s1, const Sentence2& s2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return detail::Prefix::distance(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t prefix_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_cutoff = 0)
|
||||
{
|
||||
return detail::Prefix::similarity(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t prefix_similarity(const Sentence1& s1, const Sentence2& s2, size_t score_cutoff = 0)
|
||||
{
|
||||
return detail::Prefix::similarity(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double prefix_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::Prefix::normalized_distance(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double prefix_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 1.0)
|
||||
{
|
||||
return detail::Prefix::normalized_distance(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double prefix_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::Prefix::normalized_similarity(first1, last1, first2, last2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double prefix_normalized_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
|
||||
{
|
||||
return detail::Prefix::normalized_similarity(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
struct CachedPrefix : public detail::CachedSimilarityBase<CachedPrefix<CharT1>, size_t, 0,
|
||||
std::numeric_limits<int64_t>::max()> {
|
||||
template <typename Sentence1>
|
||||
explicit CachedPrefix(const Sentence1& s1_) : CachedPrefix(detail::to_begin(s1_), detail::to_end(s1_))
|
||||
{}
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedPrefix(InputIt1 first1, InputIt1 last1) : s1(first1, last1)
|
||||
{}
|
||||
|
||||
private:
|
||||
friend detail::CachedSimilarityBase<CachedPrefix<CharT1>, size_t, 0, std::numeric_limits<int64_t>::max()>;
|
||||
friend detail::CachedNormalizedMetricBase<CachedPrefix<CharT1>>;
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t maximum(const detail::Range<InputIt2>& s2) const
|
||||
{
|
||||
return std::max(s1.size(), s2.size());
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
size_t _similarity(detail::Range<InputIt2> s2, size_t score_cutoff,
|
||||
[[maybe_unused]] size_t score_hint) const
|
||||
{
|
||||
return detail::Prefix::similarity(s1, s2, score_cutoff, score_cutoff);
|
||||
}
|
||||
|
||||
std::vector<CharT1> s1;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedPrefix(const Sentence1& s1_) -> CachedPrefix<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedPrefix(InputIt1 first1, InputIt1 last1) -> CachedPrefix<iter_value_t<InputIt1>>;
|
||||
|
||||
/**@}*/
|
||||
|
||||
} // namespace rapidfuzz
|
||||
30
src/external/rapidfuzz-cpp/rapidfuzz/distance/Prefix_impl.hpp
vendored
Normal file
30
src/external/rapidfuzz-cpp/rapidfuzz/distance/Prefix_impl.hpp
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <rapidfuzz/details/Range.hpp>
|
||||
#include <rapidfuzz/details/common.hpp>
|
||||
#include <rapidfuzz/details/distance.hpp>
|
||||
|
||||
namespace rapidfuzz::detail {
|
||||
|
||||
class Prefix : public SimilarityBase<Prefix, size_t, 0, std::numeric_limits<int64_t>::max()> {
|
||||
friend SimilarityBase<Prefix, size_t, 0, std::numeric_limits<int64_t>::max()>;
|
||||
friend NormalizedMetricBase<Prefix>;
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static size_t maximum(const Range<InputIt1>& s1, const Range<InputIt2>& s2)
|
||||
{
|
||||
return std::max(s1.size(), s2.size());
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
static size_t _similarity(Range<InputIt1> s1, Range<InputIt2> s2, size_t score_cutoff,
|
||||
[[maybe_unused]] size_t score_hint)
|
||||
{
|
||||
size_t dist = remove_common_prefix(s1, s2);
|
||||
return (dist >= score_cutoff) ? dist : 0;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace rapidfuzz::detail
|
||||
789
src/external/rapidfuzz-cpp/rapidfuzz/fuzz.hpp
vendored
Normal file
789
src/external/rapidfuzz-cpp/rapidfuzz/fuzz.hpp
vendored
Normal file
@@ -0,0 +1,789 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
/* Copyright © 2011 Adam Cohen */
|
||||
|
||||
#pragma once
|
||||
#include <rapidfuzz/details/CharSet.hpp>
|
||||
#include <rapidfuzz/details/PatternMatchVector.hpp>
|
||||
#include <rapidfuzz/details/common.hpp>
|
||||
#include <rapidfuzz/distance/Indel.hpp>
|
||||
|
||||
namespace rapidfuzz::fuzz {
|
||||
|
||||
/**
|
||||
* @defgroup Fuzz Fuzz
|
||||
* A collection of string matching algorithms from FuzzyWuzzy
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief calculates a simple ratio between two strings
|
||||
*
|
||||
* @details
|
||||
* @code{.cpp}
|
||||
* // score is 96.55
|
||||
* double score = ratio("this is a test", "this is a test!")
|
||||
* @endcode
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1 string to compare with s2 (for type info check Template parameters
|
||||
* above)
|
||||
* @param s2 string to compare with s1 (for type info check Template parameters
|
||||
* above)
|
||||
* @param score_cutoff Optional argument for a score threshold between 0% and
|
||||
* 100%. Matches with a lower score than this number will not be returned.
|
||||
* Defaults to 0.
|
||||
*
|
||||
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
||||
*/
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
namespace experimental {
|
||||
template <int MaxLen>
|
||||
struct MultiRatio {
|
||||
public:
|
||||
MultiRatio(size_t count) : input_count(count), scorer(count)
|
||||
{}
|
||||
|
||||
size_t result_count() const
|
||||
{
|
||||
return scorer.result_count();
|
||||
}
|
||||
|
||||
template <typename Sentence1>
|
||||
void insert(const Sentence1& s1_)
|
||||
{
|
||||
insert(detail::to_begin(s1_), detail::to_end(s1_));
|
||||
}
|
||||
|
||||
template <typename InputIt1>
|
||||
void insert(InputIt1 first1, InputIt1 last1)
|
||||
{
|
||||
scorer.insert(first1, last1);
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
void similarity(double* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0.0) const
|
||||
{
|
||||
similarity(scores, score_count, detail::Range(first2, last2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence2>
|
||||
void similarity(double* scores, size_t score_count, const Sentence2& s2, double score_cutoff = 0) const
|
||||
{
|
||||
scorer.normalized_similarity(scores, score_count, s2, score_cutoff / 100.0);
|
||||
|
||||
for (size_t i = 0; i < input_count; ++i)
|
||||
scores[i] *= 100.0;
|
||||
}
|
||||
|
||||
private:
|
||||
size_t input_count;
|
||||
rapidfuzz::experimental::MultiIndel<MaxLen> scorer;
|
||||
};
|
||||
} /* namespace experimental */
|
||||
#endif
|
||||
|
||||
// TODO documentation
|
||||
template <typename CharT1>
|
||||
struct CachedRatio {
|
||||
template <typename InputIt1>
|
||||
CachedRatio(InputIt1 first1, InputIt1 last1) : cached_indel(first1, last1)
|
||||
{}
|
||||
|
||||
template <typename Sentence1>
|
||||
CachedRatio(const Sentence1& s1) : cached_indel(s1)
|
||||
{}
|
||||
|
||||
template <typename InputIt2>
|
||||
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
||||
double score_hint = 0.0) const;
|
||||
|
||||
template <typename Sentence2>
|
||||
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
||||
|
||||
// private:
|
||||
CachedIndel<CharT1> cached_indel;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
CachedRatio(const Sentence1& s1) -> CachedRatio<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedRatio(InputIt1 first1, InputIt1 last1) -> CachedRatio<iter_value_t<InputIt1>>;
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
ScoreAlignment<double> partial_ratio_alignment(InputIt1 first1, InputIt1 last1, InputIt2 first2,
|
||||
InputIt2 last2, double score_cutoff = 0);
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
ScoreAlignment<double> partial_ratio_alignment(const Sentence1& s1, const Sentence2& s2,
|
||||
double score_cutoff = 0);
|
||||
|
||||
/**
|
||||
* @brief calculates the fuzz::ratio of the optimal string alignment
|
||||
*
|
||||
* @details
|
||||
* test @cite hyrro_2004 @cite wagner_fischer_1974
|
||||
* @code{.cpp}
|
||||
* // score is 100
|
||||
* double score = partial_ratio("this is a test", "this is a test!")
|
||||
* @endcode
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1 string to compare with s2 (for type info check Template parameters
|
||||
* above)
|
||||
* @param s2 string to compare with s1 (for type info check Template parameters
|
||||
* above)
|
||||
* @param score_cutoff Optional argument for a score threshold between 0% and
|
||||
* 100%. Matches with a lower score than this number will not be returned.
|
||||
* Defaults to 0.
|
||||
*
|
||||
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
||||
*/
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double partial_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0);
|
||||
|
||||
// todo add real implementation
|
||||
template <typename CharT1>
|
||||
struct CachedPartialRatio {
|
||||
template <typename>
|
||||
friend struct CachedWRatio;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedPartialRatio(InputIt1 first1, InputIt1 last1);
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedPartialRatio(const Sentence1& s1_)
|
||||
: CachedPartialRatio(detail::to_begin(s1_), detail::to_end(s1_))
|
||||
{}
|
||||
|
||||
template <typename InputIt2>
|
||||
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
||||
double score_hint = 0.0) const;
|
||||
|
||||
template <typename Sentence2>
|
||||
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
||||
|
||||
private:
|
||||
std::vector<CharT1> s1;
|
||||
rapidfuzz::detail::CharSet<CharT1> s1_char_set;
|
||||
CachedRatio<CharT1> cached_ratio;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedPartialRatio(const Sentence1& s1) -> CachedPartialRatio<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedPartialRatio(InputIt1 first1, InputIt1 last1) -> CachedPartialRatio<iter_value_t<InputIt1>>;
|
||||
|
||||
/**
|
||||
* @brief Sorts the words in the strings and calculates the fuzz::ratio between
|
||||
* them
|
||||
*
|
||||
* @details
|
||||
* @code{.cpp}
|
||||
* // score is 100
|
||||
* double score = token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a
|
||||
* bear")
|
||||
* @endcode
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1 string to compare with s2 (for type info check Template parameters
|
||||
* above)
|
||||
* @param s2 string to compare with s1 (for type info check Template parameters
|
||||
* above)
|
||||
* @param score_cutoff Optional argument for a score threshold between 0% and
|
||||
* 100%. Matches with a lower score than this number will not be returned.
|
||||
* Defaults to 0.
|
||||
*
|
||||
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
||||
*/
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0);
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
namespace experimental {
|
||||
template <int MaxLen>
|
||||
struct MultiTokenSortRatio {
|
||||
public:
|
||||
MultiTokenSortRatio(size_t count) : scorer(count)
|
||||
{}
|
||||
|
||||
size_t result_count() const
|
||||
{
|
||||
return scorer.result_count();
|
||||
}
|
||||
|
||||
template <typename Sentence1>
|
||||
void insert(const Sentence1& s1_)
|
||||
{
|
||||
insert(detail::to_begin(s1_), detail::to_end(s1_));
|
||||
}
|
||||
|
||||
template <typename InputIt1>
|
||||
void insert(InputIt1 first1, InputIt1 last1)
|
||||
{
|
||||
scorer.insert(detail::sorted_split(first1, last1).join());
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
void similarity(double* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0.0) const
|
||||
{
|
||||
scorer.similarity(scores, score_count, detail::sorted_split(first2, last2).join(), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence2>
|
||||
void similarity(double* scores, size_t score_count, const Sentence2& s2, double score_cutoff = 0) const
|
||||
{
|
||||
similarity(scores, score_count, detail::to_begin(s2), detail::to_end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
private:
|
||||
MultiRatio<MaxLen> scorer;
|
||||
};
|
||||
} /* namespace experimental */
|
||||
#endif
|
||||
|
||||
// todo CachedRatio speed for equal strings vs original implementation
|
||||
// TODO documentation
|
||||
template <typename CharT1>
|
||||
struct CachedTokenSortRatio {
|
||||
template <typename InputIt1>
|
||||
CachedTokenSortRatio(InputIt1 first1, InputIt1 last1)
|
||||
: s1_sorted(detail::sorted_split(first1, last1).join()), cached_ratio(s1_sorted)
|
||||
{}
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedTokenSortRatio(const Sentence1& s1)
|
||||
: CachedTokenSortRatio(detail::to_begin(s1), detail::to_end(s1))
|
||||
{}
|
||||
|
||||
template <typename InputIt2>
|
||||
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
||||
double score_hint = 0.0) const;
|
||||
|
||||
template <typename Sentence2>
|
||||
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
||||
|
||||
private:
|
||||
std::vector<CharT1> s1_sorted;
|
||||
CachedRatio<CharT1> cached_ratio;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedTokenSortRatio(const Sentence1& s1) -> CachedTokenSortRatio<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedTokenSortRatio(InputIt1 first1, InputIt1 last1) -> CachedTokenSortRatio<iter_value_t<InputIt1>>;
|
||||
|
||||
/**
|
||||
* @brief Sorts the words in the strings and calculates the fuzz::partial_ratio
|
||||
* between them
|
||||
*
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1 string to compare with s2 (for type info check Template parameters
|
||||
* above)
|
||||
* @param s2 string to compare with s1 (for type info check Template parameters
|
||||
* above)
|
||||
* @param score_cutoff Optional argument for a score threshold between 0% and
|
||||
* 100%. Matches with a lower score than this number will not be returned.
|
||||
* Defaults to 0.
|
||||
*
|
||||
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
||||
*/
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double partial_token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double partial_token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0);
|
||||
|
||||
// TODO documentation
|
||||
template <typename CharT1>
|
||||
struct CachedPartialTokenSortRatio {
|
||||
template <typename InputIt1>
|
||||
CachedPartialTokenSortRatio(InputIt1 first1, InputIt1 last1)
|
||||
: s1_sorted(detail::sorted_split(first1, last1).join()), cached_partial_ratio(s1_sorted)
|
||||
{}
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedPartialTokenSortRatio(const Sentence1& s1)
|
||||
: CachedPartialTokenSortRatio(detail::to_begin(s1), detail::to_end(s1))
|
||||
{}
|
||||
|
||||
template <typename InputIt2>
|
||||
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
||||
double score_hint = 0.0) const;
|
||||
|
||||
template <typename Sentence2>
|
||||
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
||||
|
||||
private:
|
||||
std::vector<CharT1> s1_sorted;
|
||||
CachedPartialRatio<CharT1> cached_partial_ratio;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedPartialTokenSortRatio(const Sentence1& s1)
|
||||
-> CachedPartialTokenSortRatio<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedPartialTokenSortRatio(InputIt1 first1,
|
||||
InputIt1 last1) -> CachedPartialTokenSortRatio<iter_value_t<InputIt1>>;
|
||||
|
||||
/**
|
||||
* @brief Compares the words in the strings based on unique and common words
|
||||
* between them using fuzz::ratio
|
||||
*
|
||||
* @details
|
||||
* @code{.cpp}
|
||||
* // score1 is 83.87
|
||||
* double score1 = token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a
|
||||
* bear")
|
||||
* // score2 is 100
|
||||
* double score2 = token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
|
||||
* @endcode
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1 string to compare with s2 (for type info check Template parameters
|
||||
* above)
|
||||
* @param s2 string to compare with s1 (for type info check Template parameters
|
||||
* above)
|
||||
* @param score_cutoff Optional argument for a score threshold between 0% and
|
||||
* 100%. Matches with a lower score than this number will not be returned.
|
||||
* Defaults to 0.
|
||||
*
|
||||
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
||||
*/
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0);
|
||||
|
||||
// TODO documentation
|
||||
template <typename CharT1>
|
||||
struct CachedTokenSetRatio {
|
||||
template <typename InputIt1>
|
||||
CachedTokenSetRatio(InputIt1 first1, InputIt1 last1)
|
||||
: s1(first1, last1), tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1)))
|
||||
{}
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedTokenSetRatio(const Sentence1& s1_)
|
||||
: CachedTokenSetRatio(detail::to_begin(s1_), detail::to_end(s1_))
|
||||
{}
|
||||
|
||||
template <typename InputIt2>
|
||||
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
||||
double score_hint = 0.0) const;
|
||||
|
||||
template <typename Sentence2>
|
||||
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
||||
|
||||
private:
|
||||
std::vector<CharT1> s1;
|
||||
detail::SplittedSentenceView<typename std::vector<CharT1>::iterator> tokens_s1;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedTokenSetRatio(const Sentence1& s1) -> CachedTokenSetRatio<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedTokenSetRatio(InputIt1 first1, InputIt1 last1) -> CachedTokenSetRatio<iter_value_t<InputIt1>>;
|
||||
|
||||
/**
|
||||
* @brief Compares the words in the strings based on unique and common words
|
||||
* between them using fuzz::partial_ratio
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1 string to compare with s2 (for type info check Template parameters
|
||||
* above)
|
||||
* @param s2 string to compare with s1 (for type info check Template parameters
|
||||
* above)
|
||||
* @param score_cutoff Optional argument for a score threshold between 0% and
|
||||
* 100%. Matches with a lower score than this number will not be returned.
|
||||
* Defaults to 0.
|
||||
*
|
||||
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
||||
*/
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double partial_token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double partial_token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0);
|
||||
|
||||
// TODO documentation
|
||||
template <typename CharT1>
|
||||
struct CachedPartialTokenSetRatio {
|
||||
template <typename InputIt1>
|
||||
CachedPartialTokenSetRatio(InputIt1 first1, InputIt1 last1)
|
||||
: s1(first1, last1), tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1)))
|
||||
{}
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedPartialTokenSetRatio(const Sentence1& s1_)
|
||||
: CachedPartialTokenSetRatio(detail::to_begin(s1_), detail::to_end(s1_))
|
||||
{}
|
||||
|
||||
template <typename InputIt2>
|
||||
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
||||
double score_hint = 0.0) const;
|
||||
|
||||
template <typename Sentence2>
|
||||
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
||||
|
||||
private:
|
||||
std::vector<CharT1> s1;
|
||||
detail::SplittedSentenceView<typename std::vector<CharT1>::iterator> tokens_s1;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedPartialTokenSetRatio(const Sentence1& s1) -> CachedPartialTokenSetRatio<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedPartialTokenSetRatio(InputIt1 first1,
|
||||
InputIt1 last1) -> CachedPartialTokenSetRatio<iter_value_t<InputIt1>>;
|
||||
|
||||
/**
|
||||
* @brief Helper method that returns the maximum of fuzz::token_set_ratio and
|
||||
* fuzz::token_sort_ratio (faster than manually executing the two functions)
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1 string to compare with s2 (for type info check Template parameters
|
||||
* above)
|
||||
* @param s2 string to compare with s1 (for type info check Template parameters
|
||||
* above)
|
||||
* @param score_cutoff Optional argument for a score threshold between 0% and
|
||||
* 100%. Matches with a lower score than this number will not be returned.
|
||||
* Defaults to 0.
|
||||
*
|
||||
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
||||
*/
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
|
||||
|
||||
// todo add real implementation
|
||||
template <typename CharT1>
|
||||
struct CachedTokenRatio {
|
||||
template <typename InputIt1>
|
||||
CachedTokenRatio(InputIt1 first1, InputIt1 last1)
|
||||
: s1(first1, last1),
|
||||
s1_tokens(detail::sorted_split(std::begin(s1), std::end(s1))),
|
||||
s1_sorted(s1_tokens.join()),
|
||||
cached_ratio_s1_sorted(s1_sorted)
|
||||
{}
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedTokenRatio(const Sentence1& s1_)
|
||||
: CachedTokenRatio(detail::to_begin(s1_), detail::to_end(s1_))
|
||||
{}
|
||||
|
||||
template <typename InputIt2>
|
||||
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
||||
double score_hint = 0.0) const;
|
||||
|
||||
template <typename Sentence2>
|
||||
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
||||
|
||||
private:
|
||||
std::vector<CharT1> s1;
|
||||
detail::SplittedSentenceView<typename std::vector<CharT1>::iterator> s1_tokens;
|
||||
std::vector<CharT1> s1_sorted;
|
||||
CachedRatio<CharT1> cached_ratio_s1_sorted;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedTokenRatio(const Sentence1& s1) -> CachedTokenRatio<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedTokenRatio(InputIt1 first1, InputIt1 last1) -> CachedTokenRatio<iter_value_t<InputIt1>>;
|
||||
|
||||
/**
|
||||
* @brief Helper method that returns the maximum of
|
||||
* fuzz::partial_token_set_ratio and fuzz::partial_token_sort_ratio (faster than
|
||||
* manually executing the two functions)
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1 string to compare with s2 (for type info check Template parameters
|
||||
* above)
|
||||
* @param s2 string to compare with s1 (for type info check Template parameters
|
||||
* above)
|
||||
* @param score_cutoff Optional argument for a score threshold between 0% and
|
||||
* 100%. Matches with a lower score than this number will not be returned.
|
||||
* Defaults to 0.
|
||||
*
|
||||
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
||||
*/
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double partial_token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double partial_token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0);
|
||||
|
||||
// todo add real implementation
|
||||
template <typename CharT1>
|
||||
struct CachedPartialTokenRatio {
|
||||
template <typename InputIt1>
|
||||
CachedPartialTokenRatio(InputIt1 first1, InputIt1 last1)
|
||||
: s1(first1, last1),
|
||||
tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1))),
|
||||
s1_sorted(tokens_s1.join())
|
||||
{}
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedPartialTokenRatio(const Sentence1& s1_)
|
||||
: CachedPartialTokenRatio(detail::to_begin(s1_), detail::to_end(s1_))
|
||||
{}
|
||||
|
||||
template <typename InputIt2>
|
||||
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
||||
double score_hint = 0.0) const;
|
||||
|
||||
template <typename Sentence2>
|
||||
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
||||
|
||||
private:
|
||||
std::vector<CharT1> s1;
|
||||
detail::SplittedSentenceView<typename std::vector<CharT1>::iterator> tokens_s1;
|
||||
std::vector<CharT1> s1_sorted;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedPartialTokenRatio(const Sentence1& s1) -> CachedPartialTokenRatio<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedPartialTokenRatio(InputIt1 first1, InputIt1 last1) -> CachedPartialTokenRatio<iter_value_t<InputIt1>>;
|
||||
|
||||
/**
|
||||
* @brief Calculates a weighted ratio based on the other ratio algorithms
|
||||
*
|
||||
* @details
|
||||
* @todo add a detailed description
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1 string to compare with s2 (for type info check Template parameters
|
||||
* above)
|
||||
* @param s2 string to compare with s1 (for type info check Template parameters
|
||||
* above)
|
||||
* @param score_cutoff Optional argument for a score threshold between 0% and
|
||||
* 100%. Matches with a lower score than this number will not be returned.
|
||||
* Defaults to 0.
|
||||
*
|
||||
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
||||
*/
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double WRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double WRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
|
||||
|
||||
// todo add real implementation
|
||||
template <typename CharT1>
|
||||
struct CachedWRatio {
|
||||
template <typename InputIt1>
|
||||
explicit CachedWRatio(InputIt1 first1, InputIt1 last1);
|
||||
|
||||
template <typename Sentence1>
|
||||
CachedWRatio(const Sentence1& s1_) : CachedWRatio(detail::to_begin(s1_), detail::to_end(s1_))
|
||||
{}
|
||||
|
||||
template <typename InputIt2>
|
||||
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
||||
double score_hint = 0.0) const;
|
||||
|
||||
template <typename Sentence2>
|
||||
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
||||
|
||||
private:
|
||||
// todo somehow implement this using other ratios with creating PatternMatchVector
|
||||
// multiple times
|
||||
std::vector<CharT1> s1;
|
||||
CachedPartialRatio<CharT1> cached_partial_ratio;
|
||||
detail::SplittedSentenceView<typename std::vector<CharT1>::iterator> tokens_s1;
|
||||
std::vector<CharT1> s1_sorted;
|
||||
rapidfuzz::detail::BlockPatternMatchVector blockmap_s1_sorted;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedWRatio(const Sentence1& s1) -> CachedWRatio<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedWRatio(InputIt1 first1, InputIt1 last1) -> CachedWRatio<iter_value_t<InputIt1>>;
|
||||
|
||||
/**
|
||||
* @brief Calculates a quick ratio between two strings using fuzz.ratio
|
||||
*
|
||||
* @details
|
||||
* @todo add a detailed description
|
||||
*
|
||||
* @tparam Sentence1 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
* @tparam Sentence2 This is a string that can be converted to
|
||||
* basic_string_view<char_type>
|
||||
*
|
||||
* @param s1 string to compare with s2 (for type info check Template parameters
|
||||
* above)
|
||||
* @param s2 string to compare with s1 (for type info check Template parameters
|
||||
* above)
|
||||
* @param score_cutoff Optional argument for a score threshold between 0% and
|
||||
* 100%. Matches with a lower score than this number will not be returned.
|
||||
* Defaults to 0.
|
||||
*
|
||||
* @return returns the ratio between s1 and s2 or 0 when ratio < score_cutoff
|
||||
*/
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double QRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double QRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
|
||||
|
||||
#ifdef RAPIDFUZZ_SIMD
|
||||
namespace experimental {
|
||||
template <int MaxLen>
|
||||
struct MultiQRatio {
|
||||
public:
|
||||
MultiQRatio(size_t count) : scorer(count)
|
||||
{}
|
||||
|
||||
size_t result_count() const
|
||||
{
|
||||
return scorer.result_count();
|
||||
}
|
||||
|
||||
template <typename Sentence1>
|
||||
void insert(const Sentence1& s1_)
|
||||
{
|
||||
insert(detail::to_begin(s1_), detail::to_end(s1_));
|
||||
}
|
||||
|
||||
template <typename InputIt1>
|
||||
void insert(InputIt1 first1, InputIt1 last1)
|
||||
{
|
||||
scorer.insert(first1, last1);
|
||||
str_lens.push_back(static_cast<size_t>(std::distance(first1, last1)));
|
||||
}
|
||||
|
||||
template <typename InputIt2>
|
||||
void similarity(double* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0.0) const
|
||||
{
|
||||
similarity(scores, score_count, detail::Range(first2, last2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence2>
|
||||
void similarity(double* scores, size_t score_count, const Sentence2& s2, double score_cutoff = 0) const
|
||||
{
|
||||
rapidfuzz::detail::Range s2_(s2);
|
||||
if (s2_.empty()) {
|
||||
for (size_t i = 0; i < str_lens.size(); ++i)
|
||||
scores[i] = 0;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
scorer.similarity(scores, score_count, s2, score_cutoff);
|
||||
|
||||
for (size_t i = 0; i < str_lens.size(); ++i)
|
||||
if (str_lens[i] == 0) scores[i] = 0;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<size_t> str_lens;
|
||||
MultiRatio<MaxLen> scorer;
|
||||
};
|
||||
} /* namespace experimental */
|
||||
#endif
|
||||
|
||||
template <typename CharT1>
|
||||
struct CachedQRatio {
|
||||
template <typename InputIt1>
|
||||
CachedQRatio(InputIt1 first1, InputIt1 last1) : s1(first1, last1), cached_ratio(first1, last1)
|
||||
{}
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedQRatio(const Sentence1& s1_) : CachedQRatio(detail::to_begin(s1_), detail::to_end(s1_))
|
||||
{}
|
||||
|
||||
template <typename InputIt2>
|
||||
double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
|
||||
double score_hint = 0.0) const;
|
||||
|
||||
template <typename Sentence2>
|
||||
double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
|
||||
|
||||
private:
|
||||
std::vector<CharT1> s1;
|
||||
CachedRatio<CharT1> cached_ratio;
|
||||
};
|
||||
|
||||
template <typename Sentence1>
|
||||
explicit CachedQRatio(const Sentence1& s1) -> CachedQRatio<char_type<Sentence1>>;
|
||||
|
||||
template <typename InputIt1>
|
||||
CachedQRatio(InputIt1 first1, InputIt1 last1) -> CachedQRatio<iter_value_t<InputIt1>>;
|
||||
|
||||
/**@}*/
|
||||
|
||||
} // namespace rapidfuzz::fuzz
|
||||
|
||||
#include <rapidfuzz/fuzz_impl.hpp>
|
||||
937
src/external/rapidfuzz-cpp/rapidfuzz/fuzz_impl.hpp
vendored
Normal file
937
src/external/rapidfuzz-cpp/rapidfuzz/fuzz_impl.hpp
vendored
Normal file
@@ -0,0 +1,937 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021-present Max Bachmann */
|
||||
/* Copyright © 2011 Adam Cohen */
|
||||
|
||||
#include <limits>
|
||||
#include <rapidfuzz/details/CharSet.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iterator>
|
||||
#include <sys/types.h>
|
||||
#include <vector>
|
||||
|
||||
namespace rapidfuzz::fuzz {
|
||||
|
||||
/**********************************************
|
||||
* ratio
|
||||
*********************************************/
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
|
||||
{
|
||||
return ratio(detail::Range(first1, last1), detail::Range(first2, last2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double ratio(const Sentence1& s1, const Sentence2& s2, const double score_cutoff)
|
||||
{
|
||||
return indel_normalized_similarity(s1, s2, score_cutoff / 100) * 100;
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename InputIt2>
|
||||
double CachedRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
|
||||
double score_hint) const
|
||||
{
|
||||
return similarity(detail::Range(first2, last2), score_cutoff, score_hint);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename Sentence2>
|
||||
double CachedRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff, double score_hint) const
|
||||
{
|
||||
return cached_indel.normalized_similarity(s2, score_cutoff / 100, score_hint / 100) * 100;
|
||||
}
|
||||
|
||||
/**********************************************
|
||||
* partial_ratio
|
||||
*********************************************/
|
||||
|
||||
namespace fuzz_detail {
|
||||
|
||||
static constexpr double norm_distance(size_t dist, size_t lensum, double score_cutoff = 0)
|
||||
{
|
||||
double score =
|
||||
(lensum > 0) ? (100.0 - 100.0 * static_cast<double>(dist) / static_cast<double>(lensum)) : 100.0;
|
||||
|
||||
return (score >= score_cutoff) ? score : 0;
|
||||
}
|
||||
|
||||
static inline size_t score_cutoff_to_distance(double score_cutoff, size_t lensum)
|
||||
{
|
||||
return static_cast<size_t>(std::ceil(static_cast<double>(lensum) * (1.0 - score_cutoff / 100)));
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2, typename CachedCharT1>
|
||||
ScoreAlignment<double>
|
||||
partial_ratio_impl(const detail::Range<InputIt1>& s1, const detail::Range<InputIt2>& s2,
|
||||
const CachedRatio<CachedCharT1>& cached_ratio,
|
||||
const detail::CharSet<iter_value_t<InputIt1>>& s1_char_set, double score_cutoff)
|
||||
{
|
||||
ScoreAlignment<double> res;
|
||||
size_t len1 = s1.size();
|
||||
size_t len2 = s2.size();
|
||||
res.src_start = 0;
|
||||
res.src_end = len1;
|
||||
res.dest_start = 0;
|
||||
res.dest_end = len1;
|
||||
|
||||
if (len2 > len1) {
|
||||
size_t maximum = len1 * 2;
|
||||
double norm_cutoff_sim = rapidfuzz::detail::NormSim_to_NormDist(score_cutoff / 100);
|
||||
size_t cutoff_dist = static_cast<size_t>(std::ceil(static_cast<double>(maximum) * norm_cutoff_sim));
|
||||
size_t best_dist = std::numeric_limits<size_t>::max();
|
||||
std::vector<size_t> scores(len2 - len1, std::numeric_limits<size_t>::max());
|
||||
std::vector<std::pair<size_t, size_t>> windows = {{0, len2 - len1 - 1}};
|
||||
std::vector<std::pair<size_t, size_t>> new_windows;
|
||||
|
||||
while (!windows.empty()) {
|
||||
for (const auto& window : windows) {
|
||||
auto subseq1_first = s2.begin() + static_cast<ptrdiff_t>(window.first);
|
||||
auto subseq2_first = s2.begin() + static_cast<ptrdiff_t>(window.second);
|
||||
detail::Range subseq1(subseq1_first, subseq1_first + static_cast<ptrdiff_t>(len1));
|
||||
detail::Range subseq2(subseq2_first, subseq2_first + static_cast<ptrdiff_t>(len1));
|
||||
|
||||
if (scores[window.first] == std::numeric_limits<size_t>::max()) {
|
||||
scores[window.first] = cached_ratio.cached_indel.distance(subseq1);
|
||||
if (scores[window.first] < cutoff_dist) {
|
||||
cutoff_dist = best_dist = scores[window.first];
|
||||
res.dest_start = window.first;
|
||||
res.dest_end = window.first + len1;
|
||||
if (best_dist == 0) {
|
||||
res.score = 100;
|
||||
return res;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (scores[window.second] == std::numeric_limits<size_t>::max()) {
|
||||
scores[window.second] = cached_ratio.cached_indel.distance(subseq2);
|
||||
if (scores[window.second] < cutoff_dist) {
|
||||
cutoff_dist = best_dist = scores[window.second];
|
||||
res.dest_start = window.second;
|
||||
res.dest_end = window.second + len1;
|
||||
if (best_dist == 0) {
|
||||
res.score = 100;
|
||||
return res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t cell_diff = window.second - window.first;
|
||||
if (cell_diff == 1) continue;
|
||||
|
||||
/* find the minimum score possible in the range first <-> last */
|
||||
size_t known_edits = detail::abs_diff(scores[window.first], scores[window.second]);
|
||||
/* half of the cells that are not needed for known_edits can lead to a better score */
|
||||
size_t max_score_improvement = (cell_diff - known_edits / 2) / 2 * 2;
|
||||
ptrdiff_t min_score =
|
||||
static_cast<ptrdiff_t>(std::min(scores[window.first], scores[window.second])) -
|
||||
static_cast<ptrdiff_t>(max_score_improvement);
|
||||
if (min_score < static_cast<ptrdiff_t>(cutoff_dist)) {
|
||||
size_t center = cell_diff / 2;
|
||||
new_windows.emplace_back(window.first, window.first + center);
|
||||
new_windows.emplace_back(window.first + center, window.second);
|
||||
}
|
||||
}
|
||||
|
||||
std::swap(windows, new_windows);
|
||||
new_windows.clear();
|
||||
}
|
||||
|
||||
double score = 1.0 - (static_cast<double>(best_dist) / static_cast<double>(maximum));
|
||||
score *= 100;
|
||||
if (score >= score_cutoff) score_cutoff = res.score = score;
|
||||
}
|
||||
|
||||
for (size_t i = 1; i < len1; ++i) {
|
||||
rapidfuzz::detail::Range subseq(s2.begin(), s2.begin() + static_cast<ptrdiff_t>(i));
|
||||
if (!s1_char_set.find(subseq.back())) continue;
|
||||
|
||||
double ls_ratio = cached_ratio.similarity(subseq, score_cutoff);
|
||||
if (ls_ratio > res.score) {
|
||||
score_cutoff = res.score = ls_ratio;
|
||||
res.dest_start = 0;
|
||||
res.dest_end = i;
|
||||
if (res.score == 100.0) return res;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = len2 - len1; i < len2; ++i) {
|
||||
rapidfuzz::detail::Range subseq(s2.begin() + static_cast<ptrdiff_t>(i), s2.end());
|
||||
if (!s1_char_set.find(subseq.front())) continue;
|
||||
|
||||
double ls_ratio = cached_ratio.similarity(subseq, score_cutoff);
|
||||
if (ls_ratio > res.score) {
|
||||
score_cutoff = res.score = ls_ratio;
|
||||
res.dest_start = i;
|
||||
res.dest_end = len2;
|
||||
if (res.score == 100.0) return res;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2, typename CharT1 = iter_value_t<InputIt1>>
|
||||
ScoreAlignment<double> partial_ratio_impl(const detail::Range<InputIt1>& s1,
|
||||
const detail::Range<InputIt2>& s2, double score_cutoff)
|
||||
{
|
||||
CachedRatio<CharT1> cached_ratio(s1);
|
||||
|
||||
detail::CharSet<CharT1> s1_char_set;
|
||||
for (auto ch : s1)
|
||||
s1_char_set.insert(ch);
|
||||
|
||||
return partial_ratio_impl(s1, s2, cached_ratio, s1_char_set, score_cutoff);
|
||||
}
|
||||
|
||||
} // namespace fuzz_detail
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
ScoreAlignment<double> partial_ratio_alignment(InputIt1 first1, InputIt1 last1, InputIt2 first2,
|
||||
InputIt2 last2, double score_cutoff)
|
||||
{
|
||||
size_t len1 = static_cast<size_t>(std::distance(first1, last1));
|
||||
size_t len2 = static_cast<size_t>(std::distance(first2, last2));
|
||||
|
||||
if (len1 > len2) {
|
||||
ScoreAlignment<double> result = partial_ratio_alignment(first2, last2, first1, last1, score_cutoff);
|
||||
std::swap(result.src_start, result.dest_start);
|
||||
std::swap(result.src_end, result.dest_end);
|
||||
return result;
|
||||
}
|
||||
|
||||
if (score_cutoff > 100) return ScoreAlignment<double>(0, 0, len1, 0, len1);
|
||||
|
||||
if (!len1 || !len2)
|
||||
return ScoreAlignment<double>(static_cast<double>(len1 == len2) * 100.0, 0, len1, 0, len1);
|
||||
|
||||
auto s1 = detail::Range(first1, last1);
|
||||
auto s2 = detail::Range(first2, last2);
|
||||
|
||||
auto alignment = fuzz_detail::partial_ratio_impl(s1, s2, score_cutoff);
|
||||
if (alignment.score != 100 && s1.size() == s2.size()) {
|
||||
score_cutoff = std::max(score_cutoff, alignment.score);
|
||||
auto alignment2 = fuzz_detail::partial_ratio_impl(s2, s1, score_cutoff);
|
||||
if (alignment2.score > alignment.score) {
|
||||
std::swap(alignment2.src_start, alignment2.dest_start);
|
||||
std::swap(alignment2.src_end, alignment2.dest_end);
|
||||
return alignment2;
|
||||
}
|
||||
}
|
||||
|
||||
return alignment;
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
ScoreAlignment<double> partial_ratio_alignment(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
|
||||
{
|
||||
return partial_ratio_alignment(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
|
||||
detail::to_end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
|
||||
{
|
||||
return partial_ratio_alignment(first1, last1, first2, last2, score_cutoff).score;
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double partial_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
|
||||
{
|
||||
return partial_ratio_alignment(s1, s2, score_cutoff).score;
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename InputIt1>
|
||||
CachedPartialRatio<CharT1>::CachedPartialRatio(InputIt1 first1, InputIt1 last1)
|
||||
: s1(first1, last1), cached_ratio(first1, last1)
|
||||
{
|
||||
for (const auto& ch : s1)
|
||||
s1_char_set.insert(ch);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename InputIt2>
|
||||
double CachedPartialRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
size_t len1 = s1.size();
|
||||
size_t len2 = static_cast<size_t>(std::distance(first2, last2));
|
||||
|
||||
if (len1 > len2)
|
||||
return partial_ratio(detail::to_begin(s1), detail::to_end(s1), first2, last2, score_cutoff);
|
||||
|
||||
if (score_cutoff > 100) return 0;
|
||||
|
||||
if (!len1 || !len2) return static_cast<double>(len1 == len2) * 100.0;
|
||||
|
||||
auto s1_ = detail::Range(s1);
|
||||
auto s2 = detail::Range(first2, last2);
|
||||
|
||||
double score = fuzz_detail::partial_ratio_impl(s1_, s2, cached_ratio, s1_char_set, score_cutoff).score;
|
||||
if (score != 100 && s1_.size() == s2.size()) {
|
||||
score_cutoff = std::max(score_cutoff, score);
|
||||
double score2 = fuzz_detail::partial_ratio_impl(s2, s1_, score_cutoff).score;
|
||||
if (score2 > score) return score2;
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename Sentence2>
|
||||
double CachedPartialRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
/**********************************************
|
||||
* token_sort_ratio
|
||||
*********************************************/
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) return 0;
|
||||
|
||||
return ratio(detail::sorted_split(first1, last1).join(), detail::sorted_split(first2, last2).join(),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
|
||||
{
|
||||
return token_sort_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
|
||||
detail::to_end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename InputIt2>
|
||||
double CachedTokenSortRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
if (score_cutoff > 100) return 0;
|
||||
|
||||
return cached_ratio.similarity(detail::sorted_split(first2, last2).join(), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename Sentence2>
|
||||
double CachedTokenSortRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
/**********************************************
|
||||
* partial_token_sort_ratio
|
||||
*********************************************/
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double partial_token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) return 0;
|
||||
|
||||
return partial_ratio(detail::sorted_split(first1, last1).join(),
|
||||
detail::sorted_split(first2, last2).join(), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double partial_token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
|
||||
{
|
||||
return partial_token_sort_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
|
||||
detail::to_end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename InputIt2>
|
||||
double CachedPartialTokenSortRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
if (score_cutoff > 100) return 0;
|
||||
|
||||
return cached_partial_ratio.similarity(detail::sorted_split(first2, last2).join(), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename Sentence2>
|
||||
double CachedPartialTokenSortRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
/**********************************************
|
||||
* token_set_ratio
|
||||
*********************************************/
|
||||
|
||||
namespace fuzz_detail {
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double token_set_ratio(const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_a,
|
||||
const rapidfuzz::detail::SplittedSentenceView<InputIt2>& tokens_b,
|
||||
const double score_cutoff)
|
||||
{
|
||||
/* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
|
||||
* see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
|
||||
if (tokens_a.empty() || tokens_b.empty()) return 0;
|
||||
|
||||
auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
|
||||
auto intersect = decomposition.intersection;
|
||||
auto diff_ab = decomposition.difference_ab;
|
||||
auto diff_ba = decomposition.difference_ba;
|
||||
|
||||
// one sentence is part of the other one
|
||||
if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) return 100;
|
||||
|
||||
auto diff_ab_joined = diff_ab.join();
|
||||
auto diff_ba_joined = diff_ba.join();
|
||||
|
||||
size_t ab_len = diff_ab_joined.size();
|
||||
size_t ba_len = diff_ba_joined.size();
|
||||
size_t sect_len = intersect.length();
|
||||
|
||||
// string length sect+ab <-> sect and sect+ba <-> sect
|
||||
size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
|
||||
size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
|
||||
|
||||
double result = 0;
|
||||
size_t cutoff_distance = score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
|
||||
size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
|
||||
|
||||
if (dist <= cutoff_distance) result = norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff);
|
||||
|
||||
// exit early since the other ratios are 0
|
||||
if (!sect_len) return result;
|
||||
|
||||
// levenshtein distance sect+ab <-> sect and sect+ba <-> sect
|
||||
// since only sect is similar in them the distance can be calculated based on
|
||||
// the length difference
|
||||
size_t sect_ab_dist = bool(sect_len) + ab_len;
|
||||
double sect_ab_ratio = norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
|
||||
|
||||
size_t sect_ba_dist = bool(sect_len) + ba_len;
|
||||
double sect_ba_ratio = norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
|
||||
|
||||
return std::max({result, sect_ab_ratio, sect_ba_ratio});
|
||||
}
|
||||
} // namespace fuzz_detail
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) return 0;
|
||||
|
||||
return fuzz_detail::token_set_ratio(detail::sorted_split(first1, last1),
|
||||
detail::sorted_split(first2, last2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
|
||||
{
|
||||
return token_set_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename InputIt2>
|
||||
double CachedTokenSetRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
if (score_cutoff > 100) return 0;
|
||||
|
||||
return fuzz_detail::token_set_ratio(tokens_s1, detail::sorted_split(first2, last2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename Sentence2>
|
||||
double CachedTokenSetRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
/**********************************************
|
||||
* partial_token_set_ratio
|
||||
*********************************************/
|
||||
|
||||
namespace fuzz_detail {
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double partial_token_set_ratio(const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_a,
|
||||
const rapidfuzz::detail::SplittedSentenceView<InputIt2>& tokens_b,
|
||||
const double score_cutoff)
|
||||
{
|
||||
/* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
|
||||
* see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
|
||||
if (tokens_a.empty() || tokens_b.empty()) return 0;
|
||||
|
||||
auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
|
||||
|
||||
// exit early when there is a common word in both sequences
|
||||
if (!decomposition.intersection.empty()) return 100;
|
||||
|
||||
return partial_ratio(decomposition.difference_ab.join(), decomposition.difference_ba.join(),
|
||||
score_cutoff);
|
||||
}
|
||||
} // namespace fuzz_detail
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double partial_token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) return 0;
|
||||
|
||||
return fuzz_detail::partial_token_set_ratio(detail::sorted_split(first1, last1),
|
||||
detail::sorted_split(first2, last2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double partial_token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
|
||||
{
|
||||
return partial_token_set_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
|
||||
detail::to_end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename InputIt2>
|
||||
double CachedPartialTokenSetRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
if (score_cutoff > 100) return 0;
|
||||
|
||||
return fuzz_detail::partial_token_set_ratio(tokens_s1, detail::sorted_split(first2, last2), score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename Sentence2>
|
||||
double CachedPartialTokenSetRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
/**********************************************
|
||||
* token_ratio
|
||||
*********************************************/
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) return 0;
|
||||
|
||||
auto tokens_a = detail::sorted_split(first1, last1);
|
||||
auto tokens_b = detail::sorted_split(first2, last2);
|
||||
|
||||
auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
|
||||
auto intersect = decomposition.intersection;
|
||||
auto diff_ab = decomposition.difference_ab;
|
||||
auto diff_ba = decomposition.difference_ba;
|
||||
|
||||
if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) return 100;
|
||||
|
||||
auto diff_ab_joined = diff_ab.join();
|
||||
auto diff_ba_joined = diff_ba.join();
|
||||
|
||||
size_t ab_len = diff_ab_joined.size();
|
||||
size_t ba_len = diff_ba_joined.size();
|
||||
size_t sect_len = intersect.length();
|
||||
|
||||
double result = ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
|
||||
|
||||
// string length sect+ab <-> sect and sect+ba <-> sect
|
||||
size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
|
||||
size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
|
||||
|
||||
size_t cutoff_distance = fuzz_detail::score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
|
||||
size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
|
||||
if (dist <= cutoff_distance)
|
||||
result = std::max(result, fuzz_detail::norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
|
||||
|
||||
// exit early since the other ratios are 0
|
||||
if (!sect_len) return result;
|
||||
|
||||
// levenshtein distance sect+ab <-> sect and sect+ba <-> sect
|
||||
// since only sect is similar in them the distance can be calculated based on
|
||||
// the length difference
|
||||
size_t sect_ab_dist = bool(sect_len) + ab_len;
|
||||
double sect_ab_ratio = fuzz_detail::norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
|
||||
|
||||
size_t sect_ba_dist = bool(sect_len) + ba_len;
|
||||
double sect_ba_ratio = fuzz_detail::norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
|
||||
|
||||
return std::max({result, sect_ab_ratio, sect_ba_ratio});
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
|
||||
{
|
||||
return token_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
namespace fuzz_detail {
|
||||
template <typename CharT1, typename CachedCharT1, typename InputIt2>
|
||||
double token_ratio(const rapidfuzz::detail::SplittedSentenceView<CharT1>& s1_tokens,
|
||||
const CachedRatio<CachedCharT1>& cached_ratio_s1_sorted, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) return 0;
|
||||
|
||||
auto s2_tokens = detail::sorted_split(first2, last2);
|
||||
|
||||
auto decomposition = detail::set_decomposition(s1_tokens, s2_tokens);
|
||||
auto intersect = decomposition.intersection;
|
||||
auto diff_ab = decomposition.difference_ab;
|
||||
auto diff_ba = decomposition.difference_ba;
|
||||
|
||||
if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) return 100;
|
||||
|
||||
auto diff_ab_joined = diff_ab.join();
|
||||
auto diff_ba_joined = diff_ba.join();
|
||||
|
||||
size_t ab_len = diff_ab_joined.size();
|
||||
size_t ba_len = diff_ba_joined.size();
|
||||
size_t sect_len = intersect.length();
|
||||
|
||||
double result = cached_ratio_s1_sorted.similarity(s2_tokens.join(), score_cutoff);
|
||||
|
||||
// string length sect+ab <-> sect and sect+ba <-> sect
|
||||
size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
|
||||
size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
|
||||
|
||||
size_t cutoff_distance = score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
|
||||
size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
|
||||
if (dist <= cutoff_distance)
|
||||
result = std::max(result, norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
|
||||
|
||||
// exit early since the other ratios are 0
|
||||
if (!sect_len) return result;
|
||||
|
||||
// levenshtein distance sect+ab <-> sect and sect+ba <-> sect
|
||||
// since only sect is similar in them the distance can be calculated based on
|
||||
// the length difference
|
||||
size_t sect_ab_dist = bool(sect_len) + ab_len;
|
||||
double sect_ab_ratio = norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
|
||||
|
||||
size_t sect_ba_dist = bool(sect_len) + ba_len;
|
||||
double sect_ba_ratio = norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
|
||||
|
||||
return std::max({result, sect_ab_ratio, sect_ba_ratio});
|
||||
}
|
||||
|
||||
// todo this is a temporary solution until WRatio is properly implemented using other scorers
|
||||
template <typename CharT1, typename InputIt1, typename InputIt2>
|
||||
double token_ratio(const std::vector<CharT1>& s1_sorted,
|
||||
const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_s1,
|
||||
const detail::BlockPatternMatchVector& blockmap_s1_sorted, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) return 0;
|
||||
|
||||
auto tokens_b = detail::sorted_split(first2, last2);
|
||||
|
||||
auto decomposition = detail::set_decomposition(tokens_s1, tokens_b);
|
||||
auto intersect = decomposition.intersection;
|
||||
auto diff_ab = decomposition.difference_ab;
|
||||
auto diff_ba = decomposition.difference_ba;
|
||||
|
||||
if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) return 100;
|
||||
|
||||
auto diff_ab_joined = diff_ab.join();
|
||||
auto diff_ba_joined = diff_ba.join();
|
||||
|
||||
size_t ab_len = diff_ab_joined.size();
|
||||
size_t ba_len = diff_ba_joined.size();
|
||||
size_t sect_len = intersect.length();
|
||||
|
||||
double result = 0;
|
||||
auto s2_sorted = tokens_b.join();
|
||||
if (s1_sorted.size() < 65) {
|
||||
double norm_sim = detail::indel_normalized_similarity(blockmap_s1_sorted, detail::Range(s1_sorted),
|
||||
detail::Range(s2_sorted), score_cutoff / 100);
|
||||
result = norm_sim * 100;
|
||||
}
|
||||
else {
|
||||
result = fuzz::ratio(s1_sorted, s2_sorted, score_cutoff);
|
||||
}
|
||||
|
||||
// string length sect+ab <-> sect and sect+ba <-> sect
|
||||
size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
|
||||
size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
|
||||
|
||||
size_t cutoff_distance = score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
|
||||
size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
|
||||
if (dist <= cutoff_distance)
|
||||
result = std::max(result, norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
|
||||
|
||||
// exit early since the other ratios are 0
|
||||
if (!sect_len) return result;
|
||||
|
||||
// levenshtein distance sect+ab <-> sect and sect+ba <-> sect
|
||||
// since only sect is similar in them the distance can be calculated based on
|
||||
// the length difference
|
||||
size_t sect_ab_dist = bool(sect_len) + ab_len;
|
||||
double sect_ab_ratio = norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
|
||||
|
||||
size_t sect_ba_dist = bool(sect_len) + ba_len;
|
||||
double sect_ba_ratio = norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
|
||||
|
||||
return std::max({result, sect_ab_ratio, sect_ba_ratio});
|
||||
}
|
||||
} // namespace fuzz_detail
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename InputIt2>
|
||||
double CachedTokenRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
return fuzz_detail::token_ratio(s1_tokens, cached_ratio_s1_sorted, first2, last2, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename Sentence2>
|
||||
double CachedTokenRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
/**********************************************
|
||||
* partial_token_ratio
|
||||
*********************************************/
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double partial_token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) return 0;
|
||||
|
||||
auto tokens_a = detail::sorted_split(first1, last1);
|
||||
auto tokens_b = detail::sorted_split(first2, last2);
|
||||
|
||||
auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
|
||||
|
||||
// exit early when there is a common word in both sequences
|
||||
if (!decomposition.intersection.empty()) return 100;
|
||||
|
||||
auto diff_ab = decomposition.difference_ab;
|
||||
auto diff_ba = decomposition.difference_ba;
|
||||
|
||||
double result = partial_ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
|
||||
|
||||
// do not calculate the same partial_ratio twice
|
||||
if (tokens_a.word_count() == diff_ab.word_count() && tokens_b.word_count() == diff_ba.word_count()) {
|
||||
return result;
|
||||
}
|
||||
|
||||
score_cutoff = std::max(score_cutoff, result);
|
||||
return std::max(result, partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double partial_token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
|
||||
{
|
||||
return partial_token_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
|
||||
detail::to_end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
namespace fuzz_detail {
|
||||
template <typename CharT1, typename InputIt1, typename InputIt2>
|
||||
double partial_token_ratio(const std::vector<CharT1>& s1_sorted,
|
||||
const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_s1,
|
||||
InputIt2 first2, InputIt2 last2, double score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) return 0;
|
||||
|
||||
auto tokens_b = detail::sorted_split(first2, last2);
|
||||
|
||||
auto decomposition = detail::set_decomposition(tokens_s1, tokens_b);
|
||||
|
||||
// exit early when there is a common word in both sequences
|
||||
if (!decomposition.intersection.empty()) return 100;
|
||||
|
||||
auto diff_ab = decomposition.difference_ab;
|
||||
auto diff_ba = decomposition.difference_ba;
|
||||
|
||||
double result = partial_ratio(s1_sorted, tokens_b.join(), score_cutoff);
|
||||
|
||||
// do not calculate the same partial_ratio twice
|
||||
if (tokens_s1.word_count() == diff_ab.word_count() && tokens_b.word_count() == diff_ba.word_count()) {
|
||||
return result;
|
||||
}
|
||||
|
||||
score_cutoff = std::max(score_cutoff, result);
|
||||
return std::max(result, partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
|
||||
}
|
||||
|
||||
} // namespace fuzz_detail
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename InputIt2>
|
||||
double CachedPartialTokenRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
return fuzz_detail::partial_token_ratio(s1_sorted, tokens_s1, first2, last2, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename Sentence2>
|
||||
double CachedPartialTokenRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
/**********************************************
|
||||
* WRatio
|
||||
*********************************************/
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double WRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) return 0;
|
||||
|
||||
constexpr double UNBASE_SCALE = 0.95;
|
||||
|
||||
auto len1 = std::distance(first1, last1);
|
||||
auto len2 = std::distance(first2, last2);
|
||||
|
||||
/* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
|
||||
* see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
|
||||
if (!len1 || !len2) return 0;
|
||||
|
||||
double len_ratio = (len1 > len2) ? static_cast<double>(len1) / static_cast<double>(len2)
|
||||
: static_cast<double>(len2) / static_cast<double>(len1);
|
||||
|
||||
double end_ratio = ratio(first1, last1, first2, last2, score_cutoff);
|
||||
|
||||
if (len_ratio < 1.5) {
|
||||
score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
|
||||
return std::max(end_ratio, token_ratio(first1, last1, first2, last2, score_cutoff) * UNBASE_SCALE);
|
||||
}
|
||||
|
||||
const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
|
||||
|
||||
score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
|
||||
end_ratio =
|
||||
std::max(end_ratio, partial_ratio(first1, last1, first2, last2, score_cutoff) * PARTIAL_SCALE);
|
||||
|
||||
score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
|
||||
return std::max(end_ratio, partial_token_ratio(first1, last1, first2, last2, score_cutoff) *
|
||||
UNBASE_SCALE * PARTIAL_SCALE);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double WRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
|
||||
{
|
||||
return WRatio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1>
|
||||
template <typename InputIt1>
|
||||
CachedWRatio<Sentence1>::CachedWRatio(InputIt1 first1, InputIt1 last1)
|
||||
: s1(first1, last1),
|
||||
cached_partial_ratio(first1, last1),
|
||||
tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1))),
|
||||
s1_sorted(tokens_s1.join()),
|
||||
blockmap_s1_sorted(detail::Range(s1_sorted))
|
||||
{}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename InputIt2>
|
||||
double CachedWRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
if (score_cutoff > 100) return 0;
|
||||
|
||||
constexpr double UNBASE_SCALE = 0.95;
|
||||
|
||||
size_t len1 = s1.size();
|
||||
size_t len2 = static_cast<size_t>(std::distance(first2, last2));
|
||||
|
||||
/* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
|
||||
* see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
|
||||
if (!len1 || !len2) return 0;
|
||||
|
||||
double len_ratio = (len1 > len2) ? static_cast<double>(len1) / static_cast<double>(len2)
|
||||
: static_cast<double>(len2) / static_cast<double>(len1);
|
||||
|
||||
double end_ratio = cached_partial_ratio.cached_ratio.similarity(first2, last2, score_cutoff);
|
||||
|
||||
if (len_ratio < 1.5) {
|
||||
score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
|
||||
// use pre calculated values
|
||||
auto r =
|
||||
fuzz_detail::token_ratio(s1_sorted, tokens_s1, blockmap_s1_sorted, first2, last2, score_cutoff);
|
||||
return std::max(end_ratio, r * UNBASE_SCALE);
|
||||
}
|
||||
|
||||
const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
|
||||
|
||||
score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
|
||||
end_ratio =
|
||||
std::max(end_ratio, cached_partial_ratio.similarity(first2, last2, score_cutoff) * PARTIAL_SCALE);
|
||||
|
||||
score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
|
||||
auto r = fuzz_detail::partial_token_ratio(s1_sorted, tokens_s1, first2, last2, score_cutoff);
|
||||
return std::max(end_ratio, r * UNBASE_SCALE * PARTIAL_SCALE);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename Sentence2>
|
||||
double CachedWRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
/**********************************************
|
||||
* QRatio
|
||||
*********************************************/
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double QRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
|
||||
{
|
||||
ptrdiff_t len1 = std::distance(first1, last1);
|
||||
ptrdiff_t len2 = std::distance(first2, last2);
|
||||
|
||||
/* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
|
||||
* see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
|
||||
if (!len1 || !len2) return 0;
|
||||
|
||||
return ratio(first1, last1, first2, last2, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double QRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
|
||||
{
|
||||
return QRatio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename InputIt2>
|
||||
double CachedQRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
auto len2 = std::distance(first2, last2);
|
||||
|
||||
/* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
|
||||
* see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
|
||||
if (s1.empty() || !len2) return 0;
|
||||
|
||||
return cached_ratio.similarity(first2, last2, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename CharT1>
|
||||
template <typename Sentence2>
|
||||
double CachedQRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
|
||||
[[maybe_unused]] double score_hint) const
|
||||
{
|
||||
return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
} // namespace rapidfuzz::fuzz
|
||||
6
src/external/rapidfuzz-cpp/rapidfuzz/rapidfuzz_all.hpp
vendored
Normal file
6
src/external/rapidfuzz-cpp/rapidfuzz/rapidfuzz_all.hpp
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <rapidfuzz/distance.hpp>
|
||||
#include <rapidfuzz/fuzz.hpp>
|
||||
76
src/external/rapidfuzz-cpp/rapidfuzz_reference/DamerauLevenshtein.hpp
vendored
Normal file
76
src/external/rapidfuzz-cpp/rapidfuzz_reference/DamerauLevenshtein.hpp
vendored
Normal file
@@ -0,0 +1,76 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include "common.hpp"
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace rapidfuzz_reference {
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
Matrix<size_t> damerau_levenshtein_matrix(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2)
|
||||
{
|
||||
size_t len1 = std::distance(first1, last1);
|
||||
size_t len2 = std::distance(first2, last2);
|
||||
size_t infinite = len1 + len2;
|
||||
|
||||
std::unordered_map<uint32_t, size_t> da;
|
||||
Matrix<size_t> matrix(len1 + 2, len2 + 2);
|
||||
matrix(0, 0) = infinite;
|
||||
|
||||
for (size_t i = 0; i <= len1; ++i) {
|
||||
matrix(i + 1, 0) = infinite;
|
||||
matrix(i + 1, 1) = i;
|
||||
}
|
||||
for (size_t i = 0; i <= len2; ++i) {
|
||||
matrix(0, i + 1) = infinite;
|
||||
matrix(1, i + 1) = i;
|
||||
}
|
||||
|
||||
for (size_t pos1 = 0; pos1 < len1; ++pos1) {
|
||||
size_t db = 0;
|
||||
for (size_t pos2 = 0; pos2 < len2; ++pos2) {
|
||||
size_t i1 = da[static_cast<uint32_t>(first2[pos2])];
|
||||
size_t j1 = db;
|
||||
size_t cost = 1;
|
||||
if (first1[pos1] == first2[pos2]) {
|
||||
cost = 0;
|
||||
db = pos2 + 1;
|
||||
}
|
||||
|
||||
matrix(pos1 + 2, pos2 + 2) =
|
||||
std::min({matrix(pos1 + 1, pos2 + 1) + cost, matrix(pos1 + 2, pos2 + 1) + 1,
|
||||
matrix(pos1 + 1, pos2 + 2) + 1, matrix(i1, j1) + (pos1 - i1) + 1 + (pos2 - j1)
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
da[first1[pos1]] = pos1 + 1;
|
||||
}
|
||||
|
||||
return matrix;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t damerau_levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
auto matrix = damerau_levenshtein_matrix(first1, last1, first2, last2);
|
||||
size_t dist = matrix.back();
|
||||
return (dist <= score_cutoff) ? dist : score_cutoff + 1;
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t damerau_levenshtein_distance(const Sentence1& s1, const Sentence2& s2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return damerau_levenshtein_distance(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
} // namespace rapidfuzz_reference
|
||||
32
src/external/rapidfuzz-cpp/rapidfuzz_reference/Hamming.hpp
vendored
Normal file
32
src/external/rapidfuzz-cpp/rapidfuzz_reference/Hamming.hpp
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <limits>
|
||||
#include <stdexcept>
|
||||
|
||||
namespace rapidfuzz_reference {
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t hamming_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
ptrdiff_t len1 = std::distance(first1, last1);
|
||||
ptrdiff_t len2 = std::distance(first2, last2);
|
||||
if (len1 != len2) throw std::invalid_argument("Sequences are not the same length.");
|
||||
|
||||
size_t dist = 0;
|
||||
for (ptrdiff_t i = 0; i < len1; ++i)
|
||||
dist += bool(first1[i] != first2[i]);
|
||||
|
||||
return (dist <= score_cutoff) ? dist : score_cutoff + 1;
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t hamming_distance(const Sentence1& s1, const Sentence2& s2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return hamming_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
} // namespace rapidfuzz_reference
|
||||
38
src/external/rapidfuzz-cpp/rapidfuzz_reference/Indel.hpp
vendored
Normal file
38
src/external/rapidfuzz-cpp/rapidfuzz_reference/Indel.hpp
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Levenshtein.hpp"
|
||||
#include <limits>
|
||||
|
||||
namespace rapidfuzz_reference {
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t indel_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return levenshtein_distance(first1, last1, first2, last2, {1, 1, 2}, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t indel_distance(const Sentence1& s1, const Sentence2& s2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return levenshtein_distance(s1, s2, {1, 1, 2}, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double indel_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0.0)
|
||||
{
|
||||
return levenshtein_similarity(first1, last1, first2, last2, {1, 1, 2}, score_cutoff);
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double indel_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
|
||||
{
|
||||
return levenshtein_similarity(s1, s2, {1, 1, 2}, score_cutoff);
|
||||
}
|
||||
|
||||
} // namespace rapidfuzz_reference
|
||||
74
src/external/rapidfuzz-cpp/rapidfuzz_reference/Jaro.hpp
vendored
Normal file
74
src/external/rapidfuzz-cpp/rapidfuzz_reference/Jaro.hpp
vendored
Normal file
@@ -0,0 +1,74 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
|
||||
namespace rapidfuzz_reference {
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double jaro_similarity(InputIt1 P_first, InputIt1 P_last, InputIt2 T_first, InputIt2 T_last,
|
||||
double score_cutoff = 0.0)
|
||||
{
|
||||
size_t P_len = static_cast<size_t>(std::distance(P_first, P_last));
|
||||
size_t T_len = static_cast<size_t>(std::distance(T_first, T_last));
|
||||
|
||||
if (score_cutoff > 1.0) return 0.0;
|
||||
|
||||
if (!P_len || !T_len) return double(!P_len && !T_len);
|
||||
|
||||
std::vector<int> P_flag(P_len + 1);
|
||||
std::vector<int> T_flag(T_len + 1);
|
||||
|
||||
size_t Bound = std::max(P_len, T_len) / 2;
|
||||
if (Bound > 0) Bound--;
|
||||
|
||||
size_t CommonChars = 0;
|
||||
for (size_t i = 0; i < T_len; i++) {
|
||||
size_t lowlim = (i >= Bound) ? i - Bound : 0;
|
||||
size_t hilim = (i + Bound <= P_len - 1) ? (i + Bound) : P_len - 1;
|
||||
for (size_t j = lowlim; j <= hilim; j++) {
|
||||
if (!P_flag[j] && (P_first[static_cast<ptrdiff_t>(j)] == T_first[static_cast<ptrdiff_t>(i)])) {
|
||||
T_flag[i] = 1;
|
||||
P_flag[j] = 1;
|
||||
CommonChars++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Count the number of transpositions
|
||||
size_t Transpositions = 0;
|
||||
size_t k = 0;
|
||||
for (size_t i = 0; i < T_len; i++) {
|
||||
if (T_flag[i]) {
|
||||
size_t j = k;
|
||||
for (; j < P_len; j++) {
|
||||
if (P_flag[j]) {
|
||||
k = j + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (T_first[static_cast<ptrdiff_t>(i)] != P_first[static_cast<ptrdiff_t>(j)]) Transpositions++;
|
||||
}
|
||||
}
|
||||
|
||||
Transpositions /= 2;
|
||||
double Sim = 0;
|
||||
Sim += static_cast<double>(CommonChars) / static_cast<double>(P_len);
|
||||
Sim += static_cast<double>(CommonChars) / static_cast<double>(T_len);
|
||||
Sim += (static_cast<double>(CommonChars) - static_cast<double>(Transpositions)) /
|
||||
static_cast<double>(CommonChars);
|
||||
Sim /= 3.0;
|
||||
return (Sim >= score_cutoff) ? Sim : 0;
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double jaro_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
|
||||
{
|
||||
return jaro_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
} /* namespace rapidfuzz_reference */
|
||||
35
src/external/rapidfuzz-cpp/rapidfuzz_reference/JaroWinkler.hpp
vendored
Normal file
35
src/external/rapidfuzz-cpp/rapidfuzz_reference/JaroWinkler.hpp
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include "Jaro.hpp"
|
||||
|
||||
namespace rapidfuzz_reference {
|
||||
|
||||
template <typename InputIt1, typename InputIt2,
|
||||
typename = std::enable_if_t<!std::is_same_v<InputIt2, double>>>
|
||||
double jaro_winkler_similarity(InputIt1 P_first, InputIt1 P_last, InputIt2 T_first, InputIt2 T_last,
|
||||
double prefix_weight = 0.1, double score_cutoff = 0.0)
|
||||
{
|
||||
int64_t min_len = std::min(std::distance(P_first, P_last), std::distance(T_first, T_last));
|
||||
size_t max_prefix = std::min(static_cast<size_t>(min_len), size_t(4));
|
||||
|
||||
size_t prefix = 0;
|
||||
for (; prefix < max_prefix; ++prefix)
|
||||
if (T_first[static_cast<ptrdiff_t>(prefix)] != P_first[static_cast<ptrdiff_t>(prefix)]) break;
|
||||
|
||||
double Sim = jaro_similarity(P_first, P_last, T_first, T_last);
|
||||
if (Sim > 0.7) Sim += static_cast<double>(prefix) * prefix_weight * (1.0 - Sim);
|
||||
|
||||
return (Sim >= score_cutoff) ? Sim : 0;
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double jaro_winkler_similarity(const Sentence1& s1, const Sentence2& s2, double prefix_weight = 0.1,
|
||||
double score_cutoff = 0.0)
|
||||
{
|
||||
return jaro_winkler_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), prefix_weight,
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
} /* namespace rapidfuzz_reference */
|
||||
25
src/external/rapidfuzz-cpp/rapidfuzz_reference/LCSseq.hpp
vendored
Normal file
25
src/external/rapidfuzz-cpp/rapidfuzz_reference/LCSseq.hpp
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include "Indel.hpp"
|
||||
|
||||
namespace rapidfuzz_reference {
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t lcs_seq_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_cutoff = 0)
|
||||
{
|
||||
size_t maximum = static_cast<size_t>(std::distance(first1, last1) + std::distance(first2, last2));
|
||||
size_t dist = indel_distance(first1, last1, first2, last2);
|
||||
size_t sim = (maximum - dist) / 2;
|
||||
return (sim >= score_cutoff) ? sim : 0;
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t lcs_seq_similarity(const Sentence1& s1, const Sentence2& s2, size_t score_cutoff = 0)
|
||||
{
|
||||
return lcs_seq_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
} // namespace rapidfuzz_reference
|
||||
104
src/external/rapidfuzz-cpp/rapidfuzz_reference/Levenshtein.hpp
vendored
Normal file
104
src/external/rapidfuzz-cpp/rapidfuzz_reference/Levenshtein.hpp
vendored
Normal file
@@ -0,0 +1,104 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include "common.hpp"
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
namespace rapidfuzz_reference {
|
||||
|
||||
struct LevenshteinWeightTable {
|
||||
size_t insert_cost;
|
||||
size_t delete_cost;
|
||||
size_t replace_cost;
|
||||
};
|
||||
|
||||
static inline size_t levenshtein_maximum(size_t len1, size_t len2, LevenshteinWeightTable weights)
|
||||
{
|
||||
size_t max_dist = len1 * weights.delete_cost + len2 * weights.insert_cost;
|
||||
|
||||
if (len1 >= len2)
|
||||
max_dist = std::min(max_dist, len2 * weights.replace_cost + (len1 - len2) * weights.delete_cost);
|
||||
else
|
||||
max_dist = std::min(max_dist, len1 * weights.replace_cost + (len2 - len1) * weights.insert_cost);
|
||||
|
||||
return max_dist;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
Matrix<size_t> levenshtein_matrix(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
LevenshteinWeightTable weights = {1, 1, 1})
|
||||
{
|
||||
size_t len1 = static_cast<size_t>(std::distance(first1, last1));
|
||||
size_t len2 = static_cast<size_t>(std::distance(first2, last2));
|
||||
|
||||
Matrix<size_t> matrix(len1 + 1, len2 + 1);
|
||||
|
||||
for (size_t i = 0; i <= len1; ++i)
|
||||
matrix(i, 0) = i * weights.delete_cost;
|
||||
for (size_t i = 0; i <= len2; ++i)
|
||||
matrix(0, i) = i * weights.insert_cost;
|
||||
|
||||
for (size_t pos1 = 0; pos1 < len1; ++pos1) {
|
||||
for (size_t pos2 = 0; pos2 < len2; ++pos2) {
|
||||
size_t cost = (first1[pos1] == first2[pos2]) ? 0 : weights.replace_cost;
|
||||
|
||||
matrix(pos1 + 1, pos2 + 1) =
|
||||
std::min({matrix(pos1, pos2 + 1) + weights.delete_cost,
|
||||
matrix(pos1 + 1, pos2) + weights.insert_cost, matrix(pos1, pos2) + cost});
|
||||
}
|
||||
}
|
||||
|
||||
return matrix;
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
Matrix<size_t> levenshtein_matrix(const Sentence1& s1, const Sentence2& s2,
|
||||
LevenshteinWeightTable weights = {1, 1, 1})
|
||||
{
|
||||
return levenshtein_matrix(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), weights);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
LevenshteinWeightTable weights = {1, 1, 1},
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
auto matrix = levenshtein_matrix(first1, last1, first2, last2, weights);
|
||||
size_t dist = matrix.back();
|
||||
return (dist <= score_cutoff) ? dist : score_cutoff + 1;
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t levenshtein_distance(const Sentence1& s1, const Sentence2& s2,
|
||||
LevenshteinWeightTable weights = {1, 1, 1},
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return levenshtein_distance(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), weights,
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double levenshtein_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
LevenshteinWeightTable weights = {1, 1, 1}, double score_cutoff = 0.0)
|
||||
{
|
||||
size_t len1 = static_cast<size_t>(std::distance(first1, last1));
|
||||
size_t len2 = static_cast<size_t>(std::distance(first2, last2));
|
||||
size_t dist = levenshtein_distance(first1, last1, first2, last2, weights);
|
||||
size_t max = levenshtein_maximum(len1, len2, weights);
|
||||
double sim = 1.0 - (double)dist / max;
|
||||
return (sim >= score_cutoff) ? sim : 0.0;
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double levenshtein_similarity(const Sentence1& s1, const Sentence2& s2,
|
||||
LevenshteinWeightTable weights = {1, 1, 1}, double score_cutoff = 0.0)
|
||||
{
|
||||
return levenshtein_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), weights,
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
} // namespace rapidfuzz_reference
|
||||
61
src/external/rapidfuzz-cpp/rapidfuzz_reference/OSA.hpp
vendored
Normal file
61
src/external/rapidfuzz-cpp/rapidfuzz_reference/OSA.hpp
vendored
Normal file
@@ -0,0 +1,61 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include "common.hpp"
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
namespace rapidfuzz_reference {
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
Matrix<size_t> osa_matrix(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2)
|
||||
{
|
||||
size_t len1 = static_cast<size_t>(std::distance(first1, last1));
|
||||
size_t len2 = static_cast<size_t>(std::distance(first2, last2));
|
||||
|
||||
Matrix<size_t> matrix(static_cast<size_t>(len1) + 1, static_cast<size_t>(len2) + 1);
|
||||
|
||||
for (size_t i = 0; i <= len1; ++i)
|
||||
matrix(i, 0) = i;
|
||||
for (size_t i = 0; i <= len2; ++i)
|
||||
matrix(0, i) = i;
|
||||
|
||||
for (size_t pos1 = 0; pos1 < len1; ++pos1) {
|
||||
for (size_t pos2 = 0; pos2 < len2; ++pos2) {
|
||||
size_t cost = (first1[pos1] == first2[pos2]) ? 0 : 1;
|
||||
|
||||
matrix(pos1 + 1, pos2 + 1) =
|
||||
std::min({matrix(pos1, pos2 + 1) + 1, matrix(pos1 + 1, pos2) + 1, matrix(pos1, pos2) + cost});
|
||||
|
||||
if (pos1 == 0 || pos2 == 0) continue;
|
||||
if (first1[pos1] != first2[pos2 - 1]) continue;
|
||||
if (first1[pos1 - 1] != first2[pos2]) continue;
|
||||
|
||||
matrix(pos1 + 1, pos2 + 1) =
|
||||
std::min(matrix(pos1 + 1, pos2 + 1), matrix(pos1 - 1, pos2 - 1) + cost);
|
||||
}
|
||||
}
|
||||
|
||||
return matrix;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
size_t osa_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
auto matrix = osa_matrix(first1, last1, first2, last2);
|
||||
size_t dist = matrix.back();
|
||||
return (dist <= score_cutoff) ? dist : score_cutoff + 1;
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
size_t osa_distance(const Sentence1& s1, const Sentence2& s2,
|
||||
size_t score_cutoff = std::numeric_limits<size_t>::max())
|
||||
{
|
||||
return osa_distance(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
} // namespace rapidfuzz_reference
|
||||
4
src/external/rapidfuzz-cpp/rapidfuzz_reference/README.md
vendored
Normal file
4
src/external/rapidfuzz-cpp/rapidfuzz_reference/README.md
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
## rapidfuzz_reference
|
||||
|
||||
This includes reference implementations of various string matching algorithms,
|
||||
which can be used to validate the results of faster implementations.
|
||||
38
src/external/rapidfuzz-cpp/rapidfuzz_reference/common.hpp
vendored
Normal file
38
src/external/rapidfuzz-cpp/rapidfuzz_reference/common.hpp
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2021 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#include <algorithm>
|
||||
|
||||
namespace rapidfuzz_reference {
|
||||
|
||||
template <typename T>
|
||||
class Matrix {
|
||||
public:
|
||||
Matrix(size_t _rows, size_t _cols) : rows(_rows), cols(_cols)
|
||||
{
|
||||
matrix = new T[rows * cols];
|
||||
std::fill(matrix, matrix + rows * cols, T());
|
||||
}
|
||||
|
||||
~Matrix()
|
||||
{
|
||||
delete[] matrix;
|
||||
}
|
||||
|
||||
T& operator()(size_t row, size_t col)
|
||||
{
|
||||
return matrix[row + col * rows];
|
||||
}
|
||||
|
||||
T& back()
|
||||
{
|
||||
return matrix[rows * cols - 1];
|
||||
}
|
||||
|
||||
size_t rows;
|
||||
size_t cols;
|
||||
T* matrix;
|
||||
};
|
||||
|
||||
} // namespace rapidfuzz_reference
|
||||
63
src/external/rapidfuzz-cpp/rapidfuzz_reference/fuzz.hpp
vendored
Normal file
63
src/external/rapidfuzz-cpp/rapidfuzz_reference/fuzz.hpp
vendored
Normal file
@@ -0,0 +1,63 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2022-present Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Indel.hpp"
|
||||
|
||||
namespace rapidfuzz_reference {
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0)
|
||||
{
|
||||
return indel_similarity(first1, last1, first2, last2, score_cutoff / 100.0) * 100;
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
|
||||
{
|
||||
return indel_similarity(s1, s2, score_cutoff / 100.0) * 100;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double partial_ratio_impl(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0.0)
|
||||
{
|
||||
size_t len1 = static_cast<size_t>(std::distance(first1, last1));
|
||||
size_t len2 = static_cast<size_t>(std::distance(first2, last2));
|
||||
if (len1 == 0 && len2 == 0) return 100.0;
|
||||
|
||||
if (len1 == 0 || len2 == 0) return 0.0;
|
||||
|
||||
if (len1 > len2) return partial_ratio_impl(first2, last2, first1, last1, score_cutoff);
|
||||
|
||||
double res = 0.0;
|
||||
for (ptrdiff_t i = -1 * (ptrdiff_t)len1; i < (ptrdiff_t)len2; i++) {
|
||||
ptrdiff_t start = std::max(ptrdiff_t(0), i);
|
||||
ptrdiff_t end = std::min(ptrdiff_t(len2), i + ptrdiff_t(len1));
|
||||
InputIt2 first2_ = first2 + start;
|
||||
InputIt2 last2_ = first2 + end;
|
||||
res = std::max(res, ratio(first1, last1, first2_, last2_, score_cutoff));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename InputIt1, typename InputIt2>
|
||||
double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
|
||||
double score_cutoff = 0.0)
|
||||
{
|
||||
size_t len1 = static_cast<size_t>(std::distance(first1, last1));
|
||||
size_t len2 = static_cast<size_t>(std::distance(first2, last2));
|
||||
if (len1 != len2) return partial_ratio_impl(first1, last1, first2, last2, score_cutoff);
|
||||
|
||||
return std::max(partial_ratio_impl(first1, last1, first2, last2, score_cutoff),
|
||||
partial_ratio_impl(first2, last2, first1, last1, score_cutoff));
|
||||
}
|
||||
|
||||
template <typename Sentence1, typename Sentence2>
|
||||
double partial_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
|
||||
{
|
||||
return partial_ratio(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), score_cutoff);
|
||||
}
|
||||
|
||||
} // namespace rapidfuzz_reference
|
||||
65
src/external/rapidfuzz-cpp/test/CMakeLists.txt
vendored
Normal file
65
src/external/rapidfuzz-cpp/test/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,65 @@
|
||||
#find_package(Catch2 2 QUIET)
|
||||
if (Catch2_FOUND)
|
||||
message("Using system supplied version of Catch2")
|
||||
else()
|
||||
message("Using FetchContent to load Catch2")
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
Catch2
|
||||
GIT_REPOSITORY https://github.com/catchorg/Catch2.git
|
||||
GIT_TAG v2.13.10
|
||||
)
|
||||
FetchContent_MakeAvailable(Catch2)
|
||||
endif()
|
||||
|
||||
if (RAPIDFUZZ_ENABLE_LINTERS)
|
||||
# include aminya & jason turner's C++ best practices recommended cmake project utilities
|
||||
message("Enable Linters on test build")
|
||||
include(FetchContent)
|
||||
|
||||
if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.20)
|
||||
FetchContent_Declare(_project_options URL https://github.com/aminya/project_options/archive/refs/tags/v0.26.2.zip)
|
||||
else()
|
||||
FetchContent_Declare(_project_options URL https://github.com/aminya/project_options/archive/refs/tags/v0.25.2.zip)
|
||||
endif()
|
||||
FetchContent_MakeAvailable(_project_options)
|
||||
include(${_project_options_SOURCE_DIR}/Index.cmake)
|
||||
|
||||
project_options(
|
||||
# ENABLE_CACHE
|
||||
# ENABLE_CONAN
|
||||
WARNINGS_AS_ERRORS
|
||||
# ENABLE_CPPCHECK
|
||||
# ENABLE_CLANG_TIDY
|
||||
# ENABLE_INCLUDE_WHAT_YOU_USE
|
||||
# ENABLE_COVERAGE
|
||||
# ENABLE_PCH
|
||||
# PCH_HEADERS <Eigen/Dense> <fmt/core.h> <vector> <utility> <string> <string_view>
|
||||
# ENABLE_DOXYGEN
|
||||
# ENABLE_IPO
|
||||
# ENABLE_USER_LINKER
|
||||
# ENABLE_BUILD_WITH_TIME_TRACE
|
||||
# ENABLE_UNITY
|
||||
# ENABLE_SANITIZER_ADDRESS
|
||||
# ENABLE_SANITIZER_LEAK
|
||||
# ENABLE_SANITIZER_UNDEFINED_BEHAVIOR
|
||||
# ENABLE_SANITIZER_THREAD
|
||||
# ENABLE_SANITIZER_MEMORY
|
||||
# CLANG_WARNINGS "-Weverything"
|
||||
)
|
||||
endif()
|
||||
|
||||
function(rapidfuzz_add_test test)
|
||||
add_executable(test_${test} tests-main.cpp tests-${test}.cpp)
|
||||
target_link_libraries(test_${test} ${PROJECT_NAME})
|
||||
target_link_libraries(test_${test} Catch2::Catch2)
|
||||
if (RAPIDFUZZ_ENABLE_LINTERS)
|
||||
target_link_libraries(test_${test} project_warnings)
|
||||
endif()
|
||||
add_test(NAME ${test} COMMAND test_${test})
|
||||
endfunction()
|
||||
|
||||
rapidfuzz_add_test(fuzz)
|
||||
rapidfuzz_add_test(common)
|
||||
|
||||
add_subdirectory(distance)
|
||||
69
src/external/rapidfuzz-cpp/test/common.hpp
vendored
Normal file
69
src/external/rapidfuzz-cpp/test/common.hpp
vendored
Normal file
@@ -0,0 +1,69 @@
|
||||
#pragma once
|
||||
|
||||
template <typename T>
|
||||
class BidirectionalIterWrapper {
|
||||
public:
|
||||
using difference_type = typename T::difference_type;
|
||||
using value_type = typename T::value_type;
|
||||
using pointer = typename T::pointer;
|
||||
using reference = typename T::reference;
|
||||
using iterator_category = std::bidirectional_iterator_tag;
|
||||
|
||||
BidirectionalIterWrapper() : iter()
|
||||
{}
|
||||
|
||||
BidirectionalIterWrapper(T iter_) : iter(iter_)
|
||||
{}
|
||||
|
||||
bool operator==(const BidirectionalIterWrapper& i) const
|
||||
{
|
||||
return iter == i.iter;
|
||||
}
|
||||
|
||||
bool operator!=(const BidirectionalIterWrapper& i) const
|
||||
{
|
||||
return !(*this == i);
|
||||
}
|
||||
|
||||
BidirectionalIterWrapper operator++(int)
|
||||
{
|
||||
BidirectionalIterWrapper cur(iter);
|
||||
++iter;
|
||||
return cur;
|
||||
}
|
||||
BidirectionalIterWrapper operator--(int)
|
||||
{
|
||||
BidirectionalIterWrapper cur(iter);
|
||||
--iter;
|
||||
return cur;
|
||||
}
|
||||
|
||||
BidirectionalIterWrapper& operator++()
|
||||
{
|
||||
++iter;
|
||||
return *this;
|
||||
}
|
||||
BidirectionalIterWrapper& operator--()
|
||||
{
|
||||
--iter;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const auto& operator*() const
|
||||
{
|
||||
return *iter;
|
||||
}
|
||||
|
||||
private:
|
||||
T iter;
|
||||
};
|
||||
|
||||
template <typename T, typename = std::enable_if_t<std::is_same_v<T, char>>>
|
||||
std::basic_string<T> str_multiply(std::basic_string<T> a, size_t b)
|
||||
{
|
||||
std::basic_string<T> output;
|
||||
while (b--)
|
||||
output += a;
|
||||
|
||||
return output;
|
||||
}
|
||||
22
src/external/rapidfuzz-cpp/test/distance/CMakeLists.txt
vendored
Normal file
22
src/external/rapidfuzz-cpp/test/distance/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
function(rapidfuzz_add_test test)
|
||||
add_executable(test_${test} ../tests-main.cpp tests-${test}.cpp examples/ocr.cpp examples/pythonLevenshteinIssue9.cpp)
|
||||
target_link_libraries(test_${test} PRIVATE ${PROJECT_NAME})
|
||||
target_link_libraries(test_${test} PRIVATE Catch2::Catch2)
|
||||
if (RAPIDFUZZ_ENABLE_LINTERS)
|
||||
target_link_libraries(test_${test} PRIVATE project_warnings)
|
||||
endif()
|
||||
|
||||
#target_compile_options(test_${test} PRIVATE -g -fsanitize=address)
|
||||
#target_link_libraries(test_${test} PRIVATE -fsanitize=address)
|
||||
|
||||
add_test(NAME ${test} COMMAND test_${test})
|
||||
endfunction()
|
||||
|
||||
rapidfuzz_add_test(Hamming)
|
||||
rapidfuzz_add_test(Indel)
|
||||
rapidfuzz_add_test(LCSseq)
|
||||
rapidfuzz_add_test(Levenshtein)
|
||||
rapidfuzz_add_test(DamerauLevenshtein)
|
||||
rapidfuzz_add_test(OSA)
|
||||
rapidfuzz_add_test(Jaro)
|
||||
rapidfuzz_add_test(JaroWinkler)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user