RapidFuzz
Loading...
Searching...
No Matches
fuzz.hpp
1/* SPDX-License-Identifier: MIT */
2/* Copyright © 2021 Max Bachmann */
3/* Copyright © 2011 Adam Cohen */
4
5#pragma once
6#include <rapidfuzz/details/CharSet.hpp>
7#include <rapidfuzz/details/PatternMatchVector.hpp>
8#include <rapidfuzz/details/common.hpp>
9#include <rapidfuzz/distance/Indel.hpp>
10
11#include <type_traits>
12
13namespace rapidfuzz::fuzz {
14
45template <typename InputIt1, typename InputIt2>
46double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
47
48template <typename Sentence1, typename Sentence2>
49double ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
50
51#ifdef RAPIDFUZZ_SIMD
52namespace experimental {
53template <int MaxLen>
54struct MultiRatio {
55public:
56 MultiRatio(size_t count) : input_count(count), scorer(count)
57 {}
58
59 size_t result_count() const
60 {
61 return scorer.result_count();
62 }
63
64 template <typename Sentence1>
65 void insert(const Sentence1& s1_)
66 {
67 insert(detail::to_begin(s1_), detail::to_end(s1_));
68 }
69
70 template <typename InputIt1>
71 void insert(InputIt1 first1, InputIt1 last1)
72 {
73 scorer.insert(first1, last1);
74 }
75
76 template <typename InputIt2>
77 void similarity(double* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
78 double score_cutoff = 0.0) const
79 {
80 similarity(scores, score_count, detail::Range(first2, last2), score_cutoff);
81 }
82
83 template <typename Sentence2>
84 void similarity(double* scores, size_t score_count, const Sentence2& s2, double score_cutoff = 0) const
85 {
86 scorer.normalized_similarity(scores, score_count, s2, score_cutoff / 100.0);
87
88 for (size_t i = 0; i < input_count; ++i)
89 scores[i] *= 100.0;
90 }
91
92private:
93 size_t input_count;
94 rapidfuzz::experimental::MultiIndel<MaxLen> scorer;
95};
96} /* namespace experimental */
97#endif
98
99// TODO documentation
100template <typename CharT1>
101struct CachedRatio {
102 template <typename InputIt1>
103 CachedRatio(InputIt1 first1, InputIt1 last1) : cached_indel(first1, last1)
104 {}
105
106 template <typename Sentence1>
107 CachedRatio(const Sentence1& s1) : cached_indel(s1)
108 {}
109
110 template <typename InputIt2>
111 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
112 double score_hint = 0.0) const;
113
114 template <typename Sentence2>
115 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
116
117 // private:
118 CachedIndel<CharT1> cached_indel;
119};
120
121template <typename Sentence1>
122CachedRatio(const Sentence1& s1) -> CachedRatio<char_type<Sentence1>>;
123
124template <typename InputIt1>
125CachedRatio(InputIt1 first1, InputIt1 last1) -> CachedRatio<iter_value_t<InputIt1>>;
126
127template <typename InputIt1, typename InputIt2>
128ScoreAlignment<double> partial_ratio_alignment(InputIt1 first1, InputIt1 last1, InputIt2 first2,
129 InputIt2 last2, double score_cutoff = 0);
130
131template <typename Sentence1, typename Sentence2>
132ScoreAlignment<double> partial_ratio_alignment(const Sentence1& s1, const Sentence2& s2,
133 double score_cutoff = 0);
134
160template <typename InputIt1, typename InputIt2>
161double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
162 double score_cutoff = 0);
163
164template <typename Sentence1, typename Sentence2>
165double partial_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
166
167// todo add real implementation
168template <typename CharT1>
169struct CachedPartialRatio {
170 template <typename>
171 friend struct CachedWRatio;
172
173 template <typename InputIt1>
174 CachedPartialRatio(InputIt1 first1, InputIt1 last1);
175
176 template <typename Sentence1>
177 explicit CachedPartialRatio(const Sentence1& s1_)
178 : CachedPartialRatio(detail::to_begin(s1_), detail::to_end(s1_))
179 {}
180
181 template <typename InputIt2>
182 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
183 double score_hint = 0.0) const;
184
185 template <typename Sentence2>
186 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
187
188private:
189 std::basic_string<CharT1> s1;
190 rapidfuzz::detail::CharSet<CharT1> s1_char_set;
191 CachedRatio<CharT1> cached_ratio;
192};
193
194template <typename Sentence1>
195explicit CachedPartialRatio(const Sentence1& s1) -> CachedPartialRatio<char_type<Sentence1>>;
196
197template <typename InputIt1>
198CachedPartialRatio(InputIt1 first1, InputIt1 last1) -> CachedPartialRatio<iter_value_t<InputIt1>>;
199
226template <typename InputIt1, typename InputIt2>
227double token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
228 double score_cutoff = 0);
229
230template <typename Sentence1, typename Sentence2>
231double token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
232
233#ifdef RAPIDFUZZ_SIMD
234namespace experimental {
235template <int MaxLen>
236struct MultiTokenSortRatio {
237public:
238 MultiTokenSortRatio(size_t count) : scorer(count)
239 {}
240
241 size_t result_count() const
242 {
243 return scorer.result_count();
244 }
245
246 template <typename Sentence1>
247 void insert(const Sentence1& s1_)
248 {
249 insert(detail::to_begin(s1_), detail::to_end(s1_));
250 }
251
252 template <typename InputIt1>
253 void insert(InputIt1 first1, InputIt1 last1)
254 {
255 scorer.insert(detail::sorted_split(first1, last1).join());
256 }
257
258 template <typename InputIt2>
259 void similarity(double* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
260 double score_cutoff = 0.0) const
261 {
262 scorer.similarity(scores, score_count, detail::sorted_split(first2, last2).join(), score_cutoff);
263 }
264
265 template <typename Sentence2>
266 void similarity(double* scores, size_t score_count, const Sentence2& s2, double score_cutoff = 0) const
267 {
268 similarity(scores, score_count, detail::to_begin(s2), detail::to_end(s2), score_cutoff);
269 }
270
271private:
272 MultiRatio<MaxLen> scorer;
273};
274} /* namespace experimental */
275#endif
276
277// todo CachedRatio speed for equal strings vs original implementation
278// TODO documentation
279template <typename CharT1>
280struct CachedTokenSortRatio {
281 template <typename InputIt1>
282 CachedTokenSortRatio(InputIt1 first1, InputIt1 last1)
283 : s1_sorted(detail::sorted_split(first1, last1).join()), cached_ratio(s1_sorted)
284 {}
285
286 template <typename Sentence1>
287 explicit CachedTokenSortRatio(const Sentence1& s1)
288 : CachedTokenSortRatio(detail::to_begin(s1), detail::to_end(s1))
289 {}
290
291 template <typename InputIt2>
292 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
293 double score_hint = 0.0) const;
294
295 template <typename Sentence2>
296 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
297
298private:
299 std::basic_string<CharT1> s1_sorted;
300 CachedRatio<CharT1> cached_ratio;
301};
302
303template <typename Sentence1>
304explicit CachedTokenSortRatio(const Sentence1& s1) -> CachedTokenSortRatio<char_type<Sentence1>>;
305
306template <typename InputIt1>
307CachedTokenSortRatio(InputIt1 first1, InputIt1 last1) -> CachedTokenSortRatio<iter_value_t<InputIt1>>;
308
329template <typename InputIt1, typename InputIt2>
330double partial_token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
331 double score_cutoff = 0);
332
333template <typename Sentence1, typename Sentence2>
334double partial_token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
335
336// TODO documentation
337template <typename CharT1>
338struct CachedPartialTokenSortRatio {
339 template <typename InputIt1>
340 CachedPartialTokenSortRatio(InputIt1 first1, InputIt1 last1)
341 : s1_sorted(detail::sorted_split(first1, last1).join()), cached_partial_ratio(s1_sorted)
342 {}
343
344 template <typename Sentence1>
345 explicit CachedPartialTokenSortRatio(const Sentence1& s1)
346 : CachedPartialTokenSortRatio(detail::to_begin(s1), detail::to_end(s1))
347 {}
348
349 template <typename InputIt2>
350 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
351 double score_hint = 0.0) const;
352
353 template <typename Sentence2>
354 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
355
356private:
357 std::basic_string<CharT1> s1_sorted;
358 CachedPartialRatio<CharT1> cached_partial_ratio;
359};
360
361template <typename Sentence1>
362explicit CachedPartialTokenSortRatio(const Sentence1& s1)
363 -> CachedPartialTokenSortRatio<char_type<Sentence1>>;
364
365template <typename InputIt1>
366CachedPartialTokenSortRatio(InputIt1 first1, InputIt1 last1)
367 -> CachedPartialTokenSortRatio<iter_value_t<InputIt1>>;
368
397template <typename InputIt1, typename InputIt2>
398double token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
399 double score_cutoff = 0);
400
401template <typename Sentence1, typename Sentence2>
402double token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
403
404// TODO documentation
405template <typename CharT1>
406struct CachedTokenSetRatio {
407 template <typename InputIt1>
408 CachedTokenSetRatio(InputIt1 first1, InputIt1 last1)
409 : s1(first1, last1), tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1)))
410 {}
411
412 template <typename Sentence1>
413 explicit CachedTokenSetRatio(const Sentence1& s1_)
414 : CachedTokenSetRatio(detail::to_begin(s1_), detail::to_end(s1_))
415 {}
416
417 template <typename InputIt2>
418 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
419 double score_hint = 0.0) const;
420
421 template <typename Sentence2>
422 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
423
424private:
425 std::basic_string<CharT1> s1;
426 detail::SplittedSentenceView<typename std::basic_string<CharT1>::iterator> tokens_s1;
427};
428
429template <typename Sentence1>
430explicit CachedTokenSetRatio(const Sentence1& s1) -> CachedTokenSetRatio<char_type<Sentence1>>;
431
432template <typename InputIt1>
433CachedTokenSetRatio(InputIt1 first1, InputIt1 last1) -> CachedTokenSetRatio<iter_value_t<InputIt1>>;
434
454template <typename InputIt1, typename InputIt2>
455double partial_token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
456 double score_cutoff = 0);
457
458template <typename Sentence1, typename Sentence2>
459double partial_token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
460
461// TODO documentation
462template <typename CharT1>
463struct CachedPartialTokenSetRatio {
464 template <typename InputIt1>
465 CachedPartialTokenSetRatio(InputIt1 first1, InputIt1 last1)
466 : s1(first1, last1), tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1)))
467 {}
468
469 template <typename Sentence1>
470 explicit CachedPartialTokenSetRatio(const Sentence1& s1_)
471 : CachedPartialTokenSetRatio(detail::to_begin(s1_), detail::to_end(s1_))
472 {}
473
474 template <typename InputIt2>
475 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
476 double score_hint = 0.0) const;
477
478 template <typename Sentence2>
479 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
480
481private:
482 std::basic_string<CharT1> s1;
483 detail::SplittedSentenceView<typename std::basic_string<CharT1>::iterator> tokens_s1;
484};
485
486template <typename Sentence1>
487explicit CachedPartialTokenSetRatio(const Sentence1& s1) -> CachedPartialTokenSetRatio<char_type<Sentence1>>;
488
489template <typename InputIt1>
490CachedPartialTokenSetRatio(InputIt1 first1, InputIt1 last1)
491 -> CachedPartialTokenSetRatio<iter_value_t<InputIt1>>;
492
512template <typename InputIt1, typename InputIt2>
513double token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
514
515template <typename Sentence1, typename Sentence2>
516double token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
517
518// todo add real implementation
519template <typename CharT1>
520struct CachedTokenRatio {
521 template <typename InputIt1>
522 CachedTokenRatio(InputIt1 first1, InputIt1 last1)
523 : s1(first1, last1),
524 s1_tokens(detail::sorted_split(std::begin(s1), std::end(s1))),
525 s1_sorted(s1_tokens.join()),
526 cached_ratio_s1_sorted(s1_sorted)
527 {}
528
529 template <typename Sentence1>
530 explicit CachedTokenRatio(const Sentence1& s1_)
531 : CachedTokenRatio(detail::to_begin(s1_), detail::to_end(s1_))
532 {}
533
534 template <typename InputIt2>
535 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
536 double score_hint = 0.0) const;
537
538 template <typename Sentence2>
539 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
540
541private:
542 std::basic_string<CharT1> s1;
543 detail::SplittedSentenceView<typename std::basic_string<CharT1>::iterator> s1_tokens;
544 std::basic_string<CharT1> s1_sorted;
545 CachedRatio<CharT1> cached_ratio_s1_sorted;
546};
547
548template <typename Sentence1>
549explicit CachedTokenRatio(const Sentence1& s1) -> CachedTokenRatio<char_type<Sentence1>>;
550
551template <typename InputIt1>
552CachedTokenRatio(InputIt1 first1, InputIt1 last1) -> CachedTokenRatio<iter_value_t<InputIt1>>;
553
574template <typename InputIt1, typename InputIt2>
575double partial_token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
576 double score_cutoff = 0);
577
578template <typename Sentence1, typename Sentence2>
579double partial_token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
580
581// todo add real implementation
582template <typename CharT1>
583struct CachedPartialTokenRatio {
584 template <typename InputIt1>
585 CachedPartialTokenRatio(InputIt1 first1, InputIt1 last1)
586 : s1(first1, last1),
587 tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1))),
588 s1_sorted(tokens_s1.join())
589 {}
590
591 template <typename Sentence1>
592 explicit CachedPartialTokenRatio(const Sentence1& s1_)
593 : CachedPartialTokenRatio(detail::to_begin(s1_), detail::to_end(s1_))
594 {}
595
596 template <typename InputIt2>
597 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
598 double score_hint = 0.0) const;
599
600 template <typename Sentence2>
601 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
602
603private:
604 std::basic_string<CharT1> s1;
605 detail::SplittedSentenceView<typename std::basic_string<CharT1>::iterator> tokens_s1;
606 std::basic_string<CharT1> s1_sorted;
607};
608
609template <typename Sentence1>
610explicit CachedPartialTokenRatio(const Sentence1& s1) -> CachedPartialTokenRatio<char_type<Sentence1>>;
611
612template <typename InputIt1>
613CachedPartialTokenRatio(InputIt1 first1, InputIt1 last1) -> CachedPartialTokenRatio<iter_value_t<InputIt1>>;
614
636template <typename InputIt1, typename InputIt2>
637double WRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
638
639template <typename Sentence1, typename Sentence2>
640double WRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
641
642// todo add real implementation
643template <typename CharT1>
644struct CachedWRatio {
645 template <typename InputIt1>
646 explicit CachedWRatio(InputIt1 first1, InputIt1 last1);
647
648 template <typename Sentence1>
649 CachedWRatio(const Sentence1& s1_) : CachedWRatio(detail::to_begin(s1_), detail::to_end(s1_))
650 {}
651
652 template <typename InputIt2>
653 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
654 double score_hint = 0.0) const;
655
656 template <typename Sentence2>
657 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
658
659private:
660 // todo somehow implement this using other ratios with creating PatternMatchVector
661 // multiple times
662 std::basic_string<CharT1> s1;
663 CachedPartialRatio<CharT1> cached_partial_ratio;
664 detail::SplittedSentenceView<typename std::basic_string<CharT1>::iterator> tokens_s1;
665 std::basic_string<CharT1> s1_sorted;
666 rapidfuzz::detail::BlockPatternMatchVector blockmap_s1_sorted;
667};
668
669template <typename Sentence1>
670explicit CachedWRatio(const Sentence1& s1) -> CachedWRatio<char_type<Sentence1>>;
671
672template <typename InputIt1>
673CachedWRatio(InputIt1 first1, InputIt1 last1) -> CachedWRatio<iter_value_t<InputIt1>>;
674
696template <typename InputIt1, typename InputIt2>
697double QRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
698
699template <typename Sentence1, typename Sentence2>
700double QRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
701
702#ifdef RAPIDFUZZ_SIMD
703namespace experimental {
704template <int MaxLen>
705struct MultiQRatio {
706public:
707 MultiQRatio(size_t count) : scorer(count)
708 {}
709
710 size_t result_count() const
711 {
712 return scorer.result_count();
713 }
714
715 template <typename Sentence1>
716 void insert(const Sentence1& s1_)
717 {
718 insert(detail::to_begin(s1_), detail::to_end(s1_));
719 }
720
721 template <typename InputIt1>
722 void insert(InputIt1 first1, InputIt1 last1)
723 {
724 scorer.insert(first1, last1);
725 str_lens.push_back(static_cast<size_t>(std::distance(first1, last1)));
726 }
727
728 template <typename InputIt2>
729 void similarity(double* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
730 double score_cutoff = 0.0) const
731 {
732 similarity(scores, score_count, detail::Range(first2, last2), score_cutoff);
733 }
734
735 template <typename Sentence2>
736 void similarity(double* scores, size_t score_count, const Sentence2& s2, double score_cutoff = 0) const
737 {
738 rapidfuzz::detail::Range s2_(s2);
739 if (s2_.empty()) {
740 for (size_t i = 0; i < str_lens.size(); ++i)
741 scores[i] = 0;
742
743 return;
744 }
745
746 scorer.similarity(scores, score_count, s2, score_cutoff);
747
748 for (size_t i = 0; i < str_lens.size(); ++i)
749 if (str_lens[i] == 0) scores[i] = 0;
750 }
751
752private:
753 std::vector<size_t> str_lens;
754 MultiRatio<MaxLen> scorer;
755};
756} /* namespace experimental */
757#endif
758
759template <typename CharT1>
760struct CachedQRatio {
761 template <typename InputIt1>
762 CachedQRatio(InputIt1 first1, InputIt1 last1) : s1(first1, last1), cached_ratio(first1, last1)
763 {}
764
765 template <typename Sentence1>
766 explicit CachedQRatio(const Sentence1& s1_) : CachedQRatio(detail::to_begin(s1_), detail::to_end(s1_))
767 {}
768
769 template <typename InputIt2>
770 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
771 double score_hint = 0.0) const;
772
773 template <typename Sentence2>
774 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
775
776private:
777 std::basic_string<CharT1> s1;
778 CachedRatio<CharT1> cached_ratio;
779};
780
781template <typename Sentence1>
782explicit CachedQRatio(const Sentence1& s1) -> CachedQRatio<char_type<Sentence1>>;
783
784template <typename InputIt1>
785CachedQRatio(InputIt1 first1, InputIt1 last1) -> CachedQRatio<iter_value_t<InputIt1>>;
786
789} // namespace rapidfuzz::fuzz
790
791#include <rapidfuzz/fuzz.impl>
double token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
Sorts the words in the strings and calculates the fuzz::ratio between them.
double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
calculates the fuzz::ratio of the optimal string alignment
double token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
Helper method that returns the maximum of fuzz::token_set_ratio and fuzz::token_sort_ratio (faster th...
double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
calculates a simple ratio between two strings
double partial_token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
Sorts the words in the strings and calculates the fuzz::partial_ratio between them.
double partial_token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
Compares the words in the strings based on unique and common words between them using fuzz::partial_r...
double WRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
Calculates a weighted ratio based on the other ratio algorithms.
double QRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
Calculates a quick ratio between two strings using fuzz.ratio.
double partial_token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
Helper method that returns the maximum of fuzz::partial_token_set_ratio and fuzz::partial_token_sort_...
double token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff=0)
Compares the words in the strings based on unique and common words between them using fuzz::ratio.