1 /* SPDX-License-Identifier: MIT */
2 /* Copyright © 2021 Max Bachmann */
3 /* Copyright © 2011 Adam Cohen */
4 
5 #include <rapidfuzz/details/matching_blocks.hpp>
6 #include <rapidfuzz/string_metric.hpp>
7 
8 #include <algorithm>
9 #include <cmath>
10 #include <iterator>
11 #include <unordered_map>
12 #include <vector>
13 
14 namespace rapidfuzz {
15 namespace fuzz {
16 
17 /**********************************************
18  *                  ratio
19  *********************************************/
20 
21 template <typename Sentence1, typename Sentence2>
ratio(const Sentence1 & s1,const Sentence2 & s2,const percent score_cutoff)22 percent ratio(const Sentence1& s1, const Sentence2& s2, const percent score_cutoff)
23 {
24     return string_metric::normalized_levenshtein(s1, s2, {1, 1, 2}, score_cutoff);
25 }
26 
27 template <typename Sentence1>
28 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const29 double CachedRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
30 {
31     auto s2_view = common::to_string_view(s2);
32 
33     return string_metric::detail::normalized_weighted_levenshtein(s2_view, blockmap_s1, s1_view,
34                                                                   score_cutoff);
35 }
36 
37 /**********************************************
38  *              partial_ratio
39  *********************************************/
40 
41 namespace detail {
42 
43 template <typename Sentence1, typename CachedSentence1, typename Sentence2>
44 percent
partial_ratio_short_needle(const Sentence1 & s1,const CachedRatio<CachedSentence1> & cached_ratio,const common::CharHashTable<char_type<Sentence1>,bool> & s1_char_map,const Sentence2 & s2,percent score_cutoff)45 partial_ratio_short_needle(const Sentence1& s1, const CachedRatio<CachedSentence1>& cached_ratio,
46                            const common::CharHashTable<char_type<Sentence1>, bool>& s1_char_map,
47                            const Sentence2& s2, percent score_cutoff)
48 {
49     double max_ratio = 0;
50     auto s1_view = common::to_string_view(s1);
51     auto s2_view = common::to_string_view(s2);
52 
53     for (std::size_t i = 1; i < s1_view.size(); ++i) {
54         auto long_substr = s2_view.substr(0, i);
55 
56         if (!s1_char_map[long_substr.back()]) {
57             continue;
58         }
59 
60         double ls_ratio = cached_ratio.ratio(long_substr, score_cutoff);
61         if (ls_ratio > max_ratio) {
62             score_cutoff = max_ratio = ls_ratio;
63             if (ls_ratio == 100.0) {
64                 return 100.0;
65             }
66         }
67     }
68 
69     for (std::size_t i = 0; i < s2_view.size() - s1_view.size(); ++i) {
70         auto long_substr = s2_view.substr(i, s1_view.size());
71 
72         if (!s1_char_map[long_substr.back()]) {
73             continue;
74         }
75 
76         double ls_ratio = cached_ratio.ratio(long_substr, score_cutoff);
77         if (ls_ratio > max_ratio) {
78             score_cutoff = max_ratio = ls_ratio;
79             if (ls_ratio == 100.0) {
80                 return 100.0;
81             }
82         }
83     }
84 
85     for (std::size_t i = s2_view.size() - s1_view.size(); i < s2_view.size(); ++i) {
86         auto long_substr = s2_view.substr(i, s1_view.size());
87 
88         if (!s1_char_map[long_substr[0]]) {
89             continue;
90         }
91 
92         double ls_ratio = cached_ratio.ratio(long_substr, score_cutoff);
93         if (ls_ratio > max_ratio) {
94             score_cutoff = max_ratio = ls_ratio;
95             if (ls_ratio == 100.0) {
96                 return 100.0;
97             }
98         }
99     }
100 
101     return max_ratio;
102 }
103 
104 template <typename Sentence1, typename Sentence2, typename CharT1 = char_type<Sentence1>>
partial_ratio_short_needle(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)105 percent partial_ratio_short_needle(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
106 {
107     auto s1_view = common::to_string_view(s1);
108     CachedRatio<decltype(s1_view)> cached_ratio(s1_view);
109 
110     common::CharHashTable<CharT1, bool> s1_char_map;
111     for (const CharT1& ch : s1_view) {
112         s1_char_map.create(ch) = true;
113     }
114 
115     return partial_ratio_short_needle(s1_view, cached_ratio, s1_char_map, s2, score_cutoff);
116 }
117 
118 template <typename Sentence1, typename CachedSentence1, typename Sentence2>
partial_ratio_long_needle(const Sentence1 & s1,const CachedRatio<CachedSentence1> & cached_ratio,const Sentence2 & s2,percent score_cutoff)119 percent partial_ratio_long_needle(const Sentence1& s1,
120                                   const CachedRatio<CachedSentence1>& cached_ratio,
121                                   const Sentence2& s2, percent score_cutoff)
122 {
123     double max_ratio = 0;
124     if (score_cutoff > 100) {
125         return 0;
126     }
127 
128     auto s1_view = common::to_string_view(s1);
129     auto s2_view = common::to_string_view(s2);
130 
131     if (s1_view.empty() || s2_view.empty()) {
132         return static_cast<double>(s1_view.empty() && s2_view.empty()) * 100.0;
133     }
134 
135     auto blocks = rapidfuzz::detail::get_matching_blocks(s1_view, s2_view);
136 
137     // when there is a full match exit early
138     for (const auto& block : blocks) {
139         if (block.length == s1_view.length()) {
140             return 100;
141         }
142     }
143 
144     for (const auto& block : blocks) {
145         std::size_t long_start = (block.dpos > block.spos) ? block.dpos - block.spos : 0;
146         auto long_substr = s2_view.substr(long_start, s1_view.length());
147 
148         double ls_ratio = cached_ratio.ratio(long_substr, score_cutoff);
149         if (ls_ratio > max_ratio) {
150             score_cutoff = max_ratio = ls_ratio;
151         }
152     }
153 
154     return max_ratio;
155 }
156 
157 template <typename Sentence1, typename Sentence2>
partial_ratio_long_needle(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)158 percent partial_ratio_long_needle(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
159 {
160     auto s1_view = common::to_string_view(s1);
161     CachedRatio<decltype(s1_view)> cached_ratio(s1_view);
162 
163     return partial_ratio_long_needle(s1_view, cached_ratio, s2, score_cutoff);
164 }
165 
166 } // namespace detail
167 
168 template <typename Sentence1, typename Sentence2, typename CharT1, typename CharT2>
partial_ratio(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)169 percent partial_ratio(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
170 {
171     if (score_cutoff > 100) {
172         return 0;
173     }
174 
175     auto s1_view = common::to_string_view(s1);
176     auto s2_view = common::to_string_view(s2);
177 
178     if (s1_view.empty() || s2_view.empty()) {
179         return static_cast<double>(s1_view.empty() && s2_view.empty()) * 100.0;
180     }
181 
182     if (s1_view.length() > s2_view.length()) {
183         return partial_ratio(s2_view, s1_view, score_cutoff);
184     }
185 
186     if (s1_view.length() <= 64) {
187         return detail::partial_ratio_short_needle(s1_view, s2_view, score_cutoff);
188     }
189     else {
190         return detail::partial_ratio_long_needle(s1_view, s2_view, score_cutoff);
191     }
192 }
193 
194 template <typename Sentence1>
CachedPartialRatio(const Sentence1 & s1)195 CachedPartialRatio<Sentence1>::CachedPartialRatio(const Sentence1& s1)
196     : s1_view(common::to_string_view(s1)), cached_ratio(s1)
197 {
198     for (const CharT1& ch : s1_view) {
199         s1_char_map.create(ch) = true;
200     }
201 }
202 
203 template <typename Sentence1>
204 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const205 double CachedPartialRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
206 {
207     auto s2_view = common::to_string_view(s2);
208 
209     if (s1_view.size() > s2_view.size()) {
210         return partial_ratio(s1_view, s2_view, score_cutoff);
211     }
212 
213     if (s1_view.empty() || s2_view.empty()) {
214         return static_cast<double>(s1_view.empty() && s2_view.empty()) * 100.0;
215     }
216 
217     if (s1_view.length() <= 64) {
218         return detail::partial_ratio_short_needle(s1_view, cached_ratio, s1_char_map, s2_view,
219                                                   score_cutoff);
220     }
221     else {
222         return detail::partial_ratio_long_needle(s1_view, cached_ratio, s2_view, score_cutoff);
223     }
224 }
225 
226 /**********************************************
227  *             token_sort_ratio
228  *********************************************/
229 
230 template <typename Sentence1, typename Sentence2, typename CharT1, typename CharT2>
token_sort_ratio(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)231 percent token_sort_ratio(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
232 {
233     if (score_cutoff > 100) return 0;
234 
235     return ratio(common::sorted_split(s1).join(), common::sorted_split(s2).join(), score_cutoff);
236 }
237 
238 template <typename Sentence1>
239 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const240 double CachedTokenSortRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
241 {
242     if (score_cutoff > 100) return 0;
243 
244     return cached_ratio.ratio(common::sorted_split(s2).join(), score_cutoff);
245 }
246 
247 /**********************************************
248  *          partial_token_sort_ratio
249  *********************************************/
250 
251 template <typename Sentence1, typename Sentence2, typename CharT1, typename CharT2>
partial_token_sort_ratio(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)252 percent partial_token_sort_ratio(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
253 {
254     if (score_cutoff > 100) return 0;
255 
256     return partial_ratio(common::sorted_split(s1).join(), common::sorted_split(s2).join(),
257                          score_cutoff);
258 }
259 
260 template <typename Sentence1>
261 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const262 double CachedPartialTokenSortRatio<Sentence1>::ratio(const Sentence2& s2,
263                                                      percent score_cutoff) const
264 {
265     if (score_cutoff > 100) return 0;
266 
267     return cached_partial_ratio.ratio(common::sorted_split(s2).join(), score_cutoff);
268 }
269 
270 /**********************************************
271  *               token_set_ratio
272  *********************************************/
273 
274 namespace detail {
275 template <typename CharT1, typename CharT2>
token_set_ratio(const SplittedSentenceView<CharT1> & tokens_a,const SplittedSentenceView<CharT2> & tokens_b,const percent score_cutoff)276 percent token_set_ratio(const SplittedSentenceView<CharT1>& tokens_a,
277                         const SplittedSentenceView<CharT2>& tokens_b, const percent score_cutoff)
278 {
279     /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
280      * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
281     if (tokens_a.empty() || tokens_a.empty()) {
282         return 0;
283     }
284 
285     auto decomposition = common::set_decomposition(tokens_a, tokens_b);
286     auto intersect = decomposition.intersection;
287     auto diff_ab = decomposition.difference_ab;
288     auto diff_ba = decomposition.difference_ba;
289 
290     // one sentence is part of the other one
291     if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) {
292         return 100;
293     }
294 
295     auto diff_ab_joined = diff_ab.join();
296     auto diff_ba_joined = diff_ba.join();
297 
298     size_t ab_len = diff_ab_joined.length();
299     size_t ba_len = diff_ba_joined.length();
300     size_t sect_len = intersect.length();
301 
302     // string length sect+ab <-> sect and sect+ba <-> sect
303     size_t sect_ab_len = sect_len + !!sect_len + ab_len;
304     size_t sect_ba_len = sect_len + !!sect_len + ba_len;
305 
306     percent result = 0;
307     auto cutoff_distance = common::score_cutoff_to_distance(score_cutoff, ab_len + ba_len);
308     size_t dist =
309         string_metric::levenshtein(diff_ab_joined, diff_ba_joined, {1, 1, 2}, cutoff_distance);
310 
311     if (dist != (std::size_t)-1) {
312         result = common::norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff);
313     }
314 
315     // exit early since the other ratios are 0
316     if (!sect_len) {
317         return result;
318     }
319 
320     // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
321     // since only sect is similar in them the distance can be calculated based on
322     // the length difference
323     std::size_t sect_ab_dist = !!sect_len + ab_len;
324     percent sect_ab_ratio =
325         common::norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
326 
327     std::size_t sect_ba_dist = !!sect_len + ba_len;
328     percent sect_ba_ratio =
329         common::norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
330 
331     return std::max({result, sect_ab_ratio, sect_ba_ratio});
332 }
333 } // namespace detail
334 
335 template <typename Sentence1, typename Sentence2>
token_set_ratio(const Sentence1 & s1,const Sentence2 & s2,const percent score_cutoff)336 percent token_set_ratio(const Sentence1& s1, const Sentence2& s2, const percent score_cutoff)
337 {
338     if (score_cutoff > 100) return 0;
339 
340     return detail::token_set_ratio(common::sorted_split(s1), common::sorted_split(s2),
341                                    score_cutoff);
342 }
343 
344 template <typename Sentence1>
CachedTokenSetRatio(const Sentence1 & s1)345 CachedTokenSetRatio<Sentence1>::CachedTokenSetRatio(const Sentence1& s1)
346     : tokens_s1(common::sorted_split(s1))
347 {}
348 
349 template <typename Sentence1>
350 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const351 double CachedTokenSetRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
352 {
353     if (score_cutoff > 100) return 0;
354 
355     return detail::token_set_ratio(tokens_s1, common::sorted_split(s2), score_cutoff);
356 }
357 
358 /**********************************************
359  *          partial_token_set_ratio
360  *********************************************/
361 
362 namespace detail {
363 template <typename CharT1, typename CharT2>
partial_token_set_ratio(const SplittedSentenceView<CharT1> & tokens_a,const SplittedSentenceView<CharT2> & tokens_b,const percent score_cutoff)364 percent partial_token_set_ratio(const SplittedSentenceView<CharT1>& tokens_a,
365                                 const SplittedSentenceView<CharT2>& tokens_b,
366                                 const percent score_cutoff)
367 {
368     /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
369      * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
370     if (tokens_a.empty() || tokens_a.empty()) {
371         return 0;
372     }
373 
374     auto decomposition = common::set_decomposition(tokens_a, tokens_b);
375 
376     // exit early when there is a common word in both sequences
377     if (!decomposition.intersection.empty()) return 100;
378 
379     return partial_ratio(decomposition.difference_ab.join(), decomposition.difference_ba.join(),
380                          score_cutoff);
381 }
382 } // namespace detail
383 
384 template <typename Sentence1, typename Sentence2, typename CharT1, typename CharT2>
partial_token_set_ratio(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)385 percent partial_token_set_ratio(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
386 {
387     if (score_cutoff > 100) return 0;
388 
389     return detail::partial_token_set_ratio(common::sorted_split(s1), common::sorted_split(s2),
390                                            score_cutoff);
391 }
392 
393 template <typename Sentence1>
CachedPartialTokenSetRatio(const Sentence1 & s1)394 CachedPartialTokenSetRatio<Sentence1>::CachedPartialTokenSetRatio(const Sentence1& s1)
395     : tokens_s1(common::sorted_split(s1))
396 {}
397 
398 template <typename Sentence1>
399 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const400 double CachedPartialTokenSetRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
401 {
402     if (score_cutoff > 100) return 0;
403 
404     return detail::partial_token_set_ratio(tokens_s1, common::sorted_split(s2), score_cutoff);
405 }
406 
407 /**********************************************
408  *                token_ratio
409  *********************************************/
410 
411 template <typename Sentence1, typename Sentence2>
token_ratio(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)412 percent token_ratio(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
413 {
414     if (score_cutoff > 100) return 0;
415 
416     auto tokens_a = common::sorted_split(s1);
417     auto tokens_b = common::sorted_split(s2);
418 
419     auto decomposition = common::set_decomposition(tokens_a, tokens_b);
420     auto intersect = decomposition.intersection;
421     auto diff_ab = decomposition.difference_ab;
422     auto diff_ba = decomposition.difference_ba;
423 
424     if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) {
425         return 100;
426     }
427 
428     auto diff_ab_joined = diff_ab.join();
429     auto diff_ba_joined = diff_ba.join();
430 
431     size_t ab_len = diff_ab_joined.length();
432     size_t ba_len = diff_ba_joined.length();
433     size_t sect_len = intersect.length();
434 
435     percent result = ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
436 
437     // string length sect+ab <-> sect and sect+ba <-> sect
438     size_t sect_ab_len = sect_len + !!sect_len + ab_len;
439     size_t sect_ba_len = sect_len + !!sect_len + ba_len;
440 
441     auto cutoff_distance = common::score_cutoff_to_distance(score_cutoff, ab_len + ba_len);
442     size_t dist =
443         string_metric::levenshtein(diff_ab_joined, diff_ba_joined, {1, 1, 2}, cutoff_distance);
444     if (dist != (std::size_t)-1) {
445         result =
446             std::max(result, common::norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
447     }
448 
449     // exit early since the other ratios are 0
450     if (!sect_len) {
451         return result;
452     }
453 
454     // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
455     // since only sect is similar in them the distance can be calculated based on
456     // the length difference
457     std::size_t sect_ab_dist = !!sect_len + ab_len;
458     percent sect_ab_ratio =
459         common::norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
460 
461     std::size_t sect_ba_dist = !!sect_len + ba_len;
462     percent sect_ba_ratio =
463         common::norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
464 
465     return std::max({result, sect_ab_ratio, sect_ba_ratio});
466 }
467 
468 namespace detail {
469 template <typename CharT1, typename CachedSentence1, typename Sentence2>
token_ratio(const SplittedSentenceView<CharT1> & s1_tokens,const CachedRatio<CachedSentence1> & cached_ratio_s1_sorted,const Sentence2 & s2,percent score_cutoff)470 percent token_ratio(const SplittedSentenceView<CharT1>& s1_tokens,
471                     const CachedRatio<CachedSentence1>& cached_ratio_s1_sorted, const Sentence2& s2,
472                     percent score_cutoff)
473 {
474     if (score_cutoff > 100) return 0;
475 
476     auto s2_tokens = common::sorted_split(s2);
477 
478     auto decomposition = common::set_decomposition(s1_tokens, s2_tokens);
479     auto intersect = decomposition.intersection;
480     auto diff_ab = decomposition.difference_ab;
481     auto diff_ba = decomposition.difference_ba;
482 
483     if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) {
484         return 100;
485     }
486 
487     auto diff_ab_joined = diff_ab.join();
488     auto diff_ba_joined = diff_ba.join();
489 
490     size_t ab_len = diff_ab_joined.length();
491     size_t ba_len = diff_ba_joined.length();
492     size_t sect_len = intersect.length();
493 
494     percent result = cached_ratio_s1_sorted.ratio(s2_tokens.join(), score_cutoff);
495 
496     // string length sect+ab <-> sect and sect+ba <-> sect
497     size_t sect_ab_len = sect_len + !!sect_len + ab_len;
498     size_t sect_ba_len = sect_len + !!sect_len + ba_len;
499 
500     auto cutoff_distance = common::score_cutoff_to_distance(score_cutoff, ab_len + ba_len);
501     size_t dist =
502         string_metric::levenshtein(diff_ab_joined, diff_ba_joined, {1, 1, 2}, cutoff_distance);
503     if (dist != (std::size_t)-1) {
504         result =
505             std::max(result, common::norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
506     }
507 
508     // exit early since the other ratios are 0
509     if (!sect_len) {
510         return result;
511     }
512 
513     // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
514     // since only sect is similar in them the distance can be calculated based on
515     // the length difference
516     std::size_t sect_ab_dist = !!sect_len + ab_len;
517     percent sect_ab_ratio =
518         common::norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
519 
520     std::size_t sect_ba_dist = !!sect_len + ba_len;
521     percent sect_ba_ratio =
522         common::norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
523 
524     return std::max({result, sect_ab_ratio, sect_ba_ratio});
525 }
526 
527 // todo this is a temporary solution until WRatio is properly implemented using other scorers
528 template <typename CharT1, typename Sentence2>
token_ratio(const std::basic_string<CharT1> & s1_sorted,const SplittedSentenceView<CharT1> & tokens_s1,const common::BlockPatternMatchVector & blockmap_s1_sorted,const Sentence2 & s2,percent score_cutoff)529 percent token_ratio(const std::basic_string<CharT1>& s1_sorted,
530                     const SplittedSentenceView<CharT1>& tokens_s1,
531                     const common::BlockPatternMatchVector& blockmap_s1_sorted,
532                     const Sentence2& s2, percent score_cutoff)
533 {
534     if (score_cutoff > 100) return 0;
535 
536     auto tokens_b = common::sorted_split(s2);
537 
538     auto decomposition = common::set_decomposition(tokens_s1, tokens_b);
539     auto intersect = decomposition.intersection;
540     auto diff_ab = decomposition.difference_ab;
541     auto diff_ba = decomposition.difference_ba;
542 
543     if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) {
544         return 100;
545     }
546 
547     auto diff_ab_joined = diff_ab.join();
548     auto diff_ba_joined = diff_ba.join();
549 
550     size_t ab_len = diff_ab_joined.length();
551     size_t ba_len = diff_ba_joined.length();
552     size_t sect_len = intersect.length();
553 
554     percent result = 0;
555     auto s2_sorted = tokens_b.join();
556     if (s1_sorted.size() < 65) {
557         result = string_metric::detail::normalized_weighted_levenshtein(
558             common::to_string_view(s2_sorted), blockmap_s1_sorted,
559             common::to_string_view(s1_sorted), score_cutoff);
560     }
561     else {
562         result = fuzz::ratio(s1_sorted, s2_sorted, score_cutoff);
563     }
564 
565     // string length sect+ab <-> sect and sect+ba <-> sect
566     size_t sect_ab_len = sect_len + !!sect_len + ab_len;
567     size_t sect_ba_len = sect_len + !!sect_len + ba_len;
568 
569     auto cutoff_distance = common::score_cutoff_to_distance(score_cutoff, ab_len + ba_len);
570     size_t dist =
571         string_metric::levenshtein(diff_ab_joined, diff_ba_joined, {1, 1, 2}, cutoff_distance);
572     if (dist != (std::size_t)-1) {
573         result =
574             std::max(result, common::norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
575     }
576 
577     // exit early since the other ratios are 0
578     if (!sect_len) {
579         return result;
580     }
581 
582     // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
583     // since only sect is similar in them the distance can be calculated based on
584     // the length difference
585     std::size_t sect_ab_dist = !!sect_len + ab_len;
586     percent sect_ab_ratio =
587         common::norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
588 
589     std::size_t sect_ba_dist = !!sect_len + ba_len;
590     percent sect_ba_ratio =
591         common::norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
592 
593     return std::max({result, sect_ab_ratio, sect_ba_ratio});
594 }
595 } // namespace detail
596 
597 template <typename Sentence1>
598 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const599 double CachedTokenRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
600 {
601     return detail::token_ratio(s1_tokens, cached_ratio_s1_sorted, s2, score_cutoff);
602 }
603 
604 /**********************************************
605  *            partial_token_ratio
606  *********************************************/
607 
608 template <typename Sentence1, typename Sentence2, typename CharT1, typename CharT2>
partial_token_ratio(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)609 percent partial_token_ratio(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
610 {
611     if (score_cutoff > 100) return 0;
612 
613     auto tokens_a = common::sorted_split(s1);
614     auto tokens_b = common::sorted_split(s2);
615 
616     auto decomposition = common::set_decomposition(tokens_a, tokens_b);
617 
618     // exit early when there is a common word in both sequences
619     if (!decomposition.intersection.empty()) return 100;
620 
621     auto diff_ab = decomposition.difference_ab;
622     auto diff_ba = decomposition.difference_ba;
623 
624     percent result = partial_ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
625 
626     // do not calculate the same partial_ratio twice
627     if (tokens_a.word_count() == diff_ab.word_count() &&
628         tokens_b.word_count() == diff_ba.word_count()) {
629         return result;
630     }
631 
632     score_cutoff = std::max(score_cutoff, result);
633     return std::max(result, partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
634 }
635 
636 namespace detail {
637 template <typename CharT1, typename Sentence2>
partial_token_ratio(const std::basic_string<CharT1> & s1_sorted,const SplittedSentenceView<CharT1> & tokens_s1,const Sentence2 & s2,percent score_cutoff)638 percent partial_token_ratio(const std::basic_string<CharT1>& s1_sorted,
639                             const SplittedSentenceView<CharT1>& tokens_s1, const Sentence2& s2,
640                             percent score_cutoff)
641 {
642     if (score_cutoff > 100) return 0;
643 
644     auto tokens_b = common::sorted_split(s2);
645 
646     auto decomposition = common::set_decomposition(tokens_s1, tokens_b);
647 
648     // exit early when there is a common word in both sequences
649     if (!decomposition.intersection.empty()) return 100;
650 
651     auto diff_ab = decomposition.difference_ab;
652     auto diff_ba = decomposition.difference_ba;
653 
654     percent result = partial_ratio(s1_sorted, tokens_b.join(), score_cutoff);
655 
656     // do not calculate the same partial_ratio twice
657     if (tokens_s1.word_count() == diff_ab.word_count() &&
658         tokens_b.word_count() == diff_ba.word_count()) {
659         return result;
660     }
661 
662     score_cutoff = std::max(score_cutoff, result);
663     return std::max(result, partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
664 }
665 
666 } // namespace detail
667 
668 template <typename Sentence1>
CachedPartialTokenRatio(const Sentence1 & s1)669 CachedPartialTokenRatio<Sentence1>::CachedPartialTokenRatio(const Sentence1& s1)
670     : tokens_s1(common::sorted_split(s1))
671 {
672     s1_sorted = tokens_s1.join();
673 }
674 
675 template <typename Sentence1>
676 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const677 double CachedPartialTokenRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
678 {
679     return detail::partial_token_ratio(s1_sorted, tokens_s1, s2, score_cutoff);
680 }
681 
682 /**********************************************
683  *                  WRatio
684  *********************************************/
685 
686 template <typename Sentence1, typename Sentence2>
WRatio(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)687 percent WRatio(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
688 {
689     if (score_cutoff > 100) return 0;
690 
691     constexpr double UNBASE_SCALE = 0.95;
692 
693     auto s1_view = common::to_string_view(s1);
694     auto s2_view = common::to_string_view(s2);
695 
696     /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
697      * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
698     if (s1_view.empty() || s2_view.empty()) {
699         return 0;
700     }
701 
702     size_t len_a = s1_view.length();
703     size_t len_b = s2_view.length();
704     double len_ratio = (len_a > len_b) ? static_cast<double>(len_a) / static_cast<double>(len_b)
705                                        : static_cast<double>(len_b) / static_cast<double>(len_a);
706 
707     percent end_ratio = ratio(s1, s2, score_cutoff);
708 
709     if (len_ratio < 1.5) {
710         score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
711         return std::max(end_ratio, token_ratio(s1, s2, score_cutoff) * UNBASE_SCALE);
712     }
713 
714     const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
715 
716     score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
717     end_ratio = std::max(end_ratio, partial_ratio(s1, s2, score_cutoff) * PARTIAL_SCALE);
718 
719     score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
720     return std::max(end_ratio,
721                     partial_token_ratio(s1, s2, score_cutoff) * UNBASE_SCALE * PARTIAL_SCALE);
722 }
723 
724 template <typename Sentence1>
CachedWRatio(const Sentence1 & s1)725 CachedWRatio<Sentence1>::CachedWRatio(const Sentence1& s1)
726     : cached_partial_ratio(s1), tokens_s1(common::sorted_split(s1))
727 {
728     s1_view = common::to_string_view(s1);
729     s1_sorted = tokens_s1.join();
730 
731     blockmap_s1_sorted.insert(common::to_string_view(s1_sorted));
732 }
733 
734 template <typename Sentence1>
735 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const736 double CachedWRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
737 {
738     if (score_cutoff > 100) return 0;
739 
740     constexpr double UNBASE_SCALE = 0.95;
741 
742     auto s2_view = common::to_string_view(s2);
743 
744     /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
745      * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
746     if (s1_view.empty() || s2_view.empty()) {
747         return 0;
748     }
749 
750     size_t len_a = s1_view.length();
751     size_t len_b = s2_view.length();
752     double len_ratio = (len_a > len_b) ? static_cast<double>(len_a) / static_cast<double>(len_b)
753                                        : static_cast<double>(len_b) / static_cast<double>(len_a);
754 
755     percent end_ratio = cached_partial_ratio.cached_ratio.ratio(s2_view, score_cutoff);
756 
757     if (len_ratio < 1.5) {
758         score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
759         // use pre calculated values
760         auto r =
761             detail::token_ratio(s1_sorted, tokens_s1, blockmap_s1_sorted, s2_view, score_cutoff);
762         return std::max(end_ratio, r * UNBASE_SCALE);
763     }
764 
765     const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
766 
767     score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
768     end_ratio =
769         std::max(end_ratio, cached_partial_ratio.ratio(s2_view, score_cutoff) * PARTIAL_SCALE);
770 
771     score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
772     auto r = detail::partial_token_ratio(s1_sorted, tokens_s1, s2_view, score_cutoff);
773     return std::max(end_ratio, r * UNBASE_SCALE * PARTIAL_SCALE);
774 }
775 
776 /**********************************************
777  *                QRatio
778  *********************************************/
779 
780 template <typename Sentence1, typename Sentence2>
QRatio(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)781 percent QRatio(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
782 {
783     auto s1_view = common::to_string_view(s1);
784     auto s2_view = common::to_string_view(s2);
785 
786     /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
787      * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
788     if (s1_view.empty() || s2_view.empty()) {
789         return 0;
790     }
791 
792     return ratio(s1_view, s2_view, score_cutoff);
793 }
794 
795 template <typename Sentence1>
796 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const797 double CachedQRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
798 {
799     auto s2_view = common::to_string_view(s2);
800 
801     /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
802      * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
803     if (s1_view.empty() || s2_view.empty()) {
804         return 0;
805     }
806 
807     return cached_ratio.ratio(s2_view, score_cutoff);
808 }
809 
810 } // namespace fuzz
811 } // namespace rapidfuzz
812