1 /* SPDX-License-Identifier: MIT */
2 /* Copyright © 2021 Max Bachmann */
3 /* Copyright © 2011 Adam Cohen */
4
5 #include <rapidfuzz/details/matching_blocks.hpp>
6 #include <rapidfuzz/string_metric.hpp>
7
8 #include <algorithm>
9 #include <cmath>
10 #include <iterator>
11 #include <unordered_map>
12 #include <vector>
13
14 namespace rapidfuzz {
15 namespace fuzz {
16
17 /**********************************************
18 * ratio
19 *********************************************/
20
21 template <typename Sentence1, typename Sentence2>
ratio(const Sentence1 & s1,const Sentence2 & s2,const percent score_cutoff)22 percent ratio(const Sentence1& s1, const Sentence2& s2, const percent score_cutoff)
23 {
24 return string_metric::normalized_levenshtein(s1, s2, {1, 1, 2}, score_cutoff);
25 }
26
27 template <typename Sentence1>
28 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const29 double CachedRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
30 {
31 auto s2_view = common::to_string_view(s2);
32
33 return string_metric::detail::normalized_weighted_levenshtein(s2_view, blockmap_s1, s1_view,
34 score_cutoff);
35 }
36
37 /**********************************************
38 * partial_ratio
39 *********************************************/
40
41 namespace detail {
42
43 template <typename Sentence1, typename CachedSentence1, typename Sentence2>
44 percent
partial_ratio_short_needle(const Sentence1 & s1,const CachedRatio<CachedSentence1> & cached_ratio,const common::CharHashTable<char_type<Sentence1>,bool> & s1_char_map,const Sentence2 & s2,percent score_cutoff)45 partial_ratio_short_needle(const Sentence1& s1, const CachedRatio<CachedSentence1>& cached_ratio,
46 const common::CharHashTable<char_type<Sentence1>, bool>& s1_char_map,
47 const Sentence2& s2, percent score_cutoff)
48 {
49 double max_ratio = 0;
50 auto s1_view = common::to_string_view(s1);
51 auto s2_view = common::to_string_view(s2);
52
53 for (std::size_t i = 1; i < s1_view.size(); ++i) {
54 auto long_substr = s2_view.substr(0, i);
55
56 if (!s1_char_map[long_substr.back()]) {
57 continue;
58 }
59
60 double ls_ratio = cached_ratio.ratio(long_substr, score_cutoff);
61 if (ls_ratio > max_ratio) {
62 score_cutoff = max_ratio = ls_ratio;
63 if (ls_ratio == 100.0) {
64 return 100.0;
65 }
66 }
67 }
68
69 for (std::size_t i = 0; i < s2_view.size() - s1_view.size(); ++i) {
70 auto long_substr = s2_view.substr(i, s1_view.size());
71
72 if (!s1_char_map[long_substr.back()]) {
73 continue;
74 }
75
76 double ls_ratio = cached_ratio.ratio(long_substr, score_cutoff);
77 if (ls_ratio > max_ratio) {
78 score_cutoff = max_ratio = ls_ratio;
79 if (ls_ratio == 100.0) {
80 return 100.0;
81 }
82 }
83 }
84
85 for (std::size_t i = s2_view.size() - s1_view.size(); i < s2_view.size(); ++i) {
86 auto long_substr = s2_view.substr(i, s1_view.size());
87
88 if (!s1_char_map[long_substr[0]]) {
89 continue;
90 }
91
92 double ls_ratio = cached_ratio.ratio(long_substr, score_cutoff);
93 if (ls_ratio > max_ratio) {
94 score_cutoff = max_ratio = ls_ratio;
95 if (ls_ratio == 100.0) {
96 return 100.0;
97 }
98 }
99 }
100
101 return max_ratio;
102 }
103
104 template <typename Sentence1, typename Sentence2, typename CharT1 = char_type<Sentence1>>
partial_ratio_short_needle(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)105 percent partial_ratio_short_needle(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
106 {
107 auto s1_view = common::to_string_view(s1);
108 CachedRatio<decltype(s1_view)> cached_ratio(s1_view);
109
110 common::CharHashTable<CharT1, bool> s1_char_map;
111 for (const CharT1& ch : s1_view) {
112 s1_char_map.create(ch) = true;
113 }
114
115 return partial_ratio_short_needle(s1_view, cached_ratio, s1_char_map, s2, score_cutoff);
116 }
117
118 template <typename Sentence1, typename CachedSentence1, typename Sentence2>
partial_ratio_long_needle(const Sentence1 & s1,const CachedRatio<CachedSentence1> & cached_ratio,const Sentence2 & s2,percent score_cutoff)119 percent partial_ratio_long_needle(const Sentence1& s1,
120 const CachedRatio<CachedSentence1>& cached_ratio,
121 const Sentence2& s2, percent score_cutoff)
122 {
123 double max_ratio = 0;
124 if (score_cutoff > 100) {
125 return 0;
126 }
127
128 auto s1_view = common::to_string_view(s1);
129 auto s2_view = common::to_string_view(s2);
130
131 if (s1_view.empty() || s2_view.empty()) {
132 return static_cast<double>(s1_view.empty() && s2_view.empty()) * 100.0;
133 }
134
135 auto blocks = rapidfuzz::detail::get_matching_blocks(s1_view, s2_view);
136
137 // when there is a full match exit early
138 for (const auto& block : blocks) {
139 if (block.length == s1_view.length()) {
140 return 100;
141 }
142 }
143
144 for (const auto& block : blocks) {
145 std::size_t long_start = (block.dpos > block.spos) ? block.dpos - block.spos : 0;
146 auto long_substr = s2_view.substr(long_start, s1_view.length());
147
148 double ls_ratio = cached_ratio.ratio(long_substr, score_cutoff);
149 if (ls_ratio > max_ratio) {
150 score_cutoff = max_ratio = ls_ratio;
151 }
152 }
153
154 return max_ratio;
155 }
156
157 template <typename Sentence1, typename Sentence2>
partial_ratio_long_needle(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)158 percent partial_ratio_long_needle(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
159 {
160 auto s1_view = common::to_string_view(s1);
161 CachedRatio<decltype(s1_view)> cached_ratio(s1_view);
162
163 return partial_ratio_long_needle(s1_view, cached_ratio, s2, score_cutoff);
164 }
165
166 } // namespace detail
167
168 template <typename Sentence1, typename Sentence2, typename CharT1, typename CharT2>
partial_ratio(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)169 percent partial_ratio(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
170 {
171 if (score_cutoff > 100) {
172 return 0;
173 }
174
175 auto s1_view = common::to_string_view(s1);
176 auto s2_view = common::to_string_view(s2);
177
178 if (s1_view.empty() || s2_view.empty()) {
179 return static_cast<double>(s1_view.empty() && s2_view.empty()) * 100.0;
180 }
181
182 if (s1_view.length() > s2_view.length()) {
183 return partial_ratio(s2_view, s1_view, score_cutoff);
184 }
185
186 if (s1_view.length() <= 64) {
187 return detail::partial_ratio_short_needle(s1_view, s2_view, score_cutoff);
188 }
189 else {
190 return detail::partial_ratio_long_needle(s1_view, s2_view, score_cutoff);
191 }
192 }
193
194 template <typename Sentence1>
CachedPartialRatio(const Sentence1 & s1)195 CachedPartialRatio<Sentence1>::CachedPartialRatio(const Sentence1& s1)
196 : s1_view(common::to_string_view(s1)), cached_ratio(s1)
197 {
198 for (const CharT1& ch : s1_view) {
199 s1_char_map.create(ch) = true;
200 }
201 }
202
203 template <typename Sentence1>
204 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const205 double CachedPartialRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
206 {
207 auto s2_view = common::to_string_view(s2);
208
209 if (s1_view.size() > s2_view.size()) {
210 return partial_ratio(s1_view, s2_view, score_cutoff);
211 }
212
213 if (s1_view.empty() || s2_view.empty()) {
214 return static_cast<double>(s1_view.empty() && s2_view.empty()) * 100.0;
215 }
216
217 if (s1_view.length() <= 64) {
218 return detail::partial_ratio_short_needle(s1_view, cached_ratio, s1_char_map, s2_view,
219 score_cutoff);
220 }
221 else {
222 return detail::partial_ratio_long_needle(s1_view, cached_ratio, s2_view, score_cutoff);
223 }
224 }
225
226 /**********************************************
227 * token_sort_ratio
228 *********************************************/
229
230 template <typename Sentence1, typename Sentence2, typename CharT1, typename CharT2>
token_sort_ratio(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)231 percent token_sort_ratio(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
232 {
233 if (score_cutoff > 100) return 0;
234
235 return ratio(common::sorted_split(s1).join(), common::sorted_split(s2).join(), score_cutoff);
236 }
237
238 template <typename Sentence1>
239 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const240 double CachedTokenSortRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
241 {
242 if (score_cutoff > 100) return 0;
243
244 return cached_ratio.ratio(common::sorted_split(s2).join(), score_cutoff);
245 }
246
247 /**********************************************
248 * partial_token_sort_ratio
249 *********************************************/
250
251 template <typename Sentence1, typename Sentence2, typename CharT1, typename CharT2>
partial_token_sort_ratio(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)252 percent partial_token_sort_ratio(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
253 {
254 if (score_cutoff > 100) return 0;
255
256 return partial_ratio(common::sorted_split(s1).join(), common::sorted_split(s2).join(),
257 score_cutoff);
258 }
259
260 template <typename Sentence1>
261 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const262 double CachedPartialTokenSortRatio<Sentence1>::ratio(const Sentence2& s2,
263 percent score_cutoff) const
264 {
265 if (score_cutoff > 100) return 0;
266
267 return cached_partial_ratio.ratio(common::sorted_split(s2).join(), score_cutoff);
268 }
269
270 /**********************************************
271 * token_set_ratio
272 *********************************************/
273
274 namespace detail {
275 template <typename CharT1, typename CharT2>
token_set_ratio(const SplittedSentenceView<CharT1> & tokens_a,const SplittedSentenceView<CharT2> & tokens_b,const percent score_cutoff)276 percent token_set_ratio(const SplittedSentenceView<CharT1>& tokens_a,
277 const SplittedSentenceView<CharT2>& tokens_b, const percent score_cutoff)
278 {
279 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
280 * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
281 if (tokens_a.empty() || tokens_a.empty()) {
282 return 0;
283 }
284
285 auto decomposition = common::set_decomposition(tokens_a, tokens_b);
286 auto intersect = decomposition.intersection;
287 auto diff_ab = decomposition.difference_ab;
288 auto diff_ba = decomposition.difference_ba;
289
290 // one sentence is part of the other one
291 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) {
292 return 100;
293 }
294
295 auto diff_ab_joined = diff_ab.join();
296 auto diff_ba_joined = diff_ba.join();
297
298 size_t ab_len = diff_ab_joined.length();
299 size_t ba_len = diff_ba_joined.length();
300 size_t sect_len = intersect.length();
301
302 // string length sect+ab <-> sect and sect+ba <-> sect
303 size_t sect_ab_len = sect_len + !!sect_len + ab_len;
304 size_t sect_ba_len = sect_len + !!sect_len + ba_len;
305
306 percent result = 0;
307 auto cutoff_distance = common::score_cutoff_to_distance(score_cutoff, ab_len + ba_len);
308 size_t dist =
309 string_metric::levenshtein(diff_ab_joined, diff_ba_joined, {1, 1, 2}, cutoff_distance);
310
311 if (dist != (std::size_t)-1) {
312 result = common::norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff);
313 }
314
315 // exit early since the other ratios are 0
316 if (!sect_len) {
317 return result;
318 }
319
320 // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
321 // since only sect is similar in them the distance can be calculated based on
322 // the length difference
323 std::size_t sect_ab_dist = !!sect_len + ab_len;
324 percent sect_ab_ratio =
325 common::norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
326
327 std::size_t sect_ba_dist = !!sect_len + ba_len;
328 percent sect_ba_ratio =
329 common::norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
330
331 return std::max({result, sect_ab_ratio, sect_ba_ratio});
332 }
333 } // namespace detail
334
335 template <typename Sentence1, typename Sentence2>
token_set_ratio(const Sentence1 & s1,const Sentence2 & s2,const percent score_cutoff)336 percent token_set_ratio(const Sentence1& s1, const Sentence2& s2, const percent score_cutoff)
337 {
338 if (score_cutoff > 100) return 0;
339
340 return detail::token_set_ratio(common::sorted_split(s1), common::sorted_split(s2),
341 score_cutoff);
342 }
343
344 template <typename Sentence1>
CachedTokenSetRatio(const Sentence1 & s1)345 CachedTokenSetRatio<Sentence1>::CachedTokenSetRatio(const Sentence1& s1)
346 : tokens_s1(common::sorted_split(s1))
347 {}
348
349 template <typename Sentence1>
350 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const351 double CachedTokenSetRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
352 {
353 if (score_cutoff > 100) return 0;
354
355 return detail::token_set_ratio(tokens_s1, common::sorted_split(s2), score_cutoff);
356 }
357
358 /**********************************************
359 * partial_token_set_ratio
360 *********************************************/
361
362 namespace detail {
363 template <typename CharT1, typename CharT2>
partial_token_set_ratio(const SplittedSentenceView<CharT1> & tokens_a,const SplittedSentenceView<CharT2> & tokens_b,const percent score_cutoff)364 percent partial_token_set_ratio(const SplittedSentenceView<CharT1>& tokens_a,
365 const SplittedSentenceView<CharT2>& tokens_b,
366 const percent score_cutoff)
367 {
368 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
369 * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
370 if (tokens_a.empty() || tokens_a.empty()) {
371 return 0;
372 }
373
374 auto decomposition = common::set_decomposition(tokens_a, tokens_b);
375
376 // exit early when there is a common word in both sequences
377 if (!decomposition.intersection.empty()) return 100;
378
379 return partial_ratio(decomposition.difference_ab.join(), decomposition.difference_ba.join(),
380 score_cutoff);
381 }
382 } // namespace detail
383
384 template <typename Sentence1, typename Sentence2, typename CharT1, typename CharT2>
partial_token_set_ratio(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)385 percent partial_token_set_ratio(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
386 {
387 if (score_cutoff > 100) return 0;
388
389 return detail::partial_token_set_ratio(common::sorted_split(s1), common::sorted_split(s2),
390 score_cutoff);
391 }
392
393 template <typename Sentence1>
CachedPartialTokenSetRatio(const Sentence1 & s1)394 CachedPartialTokenSetRatio<Sentence1>::CachedPartialTokenSetRatio(const Sentence1& s1)
395 : tokens_s1(common::sorted_split(s1))
396 {}
397
398 template <typename Sentence1>
399 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const400 double CachedPartialTokenSetRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
401 {
402 if (score_cutoff > 100) return 0;
403
404 return detail::partial_token_set_ratio(tokens_s1, common::sorted_split(s2), score_cutoff);
405 }
406
407 /**********************************************
408 * token_ratio
409 *********************************************/
410
411 template <typename Sentence1, typename Sentence2>
token_ratio(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)412 percent token_ratio(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
413 {
414 if (score_cutoff > 100) return 0;
415
416 auto tokens_a = common::sorted_split(s1);
417 auto tokens_b = common::sorted_split(s2);
418
419 auto decomposition = common::set_decomposition(tokens_a, tokens_b);
420 auto intersect = decomposition.intersection;
421 auto diff_ab = decomposition.difference_ab;
422 auto diff_ba = decomposition.difference_ba;
423
424 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) {
425 return 100;
426 }
427
428 auto diff_ab_joined = diff_ab.join();
429 auto diff_ba_joined = diff_ba.join();
430
431 size_t ab_len = diff_ab_joined.length();
432 size_t ba_len = diff_ba_joined.length();
433 size_t sect_len = intersect.length();
434
435 percent result = ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
436
437 // string length sect+ab <-> sect and sect+ba <-> sect
438 size_t sect_ab_len = sect_len + !!sect_len + ab_len;
439 size_t sect_ba_len = sect_len + !!sect_len + ba_len;
440
441 auto cutoff_distance = common::score_cutoff_to_distance(score_cutoff, ab_len + ba_len);
442 size_t dist =
443 string_metric::levenshtein(diff_ab_joined, diff_ba_joined, {1, 1, 2}, cutoff_distance);
444 if (dist != (std::size_t)-1) {
445 result =
446 std::max(result, common::norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
447 }
448
449 // exit early since the other ratios are 0
450 if (!sect_len) {
451 return result;
452 }
453
454 // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
455 // since only sect is similar in them the distance can be calculated based on
456 // the length difference
457 std::size_t sect_ab_dist = !!sect_len + ab_len;
458 percent sect_ab_ratio =
459 common::norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
460
461 std::size_t sect_ba_dist = !!sect_len + ba_len;
462 percent sect_ba_ratio =
463 common::norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
464
465 return std::max({result, sect_ab_ratio, sect_ba_ratio});
466 }
467
468 namespace detail {
469 template <typename CharT1, typename CachedSentence1, typename Sentence2>
token_ratio(const SplittedSentenceView<CharT1> & s1_tokens,const CachedRatio<CachedSentence1> & cached_ratio_s1_sorted,const Sentence2 & s2,percent score_cutoff)470 percent token_ratio(const SplittedSentenceView<CharT1>& s1_tokens,
471 const CachedRatio<CachedSentence1>& cached_ratio_s1_sorted, const Sentence2& s2,
472 percent score_cutoff)
473 {
474 if (score_cutoff > 100) return 0;
475
476 auto s2_tokens = common::sorted_split(s2);
477
478 auto decomposition = common::set_decomposition(s1_tokens, s2_tokens);
479 auto intersect = decomposition.intersection;
480 auto diff_ab = decomposition.difference_ab;
481 auto diff_ba = decomposition.difference_ba;
482
483 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) {
484 return 100;
485 }
486
487 auto diff_ab_joined = diff_ab.join();
488 auto diff_ba_joined = diff_ba.join();
489
490 size_t ab_len = diff_ab_joined.length();
491 size_t ba_len = diff_ba_joined.length();
492 size_t sect_len = intersect.length();
493
494 percent result = cached_ratio_s1_sorted.ratio(s2_tokens.join(), score_cutoff);
495
496 // string length sect+ab <-> sect and sect+ba <-> sect
497 size_t sect_ab_len = sect_len + !!sect_len + ab_len;
498 size_t sect_ba_len = sect_len + !!sect_len + ba_len;
499
500 auto cutoff_distance = common::score_cutoff_to_distance(score_cutoff, ab_len + ba_len);
501 size_t dist =
502 string_metric::levenshtein(diff_ab_joined, diff_ba_joined, {1, 1, 2}, cutoff_distance);
503 if (dist != (std::size_t)-1) {
504 result =
505 std::max(result, common::norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
506 }
507
508 // exit early since the other ratios are 0
509 if (!sect_len) {
510 return result;
511 }
512
513 // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
514 // since only sect is similar in them the distance can be calculated based on
515 // the length difference
516 std::size_t sect_ab_dist = !!sect_len + ab_len;
517 percent sect_ab_ratio =
518 common::norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
519
520 std::size_t sect_ba_dist = !!sect_len + ba_len;
521 percent sect_ba_ratio =
522 common::norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
523
524 return std::max({result, sect_ab_ratio, sect_ba_ratio});
525 }
526
527 // todo this is a temporary solution until WRatio is properly implemented using other scorers
528 template <typename CharT1, typename Sentence2>
token_ratio(const std::basic_string<CharT1> & s1_sorted,const SplittedSentenceView<CharT1> & tokens_s1,const common::BlockPatternMatchVector & blockmap_s1_sorted,const Sentence2 & s2,percent score_cutoff)529 percent token_ratio(const std::basic_string<CharT1>& s1_sorted,
530 const SplittedSentenceView<CharT1>& tokens_s1,
531 const common::BlockPatternMatchVector& blockmap_s1_sorted,
532 const Sentence2& s2, percent score_cutoff)
533 {
534 if (score_cutoff > 100) return 0;
535
536 auto tokens_b = common::sorted_split(s2);
537
538 auto decomposition = common::set_decomposition(tokens_s1, tokens_b);
539 auto intersect = decomposition.intersection;
540 auto diff_ab = decomposition.difference_ab;
541 auto diff_ba = decomposition.difference_ba;
542
543 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) {
544 return 100;
545 }
546
547 auto diff_ab_joined = diff_ab.join();
548 auto diff_ba_joined = diff_ba.join();
549
550 size_t ab_len = diff_ab_joined.length();
551 size_t ba_len = diff_ba_joined.length();
552 size_t sect_len = intersect.length();
553
554 percent result = 0;
555 auto s2_sorted = tokens_b.join();
556 if (s1_sorted.size() < 65) {
557 result = string_metric::detail::normalized_weighted_levenshtein(
558 common::to_string_view(s2_sorted), blockmap_s1_sorted,
559 common::to_string_view(s1_sorted), score_cutoff);
560 }
561 else {
562 result = fuzz::ratio(s1_sorted, s2_sorted, score_cutoff);
563 }
564
565 // string length sect+ab <-> sect and sect+ba <-> sect
566 size_t sect_ab_len = sect_len + !!sect_len + ab_len;
567 size_t sect_ba_len = sect_len + !!sect_len + ba_len;
568
569 auto cutoff_distance = common::score_cutoff_to_distance(score_cutoff, ab_len + ba_len);
570 size_t dist =
571 string_metric::levenshtein(diff_ab_joined, diff_ba_joined, {1, 1, 2}, cutoff_distance);
572 if (dist != (std::size_t)-1) {
573 result =
574 std::max(result, common::norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
575 }
576
577 // exit early since the other ratios are 0
578 if (!sect_len) {
579 return result;
580 }
581
582 // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
583 // since only sect is similar in them the distance can be calculated based on
584 // the length difference
585 std::size_t sect_ab_dist = !!sect_len + ab_len;
586 percent sect_ab_ratio =
587 common::norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
588
589 std::size_t sect_ba_dist = !!sect_len + ba_len;
590 percent sect_ba_ratio =
591 common::norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
592
593 return std::max({result, sect_ab_ratio, sect_ba_ratio});
594 }
595 } // namespace detail
596
597 template <typename Sentence1>
598 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const599 double CachedTokenRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
600 {
601 return detail::token_ratio(s1_tokens, cached_ratio_s1_sorted, s2, score_cutoff);
602 }
603
604 /**********************************************
605 * partial_token_ratio
606 *********************************************/
607
608 template <typename Sentence1, typename Sentence2, typename CharT1, typename CharT2>
partial_token_ratio(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)609 percent partial_token_ratio(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
610 {
611 if (score_cutoff > 100) return 0;
612
613 auto tokens_a = common::sorted_split(s1);
614 auto tokens_b = common::sorted_split(s2);
615
616 auto decomposition = common::set_decomposition(tokens_a, tokens_b);
617
618 // exit early when there is a common word in both sequences
619 if (!decomposition.intersection.empty()) return 100;
620
621 auto diff_ab = decomposition.difference_ab;
622 auto diff_ba = decomposition.difference_ba;
623
624 percent result = partial_ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
625
626 // do not calculate the same partial_ratio twice
627 if (tokens_a.word_count() == diff_ab.word_count() &&
628 tokens_b.word_count() == diff_ba.word_count()) {
629 return result;
630 }
631
632 score_cutoff = std::max(score_cutoff, result);
633 return std::max(result, partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
634 }
635
636 namespace detail {
637 template <typename CharT1, typename Sentence2>
partial_token_ratio(const std::basic_string<CharT1> & s1_sorted,const SplittedSentenceView<CharT1> & tokens_s1,const Sentence2 & s2,percent score_cutoff)638 percent partial_token_ratio(const std::basic_string<CharT1>& s1_sorted,
639 const SplittedSentenceView<CharT1>& tokens_s1, const Sentence2& s2,
640 percent score_cutoff)
641 {
642 if (score_cutoff > 100) return 0;
643
644 auto tokens_b = common::sorted_split(s2);
645
646 auto decomposition = common::set_decomposition(tokens_s1, tokens_b);
647
648 // exit early when there is a common word in both sequences
649 if (!decomposition.intersection.empty()) return 100;
650
651 auto diff_ab = decomposition.difference_ab;
652 auto diff_ba = decomposition.difference_ba;
653
654 percent result = partial_ratio(s1_sorted, tokens_b.join(), score_cutoff);
655
656 // do not calculate the same partial_ratio twice
657 if (tokens_s1.word_count() == diff_ab.word_count() &&
658 tokens_b.word_count() == diff_ba.word_count()) {
659 return result;
660 }
661
662 score_cutoff = std::max(score_cutoff, result);
663 return std::max(result, partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
664 }
665
666 } // namespace detail
667
668 template <typename Sentence1>
CachedPartialTokenRatio(const Sentence1 & s1)669 CachedPartialTokenRatio<Sentence1>::CachedPartialTokenRatio(const Sentence1& s1)
670 : tokens_s1(common::sorted_split(s1))
671 {
672 s1_sorted = tokens_s1.join();
673 }
674
675 template <typename Sentence1>
676 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const677 double CachedPartialTokenRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
678 {
679 return detail::partial_token_ratio(s1_sorted, tokens_s1, s2, score_cutoff);
680 }
681
682 /**********************************************
683 * WRatio
684 *********************************************/
685
686 template <typename Sentence1, typename Sentence2>
WRatio(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)687 percent WRatio(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
688 {
689 if (score_cutoff > 100) return 0;
690
691 constexpr double UNBASE_SCALE = 0.95;
692
693 auto s1_view = common::to_string_view(s1);
694 auto s2_view = common::to_string_view(s2);
695
696 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
697 * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
698 if (s1_view.empty() || s2_view.empty()) {
699 return 0;
700 }
701
702 size_t len_a = s1_view.length();
703 size_t len_b = s2_view.length();
704 double len_ratio = (len_a > len_b) ? static_cast<double>(len_a) / static_cast<double>(len_b)
705 : static_cast<double>(len_b) / static_cast<double>(len_a);
706
707 percent end_ratio = ratio(s1, s2, score_cutoff);
708
709 if (len_ratio < 1.5) {
710 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
711 return std::max(end_ratio, token_ratio(s1, s2, score_cutoff) * UNBASE_SCALE);
712 }
713
714 const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
715
716 score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
717 end_ratio = std::max(end_ratio, partial_ratio(s1, s2, score_cutoff) * PARTIAL_SCALE);
718
719 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
720 return std::max(end_ratio,
721 partial_token_ratio(s1, s2, score_cutoff) * UNBASE_SCALE * PARTIAL_SCALE);
722 }
723
724 template <typename Sentence1>
CachedWRatio(const Sentence1 & s1)725 CachedWRatio<Sentence1>::CachedWRatio(const Sentence1& s1)
726 : cached_partial_ratio(s1), tokens_s1(common::sorted_split(s1))
727 {
728 s1_view = common::to_string_view(s1);
729 s1_sorted = tokens_s1.join();
730
731 blockmap_s1_sorted.insert(common::to_string_view(s1_sorted));
732 }
733
734 template <typename Sentence1>
735 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const736 double CachedWRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
737 {
738 if (score_cutoff > 100) return 0;
739
740 constexpr double UNBASE_SCALE = 0.95;
741
742 auto s2_view = common::to_string_view(s2);
743
744 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
745 * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
746 if (s1_view.empty() || s2_view.empty()) {
747 return 0;
748 }
749
750 size_t len_a = s1_view.length();
751 size_t len_b = s2_view.length();
752 double len_ratio = (len_a > len_b) ? static_cast<double>(len_a) / static_cast<double>(len_b)
753 : static_cast<double>(len_b) / static_cast<double>(len_a);
754
755 percent end_ratio = cached_partial_ratio.cached_ratio.ratio(s2_view, score_cutoff);
756
757 if (len_ratio < 1.5) {
758 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
759 // use pre calculated values
760 auto r =
761 detail::token_ratio(s1_sorted, tokens_s1, blockmap_s1_sorted, s2_view, score_cutoff);
762 return std::max(end_ratio, r * UNBASE_SCALE);
763 }
764
765 const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
766
767 score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
768 end_ratio =
769 std::max(end_ratio, cached_partial_ratio.ratio(s2_view, score_cutoff) * PARTIAL_SCALE);
770
771 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
772 auto r = detail::partial_token_ratio(s1_sorted, tokens_s1, s2_view, score_cutoff);
773 return std::max(end_ratio, r * UNBASE_SCALE * PARTIAL_SCALE);
774 }
775
776 /**********************************************
777 * QRatio
778 *********************************************/
779
780 template <typename Sentence1, typename Sentence2>
QRatio(const Sentence1 & s1,const Sentence2 & s2,percent score_cutoff)781 percent QRatio(const Sentence1& s1, const Sentence2& s2, percent score_cutoff)
782 {
783 auto s1_view = common::to_string_view(s1);
784 auto s2_view = common::to_string_view(s2);
785
786 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
787 * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
788 if (s1_view.empty() || s2_view.empty()) {
789 return 0;
790 }
791
792 return ratio(s1_view, s2_view, score_cutoff);
793 }
794
795 template <typename Sentence1>
796 template <typename Sentence2>
ratio(const Sentence2 & s2,percent score_cutoff) const797 double CachedQRatio<Sentence1>::ratio(const Sentence2& s2, percent score_cutoff) const
798 {
799 auto s2_view = common::to_string_view(s2);
800
801 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
802 * see https://github.com/maxbachmann/RapidFuzz/issues/110 */
803 if (s1_view.empty() || s2_view.empty()) {
804 return 0;
805 }
806
807 return cached_ratio.ratio(s2_view, score_cutoff);
808 }
809
810 } // namespace fuzz
811 } // namespace rapidfuzz
812