1 //! This library implements string similarity metrics.
2
3 #![forbid(unsafe_code)]
4
5 use std::char;
6 use std::cmp::{max, min};
7 use std::collections::HashMap;
8 use std::error::Error;
9 use std::fmt::{self, Display, Formatter};
10 use std::hash::Hash;
11 use std::str::Chars;
12
13 #[derive(Debug, PartialEq)]
14 pub enum StrSimError {
15 DifferentLengthArgs,
16 }
17
18 impl Display for StrSimError {
fmt(&self, fmt: &mut Formatter) -> Result<(), fmt::Error>19 fn fmt(&self, fmt: &mut Formatter) -> Result<(), fmt::Error> {
20 let text = match self {
21 StrSimError::DifferentLengthArgs => "Differing length arguments provided",
22 };
23
24 write!(fmt, "{}", text)
25 }
26 }
27
28 impl Error for StrSimError {}
29
30 pub type HammingResult = Result<usize, StrSimError>;
31
32 /// Calculates the number of positions in the two sequences where the elements
33 /// differ. Returns an error if the sequences have different lengths.
generic_hamming<Iter1, Iter2, Elem1, Elem2>(a: Iter1, b: Iter2) -> HammingResult where Iter1: IntoIterator<Item=Elem1>, Iter2: IntoIterator<Item=Elem2>, Elem1: PartialEq<Elem2>34 pub fn generic_hamming<Iter1, Iter2, Elem1, Elem2>(a: Iter1, b: Iter2) -> HammingResult
35 where Iter1: IntoIterator<Item=Elem1>,
36 Iter2: IntoIterator<Item=Elem2>,
37 Elem1: PartialEq<Elem2> {
38 let (mut ita, mut itb) = (a.into_iter(), b.into_iter());
39 let mut count = 0;
40 loop {
41 match (ita.next(), itb.next()){
42 (Some(x), Some(y)) => if x != y { count += 1 },
43 (None, None) => return Ok(count),
44 _ => return Err(StrSimError::DifferentLengthArgs),
45 }
46 }
47 }
48
49 /// Calculates the number of positions in the two strings where the characters
50 /// differ. Returns an error if the strings have different lengths.
51 ///
52 /// ```
53 /// use strsim::{hamming, StrSimError::DifferentLengthArgs};
54 ///
55 /// assert_eq!(Ok(3), hamming("hamming", "hammers"));
56 ///
57 /// assert_eq!(Err(DifferentLengthArgs), hamming("hamming", "ham"));
58 /// ```
hamming(a: &str, b: &str) -> HammingResult59 pub fn hamming(a: &str, b: &str) -> HammingResult {
60 generic_hamming(a.chars(), b.chars())
61 }
62
63 /// Calculates the Jaro similarity between two sequences. The returned value
64 /// is between 0.0 and 1.0 (higher value means more similar).
generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 where &'a Iter1: IntoIterator<Item=Elem1>, &'b Iter2: IntoIterator<Item=Elem2>, Elem1: PartialEq<Elem2>65 pub fn generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64
66 where &'a Iter1: IntoIterator<Item=Elem1>,
67 &'b Iter2: IntoIterator<Item=Elem2>,
68 Elem1: PartialEq<Elem2> {
69 let a_len = a.into_iter().count();
70 let b_len = b.into_iter().count();
71
72 // The check for lengths of one here is to prevent integer overflow when
73 // calculating the search range.
74 if a_len == 0 && b_len == 0 {
75 return 1.0;
76 } else if a_len == 0 || b_len == 0 {
77 return 0.0;
78 } else if a_len == 1 && b_len == 1 {
79 return if a.into_iter().eq(b.into_iter()) { 1.0} else { 0.0 };
80 }
81
82 let search_range = (max(a_len, b_len) / 2) - 1;
83
84 let mut b_consumed = Vec::with_capacity(b_len);
85 for _ in 0..b_len {
86 b_consumed.push(false);
87 }
88 let mut matches = 0.0;
89
90 let mut transpositions = 0.0;
91 let mut b_match_index = 0;
92
93 for (i, a_elem) in a.into_iter().enumerate() {
94 let min_bound =
95 // prevent integer wrapping
96 if i > search_range {
97 max(0, i - search_range)
98 } else {
99 0
100 };
101
102 let max_bound = min(b_len - 1, i + search_range);
103
104 if min_bound > max_bound {
105 continue;
106 }
107
108 for (j, b_elem) in b.into_iter().enumerate() {
109 if min_bound <= j && j <= max_bound && a_elem == b_elem &&
110 !b_consumed[j] {
111 b_consumed[j] = true;
112 matches += 1.0;
113
114 if j < b_match_index {
115 transpositions += 1.0;
116 }
117 b_match_index = j;
118
119 break;
120 }
121 }
122 }
123
124 if matches == 0.0 {
125 0.0
126 } else {
127 (1.0 / 3.0) * ((matches / a_len as f64) +
128 (matches / b_len as f64) +
129 ((matches - transpositions) / matches))
130 }
131 }
132
133 struct StringWrapper<'a>(&'a str);
134
135 impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> {
136 type Item = char;
137 type IntoIter = Chars<'b>;
138
into_iter(self) -> Self::IntoIter139 fn into_iter(self) -> Self::IntoIter {
140 self.0.chars()
141 }
142 }
143
144 /// Calculates the Jaro similarity between two strings. The returned value
145 /// is between 0.0 and 1.0 (higher value means more similar).
146 ///
147 /// ```
148 /// use strsim::jaro;
149 ///
150 /// assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() <
151 /// 0.001);
152 /// ```
jaro(a: &str, b: &str) -> f64153 pub fn jaro(a: &str, b: &str) -> f64 {
154 generic_jaro(&StringWrapper(a), &StringWrapper(b))
155 }
156
157 /// Like Jaro but gives a boost to sequences that have a common prefix.
generic_jaro_winkler<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 where &'a Iter1: IntoIterator<Item=Elem1>, &'b Iter2: IntoIterator<Item=Elem2>, Elem1: PartialEq<Elem2>158 pub fn generic_jaro_winkler<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64
159 where &'a Iter1: IntoIterator<Item=Elem1>,
160 &'b Iter2: IntoIterator<Item=Elem2>,
161 Elem1: PartialEq<Elem2> {
162 let jaro_distance = generic_jaro(a, b);
163
164 // Don't limit the length of the common prefix
165 let prefix_length = a.into_iter()
166 .zip(b.into_iter())
167 .take_while(|&(ref a_elem, ref b_elem)| a_elem == b_elem)
168 .count();
169
170 let jaro_winkler_distance =
171 jaro_distance + (0.1 * prefix_length as f64 * (1.0 - jaro_distance));
172
173 if jaro_winkler_distance <= 1.0 {
174 jaro_winkler_distance
175 } else {
176 1.0
177 }
178 }
179
180 /// Like Jaro but gives a boost to strings that have a common prefix.
181 ///
182 /// ```
183 /// use strsim::jaro_winkler;
184 ///
185 /// assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() <
186 /// 0.001);
187 /// ```
jaro_winkler(a: &str, b: &str) -> f64188 pub fn jaro_winkler(a: &str, b: &str) -> f64 {
189 generic_jaro_winkler(&StringWrapper(a), &StringWrapper(b))
190 }
191
192 /// Calculates the minimum number of insertions, deletions, and substitutions
193 /// required to change one sequence into the other.
194 ///
195 /// ```
196 /// use strsim::generic_levenshtein;
197 ///
198 /// assert_eq!(3, generic_levenshtein(&[1,2,3], &[1,2,3,4,5,6]));
199 /// ```
generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> usize where &'a Iter1: IntoIterator<Item=Elem1>, &'b Iter2: IntoIterator<Item=Elem2>, Elem1: PartialEq<Elem2>200 pub fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> usize
201 where &'a Iter1: IntoIterator<Item=Elem1>,
202 &'b Iter2: IntoIterator<Item=Elem2>,
203 Elem1: PartialEq<Elem2> {
204 let b_len = b.into_iter().count();
205
206 if a.into_iter().next().is_none() { return b_len; }
207
208 let mut cache: Vec<usize> = (1..b_len+1).collect();
209
210 let mut result = 0;
211
212 for (i, a_elem) in a.into_iter().enumerate() {
213 result = i + 1;
214 let mut distance_b = i;
215
216 for (j, b_elem) in b.into_iter().enumerate() {
217 let cost = if a_elem == b_elem { 0usize } else { 1usize };
218 let distance_a = distance_b + cost;
219 distance_b = cache[j];
220 result = min(result + 1, min(distance_a, distance_b + 1));
221 cache[j] = result;
222 }
223 }
224
225 result
226 }
227
228 /// Calculates the minimum number of insertions, deletions, and substitutions
229 /// required to change one string into the other.
230 ///
231 /// ```
232 /// use strsim::levenshtein;
233 ///
234 /// assert_eq!(3, levenshtein("kitten", "sitting"));
235 /// ```
levenshtein(a: &str, b: &str) -> usize236 pub fn levenshtein(a: &str, b: &str) -> usize {
237 generic_levenshtein(&StringWrapper(a), &StringWrapper(b))
238 }
239
240 /// Calculates a normalized score of the Levenshtein algorithm between 0.0 and
241 /// 1.0 (inclusive), where 1.0 means the strings are the same.
242 ///
243 /// ```
244 /// use strsim::normalized_levenshtein;
245 ///
246 /// assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
247 /// assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001);
248 /// assert!(normalized_levenshtein("", "second").abs() < 0.00001);
249 /// assert!(normalized_levenshtein("first", "").abs() < 0.00001);
250 /// assert!((normalized_levenshtein("string", "string") - 1.0).abs() < 0.00001);
251 /// ```
normalized_levenshtein(a: &str, b: &str) -> f64252 pub fn normalized_levenshtein(a: &str, b: &str) -> f64 {
253 if a.is_empty() && b.is_empty() {
254 return 1.0;
255 }
256 1.0 - (levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64)
257 }
258
259 /// Like Levenshtein but allows for adjacent transpositions. Each substring can
260 /// only be edited once.
261 ///
262 /// ```
263 /// use strsim::osa_distance;
264 ///
265 /// assert_eq!(3, osa_distance("ab", "bca"));
266 /// ```
osa_distance(a: &str, b: &str) -> usize267 pub fn osa_distance(a: &str, b: &str) -> usize {
268 let a_len = a.chars().count();
269 let b_len = b.chars().count();
270 if a == b { return 0; }
271 else if a_len == 0 { return b_len; }
272 else if b_len == 0 { return a_len; }
273
274 let mut prev_two_distances: Vec<usize> = Vec::with_capacity(b_len + 1);
275 let mut prev_distances: Vec<usize> = Vec::with_capacity(b_len + 1);
276 let mut curr_distances: Vec<usize> = Vec::with_capacity(b_len + 1);
277
278 let mut prev_a_char = char::MAX;
279 let mut prev_b_char = char::MAX;
280
281 for i in 0..(b_len + 1) {
282 prev_two_distances.push(i);
283 prev_distances.push(i);
284 curr_distances.push(0);
285 }
286
287 for (i, a_char) in a.chars().enumerate() {
288 curr_distances[0] = i + 1;
289
290 for (j, b_char) in b.chars().enumerate() {
291 let cost = if a_char == b_char { 0 } else { 1 };
292 curr_distances[j + 1] = min(curr_distances[j] + 1,
293 min(prev_distances[j + 1] + 1,
294 prev_distances[j] + cost));
295 if i > 0 && j > 0 && a_char != b_char &&
296 a_char == prev_b_char && b_char == prev_a_char {
297 curr_distances[j + 1] = min(curr_distances[j + 1],
298 prev_two_distances[j - 1] + 1);
299 }
300
301 prev_b_char = b_char;
302 }
303
304 prev_two_distances.clone_from(&prev_distances);
305 prev_distances.clone_from(&curr_distances);
306 prev_a_char = a_char;
307 }
308
309 curr_distances[b_len]
310
311 }
312
313 /* Returns the final index for a value in a single vector that represents a fixed
314 2d grid */
flat_index(i: usize, j: usize, width: usize) -> usize315 fn flat_index(i: usize, j: usize, width: usize) -> usize {
316 j * width + i
317 }
318
319 /// Like optimal string alignment, but substrings can be edited an unlimited
320 /// number of times, and the triangle inequality holds.
321 ///
322 /// ```
323 /// use strsim::generic_damerau_levenshtein;
324 ///
325 /// assert_eq!(2, generic_damerau_levenshtein(&[1,2], &[2,3,1]));
326 /// ```
generic_damerau_levenshtein<Elem>(a_elems: &[Elem], b_elems: &[Elem]) -> usize where Elem: Eq + Hash + Clone327 pub fn generic_damerau_levenshtein<Elem>(a_elems: &[Elem], b_elems: &[Elem]) -> usize
328 where Elem: Eq + Hash + Clone {
329 let a_len = a_elems.len();
330 let b_len = b_elems.len();
331
332 if a_len == 0 { return b_len; }
333 if b_len == 0 { return a_len; }
334
335 let width = a_len + 2;
336 let mut distances = vec![0; (a_len + 2) * (b_len + 2)];
337 let max_distance = a_len + b_len;
338 distances[0] = max_distance;
339
340 for i in 0..(a_len + 1) {
341 distances[flat_index(i + 1, 0, width)] = max_distance;
342 distances[flat_index(i + 1, 1, width)] = i;
343 }
344
345 for j in 0..(b_len + 1) {
346 distances[flat_index(0, j + 1, width)] = max_distance;
347 distances[flat_index(1, j + 1, width)] = j;
348 }
349
350 let mut elems: HashMap<Elem, usize> = HashMap::with_capacity(64);
351
352 for i in 1..(a_len + 1) {
353 let mut db = 0;
354
355 for j in 1..(b_len + 1) {
356 let k = match elems.get(&b_elems[j - 1]) {
357 Some(&value) => value,
358 None => 0
359 };
360
361 let insertion_cost = distances[flat_index(i, j + 1, width)] + 1;
362 let deletion_cost = distances[flat_index(i + 1, j, width)] + 1;
363 let transposition_cost = distances[flat_index(k, db, width)] +
364 (i - k - 1) + 1 + (j - db - 1);
365
366 let mut substitution_cost = distances[flat_index(i, j, width)] + 1;
367 if a_elems[i - 1] == b_elems[j - 1] {
368 db = j;
369 substitution_cost -= 1;
370 }
371
372 distances[flat_index(i + 1, j + 1, width)] = min(substitution_cost,
373 min(insertion_cost, min(deletion_cost, transposition_cost)));
374 }
375
376 elems.insert(a_elems[i - 1].clone(), i);
377 }
378
379 distances[flat_index(a_len + 1, b_len + 1, width)]
380 }
381
382 /// Like optimal string alignment, but substrings can be edited an unlimited
383 /// number of times, and the triangle inequality holds.
384 ///
385 /// ```
386 /// use strsim::damerau_levenshtein;
387 ///
388 /// assert_eq!(2, damerau_levenshtein("ab", "bca"));
389 /// ```
damerau_levenshtein(a: &str, b: &str) -> usize390 pub fn damerau_levenshtein(a: &str, b: &str) -> usize {
391 let (x, y): (Vec<_>, Vec<_>) = (a.chars().collect(), b.chars().collect());
392 generic_damerau_levenshtein(x.as_slice(), y.as_slice())
393 }
394
395 /// Calculates a normalized score of the Damerau–Levenshtein algorithm between
396 /// 0.0 and 1.0 (inclusive), where 1.0 means the strings are the same.
397 ///
398 /// ```
399 /// use strsim::normalized_damerau_levenshtein;
400 ///
401 /// assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001);
402 /// assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001);
403 /// assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001);
404 /// assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001);
405 /// assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001);
406 /// ```
normalized_damerau_levenshtein(a: &str, b: &str) -> f64407 pub fn normalized_damerau_levenshtein(a: &str, b: &str) -> f64 {
408 if a.is_empty() && b.is_empty() {
409 return 1.0;
410 }
411 1.0 - (damerau_levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64)
412 }
413
414 #[cfg(test)]
415 mod tests {
416 use super::*;
417
assert_hamming_dist(dist: usize, str1: &str, str2: &str)418 fn assert_hamming_dist(dist: usize, str1: &str, str2: &str) {
419 assert_eq!(Ok(dist), hamming(str1, str2));
420 }
421
422 #[test]
hamming_empty()423 fn hamming_empty() {
424 assert_hamming_dist(0, "", "")
425 }
426
427 #[test]
hamming_same()428 fn hamming_same() {
429 assert_hamming_dist(0, "hamming", "hamming")
430 }
431
432 #[test]
hamming_numbers()433 fn hamming_numbers() {
434 assert_eq!(Ok(1), generic_hamming(&[1, 2, 4], &[1, 2, 3]));
435 }
436
437 #[test]
hamming_diff()438 fn hamming_diff() {
439 assert_hamming_dist(3, "hamming", "hammers")
440 }
441
442 #[test]
hamming_diff_multibyte()443 fn hamming_diff_multibyte() {
444 assert_hamming_dist(2, "hamming", "h香mmüng");
445 }
446
447 #[test]
hamming_unequal_length()448 fn hamming_unequal_length() {
449 assert_eq!(
450 Err(StrSimError::DifferentLengthArgs),
451 generic_hamming("ham".chars(), "hamming".chars())
452 );
453 }
454
455 #[test]
hamming_names()456 fn hamming_names() {
457 assert_hamming_dist(14, "Friedrich Nietzs", "Jean-Paul Sartre")
458 }
459
460 #[test]
jaro_both_empty()461 fn jaro_both_empty() {
462 assert_eq!(1.0, jaro("", ""));
463 }
464
465 #[test]
jaro_first_empty()466 fn jaro_first_empty() {
467 assert_eq!(0.0, jaro("", "jaro"));
468 }
469
470 #[test]
jaro_second_empty()471 fn jaro_second_empty() {
472 assert_eq!(0.0, jaro("distance", ""));
473 }
474
475 #[test]
jaro_same()476 fn jaro_same() {
477 assert_eq!(1.0, jaro("jaro", "jaro"));
478 }
479
480 #[test]
jaro_multibyte()481 fn jaro_multibyte() {
482 assert!((0.818 - jaro("testabctest", "testöঙ香test")) < 0.001);
483 assert!((0.818 - jaro("testöঙ香test", "testabctest")) < 0.001);
484 }
485
486 #[test]
jaro_diff_short()487 fn jaro_diff_short() {
488 assert!((0.767 - jaro("dixon", "dicksonx")).abs() < 0.001);
489 }
490
491 #[test]
jaro_diff_one_character()492 fn jaro_diff_one_character() {
493 assert_eq!(0.0, jaro("a", "b"));
494 }
495
496 #[test]
jaro_same_one_character()497 fn jaro_same_one_character() {
498 assert_eq!(1.0, jaro("a", "a"));
499 }
500
501 #[test]
generic_jaro_diff()502 fn generic_jaro_diff() {
503 assert_eq!(0.0, generic_jaro(&[1, 2], &[3, 4]));
504 }
505
506 #[test]
jaro_diff_one_and_two()507 fn jaro_diff_one_and_two() {
508 assert!((0.83 - jaro("a", "ab")).abs() < 0.01);
509 }
510
511 #[test]
jaro_diff_two_and_one()512 fn jaro_diff_two_and_one() {
513 assert!((0.83 - jaro("ab", "a")).abs() < 0.01);
514 }
515
516 #[test]
jaro_diff_no_transposition()517 fn jaro_diff_no_transposition() {
518 assert!((0.822 - jaro("dwayne", "duane")).abs() < 0.001);
519 }
520
521 #[test]
jaro_diff_with_transposition()522 fn jaro_diff_with_transposition() {
523 assert!((0.944 - jaro("martha", "marhta")).abs() < 0.001);
524 }
525
526 #[test]
jaro_names()527 fn jaro_names() {
528 assert!((0.392 - jaro("Friedrich Nietzsche",
529 "Jean-Paul Sartre")).abs() < 0.001);
530 }
531
532 #[test]
jaro_winkler_both_empty()533 fn jaro_winkler_both_empty() {
534 assert_eq!(1.0, jaro_winkler("", ""));
535 }
536
537 #[test]
jaro_winkler_first_empty()538 fn jaro_winkler_first_empty() {
539 assert_eq!(0.0, jaro_winkler("", "jaro-winkler"));
540 }
541
542 #[test]
jaro_winkler_second_empty()543 fn jaro_winkler_second_empty() {
544 assert_eq!(0.0, jaro_winkler("distance", ""));
545 }
546
547 #[test]
jaro_winkler_same()548 fn jaro_winkler_same() {
549 assert_eq!(1.0, jaro_winkler("Jaro-Winkler", "Jaro-Winkler"));
550 }
551
552 #[test]
jaro_winkler_multibyte()553 fn jaro_winkler_multibyte() {
554 assert!((0.89 - jaro_winkler("testabctest", "testöঙ香test")).abs() <
555 0.001);
556 assert!((0.89 - jaro_winkler("testöঙ香test", "testabctest")).abs() <
557 0.001);
558 }
559
560 #[test]
jaro_winkler_diff_short()561 fn jaro_winkler_diff_short() {
562 assert!((0.813 - jaro_winkler("dixon", "dicksonx")).abs() < 0.001);
563 assert!((0.813 - jaro_winkler("dicksonx", "dixon")).abs() < 0.001);
564 }
565
566 #[test]
jaro_winkler_diff_one_character()567 fn jaro_winkler_diff_one_character() {
568 assert_eq!(0.0, jaro_winkler("a", "b"));
569 }
570
571 #[test]
jaro_winkler_same_one_character()572 fn jaro_winkler_same_one_character() {
573 assert_eq!(1.0, jaro_winkler("a", "a"));
574 }
575
576 #[test]
jaro_winkler_diff_no_transposition()577 fn jaro_winkler_diff_no_transposition() {
578 assert!((0.840 - jaro_winkler("dwayne", "duane")).abs() < 0.001);
579 }
580
581 #[test]
jaro_winkler_diff_with_transposition()582 fn jaro_winkler_diff_with_transposition() {
583 assert!((0.961 - jaro_winkler("martha", "marhta")).abs() < 0.001);
584 }
585
586 #[test]
jaro_winkler_names()587 fn jaro_winkler_names() {
588 assert!((0.562 - jaro_winkler("Friedrich Nietzsche",
589 "Fran-Paul Sartre")).abs() < 0.001);
590 }
591
592 #[test]
jaro_winkler_long_prefix()593 fn jaro_winkler_long_prefix() {
594 assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() <
595 0.001);
596 }
597
598 #[test]
jaro_winkler_more_names()599 fn jaro_winkler_more_names() {
600 assert!((0.868 - jaro_winkler("Thorkel", "Thorgier")).abs() < 0.001);
601 }
602
603 #[test]
jaro_winkler_length_of_one()604 fn jaro_winkler_length_of_one() {
605 assert!((0.738 - jaro_winkler("Dinsdale", "D")).abs() < 0.001);
606 }
607
608 #[test]
jaro_winkler_very_long_prefix()609 fn jaro_winkler_very_long_prefix() {
610 assert!((1.0 - jaro_winkler("thequickbrownfoxjumpedoverx",
611 "thequickbrownfoxjumpedovery")).abs() <
612 0.001);
613 }
614
615 #[test]
levenshtein_empty()616 fn levenshtein_empty() {
617 assert_eq!(0, levenshtein("", ""));
618 }
619
620 #[test]
levenshtein_same()621 fn levenshtein_same() {
622 assert_eq!(0, levenshtein("levenshtein", "levenshtein"));
623 }
624
625 #[test]
levenshtein_diff_short()626 fn levenshtein_diff_short() {
627 assert_eq!(3, levenshtein("kitten", "sitting"));
628 }
629
630 #[test]
levenshtein_diff_with_space()631 fn levenshtein_diff_with_space() {
632 assert_eq!(5, levenshtein("hello, world", "bye, world"));
633 }
634
635 #[test]
levenshtein_diff_multibyte()636 fn levenshtein_diff_multibyte() {
637 assert_eq!(3, levenshtein("öঙ香", "abc"));
638 assert_eq!(3, levenshtein("abc", "öঙ香"));
639 }
640
641 #[test]
levenshtein_diff_longer()642 fn levenshtein_diff_longer() {
643 let a = "The quick brown fox jumped over the angry dog.";
644 let b = "Lorem ipsum dolor sit amet, dicta latine an eam.";
645 assert_eq!(37, levenshtein(a, b));
646 }
647
648 #[test]
levenshtein_first_empty()649 fn levenshtein_first_empty() {
650 assert_eq!(7, levenshtein("", "sitting"));
651 }
652
653 #[test]
levenshtein_second_empty()654 fn levenshtein_second_empty() {
655 assert_eq!(6, levenshtein("kitten", ""));
656 }
657
658 #[test]
normalized_levenshtein_diff_short()659 fn normalized_levenshtein_diff_short() {
660 assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
661 }
662
663 #[test]
normalized_levenshtein_for_empty_strings()664 fn normalized_levenshtein_for_empty_strings() {
665 assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001);
666 }
667
668 #[test]
normalized_levenshtein_first_empty()669 fn normalized_levenshtein_first_empty() {
670 assert!(normalized_levenshtein("", "second").abs() < 0.00001);
671 }
672
673 #[test]
normalized_levenshtein_second_empty()674 fn normalized_levenshtein_second_empty() {
675 assert!(normalized_levenshtein("first", "").abs() < 0.00001);
676 }
677
678 #[test]
normalized_levenshtein_identical_strings()679 fn normalized_levenshtein_identical_strings() {
680 assert!((normalized_levenshtein("identical", "identical") - 1.0).abs() < 0.00001);
681 }
682
683 #[test]
osa_distance_empty()684 fn osa_distance_empty() {
685 assert_eq!(0, osa_distance("", ""));
686 }
687
688 #[test]
osa_distance_same()689 fn osa_distance_same() {
690 assert_eq!(0, osa_distance("damerau", "damerau"));
691 }
692
693 #[test]
osa_distance_first_empty()694 fn osa_distance_first_empty() {
695 assert_eq!(7, osa_distance("", "damerau"));
696 }
697
698 #[test]
osa_distance_second_empty()699 fn osa_distance_second_empty() {
700 assert_eq!(7, osa_distance("damerau", ""));
701 }
702
703 #[test]
osa_distance_diff()704 fn osa_distance_diff() {
705 assert_eq!(3, osa_distance("ca", "abc"));
706 }
707
708 #[test]
osa_distance_diff_short()709 fn osa_distance_diff_short() {
710 assert_eq!(3, osa_distance("damerau", "aderua"));
711 }
712
713 #[test]
osa_distance_diff_reversed()714 fn osa_distance_diff_reversed() {
715 assert_eq!(3, osa_distance("aderua", "damerau"));
716 }
717
718 #[test]
osa_distance_diff_multibyte()719 fn osa_distance_diff_multibyte() {
720 assert_eq!(3, osa_distance("öঙ香", "abc"));
721 assert_eq!(3, osa_distance("abc", "öঙ香"));
722 }
723
724 #[test]
osa_distance_diff_unequal_length()725 fn osa_distance_diff_unequal_length() {
726 assert_eq!(6, osa_distance("damerau", "aderuaxyz"));
727 }
728
729 #[test]
osa_distance_diff_unequal_length_reversed()730 fn osa_distance_diff_unequal_length_reversed() {
731 assert_eq!(6, osa_distance("aderuaxyz", "damerau"));
732 }
733
734 #[test]
osa_distance_diff_comedians()735 fn osa_distance_diff_comedians() {
736 assert_eq!(5, osa_distance("Stewart", "Colbert"));
737 }
738
739 #[test]
osa_distance_many_transpositions()740 fn osa_distance_many_transpositions() {
741 assert_eq!(4, osa_distance("abcdefghijkl", "bacedfgihjlk"));
742 }
743
744 #[test]
osa_distance_diff_longer()745 fn osa_distance_diff_longer() {
746 let a = "The quick brown fox jumped over the angry dog.";
747 let b = "Lehem ipsum dolor sit amet, dicta latine an eam.";
748 assert_eq!(36, osa_distance(a, b));
749 }
750
751 #[test]
osa_distance_beginning_transposition()752 fn osa_distance_beginning_transposition() {
753 assert_eq!(1, osa_distance("foobar", "ofobar"));
754 }
755
756 #[test]
osa_distance_end_transposition()757 fn osa_distance_end_transposition() {
758 assert_eq!(1, osa_distance("specter", "spectre"));
759 }
760
761 #[test]
osa_distance_restricted_edit()762 fn osa_distance_restricted_edit() {
763 assert_eq!(4, osa_distance("a cat", "an abct"));
764 }
765
766 #[test]
damerau_levenshtein_empty()767 fn damerau_levenshtein_empty() {
768 assert_eq!(0, damerau_levenshtein("", ""));
769 }
770
771 #[test]
damerau_levenshtein_same()772 fn damerau_levenshtein_same() {
773 assert_eq!(0, damerau_levenshtein("damerau", "damerau"));
774 }
775
776 #[test]
damerau_levenshtein_first_empty()777 fn damerau_levenshtein_first_empty() {
778 assert_eq!(7, damerau_levenshtein("", "damerau"));
779 }
780
781 #[test]
damerau_levenshtein_second_empty()782 fn damerau_levenshtein_second_empty() {
783 assert_eq!(7, damerau_levenshtein("damerau", ""));
784 }
785
786 #[test]
damerau_levenshtein_diff()787 fn damerau_levenshtein_diff() {
788 assert_eq!(2, damerau_levenshtein("ca", "abc"));
789 }
790
791 #[test]
damerau_levenshtein_diff_short()792 fn damerau_levenshtein_diff_short() {
793 assert_eq!(3, damerau_levenshtein("damerau", "aderua"));
794 }
795
796 #[test]
damerau_levenshtein_diff_reversed()797 fn damerau_levenshtein_diff_reversed() {
798 assert_eq!(3, damerau_levenshtein("aderua", "damerau"));
799 }
800
801 #[test]
damerau_levenshtein_diff_multibyte()802 fn damerau_levenshtein_diff_multibyte() {
803 assert_eq!(3, damerau_levenshtein("öঙ香", "abc"));
804 assert_eq!(3, damerau_levenshtein("abc", "öঙ香"));
805 }
806
807 #[test]
damerau_levenshtein_diff_unequal_length()808 fn damerau_levenshtein_diff_unequal_length() {
809 assert_eq!(6, damerau_levenshtein("damerau", "aderuaxyz"));
810 }
811
812 #[test]
damerau_levenshtein_diff_unequal_length_reversed()813 fn damerau_levenshtein_diff_unequal_length_reversed() {
814 assert_eq!(6, damerau_levenshtein("aderuaxyz", "damerau"));
815 }
816
817 #[test]
damerau_levenshtein_diff_comedians()818 fn damerau_levenshtein_diff_comedians() {
819 assert_eq!(5, damerau_levenshtein("Stewart", "Colbert"));
820 }
821
822 #[test]
damerau_levenshtein_many_transpositions()823 fn damerau_levenshtein_many_transpositions() {
824 assert_eq!(4, damerau_levenshtein("abcdefghijkl", "bacedfgihjlk"));
825 }
826
827 #[test]
damerau_levenshtein_diff_longer()828 fn damerau_levenshtein_diff_longer() {
829 let a = "The quick brown fox jumped over the angry dog.";
830 let b = "Lehem ipsum dolor sit amet, dicta latine an eam.";
831 assert_eq!(36, damerau_levenshtein(a, b));
832 }
833
834 #[test]
damerau_levenshtein_beginning_transposition()835 fn damerau_levenshtein_beginning_transposition() {
836 assert_eq!(1, damerau_levenshtein("foobar", "ofobar"));
837 }
838
839 #[test]
damerau_levenshtein_end_transposition()840 fn damerau_levenshtein_end_transposition() {
841 assert_eq!(1, damerau_levenshtein("specter", "spectre"));
842 }
843
844 #[test]
damerau_levenshtein_unrestricted_edit()845 fn damerau_levenshtein_unrestricted_edit() {
846 assert_eq!(3, damerau_levenshtein("a cat", "an abct"));
847 }
848
849 #[test]
normalized_damerau_levenshtein_diff_short()850 fn normalized_damerau_levenshtein_diff_short() {
851 assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001);
852 }
853
854 #[test]
normalized_damerau_levenshtein_for_empty_strings()855 fn normalized_damerau_levenshtein_for_empty_strings() {
856 assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001);
857 }
858
859 #[test]
normalized_damerau_levenshtein_first_empty()860 fn normalized_damerau_levenshtein_first_empty() {
861 assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001);
862 }
863
864 #[test]
normalized_damerau_levenshtein_second_empty()865 fn normalized_damerau_levenshtein_second_empty() {
866 assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001);
867 }
868
869 #[test]
normalized_damerau_levenshtein_identical_strings()870 fn normalized_damerau_levenshtein_identical_strings() {
871 assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001);
872 }
873 }
874