1/* 2 * Copyright (C) 2011 Collabora Ltd. 3 * 4 * This library is free software: you can redistribute it and/or modify 5 * it under the terms of the GNU Lesser General Public License as published by 6 * the Free Software Foundation, either version 2.1 of the License, or 7 * (at your option) any later version. 8 * 9 * This library is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU Lesser General Public License for more details. 13 * 14 * You should have received a copy of the GNU Lesser General Public License 15 * along with this library. If not, see <http://www.gnu.org/licenses/>. 16 * 17 * Authors: 18 * Raul Gutierrez Segales <raul.gutierrez.segales@collabora.co.uk> 19 */ 20 21using Gee; 22 23/** 24 * Likely-ness of a potential match. 25 * 26 * Note that the order should be maintained. 27 * 28 * @since 0.5.0 29 */ 30public enum Folks.MatchResult 31{ 32 /** 33 * Zero likelihood of a match. 34 * 35 * This is used in situations where two individuals should never be linked, 36 * such as when one of them has a {@link Individual.trust_level} of 37 * {@link TrustLevel.NONE}, or when the individuals are explicitly 38 * anti-linked. 39 * 40 * @since 0.6.8 41 */ 42 NONE = -1, 43 44 /** 45 * Very low likelihood of a match. 46 */ 47 VERY_LOW = 0, 48 49 /** 50 * Low likelihood of a match. 51 */ 52 LOW = 1, 53 54 /** 55 * Medium likelihood of a match. 56 */ 57 MEDIUM = 2, 58 59 /** 60 * High likelihood of a match. 61 */ 62 HIGH = 3, 63 64 /** 65 * Very high likelihood of a match. 66 */ 67 VERY_HIGH = 4, 68 69 /** 70 * Minimum likelihood of a match. 71 */ 72 MIN = NONE, 73 74 /** 75 * Maximum likelihood of a match. 76 */ 77 MAX = VERY_HIGH 78} 79 80/** 81 * Match calculator for pairs of individuals. 82 * 83 * This provides functionality to explore the degree of a potential match 84 * between two individuals. It compares the similarity of the individuals' 85 * properties to determine how likely it is that the individuals represent the 86 * same physical person. 87 * 88 * This can be used by folks clients to, for example, present suggestions of 89 * pairs of individuals which should be linked by the user. 90 * 91 * @since 0.5.0 92 */ 93public class Folks.PotentialMatch : Object 94{ 95 private Folks.Individual _individual_a; 96 private Folks.Individual _individual_b; 97 98 /** 99 * A set of e-mail addresses known to be aliases of each other, such as 100 * various forms of administrator address. 101 * 102 * @since 0.5.1 103 */ 104 public static Set<string> known_email_aliases = new SmallSet<string> (); 105 106 private static double _DIST_THRESHOLD = 0.70; 107 private const string _SEPARATORS = "._-+"; 108 109 static construct 110 { 111 PotentialMatch.known_email_aliases.add ("admin"); 112 PotentialMatch.known_email_aliases.add ("abuse"); 113 PotentialMatch.known_email_aliases.add ("webmaster"); 114 } 115 116 /** 117 * Create a new PotentialMatch. 118 * 119 * @return a new PotentialMatch 120 * 121 * @since 0.5.0 122 */ 123 public PotentialMatch () 124 { 125 base (); 126 } 127 128 /** 129 * Whether two individuals are likely to be the same person. 130 * 131 * @param a an individual to compare 132 * @param b another individual to compare 133 * 134 * @since 0.5.0 135 */ 136 public MatchResult potential_match (Individual a, Individual b) 137 { 138 this._individual_a = a; 139 this._individual_b = b; 140 MatchResult result = MatchResult.MIN; 141 142 /* Immediately discount a match if either of the individuals can't be 143 * trusted (e.g. due to containing link-local XMPP personas, which can be 144 * spoofed). */ 145 if (a.trust_level == TrustLevel.NONE || b.trust_level == TrustLevel.NONE) 146 { 147 return result; 148 } 149 150 /* Similarly, immediately discount a match if the individuals have been 151 * anti-linked by the user. */ 152 if (a.has_anti_link_with_individual (b)) 153 { 154 return result; 155 } 156 157 result = MatchResult.VERY_LOW; 158 159 /* If individuals share gender. */ 160 if (this._individual_a.gender != Gender.UNSPECIFIED && 161 this._individual_b.gender != Gender.UNSPECIFIED && 162 this._individual_a.gender != this._individual_b.gender) 163 { 164 return result; 165 } 166 167 /* If individuals share common im-addresses */ 168 result = this._inspect_im_addresses (result); 169 if (result == MatchResult.MAX) 170 return result; 171 172 /* If individuals share common e-mails */ 173 result = this._inspect_emails (result); 174 if (result == MatchResult.MAX) 175 return result; 176 177 /* If individuals share common phone numbers */ 178 result = this._inspect_phone_numbers (result); 179 if (result == MatchResult.MAX) 180 return result; 181 182 /* they have the same (normalised) name? */ 183 result = this._name_similarity (result); 184 if (result == MatchResult.MAX) 185 return result; 186 187 return result; 188 } 189 190 private MatchResult _inspect_phone_numbers (MatchResult old_result) 191 { 192 var set_a = this._individual_a.phone_numbers; 193 var set_b = this._individual_b.phone_numbers; 194 195 foreach (var phone_fd_a in set_a) 196 { 197 foreach (var phone_fd_b in set_b) 198 { 199 if (phone_fd_a.values_equal (phone_fd_b)) 200 { 201 return MatchResult.HIGH; 202 } 203 } 204 } 205 206 return old_result; 207 } 208 209 /* Approach: 210 * - taking in account family, given, prefix, suffix and additional names 211 * we give some points for each non-empty match 212 * 213 * @since 0.5.0 214 */ 215 private MatchResult _name_similarity (MatchResult old_result) 216 { 217 double similarity = 0.0; 218 bool exact_match = false; 219 220 if (this._look_alike (this._individual_a.nickname, 221 this._individual_b.nickname)) 222 { 223 similarity += 0.20; 224 } 225 226 if (this._look_alike_or_identical (this._individual_a.full_name, 227 this._individual_b.full_name, 228 out exact_match) || 229 this._look_alike_or_identical (this._individual_a.alias, 230 this._individual_b.full_name, 231 out exact_match) || 232 this._look_alike_or_identical (this._individual_a.full_name, 233 this._individual_b.alias, 234 out exact_match) || 235 this._look_alike_or_identical (this._individual_a.alias, 236 this._individual_b.alias, 237 out exact_match)) 238 { 239 similarity += 0.70; 240 } 241 242 var _a = this._individual_a.structured_name; 243 var _b = this._individual_b.structured_name; 244 245 if (_a != null && _b != null) 246 { 247 var a = (!) _a; 248 var b = (!) _b; 249 250 if (a.is_empty () == false && a.equal (b)) 251 { 252 return MatchResult.HIGH; 253 } 254 255 if (Folks.Utils._str_equal_safe (a.given_name, b.given_name)) 256 similarity += 0.20; 257 258 if (this._look_alike (a.family_name, b.family_name) && 259 this._look_alike (a.given_name, b.given_name)) 260 { 261 similarity += 0.40; 262 } 263 264 if (Folks.Utils._str_equal_safe (a.additional_names, 265 b.additional_names)) 266 similarity += 0.5; 267 268 if (Folks.Utils._str_equal_safe (a.prefixes, b.prefixes)) 269 similarity += 0.5; 270 271 if (Folks.Utils._str_equal_safe (a.suffixes, b.suffixes)) 272 similarity += 0.5; 273 } 274 275 debug ("[name_similarity] Got %f\n", similarity); 276 277 if (similarity >= PotentialMatch._DIST_THRESHOLD) 278 { 279 int inc = 2; 280 /* We need exact matches to go to at least HIGH, or otherwise its 281 not possible to get a HIGH match for e.g. a facebook telepathy 282 persona, where alias is the only piece of information 283 available */ 284 if (exact_match) 285 inc += 1; 286 return this._inc_match_level (old_result, inc); 287 } 288 289 return old_result; 290 } 291 292 /** 293 * Number of equal IM addresses between two individuals. 294 * 295 * This compares the addresses without comparing their associated protocols. 296 * 297 * @since 0.5.0 298 */ 299 private MatchResult _inspect_im_addresses (MatchResult old_result) 300 { 301 var addrs = new HashSet<string> (); 302 303 foreach (var im_a in this._individual_a.im_addresses.get_values ()) 304 { 305 addrs.add (im_a.value); 306 } 307 308 foreach (var im_b in this._individual_b.im_addresses.get_values ()) 309 { 310 if (addrs.contains (im_b.value) == true) 311 { 312 return MatchResult.HIGH; 313 } 314 } 315 316 return old_result; 317 } 318 319 /** 320 * Inspect email addresses. 321 * 322 * @since 0.5.0 323 */ 324 private MatchResult _inspect_emails (MatchResult old_result) 325 { 326 var set_a = this._individual_a.email_addresses; 327 var set_b = this._individual_b.email_addresses; 328 MatchResult result = old_result; 329 330 foreach (var fd_a in set_a) 331 { 332 string[] email_split_a = fd_a.value.split ("@"); 333 334 /* Sanity check for valid e-mail addresses. */ 335 if (email_split_a.length < 2) 336 { 337 warning ("Invalid e-mail address when looking for potential " + 338 "match: %s", fd_a.value); 339 continue; 340 } 341 342 string[] tokens_a = 343 email_split_a[0].split_set (PotentialMatch._SEPARATORS); 344 345 foreach (var fd_b in set_b) 346 { 347 string[] email_split_b = fd_b.value.split ("@"); 348 349 /* Sanity check for valid e-mail addresses. */ 350 if (email_split_b.length < 2) 351 { 352 warning ("Invalid e-mail address when looking for " + 353 "potential match: %s", fd_b.value); 354 continue; 355 } 356 357 if (fd_a.value == fd_b.value) 358 { 359 if (PotentialMatch.known_email_aliases.contains 360 (email_split_a[0]) == true) 361 { 362 if (result < MatchResult.HIGH) 363 { 364 result = MatchResult.LOW; 365 } 366 } 367 else 368 { 369 return MatchResult.HIGH; 370 } 371 } 372 else 373 { 374 string[] tokens_b = 375 email_split_b[0].split_set (PotentialMatch._SEPARATORS); 376 377 /* Do we have: first.middle.last@ ~= fml@ ? */ 378 if (this._check_initials_expansion (tokens_a, tokens_b)) 379 { 380 result = MatchResult.MEDIUM; 381 } 382 /* So we have split the user part of the e-mail 383 * address into tokens. Lets see if there is some 384 * matches between tokens. 385 * As in: first.middle.last@ ~= [first,middle,..]@ */ 386 else if (this._match_tokens (tokens_a, tokens_b)) 387 { 388 result = MatchResult.MEDIUM; 389 } 390 } 391 } 392 } 393 394 return result; 395 } 396 397 /* We are after: 398 * you.are.someone@ =~ yas@ 399 */ 400 private bool _check_initials_expansion (string[] tokens_a, string[] tokens_b) 401 { 402 if (tokens_a.length > tokens_b.length && 403 tokens_b.length == 1) 404 { 405 return this._do_check_initials_expansion (tokens_a, tokens_b[0]); 406 } 407 else if (tokens_b.length > tokens_a.length && 408 tokens_a.length == 1) 409 { 410 return this._do_check_initials_expansion (tokens_b, tokens_a[0]); 411 } 412 return false; 413 } 414 415 private bool _do_check_initials_expansion (string[] expanded_name, 416 string initials) 417 { 418 if (expanded_name.length != initials.length) 419 return false; 420 421 for (int i=0; i<expanded_name.length; i++) 422 { 423 if (expanded_name[i][0] != initials[i]) 424 return false; 425 } 426 427 return true; 428 } 429 430 /* 431 * We should probably count how many tokens matched? 432 */ 433 private bool _match_tokens (string[] tokens_a, string[] tokens_b) 434 { 435 /* To find matching items from 2 sets its more efficient 436 * to make the outer loop go with the smaller set. */ 437 if (tokens_a.length > tokens_b.length) 438 return this._do_match_tokens (tokens_a, tokens_b); 439 else 440 return this._do_match_tokens (tokens_b, tokens_a); 441 } 442 443 private bool _do_match_tokens (string[] bigger_set, string[] smaller_set) 444 { 445 for (var i=0; i < smaller_set.length; i++) 446 { 447 for (var j=0; j < bigger_set.length; j++) 448 { 449 if (smaller_set[i] == bigger_set[j]) 450 return true; 451 } 452 } 453 454 return false; 455 } 456 457 private MatchResult _inc_match_level ( 458 MatchResult current_level, int times = 1) 459 { 460 MatchResult ret = current_level + times; 461 if (ret > MatchResult.MAX) 462 ret = MatchResult.MAX; 463 464 return ret; 465 } 466 467 private bool _look_alike_or_identical (string? a, string? b, out bool exact) 468 { 469 exact = false; 470 if (a == null || a == "" || b == null || b == "") 471 { 472 return false; 473 } 474 475 return_val_if_fail (a.validate (), false); 476 return_val_if_fail (b.validate (), false); 477 478 var a_stripped = this._strip_string ((!) a); 479 var b_stripped = this._strip_string ((!) b); 480 481 var jaro_dist = this._jaro_dist (a_stripped, b_stripped); 482 483 // a and b match exactly iff their Jaro distance is 1. 484 if (jaro_dist == 1.0) 485 { 486 exact = true; 487 return true; 488 } 489 490 // a and b look alike if their Jaro distance is over the threshold. 491 return (jaro_dist >= PotentialMatch._DIST_THRESHOLD); 492 } 493 494 private bool _look_alike (string? a, string? b) 495 { 496 if (a == null || a == "" || b == null || b == "") 497 { 498 return false; 499 } 500 501 return_val_if_fail (a.validate (), false); 502 return_val_if_fail (b.validate (), false); 503 504 var a_stripped = this._strip_string ((!) a); 505 var b_stripped = this._strip_string ((!) b); 506 507 // a and b look alike if their Jaro distance is over the threshold. 508 return (this._jaro_dist (a_stripped, b_stripped) >= PotentialMatch._DIST_THRESHOLD); 509 } 510 511 /* Based on: 512 * http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance 513 * 514 * d = 1/3 * ( m/|s1| + m/|s2| + (m - t)/m ) 515 * 516 * where 517 * 518 * m = matching characters 519 * t = number of transpositions 520 */ 521 private double _jaro_dist (unichar[] s1, unichar[] s2) 522 { 523 double distance; 524 int max = s1.length > s2.length ? s1.length : s2.length; 525 int max_dist = (max / 2) - 1; 526 double t; 527 double m = (double) this._matches (s1, s2, max_dist, out t); 528 double len_s1 = (double) s1.length; 529 double len_s2 = (double) s2.length; 530 double a = m / len_s1; 531 double b = m / len_s2; 532 double c = 0; 533 534 if ((int) m > 0) 535 c = (m - t) / m; 536 537 distance = (1.0/3.0) * (a + b + c); 538 539 debug ("Jaro distance: %f (a = %f, b = %f, c = %f)", distance, a, b, c); 540 541 return distance; 542 } 543 544 /** 545 * stripped_char: 546 * 547 * Returns a stripped version of @ch, removing any case, accentuation 548 * mark, or any special mark on it. 549 * 550 * Copied from Empathy's libempathy-gtk/empathy-live-search.c. 551 * 552 * Copyright (C) 2010 Collabora Ltd. 553 * Copyright (C) 2007-2010 Nokia Corporation. 554 * 555 * Authors: Felix Kaser <felix.kaser@collabora.co.uk> 556 * Xavier Claessens <xavier.claessens@collabora.co.uk> 557 * Claudio Saavedra <csaavedra@igalia.com> 558 */ 559 private unichar _stripped_char (unichar ch) 560 { 561 unichar retval[1] = { 0 }; 562 var utype = ch.type (); 563 564 switch (utype) 565 { 566 case UnicodeType.CONTROL: 567 case UnicodeType.FORMAT: 568 case UnicodeType.UNASSIGNED: 569 case UnicodeType.NON_SPACING_MARK: 570 case UnicodeType.COMBINING_MARK: 571 case UnicodeType.ENCLOSING_MARK: 572 /* Ignore those */ 573 break; 574 case UnicodeType.DECIMAL_NUMBER: 575 case UnicodeType.LETTER_NUMBER: 576 case UnicodeType.OTHER_NUMBER: 577 case UnicodeType.CONNECT_PUNCTUATION: 578 case UnicodeType.DASH_PUNCTUATION: 579 case UnicodeType.CLOSE_PUNCTUATION: 580 case UnicodeType.FINAL_PUNCTUATION: 581 case UnicodeType.INITIAL_PUNCTUATION: 582 case UnicodeType.OTHER_PUNCTUATION: 583 case UnicodeType.OPEN_PUNCTUATION: 584 case UnicodeType.CURRENCY_SYMBOL: 585 case UnicodeType.MODIFIER_SYMBOL: 586 case UnicodeType.MATH_SYMBOL: 587 case UnicodeType.OTHER_SYMBOL: 588 case UnicodeType.LINE_SEPARATOR: 589 case UnicodeType.PARAGRAPH_SEPARATOR: 590 case UnicodeType.SPACE_SEPARATOR: 591 /* Replace punctuation with spaces. */ 592 retval[0] = ' '; 593 break; 594 case UnicodeType.PRIVATE_USE: 595 case UnicodeType.SURROGATE: 596 case UnicodeType.LOWERCASE_LETTER: 597 case UnicodeType.MODIFIER_LETTER: 598 case UnicodeType.OTHER_LETTER: 599 case UnicodeType.TITLECASE_LETTER: 600 case UnicodeType.UPPERCASE_LETTER: 601 default: 602 ch = ch.tolower (); 603 ch.fully_decompose (false, retval); 604 break; 605 } 606 607 return retval[0]; 608 } 609 610 private unichar[] _strip_string (string s) 611 { 612 int next_idx = 0; 613 uint write_idx = 0; 614 unichar ch = 0; 615 unichar[] output = new unichar[s.length]; // this is a safe overestimate 616 617 while (s.get_next_char (ref next_idx, out ch)) 618 { 619 ch = this._stripped_char (ch); 620 if (ch != 0) 621 { 622 output[write_idx++] = ch; 623 } 624 } 625 626 output.length = (int) write_idx; 627 return output; 628 } 629 630 /* Calculate matches and transpositions as defined by the Jaro distance. 631 */ 632 private int _matches (unichar[] s1, unichar[] s2, int max_dist, out double t) 633 { 634 int matches = 0; 635 t = 0.0; 636 var len_s1 = s1.length; 637 638 unichar look_for = 0; 639 640 for (uint idx = 0; idx < len_s1 && (look_for = s1[idx]) != 0; idx++) 641 { 642 int contains = this._contains (s2, look_for, idx, max_dist); 643 if (contains >= 0) 644 { 645 matches++; 646 if (contains > 0) 647 t += 1.0; 648 } 649 } 650 651 debug ("%d matches and %f / 2 transpositions", matches, t); 652 653 t = t / 2.0; 654 return matches; 655 } 656 657 /* If haystack contains c in pos return 0, if it contains 658 * it within the bounds of max_dist return abs(pos-pos_found). 659 * If its not found, return -1. 660 * 661 * pos and max_dist are both in unichars. 662 * 663 * Note: haystack must have been validated using haystack.validate() before 664 * being passed to this method. */ 665 private int _contains (unichar[] haystack, unichar c, uint pos, uint max_dist) 666 { 667 var haystack_len = haystack.length; /* in unichars */ 668 669 if (pos < haystack_len && haystack[pos] == c) 670 return 0; 671 672 uint idx = ((int) pos - (int) max_dist).clamp (0, haystack_len - 1); 673 unichar ch = 0; 674 675 while (idx < pos + max_dist && idx < haystack_len && 676 (ch = haystack[idx]) != 0) 677 { 678 if (ch == c) 679 return ((int) pos - (int) idx).abs (); 680 681 idx++; 682 } 683 684 return -1; 685 } 686} 687