1/*
2 * Copyright (C) 2011 Collabora Ltd.
3 *
4 * This library is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU Lesser General Public License as published by
6 * the Free Software Foundation, either version 2.1 of the License, or
7 * (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public License
15 * along with this library.  If not, see <http://www.gnu.org/licenses/>.
16 *
17 * Authors:
18 *       Raul Gutierrez Segales <raul.gutierrez.segales@collabora.co.uk>
19 */
20
21using Gee;
22
23/**
24 * Likely-ness of a potential match.
25 *
26 * Note that the order should be maintained.
27 *
28 * @since 0.5.0
29 */
30public enum Folks.MatchResult
31{
32  /**
33   * Zero likelihood of a match.
34   *
35   * This is used in situations where two individuals should never be linked,
36   * such as when one of them has a {@link Individual.trust_level} of
37   * {@link TrustLevel.NONE}, or when the individuals are explicitly
38   * anti-linked.
39   *
40   * @since 0.6.8
41   */
42  NONE = -1,
43
44  /**
45   * Very low likelihood of a match.
46   */
47  VERY_LOW = 0,
48
49  /**
50   * Low likelihood of a match.
51   */
52  LOW = 1,
53
54  /**
55   * Medium likelihood of a match.
56   */
57  MEDIUM = 2,
58
59  /**
60   * High likelihood of a match.
61   */
62  HIGH = 3,
63
64  /**
65   * Very high likelihood of a match.
66   */
67  VERY_HIGH = 4,
68
69  /**
70   * Minimum likelihood of a match.
71   */
72  MIN = NONE,
73
74  /**
75   * Maximum likelihood of a match.
76   */
77  MAX = VERY_HIGH
78}
79
80/**
81 * Match calculator for pairs of individuals.
82 *
83 * This provides functionality to explore the degree of a potential match
84 * between two individuals. It compares the similarity of the individuals'
85 * properties to determine how likely it is that the individuals represent the
86 * same physical person.
87 *
88 * This can be used by folks clients to, for example, present suggestions of
89 * pairs of individuals which should be linked by the user.
90 *
91 * @since 0.5.0
92 */
93public class Folks.PotentialMatch : Object
94{
95  private Folks.Individual _individual_a;
96  private Folks.Individual _individual_b;
97
98  /**
99   * A set of e-mail addresses known to be aliases of each other, such as
100   * various forms of administrator address.
101   *
102   * @since 0.5.1
103   */
104  public static Set<string> known_email_aliases = new SmallSet<string> ();
105
106  private static double _DIST_THRESHOLD = 0.70;
107  private const string _SEPARATORS = "._-+";
108
109  static construct
110    {
111      PotentialMatch.known_email_aliases.add ("admin");
112      PotentialMatch.known_email_aliases.add ("abuse");
113      PotentialMatch.known_email_aliases.add ("webmaster");
114    }
115
116  /**
117   * Create a new PotentialMatch.
118   *
119   * @return a new PotentialMatch
120   *
121   * @since 0.5.0
122   */
123  public PotentialMatch ()
124    {
125      base ();
126    }
127
128  /**
129   * Whether two individuals are likely to be the same person.
130   *
131   * @param a an individual to compare
132   * @param b another individual to compare
133   *
134   * @since 0.5.0
135   */
136  public MatchResult potential_match (Individual a, Individual b)
137    {
138      this._individual_a = a;
139      this._individual_b = b;
140      MatchResult result = MatchResult.MIN;
141
142      /* Immediately discount a match if either of the individuals can't be
143       * trusted (e.g. due to containing link-local XMPP personas, which can be
144       * spoofed). */
145      if (a.trust_level == TrustLevel.NONE || b.trust_level == TrustLevel.NONE)
146        {
147          return result;
148        }
149
150      /* Similarly, immediately discount a match if the individuals have been
151       * anti-linked by the user. */
152      if (a.has_anti_link_with_individual (b))
153        {
154          return result;
155        }
156
157      result = MatchResult.VERY_LOW;
158
159      /* If individuals share gender. */
160      if (this._individual_a.gender != Gender.UNSPECIFIED &&
161          this._individual_b.gender != Gender.UNSPECIFIED &&
162          this._individual_a.gender != this._individual_b.gender)
163        {
164          return result;
165        }
166
167      /* If individuals share common im-addresses */
168      result = this._inspect_im_addresses (result);
169      if (result == MatchResult.MAX)
170        return result;
171
172      /* If individuals share common e-mails */
173      result = this._inspect_emails (result);
174      if (result == MatchResult.MAX)
175        return result;
176
177      /* If individuals share common phone numbers */
178      result = this._inspect_phone_numbers (result);
179      if (result == MatchResult.MAX)
180        return result;
181
182      /* they have the same (normalised) name? */
183      result = this._name_similarity (result);
184      if (result == MatchResult.MAX)
185        return result;
186
187      return result;
188    }
189
190  private MatchResult _inspect_phone_numbers (MatchResult old_result)
191    {
192      var set_a = this._individual_a.phone_numbers;
193      var set_b = this._individual_b.phone_numbers;
194
195      foreach (var phone_fd_a in set_a)
196        {
197          foreach (var phone_fd_b in set_b)
198            {
199              if (phone_fd_a.values_equal (phone_fd_b))
200                {
201                  return MatchResult.HIGH;
202                }
203            }
204        }
205
206      return old_result;
207    }
208
209  /* Approach:
210   * - taking in account family, given, prefix, suffix and additional names
211   *   we give some points for each non-empty match
212   *
213   * @since 0.5.0
214   */
215  private MatchResult _name_similarity (MatchResult old_result)
216    {
217      double similarity = 0.0;
218      bool exact_match = false;
219
220      if (this._look_alike (this._individual_a.nickname,
221              this._individual_b.nickname))
222        {
223          similarity += 0.20;
224        }
225
226      if (this._look_alike_or_identical (this._individual_a.full_name,
227              this._individual_b.full_name,
228              out exact_match) ||
229          this._look_alike_or_identical (this._individual_a.alias,
230              this._individual_b.full_name,
231              out exact_match) ||
232          this._look_alike_or_identical (this._individual_a.full_name,
233              this._individual_b.alias,
234              out exact_match) ||
235          this._look_alike_or_identical (this._individual_a.alias,
236              this._individual_b.alias,
237              out exact_match))
238        {
239          similarity += 0.70;
240        }
241
242      var _a = this._individual_a.structured_name;
243      var _b = this._individual_b.structured_name;
244
245      if (_a != null && _b != null)
246        {
247          var a = (!) _a;
248          var b = (!) _b;
249
250          if (a.is_empty () == false && a.equal (b))
251            {
252              return MatchResult.HIGH;
253            }
254
255          if (Folks.Utils._str_equal_safe (a.given_name, b.given_name))
256            similarity += 0.20;
257
258          if (this._look_alike (a.family_name, b.family_name) &&
259              this._look_alike (a.given_name, b.given_name))
260            {
261              similarity += 0.40;
262            }
263
264          if (Folks.Utils._str_equal_safe (a.additional_names,
265                  b.additional_names))
266            similarity += 0.5;
267
268          if (Folks.Utils._str_equal_safe (a.prefixes, b.prefixes))
269            similarity += 0.5;
270
271          if (Folks.Utils._str_equal_safe (a.suffixes, b.suffixes))
272            similarity += 0.5;
273        }
274
275      debug ("[name_similarity] Got %f\n", similarity);
276
277      if (similarity >= PotentialMatch._DIST_THRESHOLD)
278        {
279          int inc = 2;
280          /* We need exact matches to go to at least HIGH, or otherwise its
281             not possible to get a HIGH match for e.g. a facebook telepathy
282             persona, where alias is the only piece of information
283             available */
284          if (exact_match)
285            inc += 1;
286          return this._inc_match_level (old_result, inc);
287        }
288
289      return old_result;
290    }
291
292  /**
293   * Number of equal IM addresses between two individuals.
294   *
295   * This compares the addresses without comparing their associated protocols.
296   *
297   * @since 0.5.0
298   */
299  private MatchResult _inspect_im_addresses (MatchResult old_result)
300    {
301      var addrs = new HashSet<string> ();
302
303      foreach (var im_a in this._individual_a.im_addresses.get_values ())
304        {
305          addrs.add (im_a.value);
306        }
307
308      foreach (var im_b in this._individual_b.im_addresses.get_values ())
309        {
310          if (addrs.contains (im_b.value) == true)
311            {
312              return MatchResult.HIGH;
313            }
314        }
315
316      return old_result;
317    }
318
319  /**
320   * Inspect email addresses.
321   *
322   * @since 0.5.0
323   */
324  private MatchResult _inspect_emails (MatchResult old_result)
325    {
326      var set_a = this._individual_a.email_addresses;
327      var set_b = this._individual_b.email_addresses;
328      MatchResult result = old_result;
329
330      foreach (var fd_a in set_a)
331        {
332          string[] email_split_a = fd_a.value.split ("@");
333
334          /* Sanity check for valid e-mail addresses. */
335          if (email_split_a.length < 2)
336            {
337              warning ("Invalid e-mail address when looking for potential " +
338                  "match: %s", fd_a.value);
339              continue;
340            }
341
342          string[] tokens_a =
343            email_split_a[0].split_set (PotentialMatch._SEPARATORS);
344
345          foreach (var fd_b in set_b)
346            {
347              string[] email_split_b = fd_b.value.split ("@");
348
349              /* Sanity check for valid e-mail addresses. */
350              if (email_split_b.length < 2)
351                {
352                  warning ("Invalid e-mail address when looking for " +
353                      "potential match: %s", fd_b.value);
354                  continue;
355                }
356
357              if (fd_a.value == fd_b.value)
358                {
359                  if (PotentialMatch.known_email_aliases.contains
360                      (email_split_a[0]) == true)
361                    {
362                      if (result < MatchResult.HIGH)
363                        {
364                          result = MatchResult.LOW;
365                        }
366                    }
367                  else
368                    {
369                      return MatchResult.HIGH;
370                    }
371                }
372              else
373                {
374                  string[] tokens_b =
375                    email_split_b[0].split_set (PotentialMatch._SEPARATORS);
376
377                  /* Do we have: first.middle.last@ ~= fml@ ? */
378                  if (this._check_initials_expansion (tokens_a, tokens_b))
379                    {
380                      result = MatchResult.MEDIUM;
381                    }
382                  /* So we have split the user part of the e-mail
383                   * address into tokens. Lets see if there is some
384                   * matches between tokens.
385                   * As in: first.middle.last@ ~= [first,middle,..]@  */
386                  else if (this._match_tokens (tokens_a, tokens_b))
387                    {
388                      result = MatchResult.MEDIUM;
389                    }
390               }
391            }
392        }
393
394      return result;
395    }
396
397  /* We are after:
398   * you.are.someone@ =~ yas@
399   */
400  private bool _check_initials_expansion (string[] tokens_a, string[] tokens_b)
401    {
402      if (tokens_a.length > tokens_b.length &&
403          tokens_b.length == 1)
404        {
405          return this._do_check_initials_expansion (tokens_a, tokens_b[0]);
406        }
407      else if (tokens_b.length > tokens_a.length &&
408          tokens_a.length == 1)
409        {
410          return this._do_check_initials_expansion (tokens_b, tokens_a[0]);
411        }
412      return false;
413    }
414
415  private bool _do_check_initials_expansion (string[] expanded_name,
416      string initials)
417    {
418      if (expanded_name.length != initials.length)
419        return false;
420
421      for (int i=0; i<expanded_name.length; i++)
422        {
423          if (expanded_name[i][0] != initials[i])
424            return false;
425        }
426
427      return true;
428    }
429
430  /*
431   * We should probably count how many tokens matched?
432   */
433  private bool _match_tokens (string[] tokens_a, string[] tokens_b)
434    {
435      /* To find matching items from 2 sets its more efficient
436       * to make the outer loop go with the smaller set. */
437      if (tokens_a.length > tokens_b.length)
438        return this._do_match_tokens (tokens_a, tokens_b);
439      else
440        return this._do_match_tokens (tokens_b, tokens_a);
441    }
442
443  private bool _do_match_tokens (string[] bigger_set, string[] smaller_set)
444    {
445      for (var i=0; i < smaller_set.length; i++)
446        {
447          for (var j=0; j < bigger_set.length; j++)
448            {
449              if (smaller_set[i] == bigger_set[j])
450                return true;
451            }
452        }
453
454      return false;
455    }
456
457  private MatchResult _inc_match_level (
458      MatchResult current_level, int times = 1)
459    {
460      MatchResult ret = current_level + times;
461      if (ret > MatchResult.MAX)
462        ret = MatchResult.MAX;
463
464      return ret;
465    }
466
467  private bool _look_alike_or_identical (string? a, string? b, out bool exact)
468    {
469      exact = false;
470      if (a == null || a == "" || b == null || b == "")
471        {
472          return false;
473        }
474
475      return_val_if_fail (a.validate (), false);
476      return_val_if_fail (b.validate (), false);
477
478      var a_stripped = this._strip_string ((!) a);
479      var b_stripped = this._strip_string ((!) b);
480
481      var jaro_dist = this._jaro_dist (a_stripped, b_stripped);
482
483      // a and b match exactly iff their Jaro distance is 1.
484      if (jaro_dist == 1.0)
485        {
486          exact = true;
487          return true;
488        }
489
490      // a and b look alike if their Jaro distance is over the threshold.
491      return (jaro_dist >= PotentialMatch._DIST_THRESHOLD);
492    }
493
494  private bool _look_alike (string? a, string? b)
495    {
496      if (a == null || a == "" || b == null || b == "")
497        {
498          return false;
499        }
500
501      return_val_if_fail (a.validate (), false);
502      return_val_if_fail (b.validate (), false);
503
504      var a_stripped = this._strip_string ((!) a);
505      var b_stripped = this._strip_string ((!) b);
506
507      // a and b look alike if their Jaro distance is over the threshold.
508      return (this._jaro_dist (a_stripped, b_stripped) >= PotentialMatch._DIST_THRESHOLD);
509    }
510
511  /* Based on:
512   *  http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
513   *
514   * d = 1/3 * ( m/|s1| + m/|s2| + (m - t)/m )
515   *
516   *   where
517   *
518   * m = matching characters
519   * t = number of transpositions
520   */
521  private double _jaro_dist (unichar[] s1, unichar[] s2)
522    {
523      double distance;
524      int max = s1.length > s2.length ? s1.length : s2.length;
525      int max_dist = (max / 2) - 1;
526      double t;
527      double m = (double) this._matches (s1, s2, max_dist, out t);
528      double len_s1 = (double) s1.length;
529      double len_s2 = (double) s2.length;
530      double a = m / len_s1;
531      double b = m / len_s2;
532      double c = 0;
533
534      if ((int) m > 0)
535        c = (m - t) / m;
536
537      distance = (1.0/3.0) * (a + b + c);
538
539      debug ("Jaro distance: %f (a = %f, b = %f, c = %f)", distance, a, b, c);
540
541      return distance;
542    }
543
544  /**
545   * stripped_char:
546   *
547   * Returns a stripped version of @ch, removing any case, accentuation
548   * mark, or any special mark on it.
549   *
550   * Copied from Empathy's libempathy-gtk/empathy-live-search.c.
551   *
552   * Copyright (C) 2010 Collabora Ltd.
553   * Copyright (C) 2007-2010 Nokia Corporation.
554   *
555   * Authors: Felix Kaser <felix.kaser@collabora.co.uk>
556   *          Xavier Claessens <xavier.claessens@collabora.co.uk>
557   *          Claudio Saavedra <csaavedra@igalia.com>
558   */
559  private unichar _stripped_char (unichar ch)
560    {
561      unichar retval[1] = { 0 };
562      var utype = ch.type ();
563
564      switch (utype)
565        {
566          case UnicodeType.CONTROL:
567          case UnicodeType.FORMAT:
568          case UnicodeType.UNASSIGNED:
569          case UnicodeType.NON_SPACING_MARK:
570          case UnicodeType.COMBINING_MARK:
571          case UnicodeType.ENCLOSING_MARK:
572            /* Ignore those */
573            break;
574          case UnicodeType.DECIMAL_NUMBER:
575          case UnicodeType.LETTER_NUMBER:
576          case UnicodeType.OTHER_NUMBER:
577          case UnicodeType.CONNECT_PUNCTUATION:
578          case UnicodeType.DASH_PUNCTUATION:
579          case UnicodeType.CLOSE_PUNCTUATION:
580          case UnicodeType.FINAL_PUNCTUATION:
581          case UnicodeType.INITIAL_PUNCTUATION:
582          case UnicodeType.OTHER_PUNCTUATION:
583          case UnicodeType.OPEN_PUNCTUATION:
584          case UnicodeType.CURRENCY_SYMBOL:
585          case UnicodeType.MODIFIER_SYMBOL:
586          case UnicodeType.MATH_SYMBOL:
587          case UnicodeType.OTHER_SYMBOL:
588          case UnicodeType.LINE_SEPARATOR:
589          case UnicodeType.PARAGRAPH_SEPARATOR:
590          case UnicodeType.SPACE_SEPARATOR:
591            /* Replace punctuation with spaces. */
592            retval[0] = ' ';
593            break;
594          case UnicodeType.PRIVATE_USE:
595          case UnicodeType.SURROGATE:
596          case UnicodeType.LOWERCASE_LETTER:
597          case UnicodeType.MODIFIER_LETTER:
598          case UnicodeType.OTHER_LETTER:
599          case UnicodeType.TITLECASE_LETTER:
600          case UnicodeType.UPPERCASE_LETTER:
601          default:
602            ch = ch.tolower ();
603            ch.fully_decompose (false, retval);
604            break;
605        }
606
607      return retval[0];
608    }
609
610  private unichar[] _strip_string (string s)
611    {
612      int next_idx = 0;
613      uint write_idx = 0;
614      unichar ch = 0;
615      unichar[] output = new unichar[s.length]; // this is a safe overestimate
616
617      while (s.get_next_char (ref next_idx, out ch))
618        {
619          ch = this._stripped_char (ch);
620          if (ch != 0)
621            {
622              output[write_idx++] = ch;
623            }
624        }
625
626      output.length = (int) write_idx;
627      return output;
628    }
629
630  /* Calculate matches and transpositions as defined by the Jaro distance.
631   */
632  private int _matches (unichar[] s1, unichar[] s2, int max_dist, out double t)
633    {
634      int matches = 0;
635      t = 0.0;
636      var len_s1 = s1.length;
637
638      unichar look_for = 0;
639
640      for (uint idx = 0; idx < len_s1 && (look_for = s1[idx]) != 0; idx++)
641        {
642          int contains = this._contains (s2, look_for, idx, max_dist);
643          if (contains >= 0)
644            {
645              matches++;
646              if (contains > 0)
647                t += 1.0;
648            }
649        }
650
651      debug ("%d matches and %f / 2 transpositions", matches, t);
652
653      t = t / 2.0;
654      return matches;
655    }
656
657  /* If haystack contains c in pos return 0, if it contains
658   * it within the bounds of max_dist return abs(pos-pos_found).
659   * If its not found, return -1.
660   *
661   * pos and max_dist are both in unichars.
662   *
663   * Note: haystack must have been validated using haystack.validate() before
664   * being passed to this method. */
665  private int _contains (unichar[] haystack, unichar c, uint pos, uint max_dist)
666    {
667      var haystack_len = haystack.length; /* in unichars */
668
669      if (pos < haystack_len && haystack[pos] == c)
670        return 0;
671
672      uint idx = ((int) pos - (int) max_dist).clamp (0, haystack_len - 1);
673      unichar ch = 0;
674
675      while (idx < pos + max_dist && idx < haystack_len &&
676          (ch = haystack[idx]) != 0)
677        {
678          if (ch == c)
679            return ((int) pos - (int) idx).abs ();
680
681          idx++;
682        }
683
684      return -1;
685    }
686}
687