1#
2# Gramps - a GTK+/GNOME based genealogy program
3#
4# Copyright (C) 2000-2007  Donald N. Allingham
5# Copyright (C) 2008       Brian G. Matherly
6# Copyright (C) 2010       Jakim Friant
7#
8# This program is free software; you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation; either version 2 of the License, or
11# (at your option) any later version.
12#
13# This program is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with this program; if not, write to the Free Software
20# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21#
22
23"""Tools/Database Processing/Find Possible Duplicate People"""
24
25#-------------------------------------------------------------------------
26#
27# GNOME libraries
28#
29#-------------------------------------------------------------------------
30from gi.repository import Gtk
31
32#-------------------------------------------------------------------------
33#
34# Gramps modules
35#
36#-------------------------------------------------------------------------
37from gramps.gen.const import URL_MANUAL_PAGE
38from gramps.gen.lib import Event, Person
39from gramps.gui.utils import ProgressMeter
40from gramps.gui.plug import tool
41from gramps.gen.soundex import soundex, compare
42from gramps.gen.display.name import displayer as name_displayer
43from gramps.gui.dialog import OkDialog
44from gramps.gui.listmodel import ListModel
45from gramps.gen.errors import WindowActiveError
46from gramps.gui.merge import MergePerson
47from gramps.gui.display import display_help
48from gramps.gui.managedwindow import ManagedWindow
49from gramps.gui.dialog import RunDatabaseRepair
50from gramps.gen.const import GRAMPS_LOCALE as glocale
51_ = glocale.translation.sgettext
52from gramps.gui.glade import Glade
53
54#-------------------------------------------------------------------------
55#
56# Constants
57#
58#-------------------------------------------------------------------------
59_val2label = {
60    0.25 : _("Low"),
61    1.0  : _("Medium"),
62    2.0  : _("High"),
63    }
64
65WIKI_HELP_PAGE = '%s_-_Tools' % URL_MANUAL_PAGE
66WIKI_HELP_SEC = _('manual|Find_Possible_Duplicate_People')
67
68#-------------------------------------------------------------------------
69#
70#
71#
72#-------------------------------------------------------------------------
73def is_initial(name):
74    if len(name) > 2:
75        return 0
76    elif len(name) == 2:
77        if name[0] == name[0].upper() and name[1] == '.':
78            return 1
79    else:
80        return name[0] == name[0].upper()
81
82#-------------------------------------------------------------------------
83#
84# The Actual tool.
85#
86#-------------------------------------------------------------------------
87class DuplicatePeopleTool(tool.Tool, ManagedWindow):
88
89    def __init__(self, dbstate, user, options_class, name, callback=None):
90        uistate = user.uistate
91
92        tool.Tool.__init__(self, dbstate, options_class, name)
93        ManagedWindow.__init__(self, uistate, [],
94                                             self.__class__)
95        self.dbstate = dbstate
96        self.uistate = uistate
97        self.map = {}
98        self.list = []
99        self.index = 0
100        self.merger = None
101        self.mergee = None
102        self.removed = {}
103        self.update = callback
104        self.use_soundex = 1
105
106        top = Glade(toplevel="finddupes", also_load=["liststore1"])
107
108        # retrieve options
109        threshold = self.options.handler.options_dict['threshold']
110        use_soundex = self.options.handler.options_dict['soundex']
111
112        my_menu = Gtk.ListStore(str, object)
113        for val in sorted(_val2label):
114            my_menu.append([_val2label[val], val])
115
116        self.soundex_obj = top.get_object("soundex")
117        self.soundex_obj.set_active(use_soundex)
118        self.soundex_obj.show()
119
120        self.menu = top.get_object("menu")
121        self.menu.set_model(my_menu)
122        self.menu.set_active(0)
123
124        window = top.toplevel
125        self.set_window(window, top.get_object('title'),
126                        _('Find Possible Duplicate People'))
127        self.setup_configs('interface.duplicatepeopletool', 350, 220)
128
129        top.connect_signals({
130            "on_do_merge_clicked"   : self.__dummy,
131            "on_help_show_clicked"  : self.__dummy,
132            "on_delete_show_event"  : self.__dummy,
133            "on_merge_ok_clicked"   : self.on_merge_ok_clicked,
134            "destroy_passed_object" : self.close,
135            "on_help_clicked"       : self.on_help_clicked,
136            "on_delete_merge_event" : self.close,
137            "on_delete_event"       : self.close,
138            })
139
140        self.show()
141
142    def build_menu_names(self, obj):
143        return (_("Tool settings"),_("Find Duplicates tool"))
144
145    def on_help_clicked(self, obj):
146        """Display the relevant portion of Gramps manual"""
147
148        display_help(WIKI_HELP_PAGE , WIKI_HELP_SEC)
149
150    def ancestors_of(self, p1_id, id_list):
151        if (not p1_id) or (p1_id in id_list):
152            return
153        id_list.append(p1_id)
154        p1 = self.db.get_person_from_handle(p1_id)
155        f1_id = p1.get_main_parents_family_handle()
156        if f1_id:
157            f1 = self.db.get_family_from_handle(f1_id)
158            self.ancestors_of(f1.get_father_handle(),id_list)
159            self.ancestors_of(f1.get_mother_handle(),id_list)
160
161    def on_merge_ok_clicked(self, obj):
162        threshold = self.menu.get_model()[self.menu.get_active()][1]
163        self.use_soundex = int(self.soundex_obj.get_active())
164        try:
165            self.find_potentials(threshold)
166        except AttributeError as msg:
167            RunDatabaseRepair(str(msg), parent=self.window)
168            return
169
170        self.options.handler.options_dict['threshold'] = threshold
171        self.options.handler.options_dict['soundex'] = self.use_soundex
172        # Save options
173        self.options.handler.save_options()
174
175        if len(self.map) == 0:
176            OkDialog(
177                _("No matches found"),
178                _("No potential duplicate people were found"),
179                parent=self.window)
180        else:
181            try:
182                DuplicatePeopleToolMatches(self.dbstate, self.uistate,
183                                           self.track, self.list, self.map,
184                                           self.update)
185            except WindowActiveError:
186                pass
187
188    def find_potentials(self, thresh):
189        self.progress = ProgressMeter(_('Find Duplicates'),
190                                      _('Looking for duplicate people'),
191                                      parent=self.window)
192
193        index = 0
194        males = {}
195        females = {}
196        self.map = {}
197
198        length = self.db.get_number_of_people()
199
200        self.progress.set_pass(_('Pass 1: Building preliminary lists'),
201                               length)
202
203        for p1_id in self.db.iter_person_handles():
204            self.progress.step()
205            p1 = self.db.get_person_from_handle(p1_id)
206            key = self.gen_key(get_surnames(p1.get_primary_name()))
207            if p1.get_gender() == Person.MALE:
208                if key in males:
209                    males[key].append(p1_id)
210                else:
211                    males[key] = [p1_id]
212            else:
213                if key in females:
214                    females[key].append(p1_id)
215                else:
216                    females[key] = [p1_id]
217
218        self.progress.set_pass(_('Pass 2: Calculating potential matches'),
219                               length)
220
221        for p1key in self.db.iter_person_handles():
222            self.progress.step()
223            p1 = self.db.get_person_from_handle(p1key)
224
225            key = self.gen_key(get_surnames(p1.get_primary_name()))
226            if p1.get_gender() == Person.MALE:
227                remaining = males[key]
228            else:
229                remaining = females[key]
230
231            #index = 0
232            for p2key in remaining:
233                #index += 1
234                if p1key == p2key:
235                    continue
236                p2 = self.db.get_person_from_handle(p2key)
237                if p2key in self.map:
238                    (v,c) = self.map[p2key]
239                    if v == p1key:
240                        continue
241
242                chance = self.compare_people(p1,p2)
243                if chance >= thresh:
244                    if p1key in self.map:
245                        val = self.map[p1key]
246                        if val[1] > chance:
247                            self.map[p1key] = (p2key,chance)
248                    else:
249                        self.map[p1key] = (p2key,chance)
250
251        self.list = sorted(self.map)
252        self.length = len(self.list)
253        self.progress.close()
254
255    def gen_key(self, val):
256        if self.use_soundex:
257            try:
258                return soundex(val)
259            except UnicodeEncodeError:
260                return val
261        else:
262            return val
263
264    def compare_people(self, p1, p2):
265
266        name1 = p1.get_primary_name()
267        name2 = p2.get_primary_name()
268
269        chance = self.name_match(name1, name2)
270        if chance == -1  :
271            return -1
272
273        birth1_ref = p1.get_birth_ref()
274        if birth1_ref:
275            birth1 = self.db.get_event_from_handle(birth1_ref.ref)
276        else:
277            birth1 = Event()
278
279        death1_ref = p1.get_death_ref()
280        if death1_ref:
281            death1 = self.db.get_event_from_handle(death1_ref.ref)
282        else:
283            death1 = Event()
284
285        birth2_ref = p2.get_birth_ref()
286        if birth2_ref:
287            birth2 = self.db.get_event_from_handle(birth2_ref.ref)
288        else:
289            birth2 = Event()
290
291        death2_ref = p2.get_death_ref()
292        if death2_ref:
293            death2 = self.db.get_event_from_handle(death2_ref.ref)
294        else:
295            death2 = Event()
296
297        value = self.date_match(birth1.get_date_object(),
298                                birth2.get_date_object())
299        if value == -1 :
300            return -1
301        chance += value
302
303        value = self.date_match(death1.get_date_object(),
304                                death2.get_date_object())
305        if value == -1 :
306            return -1
307        chance += value
308
309        value = self.place_match(birth1.get_place_handle(),
310                                 birth2.get_place_handle())
311        if value == -1 :
312            return -1
313        chance += value
314
315        value = self.place_match(death1.get_place_handle(),
316                                 death2.get_place_handle())
317        if value == -1 :
318            return -1
319        chance += value
320
321        ancestors = []
322        self.ancestors_of(p1.get_handle(),ancestors)
323        if p2.get_handle() in ancestors:
324            return -1
325
326        ancestors = []
327        self.ancestors_of(p2.get_handle(),ancestors)
328        if p1.get_handle() in ancestors:
329            return -1
330
331        f1_id = p1.get_main_parents_family_handle()
332        f2_id = p2.get_main_parents_family_handle()
333
334        if f1_id and f2_id:
335            f1 = self.db.get_family_from_handle(f1_id)
336            f2 = self.db.get_family_from_handle(f2_id)
337            dad1_id = f1.get_father_handle()
338            if dad1_id:
339                dad1 = get_name_obj(self.db.get_person_from_handle(dad1_id))
340            else:
341                dad1 = None
342            dad2_id = f2.get_father_handle()
343            if dad2_id:
344                dad2 = get_name_obj(self.db.get_person_from_handle(dad2_id))
345            else:
346                dad2 = None
347
348            value = self.name_match(dad1,dad2)
349
350            if value == -1:
351                return -1
352
353            chance += value
354
355            mom1_id = f1.get_mother_handle()
356            if mom1_id:
357                mom1 = get_name_obj(self.db.get_person_from_handle(mom1_id))
358            else:
359                mom1 = None
360            mom2_id = f2.get_mother_handle()
361            if mom2_id:
362                mom2 = get_name_obj(self.db.get_person_from_handle(mom2_id))
363            else:
364                mom2 = None
365
366            value = self.name_match(mom1,mom2)
367            if value == -1:
368                return -1
369
370            chance += value
371
372        for f1_id in p1.get_family_handle_list():
373            f1 = self.db.get_family_from_handle(f1_id)
374            for f2_id in p2.get_family_handle_list():
375                f2 = self.db.get_family_from_handle(f2_id)
376                if p1.get_gender() == Person.FEMALE:
377                    father1_id = f1.get_father_handle()
378                    father2_id = f2.get_father_handle()
379                    if father1_id and father2_id:
380                        if father1_id == father2_id:
381                            chance += 1
382                        else:
383                            father1 = self.db.get_person_from_handle(father1_id)
384                            father2 = self.db.get_person_from_handle(father2_id)
385                            fname1 = get_name_obj(father1)
386                            fname2 = get_name_obj(father2)
387                            value = self.name_match(fname1,fname2)
388                            if value != -1:
389                                chance += value
390                else:
391                    mother1_id = f1.get_mother_handle()
392                    mother2_id = f2.get_mother_handle()
393                    if mother1_id and mother2_id:
394                        if mother1_id == mother2_id:
395                            chance += 1
396                        else:
397                            mother1 = self.db.get_person_from_handle(mother1_id)
398                            mother2 = self.db.get_person_from_handle(mother2_id)
399                            mname1 = get_name_obj(mother1)
400                            mname2 = get_name_obj(mother2)
401                            value = self.name_match(mname1,mname2)
402                            if value != -1:
403                                chance += value
404        return chance
405
406    def name_compare(self, s1, s2):
407        if self.use_soundex:
408            try:
409                return compare(s1,s2)
410            except UnicodeEncodeError:
411                return s1 == s2
412        else:
413            return s1 == s2
414
415    def date_match(self, date1, date2):
416        if date1.is_empty() or date2.is_empty():
417            return 0
418        if date1.is_equal(date2):
419            return 1
420
421        if date1.is_compound() or date2.is_compound():
422            return self.range_compare(date1,date2)
423
424        if date1.get_year() == date2.get_year():
425            if date1.get_month() == date2.get_month():
426                return 0.75
427            if not date1.get_month_valid() or not date2.get_month_valid():
428                return 0.75
429            else:
430                return -1
431        else:
432            return -1
433
434    def range_compare(self, date1, date2):
435        start_date_1 = date1.get_start_date()[0:3]
436        start_date_2 = date2.get_start_date()[0:3]
437        stop_date_1 = date1.get_stop_date()[0:3]
438        stop_date_2 = date2.get_stop_date()[0:3]
439        if date1.is_compound() and date2.is_compound():
440            if (start_date_2 <= start_date_1 <= stop_date_2 or
441                start_date_1 <= start_date_2 <= stop_date_1 or
442                start_date_2 <= stop_date_1 <= stop_date_2 or
443                start_date_1 <= stop_date_2 <= stop_date_1):
444                return 0.5
445            else:
446                return -1
447        elif date2.is_compound():
448            if start_date_2 <= start_date_1 <= stop_date_2:
449                return 0.5
450            else:
451                return -1
452        else:
453            if start_date_1 <= start_date_2 <= stop_date_1:
454                return 0.5
455            else:
456                return -1
457
458    def name_match(self, name, name1):
459
460        if not name1 or not name:
461            return 0
462
463        srn1 = get_surnames(name)
464        sfx1 = name.get_suffix()
465        srn2 = get_surnames(name1)
466        sfx2 = name1.get_suffix()
467
468        if not self.name_compare(srn1,srn2):
469            return -1
470        if sfx1 != sfx2:
471            if sfx1 != "" and sfx2 != "":
472                return -1
473
474        if name.get_first_name() == name1.get_first_name():
475            return 1
476        else:
477            list1 = name.get_first_name().split()
478            list2 = name1.get_first_name().split()
479
480            if len(list1) < len(list2):
481                return self.list_reduce(list1,list2)
482            else:
483                return self.list_reduce(list2,list1)
484
485    def place_match(self, p1_id, p2_id):
486        if p1_id == p2_id:
487            return 1
488
489        if not p1_id:
490            name1 = ""
491        else:
492            p1 = self.db.get_place_from_handle(p1_id)
493            name1 = p1.get_title()
494
495        if not p2_id:
496            name2 = ""
497        else:
498            p2 = self.db.get_place_from_handle(p2_id)
499            name2 = p2.get_title()
500
501        if not (name1 and name2):
502            return 0
503        if name1 == name2:
504            return 1
505
506        list1 = name1.replace(","," ").split()
507        list2 = name2.replace(","," ").split()
508
509        value = 0
510        for name in list1:
511            for name2 in list2:
512                if name == name2:
513                    value += 0.5
514                elif name[0] == name2[0] and self.name_compare(name, name2):
515                    value += 0.25
516        return min(value,1) if value else -1
517
518    def list_reduce(self, list1, list2):
519        value = 0
520        for name in list1:
521            for name2 in list2:
522                if is_initial(name) and name[0] == name2[0]:
523                    value += 0.25
524                elif is_initial(name2) and name2[0] == name[0]:
525                    value += 0.25
526                elif name == name2:
527                    value += 0.5
528                elif name[0] == name2[0] and self.name_compare(name, name2):
529                    value += 0.25
530        return min(value,1) if value else -1
531
532    def __dummy(self, obj):
533        """dummy callback, needed because a shared glade file is used for
534        both toplevel windows and all signals must be handled.
535        """
536        pass
537
538
539class DuplicatePeopleToolMatches(ManagedWindow):
540
541    def __init__(self, dbstate, uistate, track, the_list, the_map, callback):
542        ManagedWindow.__init__(self,uistate,track,self.__class__)
543
544        self.dellist = set()
545        self.list = the_list
546        self.map = the_map
547        self.length = len(self.list)
548        self.update = callback
549        self.db = dbstate.db
550        self.dbstate = dbstate
551        self.uistate = uistate
552
553        top = Glade(toplevel="mergelist")
554        window = top.toplevel
555        self.set_window(window, top.get_object('title'),
556                        _('Potential Merges'))
557        self.setup_configs('interface.duplicatepeopletoolmatches', 500, 350)
558
559        self.mlist = top.get_object("mlist")
560        top.connect_signals({
561            "destroy_passed_object" : self.close,
562            "on_do_merge_clicked"   : self.on_do_merge_clicked,
563            "on_help_show_clicked"  : self.on_help_clicked,
564            "on_delete_show_event"  : self.close,
565            "on_merge_ok_clicked"   : self.__dummy,
566            "on_help_clicked"       : self.__dummy,
567            "on_delete_merge_event" : self.__dummy,
568            "on_delete_event"       : self.__dummy,
569            })
570        self.db.connect("person-delete", self.person_delete)
571
572        mtitles = [
573                (_('Rating'),3,75),
574                (_('First Person'),1,200),
575                (_('Second Person'),2,200),
576                ('',-1,0)
577                ]
578        self.list = ListModel(self.mlist,mtitles,
579                              event_func=self.on_do_merge_clicked)
580
581        self.redraw()
582        self.show()
583
584    def build_menu_names(self, obj):
585        return (_("Merge candidates"), _("Merge persons"))
586
587    def on_help_clicked(self, obj):
588        """Display the relevant portion of Gramps manual"""
589
590        display_help(WIKI_HELP_PAGE , WIKI_HELP_SEC)
591    def redraw(self):
592        list = []
593        for p1key, p1data in self.map.items():
594            if p1key in self.dellist:
595                continue
596            (p2key,c) = p1data
597            if p2key in self.dellist:
598                continue
599            if p1key == p2key:
600                continue
601            list.append((c,p1key,p2key))
602
603        self.list.clear()
604        for (c,p1key,p2key) in list:
605            c1 = "%5.2f" % c
606            c2 = "%5.2f" % (100-c)
607            p1 = self.db.get_person_from_handle(p1key)
608            p2 = self.db.get_person_from_handle(p2key)
609            if not p1 or not p2:
610                continue
611            pn1 = name_displayer.display(p1)
612            pn2 = name_displayer.display(p2)
613            self.list.add([c1, pn1, pn2,c2],(p1key,p2key))
614
615    def on_do_merge_clicked(self, obj):
616        store,iter = self.list.selection.get_selected()
617        if not iter:
618            return
619
620        (self.p1,self.p2) = self.list.get_object(iter)
621        MergePerson(self.dbstate, self.uistate, self.track, self.p1, self.p2,
622                    self.on_update, True)
623
624    def on_update(self):
625        if self.db.has_person_handle(self.p1):
626            titanic = self.p2
627        else:
628            titanic = self.p1
629        self.dellist.add(titanic)
630        self.update()
631        self.redraw()
632
633    def update_and_destroy(self, obj):
634        self.update(1)
635        self.close()
636
637    def person_delete(self, handle_list):
638        """ deal with person deletes outside of the tool """
639        self.dellist.update(handle_list)
640        self.redraw()
641
642    def __dummy(self, obj):
643        """dummy callback, needed because a shared glade file is used for
644        both toplevel windows and all signals must be handled.
645        """
646        pass
647
648
649#-------------------------------------------------------------------------
650#
651#
652#
653#-------------------------------------------------------------------------
654def name_of(p):
655    if not p:
656        return ""
657    return "%s (%s)" % (name_displayer.display(p),p.get_handle())
658
659def get_name_obj(person):
660    if person:
661        return person.get_primary_name()
662    else:
663        return None
664
665def get_surnames(name):
666    """Construct a full surname of the surnames"""
667    return ' '.join([surn.get_surname() for surn in name.get_surname_list()])
668
669#------------------------------------------------------------------------
670#
671#
672#
673#------------------------------------------------------------------------
674class DuplicatePeopleToolOptions(tool.ToolOptions):
675    """
676    Defines options and provides handling interface.
677    """
678
679    def __init__(self, name,person_id=None):
680        tool.ToolOptions.__init__(self, name,person_id)
681
682        # Options specific for this report
683        self.options_dict = {
684            'soundex'   : 1,
685            'threshold' : 0.25,
686        }
687        self.options_help = {
688            'soundex'   : ("=0/1","Whether to use SoundEx codes",
689                           ["Do not use SoundEx","Use SoundEx"],
690                           True),
691            'threshold' : ("=num","Threshold for tolerance",
692                           "Floating point number")
693            }
694