1# 2# Gramps - a GTK+/GNOME based genealogy program 3# 4# Copyright (C) 2000-2007 Donald N. Allingham 5# Copyright (C) 2008 Brian G. Matherly 6# Copyright (C) 2010 Jakim Friant 7# 8# This program is free software; you can redistribute it and/or modify 9# it under the terms of the GNU General Public License as published by 10# the Free Software Foundation; either version 2 of the License, or 11# (at your option) any later version. 12# 13# This program is distributed in the hope that it will be useful, 14# but WITHOUT ANY WARRANTY; without even the implied warranty of 15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16# GNU General Public License for more details. 17# 18# You should have received a copy of the GNU General Public License 19# along with this program; if not, write to the Free Software 20# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 21# 22 23"""Tools/Database Processing/Find Possible Duplicate People""" 24 25#------------------------------------------------------------------------- 26# 27# GNOME libraries 28# 29#------------------------------------------------------------------------- 30from gi.repository import Gtk 31 32#------------------------------------------------------------------------- 33# 34# Gramps modules 35# 36#------------------------------------------------------------------------- 37from gramps.gen.const import URL_MANUAL_PAGE 38from gramps.gen.lib import Event, Person 39from gramps.gui.utils import ProgressMeter 40from gramps.gui.plug import tool 41from gramps.gen.soundex import soundex, compare 42from gramps.gen.display.name import displayer as name_displayer 43from gramps.gui.dialog import OkDialog 44from gramps.gui.listmodel import ListModel 45from gramps.gen.errors import WindowActiveError 46from gramps.gui.merge import MergePerson 47from gramps.gui.display import display_help 48from gramps.gui.managedwindow import ManagedWindow 49from gramps.gui.dialog import RunDatabaseRepair 50from gramps.gen.const import GRAMPS_LOCALE as glocale 51_ = glocale.translation.sgettext 52from gramps.gui.glade import Glade 53 54#------------------------------------------------------------------------- 55# 56# Constants 57# 58#------------------------------------------------------------------------- 59_val2label = { 60 0.25 : _("Low"), 61 1.0 : _("Medium"), 62 2.0 : _("High"), 63 } 64 65WIKI_HELP_PAGE = '%s_-_Tools' % URL_MANUAL_PAGE 66WIKI_HELP_SEC = _('manual|Find_Possible_Duplicate_People') 67 68#------------------------------------------------------------------------- 69# 70# 71# 72#------------------------------------------------------------------------- 73def is_initial(name): 74 if len(name) > 2: 75 return 0 76 elif len(name) == 2: 77 if name[0] == name[0].upper() and name[1] == '.': 78 return 1 79 else: 80 return name[0] == name[0].upper() 81 82#------------------------------------------------------------------------- 83# 84# The Actual tool. 85# 86#------------------------------------------------------------------------- 87class DuplicatePeopleTool(tool.Tool, ManagedWindow): 88 89 def __init__(self, dbstate, user, options_class, name, callback=None): 90 uistate = user.uistate 91 92 tool.Tool.__init__(self, dbstate, options_class, name) 93 ManagedWindow.__init__(self, uistate, [], 94 self.__class__) 95 self.dbstate = dbstate 96 self.uistate = uistate 97 self.map = {} 98 self.list = [] 99 self.index = 0 100 self.merger = None 101 self.mergee = None 102 self.removed = {} 103 self.update = callback 104 self.use_soundex = 1 105 106 top = Glade(toplevel="finddupes", also_load=["liststore1"]) 107 108 # retrieve options 109 threshold = self.options.handler.options_dict['threshold'] 110 use_soundex = self.options.handler.options_dict['soundex'] 111 112 my_menu = Gtk.ListStore(str, object) 113 for val in sorted(_val2label): 114 my_menu.append([_val2label[val], val]) 115 116 self.soundex_obj = top.get_object("soundex") 117 self.soundex_obj.set_active(use_soundex) 118 self.soundex_obj.show() 119 120 self.menu = top.get_object("menu") 121 self.menu.set_model(my_menu) 122 self.menu.set_active(0) 123 124 window = top.toplevel 125 self.set_window(window, top.get_object('title'), 126 _('Find Possible Duplicate People')) 127 self.setup_configs('interface.duplicatepeopletool', 350, 220) 128 129 top.connect_signals({ 130 "on_do_merge_clicked" : self.__dummy, 131 "on_help_show_clicked" : self.__dummy, 132 "on_delete_show_event" : self.__dummy, 133 "on_merge_ok_clicked" : self.on_merge_ok_clicked, 134 "destroy_passed_object" : self.close, 135 "on_help_clicked" : self.on_help_clicked, 136 "on_delete_merge_event" : self.close, 137 "on_delete_event" : self.close, 138 }) 139 140 self.show() 141 142 def build_menu_names(self, obj): 143 return (_("Tool settings"),_("Find Duplicates tool")) 144 145 def on_help_clicked(self, obj): 146 """Display the relevant portion of Gramps manual""" 147 148 display_help(WIKI_HELP_PAGE , WIKI_HELP_SEC) 149 150 def ancestors_of(self, p1_id, id_list): 151 if (not p1_id) or (p1_id in id_list): 152 return 153 id_list.append(p1_id) 154 p1 = self.db.get_person_from_handle(p1_id) 155 f1_id = p1.get_main_parents_family_handle() 156 if f1_id: 157 f1 = self.db.get_family_from_handle(f1_id) 158 self.ancestors_of(f1.get_father_handle(),id_list) 159 self.ancestors_of(f1.get_mother_handle(),id_list) 160 161 def on_merge_ok_clicked(self, obj): 162 threshold = self.menu.get_model()[self.menu.get_active()][1] 163 self.use_soundex = int(self.soundex_obj.get_active()) 164 try: 165 self.find_potentials(threshold) 166 except AttributeError as msg: 167 RunDatabaseRepair(str(msg), parent=self.window) 168 return 169 170 self.options.handler.options_dict['threshold'] = threshold 171 self.options.handler.options_dict['soundex'] = self.use_soundex 172 # Save options 173 self.options.handler.save_options() 174 175 if len(self.map) == 0: 176 OkDialog( 177 _("No matches found"), 178 _("No potential duplicate people were found"), 179 parent=self.window) 180 else: 181 try: 182 DuplicatePeopleToolMatches(self.dbstate, self.uistate, 183 self.track, self.list, self.map, 184 self.update) 185 except WindowActiveError: 186 pass 187 188 def find_potentials(self, thresh): 189 self.progress = ProgressMeter(_('Find Duplicates'), 190 _('Looking for duplicate people'), 191 parent=self.window) 192 193 index = 0 194 males = {} 195 females = {} 196 self.map = {} 197 198 length = self.db.get_number_of_people() 199 200 self.progress.set_pass(_('Pass 1: Building preliminary lists'), 201 length) 202 203 for p1_id in self.db.iter_person_handles(): 204 self.progress.step() 205 p1 = self.db.get_person_from_handle(p1_id) 206 key = self.gen_key(get_surnames(p1.get_primary_name())) 207 if p1.get_gender() == Person.MALE: 208 if key in males: 209 males[key].append(p1_id) 210 else: 211 males[key] = [p1_id] 212 else: 213 if key in females: 214 females[key].append(p1_id) 215 else: 216 females[key] = [p1_id] 217 218 self.progress.set_pass(_('Pass 2: Calculating potential matches'), 219 length) 220 221 for p1key in self.db.iter_person_handles(): 222 self.progress.step() 223 p1 = self.db.get_person_from_handle(p1key) 224 225 key = self.gen_key(get_surnames(p1.get_primary_name())) 226 if p1.get_gender() == Person.MALE: 227 remaining = males[key] 228 else: 229 remaining = females[key] 230 231 #index = 0 232 for p2key in remaining: 233 #index += 1 234 if p1key == p2key: 235 continue 236 p2 = self.db.get_person_from_handle(p2key) 237 if p2key in self.map: 238 (v,c) = self.map[p2key] 239 if v == p1key: 240 continue 241 242 chance = self.compare_people(p1,p2) 243 if chance >= thresh: 244 if p1key in self.map: 245 val = self.map[p1key] 246 if val[1] > chance: 247 self.map[p1key] = (p2key,chance) 248 else: 249 self.map[p1key] = (p2key,chance) 250 251 self.list = sorted(self.map) 252 self.length = len(self.list) 253 self.progress.close() 254 255 def gen_key(self, val): 256 if self.use_soundex: 257 try: 258 return soundex(val) 259 except UnicodeEncodeError: 260 return val 261 else: 262 return val 263 264 def compare_people(self, p1, p2): 265 266 name1 = p1.get_primary_name() 267 name2 = p2.get_primary_name() 268 269 chance = self.name_match(name1, name2) 270 if chance == -1 : 271 return -1 272 273 birth1_ref = p1.get_birth_ref() 274 if birth1_ref: 275 birth1 = self.db.get_event_from_handle(birth1_ref.ref) 276 else: 277 birth1 = Event() 278 279 death1_ref = p1.get_death_ref() 280 if death1_ref: 281 death1 = self.db.get_event_from_handle(death1_ref.ref) 282 else: 283 death1 = Event() 284 285 birth2_ref = p2.get_birth_ref() 286 if birth2_ref: 287 birth2 = self.db.get_event_from_handle(birth2_ref.ref) 288 else: 289 birth2 = Event() 290 291 death2_ref = p2.get_death_ref() 292 if death2_ref: 293 death2 = self.db.get_event_from_handle(death2_ref.ref) 294 else: 295 death2 = Event() 296 297 value = self.date_match(birth1.get_date_object(), 298 birth2.get_date_object()) 299 if value == -1 : 300 return -1 301 chance += value 302 303 value = self.date_match(death1.get_date_object(), 304 death2.get_date_object()) 305 if value == -1 : 306 return -1 307 chance += value 308 309 value = self.place_match(birth1.get_place_handle(), 310 birth2.get_place_handle()) 311 if value == -1 : 312 return -1 313 chance += value 314 315 value = self.place_match(death1.get_place_handle(), 316 death2.get_place_handle()) 317 if value == -1 : 318 return -1 319 chance += value 320 321 ancestors = [] 322 self.ancestors_of(p1.get_handle(),ancestors) 323 if p2.get_handle() in ancestors: 324 return -1 325 326 ancestors = [] 327 self.ancestors_of(p2.get_handle(),ancestors) 328 if p1.get_handle() in ancestors: 329 return -1 330 331 f1_id = p1.get_main_parents_family_handle() 332 f2_id = p2.get_main_parents_family_handle() 333 334 if f1_id and f2_id: 335 f1 = self.db.get_family_from_handle(f1_id) 336 f2 = self.db.get_family_from_handle(f2_id) 337 dad1_id = f1.get_father_handle() 338 if dad1_id: 339 dad1 = get_name_obj(self.db.get_person_from_handle(dad1_id)) 340 else: 341 dad1 = None 342 dad2_id = f2.get_father_handle() 343 if dad2_id: 344 dad2 = get_name_obj(self.db.get_person_from_handle(dad2_id)) 345 else: 346 dad2 = None 347 348 value = self.name_match(dad1,dad2) 349 350 if value == -1: 351 return -1 352 353 chance += value 354 355 mom1_id = f1.get_mother_handle() 356 if mom1_id: 357 mom1 = get_name_obj(self.db.get_person_from_handle(mom1_id)) 358 else: 359 mom1 = None 360 mom2_id = f2.get_mother_handle() 361 if mom2_id: 362 mom2 = get_name_obj(self.db.get_person_from_handle(mom2_id)) 363 else: 364 mom2 = None 365 366 value = self.name_match(mom1,mom2) 367 if value == -1: 368 return -1 369 370 chance += value 371 372 for f1_id in p1.get_family_handle_list(): 373 f1 = self.db.get_family_from_handle(f1_id) 374 for f2_id in p2.get_family_handle_list(): 375 f2 = self.db.get_family_from_handle(f2_id) 376 if p1.get_gender() == Person.FEMALE: 377 father1_id = f1.get_father_handle() 378 father2_id = f2.get_father_handle() 379 if father1_id and father2_id: 380 if father1_id == father2_id: 381 chance += 1 382 else: 383 father1 = self.db.get_person_from_handle(father1_id) 384 father2 = self.db.get_person_from_handle(father2_id) 385 fname1 = get_name_obj(father1) 386 fname2 = get_name_obj(father2) 387 value = self.name_match(fname1,fname2) 388 if value != -1: 389 chance += value 390 else: 391 mother1_id = f1.get_mother_handle() 392 mother2_id = f2.get_mother_handle() 393 if mother1_id and mother2_id: 394 if mother1_id == mother2_id: 395 chance += 1 396 else: 397 mother1 = self.db.get_person_from_handle(mother1_id) 398 mother2 = self.db.get_person_from_handle(mother2_id) 399 mname1 = get_name_obj(mother1) 400 mname2 = get_name_obj(mother2) 401 value = self.name_match(mname1,mname2) 402 if value != -1: 403 chance += value 404 return chance 405 406 def name_compare(self, s1, s2): 407 if self.use_soundex: 408 try: 409 return compare(s1,s2) 410 except UnicodeEncodeError: 411 return s1 == s2 412 else: 413 return s1 == s2 414 415 def date_match(self, date1, date2): 416 if date1.is_empty() or date2.is_empty(): 417 return 0 418 if date1.is_equal(date2): 419 return 1 420 421 if date1.is_compound() or date2.is_compound(): 422 return self.range_compare(date1,date2) 423 424 if date1.get_year() == date2.get_year(): 425 if date1.get_month() == date2.get_month(): 426 return 0.75 427 if not date1.get_month_valid() or not date2.get_month_valid(): 428 return 0.75 429 else: 430 return -1 431 else: 432 return -1 433 434 def range_compare(self, date1, date2): 435 start_date_1 = date1.get_start_date()[0:3] 436 start_date_2 = date2.get_start_date()[0:3] 437 stop_date_1 = date1.get_stop_date()[0:3] 438 stop_date_2 = date2.get_stop_date()[0:3] 439 if date1.is_compound() and date2.is_compound(): 440 if (start_date_2 <= start_date_1 <= stop_date_2 or 441 start_date_1 <= start_date_2 <= stop_date_1 or 442 start_date_2 <= stop_date_1 <= stop_date_2 or 443 start_date_1 <= stop_date_2 <= stop_date_1): 444 return 0.5 445 else: 446 return -1 447 elif date2.is_compound(): 448 if start_date_2 <= start_date_1 <= stop_date_2: 449 return 0.5 450 else: 451 return -1 452 else: 453 if start_date_1 <= start_date_2 <= stop_date_1: 454 return 0.5 455 else: 456 return -1 457 458 def name_match(self, name, name1): 459 460 if not name1 or not name: 461 return 0 462 463 srn1 = get_surnames(name) 464 sfx1 = name.get_suffix() 465 srn2 = get_surnames(name1) 466 sfx2 = name1.get_suffix() 467 468 if not self.name_compare(srn1,srn2): 469 return -1 470 if sfx1 != sfx2: 471 if sfx1 != "" and sfx2 != "": 472 return -1 473 474 if name.get_first_name() == name1.get_first_name(): 475 return 1 476 else: 477 list1 = name.get_first_name().split() 478 list2 = name1.get_first_name().split() 479 480 if len(list1) < len(list2): 481 return self.list_reduce(list1,list2) 482 else: 483 return self.list_reduce(list2,list1) 484 485 def place_match(self, p1_id, p2_id): 486 if p1_id == p2_id: 487 return 1 488 489 if not p1_id: 490 name1 = "" 491 else: 492 p1 = self.db.get_place_from_handle(p1_id) 493 name1 = p1.get_title() 494 495 if not p2_id: 496 name2 = "" 497 else: 498 p2 = self.db.get_place_from_handle(p2_id) 499 name2 = p2.get_title() 500 501 if not (name1 and name2): 502 return 0 503 if name1 == name2: 504 return 1 505 506 list1 = name1.replace(","," ").split() 507 list2 = name2.replace(","," ").split() 508 509 value = 0 510 for name in list1: 511 for name2 in list2: 512 if name == name2: 513 value += 0.5 514 elif name[0] == name2[0] and self.name_compare(name, name2): 515 value += 0.25 516 return min(value,1) if value else -1 517 518 def list_reduce(self, list1, list2): 519 value = 0 520 for name in list1: 521 for name2 in list2: 522 if is_initial(name) and name[0] == name2[0]: 523 value += 0.25 524 elif is_initial(name2) and name2[0] == name[0]: 525 value += 0.25 526 elif name == name2: 527 value += 0.5 528 elif name[0] == name2[0] and self.name_compare(name, name2): 529 value += 0.25 530 return min(value,1) if value else -1 531 532 def __dummy(self, obj): 533 """dummy callback, needed because a shared glade file is used for 534 both toplevel windows and all signals must be handled. 535 """ 536 pass 537 538 539class DuplicatePeopleToolMatches(ManagedWindow): 540 541 def __init__(self, dbstate, uistate, track, the_list, the_map, callback): 542 ManagedWindow.__init__(self,uistate,track,self.__class__) 543 544 self.dellist = set() 545 self.list = the_list 546 self.map = the_map 547 self.length = len(self.list) 548 self.update = callback 549 self.db = dbstate.db 550 self.dbstate = dbstate 551 self.uistate = uistate 552 553 top = Glade(toplevel="mergelist") 554 window = top.toplevel 555 self.set_window(window, top.get_object('title'), 556 _('Potential Merges')) 557 self.setup_configs('interface.duplicatepeopletoolmatches', 500, 350) 558 559 self.mlist = top.get_object("mlist") 560 top.connect_signals({ 561 "destroy_passed_object" : self.close, 562 "on_do_merge_clicked" : self.on_do_merge_clicked, 563 "on_help_show_clicked" : self.on_help_clicked, 564 "on_delete_show_event" : self.close, 565 "on_merge_ok_clicked" : self.__dummy, 566 "on_help_clicked" : self.__dummy, 567 "on_delete_merge_event" : self.__dummy, 568 "on_delete_event" : self.__dummy, 569 }) 570 self.db.connect("person-delete", self.person_delete) 571 572 mtitles = [ 573 (_('Rating'),3,75), 574 (_('First Person'),1,200), 575 (_('Second Person'),2,200), 576 ('',-1,0) 577 ] 578 self.list = ListModel(self.mlist,mtitles, 579 event_func=self.on_do_merge_clicked) 580 581 self.redraw() 582 self.show() 583 584 def build_menu_names(self, obj): 585 return (_("Merge candidates"), _("Merge persons")) 586 587 def on_help_clicked(self, obj): 588 """Display the relevant portion of Gramps manual""" 589 590 display_help(WIKI_HELP_PAGE , WIKI_HELP_SEC) 591 def redraw(self): 592 list = [] 593 for p1key, p1data in self.map.items(): 594 if p1key in self.dellist: 595 continue 596 (p2key,c) = p1data 597 if p2key in self.dellist: 598 continue 599 if p1key == p2key: 600 continue 601 list.append((c,p1key,p2key)) 602 603 self.list.clear() 604 for (c,p1key,p2key) in list: 605 c1 = "%5.2f" % c 606 c2 = "%5.2f" % (100-c) 607 p1 = self.db.get_person_from_handle(p1key) 608 p2 = self.db.get_person_from_handle(p2key) 609 if not p1 or not p2: 610 continue 611 pn1 = name_displayer.display(p1) 612 pn2 = name_displayer.display(p2) 613 self.list.add([c1, pn1, pn2,c2],(p1key,p2key)) 614 615 def on_do_merge_clicked(self, obj): 616 store,iter = self.list.selection.get_selected() 617 if not iter: 618 return 619 620 (self.p1,self.p2) = self.list.get_object(iter) 621 MergePerson(self.dbstate, self.uistate, self.track, self.p1, self.p2, 622 self.on_update, True) 623 624 def on_update(self): 625 if self.db.has_person_handle(self.p1): 626 titanic = self.p2 627 else: 628 titanic = self.p1 629 self.dellist.add(titanic) 630 self.update() 631 self.redraw() 632 633 def update_and_destroy(self, obj): 634 self.update(1) 635 self.close() 636 637 def person_delete(self, handle_list): 638 """ deal with person deletes outside of the tool """ 639 self.dellist.update(handle_list) 640 self.redraw() 641 642 def __dummy(self, obj): 643 """dummy callback, needed because a shared glade file is used for 644 both toplevel windows and all signals must be handled. 645 """ 646 pass 647 648 649#------------------------------------------------------------------------- 650# 651# 652# 653#------------------------------------------------------------------------- 654def name_of(p): 655 if not p: 656 return "" 657 return "%s (%s)" % (name_displayer.display(p),p.get_handle()) 658 659def get_name_obj(person): 660 if person: 661 return person.get_primary_name() 662 else: 663 return None 664 665def get_surnames(name): 666 """Construct a full surname of the surnames""" 667 return ' '.join([surn.get_surname() for surn in name.get_surname_list()]) 668 669#------------------------------------------------------------------------ 670# 671# 672# 673#------------------------------------------------------------------------ 674class DuplicatePeopleToolOptions(tool.ToolOptions): 675 """ 676 Defines options and provides handling interface. 677 """ 678 679 def __init__(self, name,person_id=None): 680 tool.ToolOptions.__init__(self, name,person_id) 681 682 # Options specific for this report 683 self.options_dict = { 684 'soundex' : 1, 685 'threshold' : 0.25, 686 } 687 self.options_help = { 688 'soundex' : ("=0/1","Whether to use SoundEx codes", 689 ["Do not use SoundEx","Use SoundEx"], 690 True), 691 'threshold' : ("=num","Threshold for tolerance", 692 "Floating point number") 693 } 694