1 2=head1 NAME 3 4Lingua::EN::MatchNames - Smart matching for human names. 5 6=head1 SYNOPSIS 7 8 use Lingua::EN::MatchNames; 9 10 $score= name_eq( $firstn_0, $lastn_0, $firstn_1, $lastn_1 ); 11 12=head1 DESCRIPTION 13 14You have two databases of person records that need to be synchronized or matched up, 15but they use different keys--maybe one uses SSN and the other uses employee id. 16The only fields you have to match on are first and last name. 17 18That's what this module is for. 19 20Just feed the first and last names to the C<name_eq()> function, and it returns 21C<undef> for no possible match, and a percentage of certainty (rank) otherwise. 22The ranking system isn't very scientific, and gender isn't considered, though 23it probably should be. 24 25The C<name_eq()> function, checks for: 26 27=over 4 28 29=item * inconsistent case (MacHenry = Machenry = MACHENRY) 30 31=item * inconsistent symbols (O'Brien = Obrien = O BRIEN) 32 33=item * misspellings (Grene = Green) 34 35=item * last name hyphenation (Smith-Curry = Curry) 36 37=item * similar phonetics (Hanson = Hansen) 38 39=item * nicknames (Midge = Peggy = Margaret) 40 41=item * extraneous initials (H. Ross = Ross) 42 43=item * extraneous suffixes (Reed, Jr. = Reed II = Reed) 44 45=item * and more... 46 47=back 48 49=head2 Preliminary Tests: 50 51 Homer Simpson HOMER SIMPOSN: 77 52 Marge Simpson MIDGE SIMPSON: 81 53 Brian Lalonde BRYAN LA LONDE: 82 54 Brian Lalonde RYAN LALAND: 72 55 Peggy MacHenry Midge Machenry: 81 56 Liz Grene Elizabeth Green: 72 57 Chuck Reed, Jr. Charles Reed II: 82 58 Kathy O'Brien Catherine Obrien: 81 59 Lizzie Hanson Lisa Hanson: 91 60 H. Ross Perot Ross PEROT: 88 61 Kathy Smith-Curry KATIE CURRY: 81 62 Dina Johnson-Warner Dinah J-Warner: 80 63 Leela Miles-Conrad Leela MilesConrad: 86 64 C. Renee Smythe Cathy Smythe: 71 65 Victoria (Honey) Rider HONEY RIDER: 88 66 Bart Simpson El Barto Simpson: 80 67 Bart Simpson Lisa Simpson: (no match) 68 Arthur Dent Zaphod Beeblebrox: (no match) 69 70=head1 WARNING 71 72The scoring in this version is utterly arbitrary. 73I made all of the numbers up. 74The certainty percentages should be OK relative to each other, but 75would be better if someone could give me some statistical data. 76 77Be sure and B<test> this against your data first! 78Your data may not look like my test data. 79 80And although I hope this is useful to many, I do not provide any 81kind of warranty (expressed or implied), and do not suggest the 82suitability of this module to any particular purpose. 83This module probably should not be used for life support or military 84purposes, and it B<must> not be used for unsolicited commercial email 85or other bulk advertising. 86 87=head1 REPOSITORY 88 89L<https://github.com/brianary/Lingua-EN-MatchNames> 90 91=head1 AUTHOR 92 93Brian Lalonde, E<lt>brian@webcoder.infoE<gt> 94 95=head1 REQUIREMENTS 96 97Lingua::EN::NameParse, 98Lingua::EN::Nickname, 99Parse::RecDescent, 100String::Approx, 101Text::Metaphone, 102Text::Soundex 103 104=head1 SEE ALSO 105 106perl(1), 107L<Lingua::EN::NameParse>, 108L<Lingua::EN::Nickname>, 109L<String::Approx>, 110L<Text::Metaphone>, 111L<Text::Soundex> 112 113=cut 114 115package Lingua::EN::MatchNames; 116require Exporter; 117use Carp; 118use Lingua::EN::NameParse; 119use Lingua::EN::Nickname; 120use String::Approx 'amatch'; 121use Text::Metaphone; 122use Text::Soundex; 123use strict; 124use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 125use vars qw($debug); 126 127$VERSION= '1.36'; 128@ISA= qw(Exporter); 129@EXPORT= qw(name_eq); 130@EXPORT_OK= qw(fname_eq lname_eq); 131%EXPORT_TAGS= 132( 133 ALL => [ @EXPORT, @EXPORT_OK ], 134); 135 136sub _nparse($) 137{ 138 local $_= shift; 139 my $nparse= new Lingua::EN::NameParse( auto_clean => 1, force_case => 1 ) 140 or carp "Unable to set up name parser.\n$!\n"; 141 $nparse->parse($_); 142 my %name= $nparse->components; 143 return($name{given_name_1},$name{surname_1}. 144 ( $name{surname_2} ? '-'.$name{surname_2} : '' )); 145} 146 147sub fname_eq 148{ 149 my($name0,$name1,$match)= @_; 150 return unless $name0 and $name1; 151 return 100 if $name0 eq $name1; 152 # recurse offset nicknames 153 if($name0=~ m/\((\w+)\)/) { return $match if $match= fname_eq($name1,$1); } 154 if($name0=~ m/"(\w+)"/) { return $match if $match= fname_eq($name1,$1); } 155 if($name1=~ m/\((\w+)\)/) { return $match if $match= fname_eq($name0,$1); } 156 if($name1=~ m/"(\w+)"/) { return $match if $match= fname_eq($name0,$1); } 157 # strip leading/trailing initial(s) (98%) 158 $name0=~ s/\W*\b\w\b\W*//g; 159 $name1=~ s/\W*\b\w\b\W*//g; 160 return 98 if $name0 eq $name1; 161 # recurse separate parts 162 if($name0=~ /\W/) 163 { # split parts, find best match 164 my($match)= sort { $b <=> $a } map {fname_eq($name1,$_)} split /\W+/, $name0; 165 return $match if $match; 166 } 167 elsif($name1=~ /\W/) 168 { # split parts, find best match 169 my($match)= sort { $b <=> $a } map {fname_eq($name0,$_)} split /\W+/, $name1; 170 return $match if $match; 171 } 172 # all caps, no symbols (95%) 173 ($name0= uc $name0)=~ y/A-Z//cd; 174 ($name1= uc $name1)=~ y/A-Z//cd; 175 return 95 if $name0 eq $name1; 176 # nickname (80%) 177 return int 0.8*$match if $match= nickname_eq($name0,$name1); 178 # fuzzy approx (15%) 179 return 35 if amatch($name0,$name1) and amatch($name1,$name0); 180 # simple trucation 181 return 10 if $name0=~ /^$name1|$name1$/ or $name1=~ /^$name0|$name0$/; 182 # a single initial 183 ($name0,$name1)= @_; 184 for($name0=~ m/\b(\w)\b/) { return 5 if $name1=~ /^$_/i; } 185 for($name1=~ m/\b(\w)\b/) { return 5 if $name0=~ /^$_/i; } 186 return; 187} 188 189sub lname_eq 190{ 191 my($name0,$name1)= @_; 192 return unless $name0 and $name1; 193 return 100 if $name0 eq $name1; 194 # strip trailing suffixes (95%) 195 $name0=~ s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//; 196 $name1=~ s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//; 197 return 95 if $name0 eq $name1; 198 # recurse hyphenated components 199 if($name0=~ /-/) 200 { # split hyphenation on hyphen ONLY 201 my($match)= sort { $b <=> $a } map {lname_eq($name1,$_)} split /-/, $name0; 202 return $match if $match; 203 } 204 elsif($name1=~ /-/) 205 { # split hyphenation on hyphen ONLY 206 my($match)= sort { $b <=> $a } map {lname_eq($name0,$_)} split /-/, $name1; 207 return $match if $match; 208 } 209 # all caps, no symbols (85%) 210 ($name0= uc $name0)=~ y/A-Z//cd; 211 ($name1= uc $name1)=~ y/A-Z//cd; 212 return 85 if $name0 eq $name1; 213 # metaphone (70%) 214 return 70 if Metaphone($name0) eq Metaphone($name1); 215 # soundex (40%) 216 return 40 if soundex($name0) eq soundex($name1); 217 # fuzzy approx (15%) 218 return 25 if amatch($name0,$name1) and amatch($name1,$name0); 219 # nonstandard 'hyphenation'/simple truncation 220 ($name0,$name1)= map {(my$n=$_)=~s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//;$n=~y/A-Za-z\-//cd;$n} @_; 221 return int 0.8*lname_eq($name0,$name1) if $name0=~ s/(\B[A-Z][a-z]+)/-$1/g 222 or $name1=~ s/(\B[A-Z][a-z]+)/-$1/g; 223 return 10 if $name0=~ /^$name1|$name1$/i or $name1=~ /^$name0|$name0$/i; 224 return; 225} 226 227sub name_eq 228{ 229 my($nomF0,$nomL0,$nomF1,$nomL1,$Frank,$Lrank)= 230 ( @_ < 4 ? (_nparse($_[0]),_nparse($_[1])) : @_ ); 231 return unless $Lrank= lname_eq $nomL0, $nomL1; 232 return unless $Frank= fname_eq $nomF0, $nomF1; 233 return int $Lrank*0.7 + $Frank*0.3; # another ratio I just made up 234} 235 2361 237