1
2=head1 NAME
3
4Lingua::EN::MatchNames - Smart matching for human names.
5
6=head1 SYNOPSIS
7
8  use Lingua::EN::MatchNames;
9
10  $score= name_eq( $firstn_0, $lastn_0, $firstn_1, $lastn_1 );
11
12=head1 DESCRIPTION
13
14You have two databases of person records that need to be synchronized or matched up,
15but they use different keys--maybe one uses SSN and the other uses employee id.
16The only fields you have to match on are first and last name.
17
18That's what this module is for.
19
20Just feed the first and last names to the C<name_eq()> function, and it returns
21C<undef> for no possible match, and a percentage of certainty (rank) otherwise.
22The ranking system isn't very scientific, and gender isn't considered, though
23it probably should be.
24
25The C<name_eq()> function, checks for:
26
27=over 4
28
29=item * inconsistent case (MacHenry = Machenry = MACHENRY)
30
31=item * inconsistent symbols (O'Brien = Obrien = O BRIEN)
32
33=item * misspellings (Grene = Green)
34
35=item * last name hyphenation (Smith-Curry = Curry)
36
37=item * similar phonetics (Hanson = Hansen)
38
39=item * nicknames (Midge = Peggy = Margaret)
40
41=item * extraneous initials (H. Ross = Ross)
42
43=item * extraneous suffixes (Reed, Jr. = Reed II = Reed)
44
45=item * and more...
46
47=back
48
49=head2 Preliminary Tests:
50
51  Homer Simpson HOMER SIMPOSN: 77
52  Marge Simpson MIDGE SIMPSON: 81
53  Brian Lalonde BRYAN LA LONDE: 82
54  Brian Lalonde RYAN LALAND: 72
55  Peggy MacHenry Midge Machenry: 81
56  Liz Grene Elizabeth Green: 72
57  Chuck Reed, Jr. Charles Reed II: 82
58  Kathy O'Brien Catherine Obrien: 81
59  Lizzie Hanson Lisa Hanson: 91
60  H. Ross Perot Ross PEROT: 88
61  Kathy Smith-Curry KATIE CURRY: 81
62  Dina Johnson-Warner Dinah J-Warner: 80
63  Leela Miles-Conrad Leela MilesConrad: 86
64  C. Renee Smythe Cathy Smythe: 71
65  Victoria (Honey) Rider HONEY RIDER: 88
66  Bart Simpson El Barto Simpson: 80
67  Bart Simpson Lisa Simpson: (no match)
68  Arthur Dent Zaphod Beeblebrox: (no match)
69
70=head1 WARNING
71
72The scoring in this version is utterly arbitrary.
73I made all of the numbers up.
74The certainty percentages should be OK relative to each other, but
75would be better if someone could give me some statistical data.
76
77Be sure and B<test> this against your data first!
78Your data may not look like my test data.
79
80And although I hope this is useful to many, I do not provide any
81kind of warranty (expressed or implied), and do not suggest the
82suitability of this module to any particular purpose.
83This module probably should not be used for life support or military
84purposes, and it B<must> not be used for unsolicited commercial email
85or other bulk advertising.
86
87=head1 REPOSITORY
88
89L<https://github.com/brianary/Lingua-EN-MatchNames>
90
91=head1 AUTHOR
92
93Brian Lalonde, E<lt>brian@webcoder.infoE<gt>
94
95=head1 REQUIREMENTS
96
97Lingua::EN::NameParse,
98Lingua::EN::Nickname,
99Parse::RecDescent,
100String::Approx,
101Text::Metaphone,
102Text::Soundex
103
104=head1 SEE ALSO
105
106perl(1),
107L<Lingua::EN::NameParse>,
108L<Lingua::EN::Nickname>,
109L<String::Approx>,
110L<Text::Metaphone>,
111L<Text::Soundex>
112
113=cut
114
115package Lingua::EN::MatchNames;
116require Exporter;
117use Carp;
118use Lingua::EN::NameParse;
119use Lingua::EN::Nickname;
120use String::Approx 'amatch';
121use Text::Metaphone;
122use Text::Soundex;
123use strict;
124use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
125use vars qw($debug);
126
127$VERSION=     '1.36';
128@ISA=         qw(Exporter);
129@EXPORT=      qw(name_eq);
130@EXPORT_OK=   qw(fname_eq lname_eq);
131%EXPORT_TAGS=
132(
133  ALL => [ @EXPORT, @EXPORT_OK ],
134);
135
136sub _nparse($)
137{
138  local $_= shift;
139  my $nparse= new Lingua::EN::NameParse( auto_clean => 1, force_case => 1 )
140    or carp "Unable to set up name parser.\n$!\n";
141  $nparse->parse($_);
142  my %name= $nparse->components;
143  return($name{given_name_1},$name{surname_1}.
144    ( $name{surname_2} ? '-'.$name{surname_2} : '' ));
145}
146
147sub fname_eq
148{
149  my($name0,$name1,$match)= @_;
150  return unless $name0 and $name1;
151  return 100 if $name0 eq $name1;
152  # recurse offset nicknames
153  if($name0=~ m/\((\w+)\)/) { return $match if $match= fname_eq($name1,$1); }
154  if($name0=~ m/"(\w+)"/)   { return $match if $match= fname_eq($name1,$1); }
155  if($name1=~ m/\((\w+)\)/) { return $match if $match= fname_eq($name0,$1); }
156  if($name1=~ m/"(\w+)"/)   { return $match if $match= fname_eq($name0,$1); }
157  # strip leading/trailing initial(s) (98%)
158  $name0=~ s/\W*\b\w\b\W*//g;
159  $name1=~ s/\W*\b\w\b\W*//g;
160  return 98 if $name0 eq $name1;
161  # recurse separate parts
162  if($name0=~ /\W/)
163  { # split parts, find best match
164    my($match)= sort { $b <=> $a } map {fname_eq($name1,$_)} split /\W+/, $name0;
165    return $match if $match;
166  }
167  elsif($name1=~ /\W/)
168  { # split parts, find best match
169    my($match)= sort { $b <=> $a } map {fname_eq($name0,$_)} split /\W+/, $name1;
170    return $match if $match;
171  }
172  # all caps, no symbols (95%)
173  ($name0= uc $name0)=~ y/A-Z//cd;
174  ($name1= uc $name1)=~ y/A-Z//cd;
175  return 95 if $name0 eq $name1;
176  # nickname (80%)
177  return int 0.8*$match if $match= nickname_eq($name0,$name1);
178  # fuzzy approx (15%)
179  return 35 if amatch($name0,$name1) and amatch($name1,$name0);
180  # simple trucation
181  return 10 if $name0=~ /^$name1|$name1$/ or $name1=~ /^$name0|$name0$/;
182  # a single initial
183  ($name0,$name1)= @_;
184  for($name0=~ m/\b(\w)\b/) { return 5 if $name1=~ /^$_/i; }
185  for($name1=~ m/\b(\w)\b/) { return 5 if $name0=~ /^$_/i; }
186  return;
187}
188
189sub lname_eq
190{
191  my($name0,$name1)= @_;
192  return unless $name0 and $name1;
193  return 100 if $name0 eq $name1;
194  # strip trailing suffixes (95%)
195  $name0=~ s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//;
196  $name1=~ s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//;
197  return 95 if $name0 eq $name1;
198  # recurse hyphenated components
199  if($name0=~ /-/)
200  { # split hyphenation on hyphen ONLY
201    my($match)= sort { $b <=> $a } map {lname_eq($name1,$_)} split /-/, $name0;
202    return $match if $match;
203  }
204  elsif($name1=~ /-/)
205  { # split hyphenation on hyphen ONLY
206    my($match)= sort { $b <=> $a } map {lname_eq($name0,$_)} split /-/, $name1;
207    return $match if $match;
208  }
209  # all caps, no symbols (85%)
210  ($name0= uc $name0)=~ y/A-Z//cd;
211  ($name1= uc $name1)=~ y/A-Z//cd;
212  return 85 if $name0 eq $name1;
213  # metaphone (70%)
214  return 70 if Metaphone($name0) eq Metaphone($name1);
215  # soundex (40%)
216  return 40 if soundex($name0) eq soundex($name1);
217  # fuzzy approx (15%)
218  return 25 if amatch($name0,$name1) and amatch($name1,$name0);
219  # nonstandard 'hyphenation'/simple truncation
220  ($name0,$name1)= map {(my$n=$_)=~s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//;$n=~y/A-Za-z\-//cd;$n} @_;
221  return int 0.8*lname_eq($name0,$name1) if $name0=~ s/(\B[A-Z][a-z]+)/-$1/g
222    or $name1=~ s/(\B[A-Z][a-z]+)/-$1/g;
223  return 10 if $name0=~ /^$name1|$name1$/i or $name1=~ /^$name0|$name0$/i;
224  return;
225}
226
227sub name_eq
228{
229  my($nomF0,$nomL0,$nomF1,$nomL1,$Frank,$Lrank)=
230    ( @_ < 4 ? (_nparse($_[0]),_nparse($_[1])) : @_ );
231  return unless $Lrank= lname_eq $nomL0, $nomL1;
232  return unless $Frank= fname_eq $nomF0, $nomF1;
233  return int $Lrank*0.7 + $Frank*0.3; # another ratio I just made up
234}
235
2361
237