1 /*
2 * This is a port of the Double Metaphone algorithm for use in PostgreSQL.
3 *
4 * contrib/fuzzystrmatch/dmetaphone.c
5 *
6 * Double Metaphone computes 2 "sounds like" strings - a primary and an
7 * alternate. In most cases they are the same, but for foreign names
8 * especially they can be a bit different, depending on pronunciation.
9 *
10 * Information on using Double Metaphone can be found at
11 * http://www.codeproject.com/string/dmetaphone1.asp
12 * and the original article describing it can be found at
13 * http://drdobbs.com/184401251
14 *
15 * For PostgreSQL we provide 2 functions - one for the primary and one for
16 * the alternate. That way the functions are pure text->text mappings that
17 * are useful in functional indexes. These are 'dmetaphone' for the
18 * primary and 'dmetaphone_alt' for the alternate.
19 *
20 * Assuming that dmetaphone.so is in $libdir, the SQL to set up the
21 * functions looks like this:
22 *
23 * CREATE FUNCTION dmetaphone (text) RETURNS text
24 * LANGUAGE C IMMUTABLE STRICT
25 * AS '$libdir/dmetaphone', 'dmetaphone';
26 *
27 * CREATE FUNCTION dmetaphone_alt (text) RETURNS text
28 * LANGUAGE C IMMUTABLE STRICT
29 * AS '$libdir/dmetaphone', 'dmetaphone_alt';
30 *
31 * Note that you have to declare the functions IMMUTABLE if you want to
32 * use them in functional indexes, and you have to declare them as STRICT
33 * as they do not check for NULL input, and will segfault if given NULL input.
34 * (See below for alternative ) Declaring them as STRICT means PostgreSQL
35 * will never call them with NULL, but instead assume the result is NULL,
36 * which is what we (I) want.
37 *
38 * Alternatively, compile with -DDMETAPHONE_NOSTRICT and the functions
39 * will detect NULL input and return NULL. The you don't have to declare them
40 * as STRICT.
41 *
42 * There is a small inefficiency here - each function call actually computes
43 * both the primary and the alternate and then throws away the one it doesn't
44 * need. That's the way the perl module was written, because perl can handle
45 * a list return more easily than we can in PostgreSQL. The result has been
46 * fast enough for my needs, but it could maybe be optimized a bit to remove
47 * that behaviour.
48 *
49 */
50
51
52 /***************************** COPYRIGHT NOTICES ***********************
53
54 Most of this code is directly from the Text::DoubleMetaphone perl module
55 version 0.05 available from http://www.cpan.org.
56 It bears this copyright notice:
57
58
59 Copyright 2000, Maurice Aubrey <maurice@hevanet.com>.
60 All rights reserved.
61
62 This code is based heavily on the C++ implementation by
63 Lawrence Philips and incorporates several bug fixes courtesy
64 of Kevin Atkinson <kevina@users.sourceforge.net>.
65
66 This module is free software; you may redistribute it and/or
67 modify it under the same terms as Perl itself.
68
69 The remaining code is authored by Andrew Dunstan <amdunstan@ncshp.org> and
70 <andrew@dunslane.net> and is covered this copyright:
71
72 Copyright 2003, North Carolina State Highway Patrol.
73 All rights reserved.
74
75 Permission to use, copy, modify, and distribute this software and its
76 documentation for any purpose, without fee, and without a written agreement
77 is hereby granted, provided that the above copyright notice and this
78 paragraph and the following two paragraphs appear in all copies.
79
80 IN NO EVENT SHALL THE NORTH CAROLINA STATE HIGHWAY PATROL BE LIABLE TO ANY
81 PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
82 INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
83 DOCUMENTATION, EVEN IF THE NORTH CAROLINA STATE HIGHWAY PATROL HAS BEEN
84 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85
86 THE NORTH CAROLINA STATE HIGHWAY PATROL SPECIFICALLY DISCLAIMS ANY
87 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
88 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED
89 HEREUNDER IS ON AN "AS IS" BASIS, AND THE NORTH CAROLINA STATE HIGHWAY PATROL
90 HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
91 MODIFICATIONS.
92
93 ***********************************************************************/
94
95
96 /* include these first, according to the docs */
97 #ifndef DMETAPHONE_MAIN
98
99 #include "postgres.h"
100
101 #include "utils/builtins.h"
102
103 /* turn off assertions for embedded function */
104 #define NDEBUG
105
106 #else /* DMETAPHONE_MAIN */
107
108 /* we need these if we didn't get them from postgres.h */
109 #include <stdio.h>
110 #include <stdlib.h>
111 #include <string.h>
112 #include <stdarg.h>
113
114 #endif /* DMETAPHONE_MAIN */
115
116 #include <assert.h>
117 #include <ctype.h>
118
119 /* prototype for the main function we got from the perl module */
120 static void DoubleMetaphone(char *, char **);
121
122 #ifndef DMETAPHONE_MAIN
123
124 /*
125 * The PostgreSQL visible dmetaphone function.
126 */
127
128 PG_FUNCTION_INFO_V1(dmetaphone);
129
130 Datum
dmetaphone(PG_FUNCTION_ARGS)131 dmetaphone(PG_FUNCTION_ARGS)
132 {
133 text *arg;
134 char *aptr,
135 *codes[2],
136 *code;
137
138 #ifdef DMETAPHONE_NOSTRICT
139 if (PG_ARGISNULL(0))
140 PG_RETURN_NULL();
141 #endif
142 arg = PG_GETARG_TEXT_PP(0);
143 aptr = text_to_cstring(arg);
144
145 DoubleMetaphone(aptr, codes);
146 code = codes[0];
147 if (!code)
148 code = "";
149
150 PG_RETURN_TEXT_P(cstring_to_text(code));
151 }
152
153 /*
154 * The PostgreSQL visible dmetaphone_alt function.
155 */
156
157 PG_FUNCTION_INFO_V1(dmetaphone_alt);
158
159 Datum
dmetaphone_alt(PG_FUNCTION_ARGS)160 dmetaphone_alt(PG_FUNCTION_ARGS)
161 {
162 text *arg;
163 char *aptr,
164 *codes[2],
165 *code;
166
167 #ifdef DMETAPHONE_NOSTRICT
168 if (PG_ARGISNULL(0))
169 PG_RETURN_NULL();
170 #endif
171 arg = PG_GETARG_TEXT_PP(0);
172 aptr = text_to_cstring(arg);
173
174 DoubleMetaphone(aptr, codes);
175 code = codes[1];
176 if (!code)
177 code = "";
178
179 PG_RETURN_TEXT_P(cstring_to_text(code));
180 }
181
182
183 /* here is where we start the code imported from the perl module */
184
185 /* all memory handling is done with these macros */
186
187 #define META_MALLOC(v,n,t) \
188 (v = (t*)palloc(((n)*sizeof(t))))
189
190 #define META_REALLOC(v,n,t) \
191 (v = (t*)repalloc((v),((n)*sizeof(t))))
192
193 /*
194 * Don't do pfree - it seems to cause a segv sometimes - which might have just
195 * been caused by reloading the module in development.
196 * So we rely on context cleanup - Tom Lane says pfree shouldn't be necessary
197 * in a case like this.
198 */
199
200 #define META_FREE(x) ((void)true) /* pfree((x)) */
201 #else /* not defined DMETAPHONE_MAIN */
202
203 /* use the standard malloc library when not running in PostgreSQL */
204
205 #define META_MALLOC(v,n,t) \
206 (v = (t*)malloc(((n)*sizeof(t))))
207
208 #define META_REALLOC(v,n,t) \
209 (v = (t*)realloc((v),((n)*sizeof(t))))
210
211 #define META_FREE(x) free((x))
212 #endif /* defined DMETAPHONE_MAIN */
213
214
215
216 /* this typedef was originally in the perl module's .h file */
217
218 typedef struct
219 {
220 char *str;
221 int length;
222 int bufsize;
223 int free_string_on_destroy;
224 }
225
226 metastring;
227
228 /*
229 * remaining perl module funcs unchanged except for declaring them static
230 * and reformatting to PostgreSQL indentation and to fit in 80 cols.
231 *
232 */
233
234 static metastring *
NewMetaString(char * init_str)235 NewMetaString(char *init_str)
236 {
237 metastring *s;
238 char empty_string[] = "";
239
240 META_MALLOC(s, 1, metastring);
241 assert(s != NULL);
242
243 if (init_str == NULL)
244 init_str = empty_string;
245 s->length = strlen(init_str);
246 /* preallocate a bit more for potential growth */
247 s->bufsize = s->length + 7;
248
249 META_MALLOC(s->str, s->bufsize, char);
250 assert(s->str != NULL);
251
252 memcpy(s->str, init_str, s->length + 1);
253 s->free_string_on_destroy = 1;
254
255 return s;
256 }
257
258
259 static void
DestroyMetaString(metastring * s)260 DestroyMetaString(metastring *s)
261 {
262 if (s == NULL)
263 return;
264
265 if (s->free_string_on_destroy && (s->str != NULL))
266 META_FREE(s->str);
267
268 META_FREE(s);
269 }
270
271
272 static void
IncreaseBuffer(metastring * s,int chars_needed)273 IncreaseBuffer(metastring *s, int chars_needed)
274 {
275 META_REALLOC(s->str, (s->bufsize + chars_needed + 10), char);
276 assert(s->str != NULL);
277 s->bufsize = s->bufsize + chars_needed + 10;
278 }
279
280
281 static void
MakeUpper(metastring * s)282 MakeUpper(metastring *s)
283 {
284 char *i;
285
286 for (i = s->str; *i; i++)
287 *i = toupper((unsigned char) *i);
288 }
289
290
291 static int
IsVowel(metastring * s,int pos)292 IsVowel(metastring *s, int pos)
293 {
294 char c;
295
296 if ((pos < 0) || (pos >= s->length))
297 return 0;
298
299 c = *(s->str + pos);
300 if ((c == 'A') || (c == 'E') || (c == 'I') || (c == 'O') ||
301 (c == 'U') || (c == 'Y'))
302 return 1;
303
304 return 0;
305 }
306
307
308 static int
SlavoGermanic(metastring * s)309 SlavoGermanic(metastring *s)
310 {
311 if ((char *) strstr(s->str, "W"))
312 return 1;
313 else if ((char *) strstr(s->str, "K"))
314 return 1;
315 else if ((char *) strstr(s->str, "CZ"))
316 return 1;
317 else if ((char *) strstr(s->str, "WITZ"))
318 return 1;
319 else
320 return 0;
321 }
322
323
324 static char
GetAt(metastring * s,int pos)325 GetAt(metastring *s, int pos)
326 {
327 if ((pos < 0) || (pos >= s->length))
328 return '\0';
329
330 return ((char) *(s->str + pos));
331 }
332
333
334 static void
SetAt(metastring * s,int pos,char c)335 SetAt(metastring *s, int pos, char c)
336 {
337 if ((pos < 0) || (pos >= s->length))
338 return;
339
340 *(s->str + pos) = c;
341 }
342
343
344 /*
345 Caveats: the START value is 0 based
346 */
347 static int
StringAt(metastring * s,int start,int length,...)348 StringAt(metastring *s, int start, int length,...)
349 {
350 char *test;
351 char *pos;
352 va_list ap;
353
354 if ((start < 0) || (start >= s->length))
355 return 0;
356
357 pos = (s->str + start);
358 va_start(ap, length);
359
360 do
361 {
362 test = va_arg(ap, char *);
363 if (*test && (strncmp(pos, test, length) == 0))
364 {
365 va_end(ap);
366 return 1;
367 }
368 }
369 while (strcmp(test, "") != 0);
370
371 va_end(ap);
372
373 return 0;
374 }
375
376
377 static void
MetaphAdd(metastring * s,char * new_str)378 MetaphAdd(metastring *s, char *new_str)
379 {
380 int add_length;
381
382 if (new_str == NULL)
383 return;
384
385 add_length = strlen(new_str);
386 if ((s->length + add_length) > (s->bufsize - 1))
387 IncreaseBuffer(s, add_length);
388
389 strcat(s->str, new_str);
390 s->length += add_length;
391 }
392
393
394 static void
DoubleMetaphone(char * str,char ** codes)395 DoubleMetaphone(char *str, char **codes)
396 {
397 int length;
398 metastring *original;
399 metastring *primary;
400 metastring *secondary;
401 int current;
402 int last;
403
404 current = 0;
405 /* we need the real length and last prior to padding */
406 length = strlen(str);
407 last = length - 1;
408 original = NewMetaString(str);
409 /* Pad original so we can index beyond end */
410 MetaphAdd(original, " ");
411
412 primary = NewMetaString("");
413 secondary = NewMetaString("");
414 primary->free_string_on_destroy = 0;
415 secondary->free_string_on_destroy = 0;
416
417 MakeUpper(original);
418
419 /* skip these when at start of word */
420 if (StringAt(original, 0, 2, "GN", "KN", "PN", "WR", "PS", ""))
421 current += 1;
422
423 /* Initial 'X' is pronounced 'Z' e.g. 'Xavier' */
424 if (GetAt(original, 0) == 'X')
425 {
426 MetaphAdd(primary, "S"); /* 'Z' maps to 'S' */
427 MetaphAdd(secondary, "S");
428 current += 1;
429 }
430
431 /* main loop */
432 while ((primary->length < 4) || (secondary->length < 4))
433 {
434 if (current >= length)
435 break;
436
437 switch (GetAt(original, current))
438 {
439 case 'A':
440 case 'E':
441 case 'I':
442 case 'O':
443 case 'U':
444 case 'Y':
445 if (current == 0)
446 {
447 /* all init vowels now map to 'A' */
448 MetaphAdd(primary, "A");
449 MetaphAdd(secondary, "A");
450 }
451 current += 1;
452 break;
453
454 case 'B':
455
456 /* "-mb", e.g", "dumb", already skipped over... */
457 MetaphAdd(primary, "P");
458 MetaphAdd(secondary, "P");
459
460 if (GetAt(original, current + 1) == 'B')
461 current += 2;
462 else
463 current += 1;
464 break;
465
466 case '\xc7': /* C with cedilla */
467 MetaphAdd(primary, "S");
468 MetaphAdd(secondary, "S");
469 current += 1;
470 break;
471
472 case 'C':
473 /* various germanic */
474 if ((current > 1)
475 && !IsVowel(original, current - 2)
476 && StringAt(original, (current - 1), 3, "ACH", "")
477 && ((GetAt(original, current + 2) != 'I')
478 && ((GetAt(original, current + 2) != 'E')
479 || StringAt(original, (current - 2), 6, "BACHER",
480 "MACHER", ""))))
481 {
482 MetaphAdd(primary, "K");
483 MetaphAdd(secondary, "K");
484 current += 2;
485 break;
486 }
487
488 /* special case 'caesar' */
489 if ((current == 0)
490 && StringAt(original, current, 6, "CAESAR", ""))
491 {
492 MetaphAdd(primary, "S");
493 MetaphAdd(secondary, "S");
494 current += 2;
495 break;
496 }
497
498 /* italian 'chianti' */
499 if (StringAt(original, current, 4, "CHIA", ""))
500 {
501 MetaphAdd(primary, "K");
502 MetaphAdd(secondary, "K");
503 current += 2;
504 break;
505 }
506
507 if (StringAt(original, current, 2, "CH", ""))
508 {
509 /* find 'michael' */
510 if ((current > 0)
511 && StringAt(original, current, 4, "CHAE", ""))
512 {
513 MetaphAdd(primary, "K");
514 MetaphAdd(secondary, "X");
515 current += 2;
516 break;
517 }
518
519 /* greek roots e.g. 'chemistry', 'chorus' */
520 if ((current == 0)
521 && (StringAt(original, (current + 1), 5,
522 "HARAC", "HARIS", "")
523 || StringAt(original, (current + 1), 3, "HOR",
524 "HYM", "HIA", "HEM", ""))
525 && !StringAt(original, 0, 5, "CHORE", ""))
526 {
527 MetaphAdd(primary, "K");
528 MetaphAdd(secondary, "K");
529 current += 2;
530 break;
531 }
532
533 /* germanic, greek, or otherwise 'ch' for 'kh' sound */
534 if (
535 (StringAt(original, 0, 4, "VAN ", "VON ", "")
536 || StringAt(original, 0, 3, "SCH", ""))
537 /* 'architect but not 'arch', 'orchestra', 'orchid' */
538 || StringAt(original, (current - 2), 6, "ORCHES",
539 "ARCHIT", "ORCHID", "")
540 || StringAt(original, (current + 2), 1, "T", "S",
541 "")
542 || ((StringAt(original, (current - 1), 1,
543 "A", "O", "U", "E", "")
544 || (current == 0))
545
546 /*
547 * e.g., 'wachtler', 'wechsler', but not 'tichner'
548 */
549 && StringAt(original, (current + 2), 1, "L", "R",
550 "N", "M", "B", "H", "F", "V", "W",
551 " ", "")))
552 {
553 MetaphAdd(primary, "K");
554 MetaphAdd(secondary, "K");
555 }
556 else
557 {
558 if (current > 0)
559 {
560 if (StringAt(original, 0, 2, "MC", ""))
561 {
562 /* e.g., "McHugh" */
563 MetaphAdd(primary, "K");
564 MetaphAdd(secondary, "K");
565 }
566 else
567 {
568 MetaphAdd(primary, "X");
569 MetaphAdd(secondary, "K");
570 }
571 }
572 else
573 {
574 MetaphAdd(primary, "X");
575 MetaphAdd(secondary, "X");
576 }
577 }
578 current += 2;
579 break;
580 }
581 /* e.g, 'czerny' */
582 if (StringAt(original, current, 2, "CZ", "")
583 && !StringAt(original, (current - 2), 4, "WICZ", ""))
584 {
585 MetaphAdd(primary, "S");
586 MetaphAdd(secondary, "X");
587 current += 2;
588 break;
589 }
590
591 /* e.g., 'focaccia' */
592 if (StringAt(original, (current + 1), 3, "CIA", ""))
593 {
594 MetaphAdd(primary, "X");
595 MetaphAdd(secondary, "X");
596 current += 3;
597 break;
598 }
599
600 /* double 'C', but not if e.g. 'McClellan' */
601 if (StringAt(original, current, 2, "CC", "")
602 && !((current == 1) && (GetAt(original, 0) == 'M')))
603 {
604 /* 'bellocchio' but not 'bacchus' */
605 if (StringAt(original, (current + 2), 1, "I", "E", "H", "")
606 && !StringAt(original, (current + 2), 2, "HU", ""))
607 {
608 /* 'accident', 'accede' 'succeed' */
609 if (
610 ((current == 1)
611 && (GetAt(original, current - 1) == 'A'))
612 || StringAt(original, (current - 1), 5, "UCCEE",
613 "UCCES", ""))
614 {
615 MetaphAdd(primary, "KS");
616 MetaphAdd(secondary, "KS");
617 /* 'bacci', 'bertucci', other italian */
618 }
619 else
620 {
621 MetaphAdd(primary, "X");
622 MetaphAdd(secondary, "X");
623 }
624 current += 3;
625 break;
626 }
627 else
628 { /* Pierce's rule */
629 MetaphAdd(primary, "K");
630 MetaphAdd(secondary, "K");
631 current += 2;
632 break;
633 }
634 }
635
636 if (StringAt(original, current, 2, "CK", "CG", "CQ", ""))
637 {
638 MetaphAdd(primary, "K");
639 MetaphAdd(secondary, "K");
640 current += 2;
641 break;
642 }
643
644 if (StringAt(original, current, 2, "CI", "CE", "CY", ""))
645 {
646 /* italian vs. english */
647 if (StringAt
648 (original, current, 3, "CIO", "CIE", "CIA", ""))
649 {
650 MetaphAdd(primary, "S");
651 MetaphAdd(secondary, "X");
652 }
653 else
654 {
655 MetaphAdd(primary, "S");
656 MetaphAdd(secondary, "S");
657 }
658 current += 2;
659 break;
660 }
661
662 /* else */
663 MetaphAdd(primary, "K");
664 MetaphAdd(secondary, "K");
665
666 /* name sent in 'mac caffrey', 'mac gregor */
667 if (StringAt(original, (current + 1), 2, " C", " Q", " G", ""))
668 current += 3;
669 else if (StringAt(original, (current + 1), 1, "C", "K", "Q", "")
670 && !StringAt(original, (current + 1), 2,
671 "CE", "CI", ""))
672 current += 2;
673 else
674 current += 1;
675 break;
676
677 case 'D':
678 if (StringAt(original, current, 2, "DG", ""))
679 {
680 if (StringAt(original, (current + 2), 1,
681 "I", "E", "Y", ""))
682 {
683 /* e.g. 'edge' */
684 MetaphAdd(primary, "J");
685 MetaphAdd(secondary, "J");
686 current += 3;
687 break;
688 }
689 else
690 {
691 /* e.g. 'edgar' */
692 MetaphAdd(primary, "TK");
693 MetaphAdd(secondary, "TK");
694 current += 2;
695 break;
696 }
697 }
698
699 if (StringAt(original, current, 2, "DT", "DD", ""))
700 {
701 MetaphAdd(primary, "T");
702 MetaphAdd(secondary, "T");
703 current += 2;
704 break;
705 }
706
707 /* else */
708 MetaphAdd(primary, "T");
709 MetaphAdd(secondary, "T");
710 current += 1;
711 break;
712
713 case 'F':
714 if (GetAt(original, current + 1) == 'F')
715 current += 2;
716 else
717 current += 1;
718 MetaphAdd(primary, "F");
719 MetaphAdd(secondary, "F");
720 break;
721
722 case 'G':
723 if (GetAt(original, current + 1) == 'H')
724 {
725 if ((current > 0) && !IsVowel(original, current - 1))
726 {
727 MetaphAdd(primary, "K");
728 MetaphAdd(secondary, "K");
729 current += 2;
730 break;
731 }
732
733 if (current < 3)
734 {
735 /* 'ghislane', ghiradelli */
736 if (current == 0)
737 {
738 if (GetAt(original, current + 2) == 'I')
739 {
740 MetaphAdd(primary, "J");
741 MetaphAdd(secondary, "J");
742 }
743 else
744 {
745 MetaphAdd(primary, "K");
746 MetaphAdd(secondary, "K");
747 }
748 current += 2;
749 break;
750 }
751 }
752
753 /*
754 * Parker's rule (with some further refinements) - e.g.,
755 * 'hugh'
756 */
757 if (
758 ((current > 1)
759 && StringAt(original, (current - 2), 1,
760 "B", "H", "D", ""))
761 /* e.g., 'bough' */
762 || ((current > 2)
763 && StringAt(original, (current - 3), 1,
764 "B", "H", "D", ""))
765 /* e.g., 'broughton' */
766 || ((current > 3)
767 && StringAt(original, (current - 4), 1,
768 "B", "H", "")))
769 {
770 current += 2;
771 break;
772 }
773 else
774 {
775 /*
776 * e.g., 'laugh', 'McLaughlin', 'cough', 'gough',
777 * 'rough', 'tough'
778 */
779 if ((current > 2)
780 && (GetAt(original, current - 1) == 'U')
781 && StringAt(original, (current - 3), 1, "C",
782 "G", "L", "R", "T", ""))
783 {
784 MetaphAdd(primary, "F");
785 MetaphAdd(secondary, "F");
786 }
787 else if ((current > 0)
788 && GetAt(original, current - 1) != 'I')
789 {
790
791
792 MetaphAdd(primary, "K");
793 MetaphAdd(secondary, "K");
794 }
795
796 current += 2;
797 break;
798 }
799 }
800
801 if (GetAt(original, current + 1) == 'N')
802 {
803 if ((current == 1) && IsVowel(original, 0)
804 && !SlavoGermanic(original))
805 {
806 MetaphAdd(primary, "KN");
807 MetaphAdd(secondary, "N");
808 }
809 else
810 /* not e.g. 'cagney' */
811 if (!StringAt(original, (current + 2), 2, "EY", "")
812 && (GetAt(original, current + 1) != 'Y')
813 && !SlavoGermanic(original))
814 {
815 MetaphAdd(primary, "N");
816 MetaphAdd(secondary, "KN");
817 }
818 else
819 {
820 MetaphAdd(primary, "KN");
821 MetaphAdd(secondary, "KN");
822 }
823 current += 2;
824 break;
825 }
826
827 /* 'tagliaro' */
828 if (StringAt(original, (current + 1), 2, "LI", "")
829 && !SlavoGermanic(original))
830 {
831 MetaphAdd(primary, "KL");
832 MetaphAdd(secondary, "L");
833 current += 2;
834 break;
835 }
836
837 /* -ges-,-gep-,-gel-, -gie- at beginning */
838 if ((current == 0)
839 && ((GetAt(original, current + 1) == 'Y')
840 || StringAt(original, (current + 1), 2, "ES", "EP",
841 "EB", "EL", "EY", "IB", "IL", "IN", "IE",
842 "EI", "ER", "")))
843 {
844 MetaphAdd(primary, "K");
845 MetaphAdd(secondary, "J");
846 current += 2;
847 break;
848 }
849
850 /* -ger-, -gy- */
851 if (
852 (StringAt(original, (current + 1), 2, "ER", "")
853 || (GetAt(original, current + 1) == 'Y'))
854 && !StringAt(original, 0, 6,
855 "DANGER", "RANGER", "MANGER", "")
856 && !StringAt(original, (current - 1), 1, "E", "I", "")
857 && !StringAt(original, (current - 1), 3, "RGY", "OGY",
858 ""))
859 {
860 MetaphAdd(primary, "K");
861 MetaphAdd(secondary, "J");
862 current += 2;
863 break;
864 }
865
866 /* italian e.g, 'biaggi' */
867 if (StringAt(original, (current + 1), 1, "E", "I", "Y", "")
868 || StringAt(original, (current - 1), 4,
869 "AGGI", "OGGI", ""))
870 {
871 /* obvious germanic */
872 if (
873 (StringAt(original, 0, 4, "VAN ", "VON ", "")
874 || StringAt(original, 0, 3, "SCH", ""))
875 || StringAt(original, (current + 1), 2, "ET", ""))
876 {
877 MetaphAdd(primary, "K");
878 MetaphAdd(secondary, "K");
879 }
880 else
881 {
882 /* always soft if french ending */
883 if (StringAt
884 (original, (current + 1), 4, "IER ", ""))
885 {
886 MetaphAdd(primary, "J");
887 MetaphAdd(secondary, "J");
888 }
889 else
890 {
891 MetaphAdd(primary, "J");
892 MetaphAdd(secondary, "K");
893 }
894 }
895 current += 2;
896 break;
897 }
898
899 if (GetAt(original, current + 1) == 'G')
900 current += 2;
901 else
902 current += 1;
903 MetaphAdd(primary, "K");
904 MetaphAdd(secondary, "K");
905 break;
906
907 case 'H':
908 /* only keep if first & before vowel or btw. 2 vowels */
909 if (((current == 0) || IsVowel(original, current - 1))
910 && IsVowel(original, current + 1))
911 {
912 MetaphAdd(primary, "H");
913 MetaphAdd(secondary, "H");
914 current += 2;
915 }
916 else
917 /* also takes care of 'HH' */
918 current += 1;
919 break;
920
921 case 'J':
922 /* obvious spanish, 'jose', 'san jacinto' */
923 if (StringAt(original, current, 4, "JOSE", "")
924 || StringAt(original, 0, 4, "SAN ", ""))
925 {
926 if (((current == 0)
927 && (GetAt(original, current + 4) == ' '))
928 || StringAt(original, 0, 4, "SAN ", ""))
929 {
930 MetaphAdd(primary, "H");
931 MetaphAdd(secondary, "H");
932 }
933 else
934 {
935 MetaphAdd(primary, "J");
936 MetaphAdd(secondary, "H");
937 }
938 current += 1;
939 break;
940 }
941
942 if ((current == 0)
943 && !StringAt(original, current, 4, "JOSE", ""))
944 {
945 MetaphAdd(primary, "J"); /* Yankelovich/Jankelowicz */
946 MetaphAdd(secondary, "A");
947 }
948 else
949 {
950 /* spanish pron. of e.g. 'bajador' */
951 if (IsVowel(original, current - 1)
952 && !SlavoGermanic(original)
953 && ((GetAt(original, current + 1) == 'A')
954 || (GetAt(original, current + 1) == 'O')))
955 {
956 MetaphAdd(primary, "J");
957 MetaphAdd(secondary, "H");
958 }
959 else
960 {
961 if (current == last)
962 {
963 MetaphAdd(primary, "J");
964 MetaphAdd(secondary, "");
965 }
966 else
967 {
968 if (!StringAt(original, (current + 1), 1, "L", "T",
969 "K", "S", "N", "M", "B", "Z", "")
970 && !StringAt(original, (current - 1), 1,
971 "S", "K", "L", ""))
972 {
973 MetaphAdd(primary, "J");
974 MetaphAdd(secondary, "J");
975 }
976 }
977 }
978 }
979
980 if (GetAt(original, current + 1) == 'J') /* it could happen! */
981 current += 2;
982 else
983 current += 1;
984 break;
985
986 case 'K':
987 if (GetAt(original, current + 1) == 'K')
988 current += 2;
989 else
990 current += 1;
991 MetaphAdd(primary, "K");
992 MetaphAdd(secondary, "K");
993 break;
994
995 case 'L':
996 if (GetAt(original, current + 1) == 'L')
997 {
998 /* spanish e.g. 'cabrillo', 'gallegos' */
999 if (((current == (length - 3))
1000 && StringAt(original, (current - 1), 4, "ILLO",
1001 "ILLA", "ALLE", ""))
1002 || ((StringAt(original, (last - 1), 2, "AS", "OS", "")
1003 || StringAt(original, last, 1, "A", "O", ""))
1004 && StringAt(original, (current - 1), 4,
1005 "ALLE", "")))
1006 {
1007 MetaphAdd(primary, "L");
1008 MetaphAdd(secondary, "");
1009 current += 2;
1010 break;
1011 }
1012 current += 2;
1013 }
1014 else
1015 current += 1;
1016 MetaphAdd(primary, "L");
1017 MetaphAdd(secondary, "L");
1018 break;
1019
1020 case 'M':
1021 if ((StringAt(original, (current - 1), 3, "UMB", "")
1022 && (((current + 1) == last)
1023 || StringAt(original, (current + 2), 2, "ER", "")))
1024 /* 'dumb','thumb' */
1025 || (GetAt(original, current + 1) == 'M'))
1026 current += 2;
1027 else
1028 current += 1;
1029 MetaphAdd(primary, "M");
1030 MetaphAdd(secondary, "M");
1031 break;
1032
1033 case 'N':
1034 if (GetAt(original, current + 1) == 'N')
1035 current += 2;
1036 else
1037 current += 1;
1038 MetaphAdd(primary, "N");
1039 MetaphAdd(secondary, "N");
1040 break;
1041
1042 case '\xd1': /* N with tilde */
1043 current += 1;
1044 MetaphAdd(primary, "N");
1045 MetaphAdd(secondary, "N");
1046 break;
1047
1048 case 'P':
1049 if (GetAt(original, current + 1) == 'H')
1050 {
1051 MetaphAdd(primary, "F");
1052 MetaphAdd(secondary, "F");
1053 current += 2;
1054 break;
1055 }
1056
1057 /* also account for "campbell", "raspberry" */
1058 if (StringAt(original, (current + 1), 1, "P", "B", ""))
1059 current += 2;
1060 else
1061 current += 1;
1062 MetaphAdd(primary, "P");
1063 MetaphAdd(secondary, "P");
1064 break;
1065
1066 case 'Q':
1067 if (GetAt(original, current + 1) == 'Q')
1068 current += 2;
1069 else
1070 current += 1;
1071 MetaphAdd(primary, "K");
1072 MetaphAdd(secondary, "K");
1073 break;
1074
1075 case 'R':
1076 /* french e.g. 'rogier', but exclude 'hochmeier' */
1077 if ((current == last)
1078 && !SlavoGermanic(original)
1079 && StringAt(original, (current - 2), 2, "IE", "")
1080 && !StringAt(original, (current - 4), 2, "ME", "MA", ""))
1081 {
1082 MetaphAdd(primary, "");
1083 MetaphAdd(secondary, "R");
1084 }
1085 else
1086 {
1087 MetaphAdd(primary, "R");
1088 MetaphAdd(secondary, "R");
1089 }
1090
1091 if (GetAt(original, current + 1) == 'R')
1092 current += 2;
1093 else
1094 current += 1;
1095 break;
1096
1097 case 'S':
1098 /* special cases 'island', 'isle', 'carlisle', 'carlysle' */
1099 if (StringAt(original, (current - 1), 3, "ISL", "YSL", ""))
1100 {
1101 current += 1;
1102 break;
1103 }
1104
1105 /* special case 'sugar-' */
1106 if ((current == 0)
1107 && StringAt(original, current, 5, "SUGAR", ""))
1108 {
1109 MetaphAdd(primary, "X");
1110 MetaphAdd(secondary, "S");
1111 current += 1;
1112 break;
1113 }
1114
1115 if (StringAt(original, current, 2, "SH", ""))
1116 {
1117 /* germanic */
1118 if (StringAt
1119 (original, (current + 1), 4, "HEIM", "HOEK", "HOLM",
1120 "HOLZ", ""))
1121 {
1122 MetaphAdd(primary, "S");
1123 MetaphAdd(secondary, "S");
1124 }
1125 else
1126 {
1127 MetaphAdd(primary, "X");
1128 MetaphAdd(secondary, "X");
1129 }
1130 current += 2;
1131 break;
1132 }
1133
1134 /* italian & armenian */
1135 if (StringAt(original, current, 3, "SIO", "SIA", "")
1136 || StringAt(original, current, 4, "SIAN", ""))
1137 {
1138 if (!SlavoGermanic(original))
1139 {
1140 MetaphAdd(primary, "S");
1141 MetaphAdd(secondary, "X");
1142 }
1143 else
1144 {
1145 MetaphAdd(primary, "S");
1146 MetaphAdd(secondary, "S");
1147 }
1148 current += 3;
1149 break;
1150 }
1151
1152 /*
1153 * german & anglicisations, e.g. 'smith' match 'schmidt',
1154 * 'snider' match 'schneider' also, -sz- in slavic language
1155 * although in hungarian it is pronounced 's'
1156 */
1157 if (((current == 0)
1158 && StringAt(original, (current + 1), 1,
1159 "M", "N", "L", "W", ""))
1160 || StringAt(original, (current + 1), 1, "Z", ""))
1161 {
1162 MetaphAdd(primary, "S");
1163 MetaphAdd(secondary, "X");
1164 if (StringAt(original, (current + 1), 1, "Z", ""))
1165 current += 2;
1166 else
1167 current += 1;
1168 break;
1169 }
1170
1171 if (StringAt(original, current, 2, "SC", ""))
1172 {
1173 /* Schlesinger's rule */
1174 if (GetAt(original, current + 2) == 'H')
1175 {
1176 /* dutch origin, e.g. 'school', 'schooner' */
1177 if (StringAt(original, (current + 3), 2,
1178 "OO", "ER", "EN",
1179 "UY", "ED", "EM", ""))
1180 {
1181 /* 'schermerhorn', 'schenker' */
1182 if (StringAt(original, (current + 3), 2,
1183 "ER", "EN", ""))
1184 {
1185 MetaphAdd(primary, "X");
1186 MetaphAdd(secondary, "SK");
1187 }
1188 else
1189 {
1190 MetaphAdd(primary, "SK");
1191 MetaphAdd(secondary, "SK");
1192 }
1193 current += 3;
1194 break;
1195 }
1196 else
1197 {
1198 if ((current == 0) && !IsVowel(original, 3)
1199 && (GetAt(original, 3) != 'W'))
1200 {
1201 MetaphAdd(primary, "X");
1202 MetaphAdd(secondary, "S");
1203 }
1204 else
1205 {
1206 MetaphAdd(primary, "X");
1207 MetaphAdd(secondary, "X");
1208 }
1209 current += 3;
1210 break;
1211 }
1212 }
1213
1214 if (StringAt(original, (current + 2), 1,
1215 "I", "E", "Y", ""))
1216 {
1217 MetaphAdd(primary, "S");
1218 MetaphAdd(secondary, "S");
1219 current += 3;
1220 break;
1221 }
1222 /* else */
1223 MetaphAdd(primary, "SK");
1224 MetaphAdd(secondary, "SK");
1225 current += 3;
1226 break;
1227 }
1228
1229 /* french e.g. 'resnais', 'artois' */
1230 if ((current == last)
1231 && StringAt(original, (current - 2), 2, "AI", "OI", ""))
1232 {
1233 MetaphAdd(primary, "");
1234 MetaphAdd(secondary, "S");
1235 }
1236 else
1237 {
1238 MetaphAdd(primary, "S");
1239 MetaphAdd(secondary, "S");
1240 }
1241
1242 if (StringAt(original, (current + 1), 1, "S", "Z", ""))
1243 current += 2;
1244 else
1245 current += 1;
1246 break;
1247
1248 case 'T':
1249 if (StringAt(original, current, 4, "TION", ""))
1250 {
1251 MetaphAdd(primary, "X");
1252 MetaphAdd(secondary, "X");
1253 current += 3;
1254 break;
1255 }
1256
1257 if (StringAt(original, current, 3, "TIA", "TCH", ""))
1258 {
1259 MetaphAdd(primary, "X");
1260 MetaphAdd(secondary, "X");
1261 current += 3;
1262 break;
1263 }
1264
1265 if (StringAt(original, current, 2, "TH", "")
1266 || StringAt(original, current, 3, "TTH", ""))
1267 {
1268 /* special case 'thomas', 'thames' or germanic */
1269 if (StringAt(original, (current + 2), 2, "OM", "AM", "")
1270 || StringAt(original, 0, 4, "VAN ", "VON ", "")
1271 || StringAt(original, 0, 3, "SCH", ""))
1272 {
1273 MetaphAdd(primary, "T");
1274 MetaphAdd(secondary, "T");
1275 }
1276 else
1277 {
1278 MetaphAdd(primary, "0");
1279 MetaphAdd(secondary, "T");
1280 }
1281 current += 2;
1282 break;
1283 }
1284
1285 if (StringAt(original, (current + 1), 1, "T", "D", ""))
1286 current += 2;
1287 else
1288 current += 1;
1289 MetaphAdd(primary, "T");
1290 MetaphAdd(secondary, "T");
1291 break;
1292
1293 case 'V':
1294 if (GetAt(original, current + 1) == 'V')
1295 current += 2;
1296 else
1297 current += 1;
1298 MetaphAdd(primary, "F");
1299 MetaphAdd(secondary, "F");
1300 break;
1301
1302 case 'W':
1303 /* can also be in middle of word */
1304 if (StringAt(original, current, 2, "WR", ""))
1305 {
1306 MetaphAdd(primary, "R");
1307 MetaphAdd(secondary, "R");
1308 current += 2;
1309 break;
1310 }
1311
1312 if ((current == 0)
1313 && (IsVowel(original, current + 1)
1314 || StringAt(original, current, 2, "WH", "")))
1315 {
1316 /* Wasserman should match Vasserman */
1317 if (IsVowel(original, current + 1))
1318 {
1319 MetaphAdd(primary, "A");
1320 MetaphAdd(secondary, "F");
1321 }
1322 else
1323 {
1324 /* need Uomo to match Womo */
1325 MetaphAdd(primary, "A");
1326 MetaphAdd(secondary, "A");
1327 }
1328 }
1329
1330 /* Arnow should match Arnoff */
1331 if (((current == last) && IsVowel(original, current - 1))
1332 || StringAt(original, (current - 1), 5, "EWSKI", "EWSKY",
1333 "OWSKI", "OWSKY", "")
1334 || StringAt(original, 0, 3, "SCH", ""))
1335 {
1336 MetaphAdd(primary, "");
1337 MetaphAdd(secondary, "F");
1338 current += 1;
1339 break;
1340 }
1341
1342 /* polish e.g. 'filipowicz' */
1343 if (StringAt(original, current, 4, "WICZ", "WITZ", ""))
1344 {
1345 MetaphAdd(primary, "TS");
1346 MetaphAdd(secondary, "FX");
1347 current += 4;
1348 break;
1349 }
1350
1351 /* else skip it */
1352 current += 1;
1353 break;
1354
1355 case 'X':
1356 /* french e.g. breaux */
1357 if (!((current == last)
1358 && (StringAt(original, (current - 3), 3,
1359 "IAU", "EAU", "")
1360 || StringAt(original, (current - 2), 2,
1361 "AU", "OU", ""))))
1362 {
1363 MetaphAdd(primary, "KS");
1364 MetaphAdd(secondary, "KS");
1365 }
1366
1367
1368 if (StringAt(original, (current + 1), 1, "C", "X", ""))
1369 current += 2;
1370 else
1371 current += 1;
1372 break;
1373
1374 case 'Z':
1375 /* chinese pinyin e.g. 'zhao' */
1376 if (GetAt(original, current + 1) == 'H')
1377 {
1378 MetaphAdd(primary, "J");
1379 MetaphAdd(secondary, "J");
1380 current += 2;
1381 break;
1382 }
1383 else if (StringAt(original, (current + 1), 2,
1384 "ZO", "ZI", "ZA", "")
1385 || (SlavoGermanic(original)
1386 && ((current > 0)
1387 && GetAt(original, current - 1) != 'T')))
1388 {
1389 MetaphAdd(primary, "S");
1390 MetaphAdd(secondary, "TS");
1391 }
1392 else
1393 {
1394 MetaphAdd(primary, "S");
1395 MetaphAdd(secondary, "S");
1396 }
1397
1398 if (GetAt(original, current + 1) == 'Z')
1399 current += 2;
1400 else
1401 current += 1;
1402 break;
1403
1404 default:
1405 current += 1;
1406 }
1407
1408 /*
1409 * printf("PRIMARY: %s\n", primary->str); printf("SECONDARY: %s\n",
1410 * secondary->str);
1411 */
1412 }
1413
1414
1415 if (primary->length > 4)
1416 SetAt(primary, 4, '\0');
1417
1418 if (secondary->length > 4)
1419 SetAt(secondary, 4, '\0');
1420
1421 *codes = primary->str;
1422 *++codes = secondary->str;
1423
1424 DestroyMetaString(original);
1425 DestroyMetaString(primary);
1426 DestroyMetaString(secondary);
1427 }
1428
1429 #ifdef DMETAPHONE_MAIN
1430
1431 /* just for testing - not part of the perl code */
1432
main(int argc,char ** argv)1433 main(int argc, char **argv)
1434 {
1435 char *codes[2];
1436
1437 if (argc > 1)
1438 {
1439 DoubleMetaphone(argv[1], codes);
1440 printf("%s|%s\n", codes[0], codes[1]);
1441 }
1442 }
1443
1444 #endif
1445