1 /*
2  * This is a port of the Double Metaphone algorithm for use in PostgreSQL.
3  *
4  * contrib/fuzzystrmatch/dmetaphone.c
5  *
6  * Double Metaphone computes 2 "sounds like" strings - a primary and an
7  * alternate. In most cases they are the same, but for foreign names
8  * especially they can be a bit different, depending on pronunciation.
9  *
10  * Information on using Double Metaphone can be found at
11  *	 http://www.codeproject.com/string/dmetaphone1.asp
12  * and the original article describing it can be found at
13  *	 http://drdobbs.com/184401251
14  *
15  * For PostgreSQL we provide 2 functions - one for the primary and one for
16  * the alternate. That way the functions are pure text->text mappings that
17  * are useful in functional indexes. These are 'dmetaphone' for the
18  * primary and 'dmetaphone_alt' for the alternate.
19  *
20  * Assuming that dmetaphone.so is in $libdir, the SQL to set up the
21  * functions looks like this:
22  *
23  * CREATE FUNCTION dmetaphone (text) RETURNS text
24  *	  LANGUAGE C IMMUTABLE STRICT
25  *	  AS '$libdir/dmetaphone', 'dmetaphone';
26  *
27  * CREATE FUNCTION dmetaphone_alt (text) RETURNS text
28  *	  LANGUAGE C IMMUTABLE STRICT
29  *	  AS '$libdir/dmetaphone', 'dmetaphone_alt';
30  *
31  * Note that you have to declare the functions IMMUTABLE if you want to
32  * use them in functional indexes, and you have to declare them as STRICT
33  * as they do not check for NULL input, and will segfault if given NULL input.
34  * (See below for alternative ) Declaring them as STRICT means PostgreSQL
35  * will never call them with NULL, but instead assume the result is NULL,
36  * which is what we (I) want.
37  *
38  * Alternatively, compile with -DDMETAPHONE_NOSTRICT and the functions
39  * will detect NULL input and return NULL. The you don't have to declare them
40  * as STRICT.
41  *
42  * There is a small inefficiency here - each function call actually computes
43  * both the primary and the alternate and then throws away the one it doesn't
44  * need. That's the way the perl module was written, because perl can handle
45  * a list return more easily than we can in PostgreSQL. The result has been
46  * fast enough for my needs, but it could maybe be optimized a bit to remove
47  * that behaviour.
48  *
49  */
50 
51 
52 /***************************** COPYRIGHT NOTICES ***********************
53 
54 Most of this code is directly from the Text::DoubleMetaphone perl module
55 version 0.05 available from http://www.cpan.org.
56 It bears this copyright notice:
57 
58 
59   Copyright 2000, Maurice Aubrey <maurice@hevanet.com>.
60   All rights reserved.
61 
62   This code is based heavily on the C++ implementation by
63   Lawrence Philips and incorporates several bug fixes courtesy
64   of Kevin Atkinson <kevina@users.sourceforge.net>.
65 
66   This module is free software; you may redistribute it and/or
67   modify it under the same terms as Perl itself.
68 
69 The remaining code is authored by Andrew Dunstan <amdunstan@ncshp.org> and
70 <andrew@dunslane.net> and is covered this copyright:
71 
72   Copyright 2003, North Carolina State Highway Patrol.
73   All rights reserved.
74 
75   Permission to use, copy, modify, and distribute this software and its
76   documentation for any purpose, without fee, and without a written agreement
77   is hereby granted, provided that the above copyright notice and this
78   paragraph and the following two paragraphs appear in all copies.
79 
80   IN NO EVENT SHALL THE NORTH CAROLINA STATE HIGHWAY PATROL BE LIABLE TO ANY
81   PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
82   INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
83   DOCUMENTATION, EVEN IF THE NORTH CAROLINA STATE HIGHWAY PATROL HAS BEEN
84   ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85 
86   THE NORTH CAROLINA STATE HIGHWAY PATROL SPECIFICALLY DISCLAIMS ANY
87   WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
88   MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED
89   HEREUNDER IS ON AN "AS IS" BASIS, AND THE NORTH CAROLINA STATE HIGHWAY PATROL
90   HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
91   MODIFICATIONS.
92 
93 ***********************************************************************/
94 
95 
96 /* include these first, according to the docs */
97 #ifndef DMETAPHONE_MAIN
98 
99 #include "postgres.h"
100 
101 #include "utils/builtins.h"
102 
103 /* turn off assertions for embedded function */
104 #define NDEBUG
105 
106 #else							/* DMETAPHONE_MAIN */
107 
108 /* we need these if we didn't get them from postgres.h */
109 #include <stdio.h>
110 #include <stdlib.h>
111 #include <string.h>
112 #include <stdarg.h>
113 
114 #endif							/* DMETAPHONE_MAIN */
115 
116 #include <assert.h>
117 #include <ctype.h>
118 
119 /* prototype for the main function we got from the perl module */
120 static void DoubleMetaphone(char *, char **);
121 
122 #ifndef DMETAPHONE_MAIN
123 
124 /*
125  * The PostgreSQL visible dmetaphone function.
126  */
127 
128 PG_FUNCTION_INFO_V1(dmetaphone);
129 
130 Datum
dmetaphone(PG_FUNCTION_ARGS)131 dmetaphone(PG_FUNCTION_ARGS)
132 {
133 	text	   *arg;
134 	char	   *aptr,
135 			   *codes[2],
136 			   *code;
137 
138 #ifdef DMETAPHONE_NOSTRICT
139 	if (PG_ARGISNULL(0))
140 		PG_RETURN_NULL();
141 #endif
142 	arg = PG_GETARG_TEXT_PP(0);
143 	aptr = text_to_cstring(arg);
144 
145 	DoubleMetaphone(aptr, codes);
146 	code = codes[0];
147 	if (!code)
148 		code = "";
149 
150 	PG_RETURN_TEXT_P(cstring_to_text(code));
151 }
152 
153 /*
154  * The PostgreSQL visible dmetaphone_alt function.
155  */
156 
157 PG_FUNCTION_INFO_V1(dmetaphone_alt);
158 
159 Datum
dmetaphone_alt(PG_FUNCTION_ARGS)160 dmetaphone_alt(PG_FUNCTION_ARGS)
161 {
162 	text	   *arg;
163 	char	   *aptr,
164 			   *codes[2],
165 			   *code;
166 
167 #ifdef DMETAPHONE_NOSTRICT
168 	if (PG_ARGISNULL(0))
169 		PG_RETURN_NULL();
170 #endif
171 	arg = PG_GETARG_TEXT_PP(0);
172 	aptr = text_to_cstring(arg);
173 
174 	DoubleMetaphone(aptr, codes);
175 	code = codes[1];
176 	if (!code)
177 		code = "";
178 
179 	PG_RETURN_TEXT_P(cstring_to_text(code));
180 }
181 
182 
183 /* here is where we start the code imported from the perl module */
184 
185 /* all memory handling is done with these macros */
186 
187 #define META_MALLOC(v,n,t) \
188 		  (v = (t*)palloc(((n)*sizeof(t))))
189 
190 #define META_REALLOC(v,n,t) \
191 					  (v = (t*)repalloc((v),((n)*sizeof(t))))
192 
193 /*
194  * Don't do pfree - it seems to cause a segv sometimes - which might have just
195  * been caused by reloading the module in development.
196  * So we rely on context cleanup - Tom Lane says pfree shouldn't be necessary
197  * in a case like this.
198  */
199 
200 #define META_FREE(x) ((void)true)	/* pfree((x)) */
201 #else							/* not defined DMETAPHONE_MAIN */
202 
203 /* use the standard malloc library when not running in PostgreSQL */
204 
205 #define META_MALLOC(v,n,t) \
206 		  (v = (t*)malloc(((n)*sizeof(t))))
207 
208 #define META_REALLOC(v,n,t) \
209 					  (v = (t*)realloc((v),((n)*sizeof(t))))
210 
211 #define META_FREE(x) free((x))
212 #endif							/* defined DMETAPHONE_MAIN */
213 
214 
215 
216 /* this typedef was originally in the perl module's .h file */
217 
218 typedef struct
219 {
220 	char	   *str;
221 	int			length;
222 	int			bufsize;
223 	int			free_string_on_destroy;
224 }
225 
226 metastring;
227 
228 /*
229  * remaining perl module funcs unchanged except for declaring them static
230  * and reformatting to PostgreSQL indentation and to fit in 80 cols.
231  *
232  */
233 
234 static metastring *
NewMetaString(char * init_str)235 NewMetaString(char *init_str)
236 {
237 	metastring *s;
238 	char		empty_string[] = "";
239 
240 	META_MALLOC(s, 1, metastring);
241 	assert(s != NULL);
242 
243 	if (init_str == NULL)
244 		init_str = empty_string;
245 	s->length = strlen(init_str);
246 	/* preallocate a bit more for potential growth */
247 	s->bufsize = s->length + 7;
248 
249 	META_MALLOC(s->str, s->bufsize, char);
250 	assert(s->str != NULL);
251 
252 	memcpy(s->str, init_str, s->length + 1);
253 	s->free_string_on_destroy = 1;
254 
255 	return s;
256 }
257 
258 
259 static void
DestroyMetaString(metastring * s)260 DestroyMetaString(metastring *s)
261 {
262 	if (s == NULL)
263 		return;
264 
265 	if (s->free_string_on_destroy && (s->str != NULL))
266 		META_FREE(s->str);
267 
268 	META_FREE(s);
269 }
270 
271 
272 static void
IncreaseBuffer(metastring * s,int chars_needed)273 IncreaseBuffer(metastring *s, int chars_needed)
274 {
275 	META_REALLOC(s->str, (s->bufsize + chars_needed + 10), char);
276 	assert(s->str != NULL);
277 	s->bufsize = s->bufsize + chars_needed + 10;
278 }
279 
280 
281 static void
MakeUpper(metastring * s)282 MakeUpper(metastring *s)
283 {
284 	char	   *i;
285 
286 	for (i = s->str; *i; i++)
287 		*i = toupper((unsigned char) *i);
288 }
289 
290 
291 static int
IsVowel(metastring * s,int pos)292 IsVowel(metastring *s, int pos)
293 {
294 	char		c;
295 
296 	if ((pos < 0) || (pos >= s->length))
297 		return 0;
298 
299 	c = *(s->str + pos);
300 	if ((c == 'A') || (c == 'E') || (c == 'I') || (c == 'O') ||
301 		(c == 'U') || (c == 'Y'))
302 		return 1;
303 
304 	return 0;
305 }
306 
307 
308 static int
SlavoGermanic(metastring * s)309 SlavoGermanic(metastring *s)
310 {
311 	if ((char *) strstr(s->str, "W"))
312 		return 1;
313 	else if ((char *) strstr(s->str, "K"))
314 		return 1;
315 	else if ((char *) strstr(s->str, "CZ"))
316 		return 1;
317 	else if ((char *) strstr(s->str, "WITZ"))
318 		return 1;
319 	else
320 		return 0;
321 }
322 
323 
324 static char
GetAt(metastring * s,int pos)325 GetAt(metastring *s, int pos)
326 {
327 	if ((pos < 0) || (pos >= s->length))
328 		return '\0';
329 
330 	return ((char) *(s->str + pos));
331 }
332 
333 
334 static void
SetAt(metastring * s,int pos,char c)335 SetAt(metastring *s, int pos, char c)
336 {
337 	if ((pos < 0) || (pos >= s->length))
338 		return;
339 
340 	*(s->str + pos) = c;
341 }
342 
343 
344 /*
345    Caveats: the START value is 0 based
346 */
347 static int
StringAt(metastring * s,int start,int length,...)348 StringAt(metastring *s, int start, int length,...)
349 {
350 	char	   *test;
351 	char	   *pos;
352 	va_list		ap;
353 
354 	if ((start < 0) || (start >= s->length))
355 		return 0;
356 
357 	pos = (s->str + start);
358 	va_start(ap, length);
359 
360 	do
361 	{
362 		test = va_arg(ap, char *);
363 		if (*test && (strncmp(pos, test, length) == 0))
364 		{
365 			va_end(ap);
366 			return 1;
367 		}
368 	}
369 	while (strcmp(test, "") != 0);
370 
371 	va_end(ap);
372 
373 	return 0;
374 }
375 
376 
377 static void
MetaphAdd(metastring * s,char * new_str)378 MetaphAdd(metastring *s, char *new_str)
379 {
380 	int			add_length;
381 
382 	if (new_str == NULL)
383 		return;
384 
385 	add_length = strlen(new_str);
386 	if ((s->length + add_length) > (s->bufsize - 1))
387 		IncreaseBuffer(s, add_length);
388 
389 	strcat(s->str, new_str);
390 	s->length += add_length;
391 }
392 
393 
394 static void
DoubleMetaphone(char * str,char ** codes)395 DoubleMetaphone(char *str, char **codes)
396 {
397 	int			length;
398 	metastring *original;
399 	metastring *primary;
400 	metastring *secondary;
401 	int			current;
402 	int			last;
403 
404 	current = 0;
405 	/* we need the real length and last prior to padding */
406 	length = strlen(str);
407 	last = length - 1;
408 	original = NewMetaString(str);
409 	/* Pad original so we can index beyond end */
410 	MetaphAdd(original, "     ");
411 
412 	primary = NewMetaString("");
413 	secondary = NewMetaString("");
414 	primary->free_string_on_destroy = 0;
415 	secondary->free_string_on_destroy = 0;
416 
417 	MakeUpper(original);
418 
419 	/* skip these when at start of word */
420 	if (StringAt(original, 0, 2, "GN", "KN", "PN", "WR", "PS", ""))
421 		current += 1;
422 
423 	/* Initial 'X' is pronounced 'Z' e.g. 'Xavier' */
424 	if (GetAt(original, 0) == 'X')
425 	{
426 		MetaphAdd(primary, "S");	/* 'Z' maps to 'S' */
427 		MetaphAdd(secondary, "S");
428 		current += 1;
429 	}
430 
431 	/* main loop */
432 	while ((primary->length < 4) || (secondary->length < 4))
433 	{
434 		if (current >= length)
435 			break;
436 
437 		switch (GetAt(original, current))
438 		{
439 			case 'A':
440 			case 'E':
441 			case 'I':
442 			case 'O':
443 			case 'U':
444 			case 'Y':
445 				if (current == 0)
446 				{
447 					/* all init vowels now map to 'A' */
448 					MetaphAdd(primary, "A");
449 					MetaphAdd(secondary, "A");
450 				}
451 				current += 1;
452 				break;
453 
454 			case 'B':
455 
456 				/* "-mb", e.g", "dumb", already skipped over... */
457 				MetaphAdd(primary, "P");
458 				MetaphAdd(secondary, "P");
459 
460 				if (GetAt(original, current + 1) == 'B')
461 					current += 2;
462 				else
463 					current += 1;
464 				break;
465 
466 			case '\xc7':		/* C with cedilla */
467 				MetaphAdd(primary, "S");
468 				MetaphAdd(secondary, "S");
469 				current += 1;
470 				break;
471 
472 			case 'C':
473 				/* various germanic */
474 				if ((current > 1)
475 					&& !IsVowel(original, current - 2)
476 					&& StringAt(original, (current - 1), 3, "ACH", "")
477 					&& ((GetAt(original, current + 2) != 'I')
478 						&& ((GetAt(original, current + 2) != 'E')
479 							|| StringAt(original, (current - 2), 6, "BACHER",
480 										"MACHER", ""))))
481 				{
482 					MetaphAdd(primary, "K");
483 					MetaphAdd(secondary, "K");
484 					current += 2;
485 					break;
486 				}
487 
488 				/* special case 'caesar' */
489 				if ((current == 0)
490 					&& StringAt(original, current, 6, "CAESAR", ""))
491 				{
492 					MetaphAdd(primary, "S");
493 					MetaphAdd(secondary, "S");
494 					current += 2;
495 					break;
496 				}
497 
498 				/* italian 'chianti' */
499 				if (StringAt(original, current, 4, "CHIA", ""))
500 				{
501 					MetaphAdd(primary, "K");
502 					MetaphAdd(secondary, "K");
503 					current += 2;
504 					break;
505 				}
506 
507 				if (StringAt(original, current, 2, "CH", ""))
508 				{
509 					/* find 'michael' */
510 					if ((current > 0)
511 						&& StringAt(original, current, 4, "CHAE", ""))
512 					{
513 						MetaphAdd(primary, "K");
514 						MetaphAdd(secondary, "X");
515 						current += 2;
516 						break;
517 					}
518 
519 					/* greek roots e.g. 'chemistry', 'chorus' */
520 					if ((current == 0)
521 						&& (StringAt(original, (current + 1), 5,
522 									 "HARAC", "HARIS", "")
523 							|| StringAt(original, (current + 1), 3, "HOR",
524 										"HYM", "HIA", "HEM", ""))
525 						&& !StringAt(original, 0, 5, "CHORE", ""))
526 					{
527 						MetaphAdd(primary, "K");
528 						MetaphAdd(secondary, "K");
529 						current += 2;
530 						break;
531 					}
532 
533 					/* germanic, greek, or otherwise 'ch' for 'kh' sound */
534 					if (
535 						(StringAt(original, 0, 4, "VAN ", "VON ", "")
536 						 || StringAt(original, 0, 3, "SCH", ""))
537 					/* 'architect but not 'arch', 'orchestra', 'orchid' */
538 						|| StringAt(original, (current - 2), 6, "ORCHES",
539 									"ARCHIT", "ORCHID", "")
540 						|| StringAt(original, (current + 2), 1, "T", "S",
541 									"")
542 						|| ((StringAt(original, (current - 1), 1,
543 									  "A", "O", "U", "E", "")
544 							 || (current == 0))
545 
546 					/*
547 					 * e.g., 'wachtler', 'wechsler', but not 'tichner'
548 					 */
549 							&& StringAt(original, (current + 2), 1, "L", "R",
550 										"N", "M", "B", "H", "F", "V", "W",
551 										" ", "")))
552 					{
553 						MetaphAdd(primary, "K");
554 						MetaphAdd(secondary, "K");
555 					}
556 					else
557 					{
558 						if (current > 0)
559 						{
560 							if (StringAt(original, 0, 2, "MC", ""))
561 							{
562 								/* e.g., "McHugh" */
563 								MetaphAdd(primary, "K");
564 								MetaphAdd(secondary, "K");
565 							}
566 							else
567 							{
568 								MetaphAdd(primary, "X");
569 								MetaphAdd(secondary, "K");
570 							}
571 						}
572 						else
573 						{
574 							MetaphAdd(primary, "X");
575 							MetaphAdd(secondary, "X");
576 						}
577 					}
578 					current += 2;
579 					break;
580 				}
581 				/* e.g, 'czerny' */
582 				if (StringAt(original, current, 2, "CZ", "")
583 					&& !StringAt(original, (current - 2), 4, "WICZ", ""))
584 				{
585 					MetaphAdd(primary, "S");
586 					MetaphAdd(secondary, "X");
587 					current += 2;
588 					break;
589 				}
590 
591 				/* e.g., 'focaccia' */
592 				if (StringAt(original, (current + 1), 3, "CIA", ""))
593 				{
594 					MetaphAdd(primary, "X");
595 					MetaphAdd(secondary, "X");
596 					current += 3;
597 					break;
598 				}
599 
600 				/* double 'C', but not if e.g. 'McClellan' */
601 				if (StringAt(original, current, 2, "CC", "")
602 					&& !((current == 1) && (GetAt(original, 0) == 'M')))
603 				{
604 					/* 'bellocchio' but not 'bacchus' */
605 					if (StringAt(original, (current + 2), 1, "I", "E", "H", "")
606 						&& !StringAt(original, (current + 2), 2, "HU", ""))
607 					{
608 						/* 'accident', 'accede' 'succeed' */
609 						if (
610 							((current == 1)
611 							 && (GetAt(original, current - 1) == 'A'))
612 							|| StringAt(original, (current - 1), 5, "UCCEE",
613 										"UCCES", ""))
614 						{
615 							MetaphAdd(primary, "KS");
616 							MetaphAdd(secondary, "KS");
617 							/* 'bacci', 'bertucci', other italian */
618 						}
619 						else
620 						{
621 							MetaphAdd(primary, "X");
622 							MetaphAdd(secondary, "X");
623 						}
624 						current += 3;
625 						break;
626 					}
627 					else
628 					{			/* Pierce's rule */
629 						MetaphAdd(primary, "K");
630 						MetaphAdd(secondary, "K");
631 						current += 2;
632 						break;
633 					}
634 				}
635 
636 				if (StringAt(original, current, 2, "CK", "CG", "CQ", ""))
637 				{
638 					MetaphAdd(primary, "K");
639 					MetaphAdd(secondary, "K");
640 					current += 2;
641 					break;
642 				}
643 
644 				if (StringAt(original, current, 2, "CI", "CE", "CY", ""))
645 				{
646 					/* italian vs. english */
647 					if (StringAt
648 						(original, current, 3, "CIO", "CIE", "CIA", ""))
649 					{
650 						MetaphAdd(primary, "S");
651 						MetaphAdd(secondary, "X");
652 					}
653 					else
654 					{
655 						MetaphAdd(primary, "S");
656 						MetaphAdd(secondary, "S");
657 					}
658 					current += 2;
659 					break;
660 				}
661 
662 				/* else */
663 				MetaphAdd(primary, "K");
664 				MetaphAdd(secondary, "K");
665 
666 				/* name sent in 'mac caffrey', 'mac gregor */
667 				if (StringAt(original, (current + 1), 2, " C", " Q", " G", ""))
668 					current += 3;
669 				else if (StringAt(original, (current + 1), 1, "C", "K", "Q", "")
670 						 && !StringAt(original, (current + 1), 2,
671 									  "CE", "CI", ""))
672 					current += 2;
673 				else
674 					current += 1;
675 				break;
676 
677 			case 'D':
678 				if (StringAt(original, current, 2, "DG", ""))
679 				{
680 					if (StringAt(original, (current + 2), 1,
681 								 "I", "E", "Y", ""))
682 					{
683 						/* e.g. 'edge' */
684 						MetaphAdd(primary, "J");
685 						MetaphAdd(secondary, "J");
686 						current += 3;
687 						break;
688 					}
689 					else
690 					{
691 						/* e.g. 'edgar' */
692 						MetaphAdd(primary, "TK");
693 						MetaphAdd(secondary, "TK");
694 						current += 2;
695 						break;
696 					}
697 				}
698 
699 				if (StringAt(original, current, 2, "DT", "DD", ""))
700 				{
701 					MetaphAdd(primary, "T");
702 					MetaphAdd(secondary, "T");
703 					current += 2;
704 					break;
705 				}
706 
707 				/* else */
708 				MetaphAdd(primary, "T");
709 				MetaphAdd(secondary, "T");
710 				current += 1;
711 				break;
712 
713 			case 'F':
714 				if (GetAt(original, current + 1) == 'F')
715 					current += 2;
716 				else
717 					current += 1;
718 				MetaphAdd(primary, "F");
719 				MetaphAdd(secondary, "F");
720 				break;
721 
722 			case 'G':
723 				if (GetAt(original, current + 1) == 'H')
724 				{
725 					if ((current > 0) && !IsVowel(original, current - 1))
726 					{
727 						MetaphAdd(primary, "K");
728 						MetaphAdd(secondary, "K");
729 						current += 2;
730 						break;
731 					}
732 
733 					if (current < 3)
734 					{
735 						/* 'ghislane', ghiradelli */
736 						if (current == 0)
737 						{
738 							if (GetAt(original, current + 2) == 'I')
739 							{
740 								MetaphAdd(primary, "J");
741 								MetaphAdd(secondary, "J");
742 							}
743 							else
744 							{
745 								MetaphAdd(primary, "K");
746 								MetaphAdd(secondary, "K");
747 							}
748 							current += 2;
749 							break;
750 						}
751 					}
752 
753 					/*
754 					 * Parker's rule (with some further refinements) - e.g.,
755 					 * 'hugh'
756 					 */
757 					if (
758 						((current > 1)
759 						 && StringAt(original, (current - 2), 1,
760 									 "B", "H", "D", ""))
761 					/* e.g., 'bough' */
762 						|| ((current > 2)
763 							&& StringAt(original, (current - 3), 1,
764 										"B", "H", "D", ""))
765 					/* e.g., 'broughton' */
766 						|| ((current > 3)
767 							&& StringAt(original, (current - 4), 1,
768 										"B", "H", "")))
769 					{
770 						current += 2;
771 						break;
772 					}
773 					else
774 					{
775 						/*
776 						 * e.g., 'laugh', 'McLaughlin', 'cough', 'gough',
777 						 * 'rough', 'tough'
778 						 */
779 						if ((current > 2)
780 							&& (GetAt(original, current - 1) == 'U')
781 							&& StringAt(original, (current - 3), 1, "C",
782 										"G", "L", "R", "T", ""))
783 						{
784 							MetaphAdd(primary, "F");
785 							MetaphAdd(secondary, "F");
786 						}
787 						else if ((current > 0)
788 								 && GetAt(original, current - 1) != 'I')
789 						{
790 
791 
792 							MetaphAdd(primary, "K");
793 							MetaphAdd(secondary, "K");
794 						}
795 
796 						current += 2;
797 						break;
798 					}
799 				}
800 
801 				if (GetAt(original, current + 1) == 'N')
802 				{
803 					if ((current == 1) && IsVowel(original, 0)
804 						&& !SlavoGermanic(original))
805 					{
806 						MetaphAdd(primary, "KN");
807 						MetaphAdd(secondary, "N");
808 					}
809 					else
810 						/* not e.g. 'cagney' */
811 						if (!StringAt(original, (current + 2), 2, "EY", "")
812 							&& (GetAt(original, current + 1) != 'Y')
813 							&& !SlavoGermanic(original))
814 					{
815 						MetaphAdd(primary, "N");
816 						MetaphAdd(secondary, "KN");
817 					}
818 					else
819 					{
820 						MetaphAdd(primary, "KN");
821 						MetaphAdd(secondary, "KN");
822 					}
823 					current += 2;
824 					break;
825 				}
826 
827 				/* 'tagliaro' */
828 				if (StringAt(original, (current + 1), 2, "LI", "")
829 					&& !SlavoGermanic(original))
830 				{
831 					MetaphAdd(primary, "KL");
832 					MetaphAdd(secondary, "L");
833 					current += 2;
834 					break;
835 				}
836 
837 				/* -ges-,-gep-,-gel-, -gie- at beginning */
838 				if ((current == 0)
839 					&& ((GetAt(original, current + 1) == 'Y')
840 						|| StringAt(original, (current + 1), 2, "ES", "EP",
841 									"EB", "EL", "EY", "IB", "IL", "IN", "IE",
842 									"EI", "ER", "")))
843 				{
844 					MetaphAdd(primary, "K");
845 					MetaphAdd(secondary, "J");
846 					current += 2;
847 					break;
848 				}
849 
850 				/* -ger-,  -gy- */
851 				if (
852 					(StringAt(original, (current + 1), 2, "ER", "")
853 					 || (GetAt(original, current + 1) == 'Y'))
854 					&& !StringAt(original, 0, 6,
855 								 "DANGER", "RANGER", "MANGER", "")
856 					&& !StringAt(original, (current - 1), 1, "E", "I", "")
857 					&& !StringAt(original, (current - 1), 3, "RGY", "OGY",
858 								 ""))
859 				{
860 					MetaphAdd(primary, "K");
861 					MetaphAdd(secondary, "J");
862 					current += 2;
863 					break;
864 				}
865 
866 				/* italian e.g, 'biaggi' */
867 				if (StringAt(original, (current + 1), 1, "E", "I", "Y", "")
868 					|| StringAt(original, (current - 1), 4,
869 								"AGGI", "OGGI", ""))
870 				{
871 					/* obvious germanic */
872 					if (
873 						(StringAt(original, 0, 4, "VAN ", "VON ", "")
874 						 || StringAt(original, 0, 3, "SCH", ""))
875 						|| StringAt(original, (current + 1), 2, "ET", ""))
876 					{
877 						MetaphAdd(primary, "K");
878 						MetaphAdd(secondary, "K");
879 					}
880 					else
881 					{
882 						/* always soft if french ending */
883 						if (StringAt
884 							(original, (current + 1), 4, "IER ", ""))
885 						{
886 							MetaphAdd(primary, "J");
887 							MetaphAdd(secondary, "J");
888 						}
889 						else
890 						{
891 							MetaphAdd(primary, "J");
892 							MetaphAdd(secondary, "K");
893 						}
894 					}
895 					current += 2;
896 					break;
897 				}
898 
899 				if (GetAt(original, current + 1) == 'G')
900 					current += 2;
901 				else
902 					current += 1;
903 				MetaphAdd(primary, "K");
904 				MetaphAdd(secondary, "K");
905 				break;
906 
907 			case 'H':
908 				/* only keep if first & before vowel or btw. 2 vowels */
909 				if (((current == 0) || IsVowel(original, current - 1))
910 					&& IsVowel(original, current + 1))
911 				{
912 					MetaphAdd(primary, "H");
913 					MetaphAdd(secondary, "H");
914 					current += 2;
915 				}
916 				else
917 					/* also takes care of 'HH' */
918 					current += 1;
919 				break;
920 
921 			case 'J':
922 				/* obvious spanish, 'jose', 'san jacinto' */
923 				if (StringAt(original, current, 4, "JOSE", "")
924 					|| StringAt(original, 0, 4, "SAN ", ""))
925 				{
926 					if (((current == 0)
927 						 && (GetAt(original, current + 4) == ' '))
928 						|| StringAt(original, 0, 4, "SAN ", ""))
929 					{
930 						MetaphAdd(primary, "H");
931 						MetaphAdd(secondary, "H");
932 					}
933 					else
934 					{
935 						MetaphAdd(primary, "J");
936 						MetaphAdd(secondary, "H");
937 					}
938 					current += 1;
939 					break;
940 				}
941 
942 				if ((current == 0)
943 					&& !StringAt(original, current, 4, "JOSE", ""))
944 				{
945 					MetaphAdd(primary, "J");	/* Yankelovich/Jankelowicz */
946 					MetaphAdd(secondary, "A");
947 				}
948 				else
949 				{
950 					/* spanish pron. of e.g. 'bajador' */
951 					if (IsVowel(original, current - 1)
952 						&& !SlavoGermanic(original)
953 						&& ((GetAt(original, current + 1) == 'A')
954 							|| (GetAt(original, current + 1) == 'O')))
955 					{
956 						MetaphAdd(primary, "J");
957 						MetaphAdd(secondary, "H");
958 					}
959 					else
960 					{
961 						if (current == last)
962 						{
963 							MetaphAdd(primary, "J");
964 							MetaphAdd(secondary, "");
965 						}
966 						else
967 						{
968 							if (!StringAt(original, (current + 1), 1, "L", "T",
969 										  "K", "S", "N", "M", "B", "Z", "")
970 								&& !StringAt(original, (current - 1), 1,
971 											 "S", "K", "L", ""))
972 							{
973 								MetaphAdd(primary, "J");
974 								MetaphAdd(secondary, "J");
975 							}
976 						}
977 					}
978 				}
979 
980 				if (GetAt(original, current + 1) == 'J')	/* it could happen! */
981 					current += 2;
982 				else
983 					current += 1;
984 				break;
985 
986 			case 'K':
987 				if (GetAt(original, current + 1) == 'K')
988 					current += 2;
989 				else
990 					current += 1;
991 				MetaphAdd(primary, "K");
992 				MetaphAdd(secondary, "K");
993 				break;
994 
995 			case 'L':
996 				if (GetAt(original, current + 1) == 'L')
997 				{
998 					/* spanish e.g. 'cabrillo', 'gallegos' */
999 					if (((current == (length - 3))
1000 						 && StringAt(original, (current - 1), 4, "ILLO",
1001 									 "ILLA", "ALLE", ""))
1002 						|| ((StringAt(original, (last - 1), 2, "AS", "OS", "")
1003 							 || StringAt(original, last, 1, "A", "O", ""))
1004 							&& StringAt(original, (current - 1), 4,
1005 										"ALLE", "")))
1006 					{
1007 						MetaphAdd(primary, "L");
1008 						MetaphAdd(secondary, "");
1009 						current += 2;
1010 						break;
1011 					}
1012 					current += 2;
1013 				}
1014 				else
1015 					current += 1;
1016 				MetaphAdd(primary, "L");
1017 				MetaphAdd(secondary, "L");
1018 				break;
1019 
1020 			case 'M':
1021 				if ((StringAt(original, (current - 1), 3, "UMB", "")
1022 					 && (((current + 1) == last)
1023 						 || StringAt(original, (current + 2), 2, "ER", "")))
1024 				/* 'dumb','thumb' */
1025 					|| (GetAt(original, current + 1) == 'M'))
1026 					current += 2;
1027 				else
1028 					current += 1;
1029 				MetaphAdd(primary, "M");
1030 				MetaphAdd(secondary, "M");
1031 				break;
1032 
1033 			case 'N':
1034 				if (GetAt(original, current + 1) == 'N')
1035 					current += 2;
1036 				else
1037 					current += 1;
1038 				MetaphAdd(primary, "N");
1039 				MetaphAdd(secondary, "N");
1040 				break;
1041 
1042 			case '\xd1':		/* N with tilde */
1043 				current += 1;
1044 				MetaphAdd(primary, "N");
1045 				MetaphAdd(secondary, "N");
1046 				break;
1047 
1048 			case 'P':
1049 				if (GetAt(original, current + 1) == 'H')
1050 				{
1051 					MetaphAdd(primary, "F");
1052 					MetaphAdd(secondary, "F");
1053 					current += 2;
1054 					break;
1055 				}
1056 
1057 				/* also account for "campbell", "raspberry" */
1058 				if (StringAt(original, (current + 1), 1, "P", "B", ""))
1059 					current += 2;
1060 				else
1061 					current += 1;
1062 				MetaphAdd(primary, "P");
1063 				MetaphAdd(secondary, "P");
1064 				break;
1065 
1066 			case 'Q':
1067 				if (GetAt(original, current + 1) == 'Q')
1068 					current += 2;
1069 				else
1070 					current += 1;
1071 				MetaphAdd(primary, "K");
1072 				MetaphAdd(secondary, "K");
1073 				break;
1074 
1075 			case 'R':
1076 				/* french e.g. 'rogier', but exclude 'hochmeier' */
1077 				if ((current == last)
1078 					&& !SlavoGermanic(original)
1079 					&& StringAt(original, (current - 2), 2, "IE", "")
1080 					&& !StringAt(original, (current - 4), 2, "ME", "MA", ""))
1081 				{
1082 					MetaphAdd(primary, "");
1083 					MetaphAdd(secondary, "R");
1084 				}
1085 				else
1086 				{
1087 					MetaphAdd(primary, "R");
1088 					MetaphAdd(secondary, "R");
1089 				}
1090 
1091 				if (GetAt(original, current + 1) == 'R')
1092 					current += 2;
1093 				else
1094 					current += 1;
1095 				break;
1096 
1097 			case 'S':
1098 				/* special cases 'island', 'isle', 'carlisle', 'carlysle' */
1099 				if (StringAt(original, (current - 1), 3, "ISL", "YSL", ""))
1100 				{
1101 					current += 1;
1102 					break;
1103 				}
1104 
1105 				/* special case 'sugar-' */
1106 				if ((current == 0)
1107 					&& StringAt(original, current, 5, "SUGAR", ""))
1108 				{
1109 					MetaphAdd(primary, "X");
1110 					MetaphAdd(secondary, "S");
1111 					current += 1;
1112 					break;
1113 				}
1114 
1115 				if (StringAt(original, current, 2, "SH", ""))
1116 				{
1117 					/* germanic */
1118 					if (StringAt
1119 						(original, (current + 1), 4, "HEIM", "HOEK", "HOLM",
1120 						 "HOLZ", ""))
1121 					{
1122 						MetaphAdd(primary, "S");
1123 						MetaphAdd(secondary, "S");
1124 					}
1125 					else
1126 					{
1127 						MetaphAdd(primary, "X");
1128 						MetaphAdd(secondary, "X");
1129 					}
1130 					current += 2;
1131 					break;
1132 				}
1133 
1134 				/* italian & armenian */
1135 				if (StringAt(original, current, 3, "SIO", "SIA", "")
1136 					|| StringAt(original, current, 4, "SIAN", ""))
1137 				{
1138 					if (!SlavoGermanic(original))
1139 					{
1140 						MetaphAdd(primary, "S");
1141 						MetaphAdd(secondary, "X");
1142 					}
1143 					else
1144 					{
1145 						MetaphAdd(primary, "S");
1146 						MetaphAdd(secondary, "S");
1147 					}
1148 					current += 3;
1149 					break;
1150 				}
1151 
1152 				/*
1153 				 * german & anglicisations, e.g. 'smith' match 'schmidt',
1154 				 * 'snider' match 'schneider' also, -sz- in slavic language
1155 				 * although in hungarian it is pronounced 's'
1156 				 */
1157 				if (((current == 0)
1158 					 && StringAt(original, (current + 1), 1,
1159 								 "M", "N", "L", "W", ""))
1160 					|| StringAt(original, (current + 1), 1, "Z", ""))
1161 				{
1162 					MetaphAdd(primary, "S");
1163 					MetaphAdd(secondary, "X");
1164 					if (StringAt(original, (current + 1), 1, "Z", ""))
1165 						current += 2;
1166 					else
1167 						current += 1;
1168 					break;
1169 				}
1170 
1171 				if (StringAt(original, current, 2, "SC", ""))
1172 				{
1173 					/* Schlesinger's rule */
1174 					if (GetAt(original, current + 2) == 'H')
1175 					{
1176 						/* dutch origin, e.g. 'school', 'schooner' */
1177 						if (StringAt(original, (current + 3), 2,
1178 									 "OO", "ER", "EN",
1179 									 "UY", "ED", "EM", ""))
1180 						{
1181 							/* 'schermerhorn', 'schenker' */
1182 							if (StringAt(original, (current + 3), 2,
1183 										 "ER", "EN", ""))
1184 							{
1185 								MetaphAdd(primary, "X");
1186 								MetaphAdd(secondary, "SK");
1187 							}
1188 							else
1189 							{
1190 								MetaphAdd(primary, "SK");
1191 								MetaphAdd(secondary, "SK");
1192 							}
1193 							current += 3;
1194 							break;
1195 						}
1196 						else
1197 						{
1198 							if ((current == 0) && !IsVowel(original, 3)
1199 								&& (GetAt(original, 3) != 'W'))
1200 							{
1201 								MetaphAdd(primary, "X");
1202 								MetaphAdd(secondary, "S");
1203 							}
1204 							else
1205 							{
1206 								MetaphAdd(primary, "X");
1207 								MetaphAdd(secondary, "X");
1208 							}
1209 							current += 3;
1210 							break;
1211 						}
1212 					}
1213 
1214 					if (StringAt(original, (current + 2), 1,
1215 								 "I", "E", "Y", ""))
1216 					{
1217 						MetaphAdd(primary, "S");
1218 						MetaphAdd(secondary, "S");
1219 						current += 3;
1220 						break;
1221 					}
1222 					/* else */
1223 					MetaphAdd(primary, "SK");
1224 					MetaphAdd(secondary, "SK");
1225 					current += 3;
1226 					break;
1227 				}
1228 
1229 				/* french e.g. 'resnais', 'artois' */
1230 				if ((current == last)
1231 					&& StringAt(original, (current - 2), 2, "AI", "OI", ""))
1232 				{
1233 					MetaphAdd(primary, "");
1234 					MetaphAdd(secondary, "S");
1235 				}
1236 				else
1237 				{
1238 					MetaphAdd(primary, "S");
1239 					MetaphAdd(secondary, "S");
1240 				}
1241 
1242 				if (StringAt(original, (current + 1), 1, "S", "Z", ""))
1243 					current += 2;
1244 				else
1245 					current += 1;
1246 				break;
1247 
1248 			case 'T':
1249 				if (StringAt(original, current, 4, "TION", ""))
1250 				{
1251 					MetaphAdd(primary, "X");
1252 					MetaphAdd(secondary, "X");
1253 					current += 3;
1254 					break;
1255 				}
1256 
1257 				if (StringAt(original, current, 3, "TIA", "TCH", ""))
1258 				{
1259 					MetaphAdd(primary, "X");
1260 					MetaphAdd(secondary, "X");
1261 					current += 3;
1262 					break;
1263 				}
1264 
1265 				if (StringAt(original, current, 2, "TH", "")
1266 					|| StringAt(original, current, 3, "TTH", ""))
1267 				{
1268 					/* special case 'thomas', 'thames' or germanic */
1269 					if (StringAt(original, (current + 2), 2, "OM", "AM", "")
1270 						|| StringAt(original, 0, 4, "VAN ", "VON ", "")
1271 						|| StringAt(original, 0, 3, "SCH", ""))
1272 					{
1273 						MetaphAdd(primary, "T");
1274 						MetaphAdd(secondary, "T");
1275 					}
1276 					else
1277 					{
1278 						MetaphAdd(primary, "0");
1279 						MetaphAdd(secondary, "T");
1280 					}
1281 					current += 2;
1282 					break;
1283 				}
1284 
1285 				if (StringAt(original, (current + 1), 1, "T", "D", ""))
1286 					current += 2;
1287 				else
1288 					current += 1;
1289 				MetaphAdd(primary, "T");
1290 				MetaphAdd(secondary, "T");
1291 				break;
1292 
1293 			case 'V':
1294 				if (GetAt(original, current + 1) == 'V')
1295 					current += 2;
1296 				else
1297 					current += 1;
1298 				MetaphAdd(primary, "F");
1299 				MetaphAdd(secondary, "F");
1300 				break;
1301 
1302 			case 'W':
1303 				/* can also be in middle of word */
1304 				if (StringAt(original, current, 2, "WR", ""))
1305 				{
1306 					MetaphAdd(primary, "R");
1307 					MetaphAdd(secondary, "R");
1308 					current += 2;
1309 					break;
1310 				}
1311 
1312 				if ((current == 0)
1313 					&& (IsVowel(original, current + 1)
1314 						|| StringAt(original, current, 2, "WH", "")))
1315 				{
1316 					/* Wasserman should match Vasserman */
1317 					if (IsVowel(original, current + 1))
1318 					{
1319 						MetaphAdd(primary, "A");
1320 						MetaphAdd(secondary, "F");
1321 					}
1322 					else
1323 					{
1324 						/* need Uomo to match Womo */
1325 						MetaphAdd(primary, "A");
1326 						MetaphAdd(secondary, "A");
1327 					}
1328 				}
1329 
1330 				/* Arnow should match Arnoff */
1331 				if (((current == last) && IsVowel(original, current - 1))
1332 					|| StringAt(original, (current - 1), 5, "EWSKI", "EWSKY",
1333 								"OWSKI", "OWSKY", "")
1334 					|| StringAt(original, 0, 3, "SCH", ""))
1335 				{
1336 					MetaphAdd(primary, "");
1337 					MetaphAdd(secondary, "F");
1338 					current += 1;
1339 					break;
1340 				}
1341 
1342 				/* polish e.g. 'filipowicz' */
1343 				if (StringAt(original, current, 4, "WICZ", "WITZ", ""))
1344 				{
1345 					MetaphAdd(primary, "TS");
1346 					MetaphAdd(secondary, "FX");
1347 					current += 4;
1348 					break;
1349 				}
1350 
1351 				/* else skip it */
1352 				current += 1;
1353 				break;
1354 
1355 			case 'X':
1356 				/* french e.g. breaux */
1357 				if (!((current == last)
1358 					  && (StringAt(original, (current - 3), 3,
1359 								   "IAU", "EAU", "")
1360 						  || StringAt(original, (current - 2), 2,
1361 									  "AU", "OU", ""))))
1362 				{
1363 					MetaphAdd(primary, "KS");
1364 					MetaphAdd(secondary, "KS");
1365 				}
1366 
1367 
1368 				if (StringAt(original, (current + 1), 1, "C", "X", ""))
1369 					current += 2;
1370 				else
1371 					current += 1;
1372 				break;
1373 
1374 			case 'Z':
1375 				/* chinese pinyin e.g. 'zhao' */
1376 				if (GetAt(original, current + 1) == 'H')
1377 				{
1378 					MetaphAdd(primary, "J");
1379 					MetaphAdd(secondary, "J");
1380 					current += 2;
1381 					break;
1382 				}
1383 				else if (StringAt(original, (current + 1), 2,
1384 								  "ZO", "ZI", "ZA", "")
1385 						 || (SlavoGermanic(original)
1386 							 && ((current > 0)
1387 								 && GetAt(original, current - 1) != 'T')))
1388 				{
1389 					MetaphAdd(primary, "S");
1390 					MetaphAdd(secondary, "TS");
1391 				}
1392 				else
1393 				{
1394 					MetaphAdd(primary, "S");
1395 					MetaphAdd(secondary, "S");
1396 				}
1397 
1398 				if (GetAt(original, current + 1) == 'Z')
1399 					current += 2;
1400 				else
1401 					current += 1;
1402 				break;
1403 
1404 			default:
1405 				current += 1;
1406 		}
1407 
1408 		/*
1409 		 * printf("PRIMARY: %s\n", primary->str); printf("SECONDARY: %s\n",
1410 		 * secondary->str);
1411 		 */
1412 	}
1413 
1414 
1415 	if (primary->length > 4)
1416 		SetAt(primary, 4, '\0');
1417 
1418 	if (secondary->length > 4)
1419 		SetAt(secondary, 4, '\0');
1420 
1421 	*codes = primary->str;
1422 	*++codes = secondary->str;
1423 
1424 	DestroyMetaString(original);
1425 	DestroyMetaString(primary);
1426 	DestroyMetaString(secondary);
1427 }
1428 
1429 #ifdef DMETAPHONE_MAIN
1430 
1431 /* just for testing - not part of the perl code */
1432 
main(int argc,char ** argv)1433 main(int argc, char **argv)
1434 {
1435 	char	   *codes[2];
1436 
1437 	if (argc > 1)
1438 	{
1439 		DoubleMetaphone(argv[1], codes);
1440 		printf("%s|%s\n", codes[0], codes[1]);
1441 	}
1442 }
1443 
1444 #endif
1445