1 /* COPYRIGHT NOTICE
2 *
3 * This code was pulled directly from the Text-DoubleMetaphone perl package,
4 * version 0.07
5 *
6 * The README mentions that the copyright is:
7 *
8 * Copyright 2000, Maurice Aubrey <maurice@hevanet.com>.
9 * All rights reserved.
10
11 * This code is based heavily on the C++ implementation by
12 * Lawrence Philips and incorporates several bug fixes courtesy
13 * of Kevin Atkinson <kevina@users.sourceforge.net>.
14 *
15 * This module is free software; you may redistribute it and/or
16 * modify it under the same terms as Perl itself.
17 */
18
19 #include <stdio.h>
20 #include <ctype.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <stdarg.h>
24 #include <assert.h>
25 #include "double_metaphone.h"
26
27 #include "rmalloc.h"
28
29 /*
30 * * If META_USE_PERL_MALLOC is defined we use Perl's memory routines.
31 * */
32 #ifdef META_USE_PERL_MALLOC
33
34 #include "EXTERN.h"
35 #include "perl.h"
36 #define META_MALLOC(v, n, t) New(1, v, n, t)
37 #define META_REALLOC(v, n, t) Renew(v, n, t)
38 #define META_FREE(x) Safefree((x))
39
40 #else
41
42 #define META_MALLOC(v, n, t) (v = (t *)rm_malloc(((n) * sizeof(t))))
43 #define META_REALLOC(v, n, t) (v = (t *)rm_realloc((v), ((n) * sizeof(t))))
44 #define META_FREE(x) rm_free((x))
45
46 #endif /* META_USE_PERL_MALLOC */
47
NewMetaString(const char * init_str)48 static metastring *NewMetaString(const char *init_str) {
49 metastring *s;
50 char empty_string[] = "";
51
52 META_MALLOC(s, 1, metastring);
53 assert(s != NULL);
54
55 if (init_str == NULL) init_str = empty_string;
56 s->length = strlen(init_str);
57 /* preallocate a bit more for potential growth */
58 s->bufsize = s->length + 7;
59
60 META_MALLOC(s->str, s->bufsize, char);
61 assert(s->str != NULL);
62
63 strncpy(s->str, init_str, s->length + 1);
64 s->free_string_on_destroy = 1;
65
66 return s;
67 }
68
DestroyMetaString(metastring * s)69 static void DestroyMetaString(metastring *s) {
70 if (s == NULL) return;
71
72 if (s->free_string_on_destroy && (s->str != NULL)) META_FREE(s->str);
73
74 META_FREE(s);
75 }
76
IncreaseBuffer(metastring * s,int chars_needed)77 static void IncreaseBuffer(metastring *s, int chars_needed) {
78 META_REALLOC(s->str, (s->bufsize + chars_needed + 10), char);
79 assert(s->str != NULL);
80 s->bufsize = s->bufsize + chars_needed + 10;
81 }
82
MakeUpper(metastring * s)83 static void MakeUpper(metastring *s) {
84 char *i;
85
86 for (i = s->str; *i; i++) {
87 *i = toupper(*i);
88 }
89 }
90
IsVowel(metastring * s,int pos)91 static int IsVowel(metastring *s, int pos) {
92 char c;
93
94 if ((pos < 0) || (pos >= s->length)) return 0;
95
96 c = *(s->str + pos);
97 if ((c == 'A') || (c == 'E') || (c == 'I') || (c == 'O') || (c == 'U') || (c == 'Y')) return 1;
98
99 return 0;
100 }
101
SlavoGermanic(metastring * s)102 static int SlavoGermanic(metastring *s) {
103 if ((char *)strstr(s->str, "W"))
104 return 1;
105 else if ((char *)strstr(s->str, "K"))
106 return 1;
107 else if ((char *)strstr(s->str, "CZ"))
108 return 1;
109 else if ((char *)strstr(s->str, "WITZ"))
110 return 1;
111 else
112 return 0;
113 }
114
GetLength(metastring * s)115 static int GetLength(metastring *s) {
116 return s->length;
117 }
118
GetAt(metastring * s,int pos)119 static char GetAt(metastring *s, int pos) {
120 if ((pos < 0) || (pos >= s->length)) return '\0';
121
122 return ((char)*(s->str + pos));
123 }
124
SetAt(metastring * s,int pos,char c)125 static void SetAt(metastring *s, int pos, char c) {
126 if ((pos < 0) || (pos >= s->length)) return;
127
128 *(s->str + pos) = c;
129 }
130
131 /*
132 Caveats: the START value is 0 based
133 */
StringAt(metastring * s,int start,int length,...)134 static int StringAt(metastring *s, int start, int length, ...) {
135 char *test;
136 char *pos;
137 va_list ap;
138
139 if ((start < 0) || (start >= s->length)) return 0;
140
141 pos = (s->str + start);
142 va_start(ap, length);
143
144 do {
145 test = va_arg(ap, char *);
146 if (*test && (strncmp(pos, test, length) == 0)) return 1;
147 } while (strcmp(test, ""));
148
149 va_end(ap);
150
151 return 0;
152 }
153
MetaphAdd(metastring * s,const char * new_str)154 static void MetaphAdd(metastring *s, const char *new_str) {
155 int add_length;
156
157 if (new_str == NULL) return;
158
159 add_length = strlen(new_str);
160 if ((s->length + add_length) > (s->bufsize - 1)) {
161 IncreaseBuffer(s, add_length);
162 }
163
164 strcat(s->str, new_str);
165 s->length += add_length;
166 }
167
DoubleMetaphone(const char * str,char ** primary_pp,char ** secondary_pp)168 void DoubleMetaphone(const char *str, char **primary_pp, char **secondary_pp) {
169 int length;
170 metastring *original;
171 metastring *primary;
172 metastring *secondary;
173 int current;
174 int last;
175
176 current = 0;
177 /* we need the real length and last prior to padding */
178 length = strlen(str);
179 last = length - 1;
180 original = NewMetaString(str);
181 /* Pad original so we can index beyond end */
182 MetaphAdd(original, " ");
183
184 primary = NewMetaString("");
185 secondary = NewMetaString("");
186
187 MakeUpper(original);
188
189 /* skip these when at start of word */
190 if (StringAt(original, 0, 2, "GN", "KN", "PN", "WR", "PS", "")) current += 1;
191
192 /* Initial 'X' is pronounced 'Z' e.g. 'Xavier' */
193 if (GetAt(original, 0) == 'X') {
194 MetaphAdd(primary, "S"); /* 'Z' maps to 'S' */
195 MetaphAdd(secondary, "S");
196 current += 1;
197 }
198
199 /* main loop */
200 while ((primary->length < 4) || (secondary->length < 4)) {
201 if (current >= length) break;
202
203 switch (GetAt(original, current)) {
204 case 'A':
205 case 'E':
206 case 'I':
207 case 'O':
208 case 'U':
209 case 'Y':
210 if (current == 0) {
211 /* all init vowels now map to 'A' */
212 MetaphAdd(primary, "A");
213 MetaphAdd(secondary, "A");
214 }
215 current += 1;
216 break;
217
218 case 'B':
219
220 /* "-mb", e.g", "dumb", already skipped over... */
221 MetaphAdd(primary, "P");
222 MetaphAdd(secondary, "P");
223
224 if (GetAt(original, current + 1) == 'B')
225 current += 2;
226 else
227 current += 1;
228 break;
229
230 #if 0 // This is 2018 and nobody is using Latin1
231 case 'Ç':
232 MetaphAdd(primary, "S");
233 MetaphAdd(secondary, "S");
234 current += 1;
235 break;
236 #endif
237
238 case 'C':
239 /* various germanic */
240 if ((current > 1) && !IsVowel(original, current - 2) &&
241 StringAt(original, (current - 1), 3, "ACH", "") &&
242 ((GetAt(original, current + 2) != 'I') &&
243 ((GetAt(original, current + 2) != 'E') ||
244 StringAt(original, (current - 2), 6, "BACHER", "MACHER", "")))) {
245 MetaphAdd(primary, "K");
246 MetaphAdd(secondary, "K");
247 current += 2;
248 break;
249 }
250
251 /* special case 'caesar' */
252 if ((current == 0) && StringAt(original, current, 6, "CAESAR", "")) {
253 MetaphAdd(primary, "S");
254 MetaphAdd(secondary, "S");
255 current += 2;
256 break;
257 }
258
259 /* italian 'chianti' */
260 if (StringAt(original, current, 4, "CHIA", "")) {
261 MetaphAdd(primary, "K");
262 MetaphAdd(secondary, "K");
263 current += 2;
264 break;
265 }
266
267 if (StringAt(original, current, 2, "CH", "")) {
268 /* find 'michael' */
269 if ((current > 0) && StringAt(original, current, 4, "CHAE", "")) {
270 MetaphAdd(primary, "K");
271 MetaphAdd(secondary, "X");
272 current += 2;
273 break;
274 }
275
276 /* greek roots e.g. 'chemistry', 'chorus' */
277 if ((current == 0) &&
278 (StringAt(original, (current + 1), 5, "HARAC", "HARIS", "") ||
279 StringAt(original, (current + 1), 3, "HOR", "HYM", "HIA", "HEM", "")) &&
280 !StringAt(original, 0, 5, "CHORE", "")) {
281 MetaphAdd(primary, "K");
282 MetaphAdd(secondary, "K");
283 current += 2;
284 break;
285 }
286
287 /* germanic, greek, or otherwise 'ch' for 'kh' sound */
288 if ((StringAt(original, 0, 4, "VAN ", "VON ", "") || StringAt(original, 0, 3, "SCH", ""))
289 /* 'architect but not 'arch', 'orchestra', 'orchid' */
290 || StringAt(original, (current - 2), 6, "ORCHES", "ARCHIT", "ORCHID", "") ||
291 StringAt(original, (current + 2), 1, "T", "S", "") ||
292 ((StringAt(original, (current - 1), 1, "A", "O", "U", "E", "") || (current == 0))
293 /* e.g., 'wachtler', 'wechsler', but not 'tichner' */
294 && StringAt(original, (current + 2), 1, "L", "R", "N", "M", "B", "H", "F", "V", "W",
295 " ", ""))) {
296 MetaphAdd(primary, "K");
297 MetaphAdd(secondary, "K");
298 } else {
299 if (current > 0) {
300 if (StringAt(original, 0, 2, "MC", "")) {
301 /* e.g., "McHugh" */
302 MetaphAdd(primary, "K");
303 MetaphAdd(secondary, "K");
304 } else {
305 MetaphAdd(primary, "X");
306 MetaphAdd(secondary, "K");
307 }
308 } else {
309 MetaphAdd(primary, "X");
310 MetaphAdd(secondary, "X");
311 }
312 }
313 current += 2;
314 break;
315 }
316 /* e.g, 'czerny' */
317 if (StringAt(original, current, 2, "CZ", "") &&
318 !StringAt(original, (current - 2), 4, "WICZ", "")) {
319 MetaphAdd(primary, "S");
320 MetaphAdd(secondary, "X");
321 current += 2;
322 break;
323 }
324
325 /* e.g., 'focaccia' */
326 if (StringAt(original, (current + 1), 3, "CIA", "")) {
327 MetaphAdd(primary, "X");
328 MetaphAdd(secondary, "X");
329 current += 3;
330 break;
331 }
332
333 /* double 'C', but not if e.g. 'McClellan' */
334 if (StringAt(original, current, 2, "CC", "") &&
335 !((current == 1) && (GetAt(original, 0) == 'M'))) {
336 /* 'bellocchio' but not 'bacchus' */
337 if (StringAt(original, (current + 2), 1, "I", "E", "H", "") &&
338 !StringAt(original, (current + 2), 2, "HU", "")) {
339 /* 'accident', 'accede' 'succeed' */
340 if (((current == 1) && (GetAt(original, current - 1) == 'A')) ||
341 StringAt(original, (current - 1), 5, "UCCEE", "UCCES", "")) {
342 MetaphAdd(primary, "KS");
343 MetaphAdd(secondary, "KS");
344 /* 'bacci', 'bertucci', other italian */
345 } else {
346 MetaphAdd(primary, "X");
347 MetaphAdd(secondary, "X");
348 }
349 current += 3;
350 break;
351 } else { /* Pierce's rule */
352 MetaphAdd(primary, "K");
353 MetaphAdd(secondary, "K");
354 current += 2;
355 break;
356 }
357 }
358
359 if (StringAt(original, current, 2, "CK", "CG", "CQ", "")) {
360 MetaphAdd(primary, "K");
361 MetaphAdd(secondary, "K");
362 current += 2;
363 break;
364 }
365
366 if (StringAt(original, current, 2, "CI", "CE", "CY", "")) {
367 /* italian vs. english */
368 if (StringAt(original, current, 3, "CIO", "CIE", "CIA", "")) {
369 MetaphAdd(primary, "S");
370 MetaphAdd(secondary, "X");
371 } else {
372 MetaphAdd(primary, "S");
373 MetaphAdd(secondary, "S");
374 }
375 current += 2;
376 break;
377 }
378
379 /* else */
380 MetaphAdd(primary, "K");
381 MetaphAdd(secondary, "K");
382
383 /* name sent in 'mac caffrey', 'mac gregor */
384 if (StringAt(original, (current + 1), 2, " C", " Q", " G", ""))
385 current += 3;
386 else if (StringAt(original, (current + 1), 1, "C", "K", "Q", "") &&
387 !StringAt(original, (current + 1), 2, "CE", "CI", ""))
388 current += 2;
389 else
390 current += 1;
391 break;
392
393 case 'D':
394 if (StringAt(original, current, 2, "DG", "")) {
395 if (StringAt(original, (current + 2), 1, "I", "E", "Y", "")) {
396 /* e.g. 'edge' */
397 MetaphAdd(primary, "J");
398 MetaphAdd(secondary, "J");
399 current += 3;
400 break;
401 } else {
402 /* e.g. 'edgar' */
403 MetaphAdd(primary, "TK");
404 MetaphAdd(secondary, "TK");
405 current += 2;
406 break;
407 }
408 }
409
410 if (StringAt(original, current, 2, "DT", "DD", "")) {
411 MetaphAdd(primary, "T");
412 MetaphAdd(secondary, "T");
413 current += 2;
414 break;
415 }
416
417 /* else */
418 MetaphAdd(primary, "T");
419 MetaphAdd(secondary, "T");
420 current += 1;
421 break;
422
423 case 'F':
424 if (GetAt(original, current + 1) == 'F')
425 current += 2;
426 else
427 current += 1;
428 MetaphAdd(primary, "F");
429 MetaphAdd(secondary, "F");
430 break;
431
432 case 'G':
433 if (GetAt(original, current + 1) == 'H') {
434 if ((current > 0) && !IsVowel(original, current - 1)) {
435 MetaphAdd(primary, "K");
436 MetaphAdd(secondary, "K");
437 current += 2;
438 break;
439 }
440
441 if (current < 3) {
442 /* 'ghislane', ghiradelli */
443 if (current == 0) {
444 if (GetAt(original, current + 2) == 'I') {
445 MetaphAdd(primary, "J");
446 MetaphAdd(secondary, "J");
447 } else {
448 MetaphAdd(primary, "K");
449 MetaphAdd(secondary, "K");
450 }
451 current += 2;
452 break;
453 }
454 }
455 /* Parker's rule (with some further refinements) - e.g., 'hugh' */
456 if (((current > 1) && StringAt(original, (current - 2), 1, "B", "H", "D", ""))
457 /* e.g., 'bough' */
458 || ((current > 2) && StringAt(original, (current - 3), 1, "B", "H", "D", ""))
459 /* e.g., 'broughton' */
460 || ((current > 3) && StringAt(original, (current - 4), 1, "B", "H", ""))) {
461 current += 2;
462 break;
463 } else {
464 /* e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' */
465 if ((current > 2) && (GetAt(original, current - 1) == 'U') &&
466 StringAt(original, (current - 3), 1, "C", "G", "L", "R", "T", "")) {
467 MetaphAdd(primary, "F");
468 MetaphAdd(secondary, "F");
469 } else if ((current > 0) && GetAt(original, current - 1) != 'I') {
470
471 MetaphAdd(primary, "K");
472 MetaphAdd(secondary, "K");
473 }
474
475 current += 2;
476 break;
477 }
478 }
479
480 if (GetAt(original, current + 1) == 'N') {
481 if ((current == 1) && IsVowel(original, 0) && !SlavoGermanic(original)) {
482 MetaphAdd(primary, "KN");
483 MetaphAdd(secondary, "N");
484 } else
485 /* not e.g. 'cagney' */
486 if (!StringAt(original, (current + 2), 2, "EY", "") &&
487 (GetAt(original, current + 1) != 'Y') && !SlavoGermanic(original)) {
488 MetaphAdd(primary, "N");
489 MetaphAdd(secondary, "KN");
490 } else {
491 MetaphAdd(primary, "KN");
492 MetaphAdd(secondary, "KN");
493 }
494 current += 2;
495 break;
496 }
497
498 /* 'tagliaro' */
499 if (StringAt(original, (current + 1), 2, "LI", "") && !SlavoGermanic(original)) {
500 MetaphAdd(primary, "KL");
501 MetaphAdd(secondary, "L");
502 current += 2;
503 break;
504 }
505
506 /* -ges-,-gep-,-gel-, -gie- at beginning */
507 if ((current == 0) && ((GetAt(original, current + 1) == 'Y') ||
508 StringAt(original, (current + 1), 2, "ES", "EP", "EB", "EL", "EY",
509 "IB", "IL", "IN", "IE", "EI", "ER", ""))) {
510 MetaphAdd(primary, "K");
511 MetaphAdd(secondary, "J");
512 current += 2;
513 break;
514 }
515
516 /* -ger-, -gy- */
517 if ((StringAt(original, (current + 1), 2, "ER", "") ||
518 (GetAt(original, current + 1) == 'Y')) &&
519 !StringAt(original, 0, 6, "DANGER", "RANGER", "MANGER", "") &&
520 !StringAt(original, (current - 1), 1, "E", "I", "") &&
521 !StringAt(original, (current - 1), 3, "RGY", "OGY", "")) {
522 MetaphAdd(primary, "K");
523 MetaphAdd(secondary, "J");
524 current += 2;
525 break;
526 }
527
528 /* italian e.g, 'biaggi' */
529 if (StringAt(original, (current + 1), 1, "E", "I", "Y", "") ||
530 StringAt(original, (current - 1), 4, "AGGI", "OGGI", "")) {
531 /* obvious germanic */
532 if ((StringAt(original, 0, 4, "VAN ", "VON ", "") ||
533 StringAt(original, 0, 3, "SCH", "")) ||
534 StringAt(original, (current + 1), 2, "ET", "")) {
535 MetaphAdd(primary, "K");
536 MetaphAdd(secondary, "K");
537 } else {
538 /* always soft if french ending */
539 if (StringAt(original, (current + 1), 4, "IER ", "")) {
540 MetaphAdd(primary, "J");
541 MetaphAdd(secondary, "J");
542 } else {
543 MetaphAdd(primary, "J");
544 MetaphAdd(secondary, "K");
545 }
546 }
547 current += 2;
548 break;
549 }
550
551 if (GetAt(original, current + 1) == 'G')
552 current += 2;
553 else
554 current += 1;
555 MetaphAdd(primary, "K");
556 MetaphAdd(secondary, "K");
557 break;
558
559 case 'H':
560 /* only keep if first & before vowel or btw. 2 vowels */
561 if (((current == 0) || IsVowel(original, current - 1)) && IsVowel(original, current + 1)) {
562 MetaphAdd(primary, "H");
563 MetaphAdd(secondary, "H");
564 current += 2;
565 } else /* also takes care of 'HH' */
566 current += 1;
567 break;
568
569 case 'J':
570 /* obvious spanish, 'jose', 'san jacinto' */
571 if (StringAt(original, current, 4, "JOSE", "") || StringAt(original, 0, 4, "SAN ", "")) {
572 if (((current == 0) && (GetAt(original, current + 4) == ' ')) ||
573 StringAt(original, 0, 4, "SAN ", "")) {
574 MetaphAdd(primary, "H");
575 MetaphAdd(secondary, "H");
576 } else {
577 MetaphAdd(primary, "J");
578 MetaphAdd(secondary, "H");
579 }
580 current += 1;
581 break;
582 }
583
584 if ((current == 0) && !StringAt(original, current, 4, "JOSE", "")) {
585 MetaphAdd(primary, "J"); /* Yankelovich/Jankelowicz */
586 MetaphAdd(secondary, "A");
587 } else {
588 /* spanish pron. of e.g. 'bajador' */
589 if (IsVowel(original, current - 1) && !SlavoGermanic(original) &&
590 ((GetAt(original, current + 1) == 'A') || (GetAt(original, current + 1) == 'O'))) {
591 MetaphAdd(primary, "J");
592 MetaphAdd(secondary, "H");
593 } else {
594 if (current == last) {
595 MetaphAdd(primary, "J");
596 MetaphAdd(secondary, "");
597 } else {
598 if (!StringAt(original, (current + 1), 1, "L", "T", "K", "S", "N", "M", "B", "Z",
599 "") &&
600 !StringAt(original, (current - 1), 1, "S", "K", "L", "")) {
601 MetaphAdd(primary, "J");
602 MetaphAdd(secondary, "J");
603 }
604 }
605 }
606 }
607
608 if (GetAt(original, current + 1) == 'J') /* it could happen! */
609 current += 2;
610 else
611 current += 1;
612 break;
613
614 case 'K':
615 if (GetAt(original, current + 1) == 'K')
616 current += 2;
617 else
618 current += 1;
619 MetaphAdd(primary, "K");
620 MetaphAdd(secondary, "K");
621 break;
622
623 case 'L':
624 if (GetAt(original, current + 1) == 'L') {
625 /* spanish e.g. 'cabrillo', 'gallegos' */
626 if (((current == (length - 3)) &&
627 StringAt(original, (current - 1), 4, "ILLO", "ILLA", "ALLE", "")) ||
628 ((StringAt(original, (last - 1), 2, "AS", "OS", "") ||
629 StringAt(original, last, 1, "A", "O", "")) &&
630 StringAt(original, (current - 1), 4, "ALLE", ""))) {
631 MetaphAdd(primary, "L");
632 MetaphAdd(secondary, "");
633 current += 2;
634 break;
635 }
636 current += 2;
637 } else
638 current += 1;
639 MetaphAdd(primary, "L");
640 MetaphAdd(secondary, "L");
641 break;
642
643 case 'M':
644 if ((StringAt(original, (current - 1), 3, "UMB", "") &&
645 (((current + 1) == last) || StringAt(original, (current + 2), 2, "ER", "")))
646 /* 'dumb','thumb' */
647 || (GetAt(original, current + 1) == 'M'))
648 current += 2;
649 else
650 current += 1;
651 MetaphAdd(primary, "M");
652 MetaphAdd(secondary, "M");
653 break;
654
655 case 'N':
656 if (GetAt(original, current + 1) == 'N')
657 current += 2;
658 else
659 current += 1;
660 MetaphAdd(primary, "N");
661 MetaphAdd(secondary, "N");
662 break;
663
664 #if 0 // UTF8, not Latin1
665 case 'Ñ':
666 current += 1;
667 MetaphAdd(primary, "N");
668 MetaphAdd(secondary, "N");
669 break;
670 #endif
671
672 case 'P':
673 if (GetAt(original, current + 1) == 'H') {
674 MetaphAdd(primary, "F");
675 MetaphAdd(secondary, "F");
676 current += 2;
677 break;
678 }
679
680 /* also account for "campbell", "raspberry" */
681 if (StringAt(original, (current + 1), 1, "P", "B", ""))
682 current += 2;
683 else
684 current += 1;
685 MetaphAdd(primary, "P");
686 MetaphAdd(secondary, "P");
687 break;
688
689 case 'Q':
690 if (GetAt(original, current + 1) == 'Q')
691 current += 2;
692 else
693 current += 1;
694 MetaphAdd(primary, "K");
695 MetaphAdd(secondary, "K");
696 break;
697
698 case 'R':
699 /* french e.g. 'rogier', but exclude 'hochmeier' */
700 if ((current == last) && !SlavoGermanic(original) &&
701 StringAt(original, (current - 2), 2, "IE", "") &&
702 !StringAt(original, (current - 4), 2, "ME", "MA", "")) {
703 MetaphAdd(primary, "");
704 MetaphAdd(secondary, "R");
705 } else {
706 MetaphAdd(primary, "R");
707 MetaphAdd(secondary, "R");
708 }
709
710 if (GetAt(original, current + 1) == 'R')
711 current += 2;
712 else
713 current += 1;
714 break;
715
716 case 'S':
717 /* special cases 'island', 'isle', 'carlisle', 'carlysle' */
718 if (StringAt(original, (current - 1), 3, "ISL", "YSL", "")) {
719 current += 1;
720 break;
721 }
722
723 /* special case 'sugar-' */
724 if ((current == 0) && StringAt(original, current, 5, "SUGAR", "")) {
725 MetaphAdd(primary, "X");
726 MetaphAdd(secondary, "S");
727 current += 1;
728 break;
729 }
730
731 if (StringAt(original, current, 2, "SH", "")) {
732 /* germanic */
733 if (StringAt(original, (current + 1), 4, "HEIM", "HOEK", "HOLM", "HOLZ", "")) {
734 MetaphAdd(primary, "S");
735 MetaphAdd(secondary, "S");
736 } else {
737 MetaphAdd(primary, "X");
738 MetaphAdd(secondary, "X");
739 }
740 current += 2;
741 break;
742 }
743
744 /* italian & armenian */
745 if (StringAt(original, current, 3, "SIO", "SIA", "") ||
746 StringAt(original, current, 4, "SIAN", "")) {
747 if (!SlavoGermanic(original)) {
748 MetaphAdd(primary, "S");
749 MetaphAdd(secondary, "X");
750 } else {
751 MetaphAdd(primary, "S");
752 MetaphAdd(secondary, "S");
753 }
754 current += 3;
755 break;
756 }
757
758 /* german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
759 also, -sz- in slavic language altho in hungarian it is pronounced 's' */
760 if (((current == 0) && StringAt(original, (current + 1), 1, "M", "N", "L", "W", "")) ||
761 StringAt(original, (current + 1), 1, "Z", "")) {
762 MetaphAdd(primary, "S");
763 MetaphAdd(secondary, "X");
764 if (StringAt(original, (current + 1), 1, "Z", ""))
765 current += 2;
766 else
767 current += 1;
768 break;
769 }
770
771 if (StringAt(original, current, 2, "SC", "")) {
772 /* Schlesinger's rule */
773 if (GetAt(original, current + 2) == 'H') /* dutch origin, e.g. 'school', 'schooner' */ {
774 if (StringAt(original, (current + 3), 2, "OO", "ER", "EN", "UY", "ED", "EM", "")) {
775 /* 'schermerhorn', 'schenker' */
776 if (StringAt(original, (current + 3), 2, "ER", "EN", "")) {
777 MetaphAdd(primary, "X");
778 MetaphAdd(secondary, "SK");
779 } else {
780 MetaphAdd(primary, "SK");
781 MetaphAdd(secondary, "SK");
782 }
783 current += 3;
784 break;
785 } else {
786 if ((current == 0) && !IsVowel(original, 3) && (GetAt(original, 3) != 'W')) {
787 MetaphAdd(primary, "X");
788 MetaphAdd(secondary, "S");
789 } else {
790 MetaphAdd(primary, "X");
791 MetaphAdd(secondary, "X");
792 }
793 current += 3;
794 break;
795 }
796 }
797
798 if (StringAt(original, (current + 2), 1, "I", "E", "Y", "")) {
799 MetaphAdd(primary, "S");
800 MetaphAdd(secondary, "S");
801 current += 3;
802 break;
803 }
804 /* else */
805 MetaphAdd(primary, "SK");
806 MetaphAdd(secondary, "SK");
807 current += 3;
808 break;
809 }
810
811 /* french e.g. 'resnais', 'artois' */
812 if ((current == last) && StringAt(original, (current - 2), 2, "AI", "OI", "")) {
813 MetaphAdd(primary, "");
814 MetaphAdd(secondary, "S");
815 } else {
816 MetaphAdd(primary, "S");
817 MetaphAdd(secondary, "S");
818 }
819
820 if (StringAt(original, (current + 1), 1, "S", "Z", ""))
821 current += 2;
822 else
823 current += 1;
824 break;
825
826 case 'T':
827 if (StringAt(original, current, 4, "TION", "")) {
828 MetaphAdd(primary, "X");
829 MetaphAdd(secondary, "X");
830 current += 3;
831 break;
832 }
833
834 if (StringAt(original, current, 3, "TIA", "TCH", "")) {
835 MetaphAdd(primary, "X");
836 MetaphAdd(secondary, "X");
837 current += 3;
838 break;
839 }
840
841 if (StringAt(original, current, 2, "TH", "") || StringAt(original, current, 3, "TTH", "")) {
842 /* special case 'thomas', 'thames' or germanic */
843 if (StringAt(original, (current + 2), 2, "OM", "AM", "") ||
844 StringAt(original, 0, 4, "VAN ", "VON ", "") || StringAt(original, 0, 3, "SCH", "")) {
845 MetaphAdd(primary, "T");
846 MetaphAdd(secondary, "T");
847 } else {
848 MetaphAdd(primary, "0"); /* yes, zero */
849 MetaphAdd(secondary, "T");
850 }
851 current += 2;
852 break;
853 }
854
855 if (StringAt(original, (current + 1), 1, "T", "D", ""))
856 current += 2;
857 else
858 current += 1;
859 MetaphAdd(primary, "T");
860 MetaphAdd(secondary, "T");
861 break;
862
863 case 'V':
864 if (GetAt(original, current + 1) == 'V')
865 current += 2;
866 else
867 current += 1;
868 MetaphAdd(primary, "F");
869 MetaphAdd(secondary, "F");
870 break;
871
872 case 'W':
873 /* can also be in middle of word */
874 if (StringAt(original, current, 2, "WR", "")) {
875 MetaphAdd(primary, "R");
876 MetaphAdd(secondary, "R");
877 current += 2;
878 break;
879 }
880
881 if ((current == 0) &&
882 (IsVowel(original, current + 1) || StringAt(original, current, 2, "WH", ""))) {
883 /* Wasserman should match Vasserman */
884 if (IsVowel(original, current + 1)) {
885 MetaphAdd(primary, "A");
886 MetaphAdd(secondary, "F");
887 } else {
888 /* need Uomo to match Womo */
889 MetaphAdd(primary, "A");
890 MetaphAdd(secondary, "A");
891 }
892 }
893
894 /* Arnow should match Arnoff */
895 if (((current == last) && IsVowel(original, current - 1)) ||
896 StringAt(original, (current - 1), 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY", "") ||
897 StringAt(original, 0, 3, "SCH", "")) {
898 MetaphAdd(primary, "");
899 MetaphAdd(secondary, "F");
900 current += 1;
901 break;
902 }
903
904 /* polish e.g. 'filipowicz' */
905 if (StringAt(original, current, 4, "WICZ", "WITZ", "")) {
906 MetaphAdd(primary, "TS");
907 MetaphAdd(secondary, "FX");
908 current += 4;
909 break;
910 }
911
912 /* else skip it */
913 current += 1;
914 break;
915
916 case 'X':
917 /* french e.g. breaux */
918 if (!((current == last) && (StringAt(original, (current - 3), 3, "IAU", "EAU", "") ||
919 StringAt(original, (current - 2), 2, "AU", "OU", "")))) {
920 MetaphAdd(primary, "KS");
921 MetaphAdd(secondary, "KS");
922 }
923
924 if (StringAt(original, (current + 1), 1, "C", "X", ""))
925 current += 2;
926 else
927 current += 1;
928 break;
929
930 case 'Z':
931 /* chinese pinyin e.g. 'zhao' */
932 if (GetAt(original, current + 1) == 'H') {
933 MetaphAdd(primary, "J");
934 MetaphAdd(secondary, "J");
935 current += 2;
936 break;
937 } else if (StringAt(original, (current + 1), 2, "ZO", "ZI", "ZA", "") ||
938 (SlavoGermanic(original) &&
939 ((current > 0) && GetAt(original, current - 1) != 'T'))) {
940 MetaphAdd(primary, "S");
941 MetaphAdd(secondary, "TS");
942 } else {
943 MetaphAdd(primary, "S");
944 MetaphAdd(secondary, "S");
945 }
946
947 if (GetAt(original, current + 1) == 'Z')
948 current += 2;
949 else
950 current += 1;
951 break;
952
953 default:
954 current += 1;
955 }
956 /* printf("PRIMARY: %s\n", primary->str);
957 printf("SECONDARY: %s\n", secondary->str); */
958 }
959
960 if (primary->length > 4) SetAt(primary, 4, '\0');
961
962 if (secondary->length > 4) SetAt(secondary, 4, '\0');
963 if (primary_pp) {
964 if (primary->length > 0) {
965 *primary_pp = primary->str;
966 primary->free_string_on_destroy = 0;
967 }
968 }
969 if (secondary_pp) {
970 if (secondary->length > 0) {
971 *secondary_pp = secondary->str;
972 secondary->free_string_on_destroy = 0;
973 }
974 }
975
976 DestroyMetaString(original);
977 DestroyMetaString(primary);
978 DestroyMetaString(secondary);
979 }
980