1 /*
2  * fuzzystrmatch.c
3  *
4  * Functions for "fuzzy" comparison of strings
5  *
6  * Joe Conway <mail@joeconway.com>
7  *
8  * contrib/fuzzystrmatch/fuzzystrmatch.c
9  * Copyright (c) 2001-2016, PostgreSQL Global Development Group
10  * ALL RIGHTS RESERVED;
11  *
12  * metaphone()
13  * -----------
14  * Modified for PostgreSQL by Joe Conway.
15  * Based on CPAN's "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
16  * Code slightly modified for use as PostgreSQL function (palloc, elog, etc).
17  * Metaphone was originally created by Lawrence Philips and presented in article
18  * in "Computer Language" December 1990 issue.
19  *
20  * Permission to use, copy, modify, and distribute this software and its
21  * documentation for any purpose, without fee, and without a written agreement
22  * is hereby granted, provided that the above copyright notice and this
23  * paragraph and the following two paragraphs appear in all copies.
24  *
25  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
26  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
27  * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
28  * DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  *
31  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
33  * AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
34  * ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO
35  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
36  *
37  */
38 
39 #include "postgres.h"
40 
41 #include <ctype.h>
42 
43 #include "mb/pg_wchar.h"
44 #include "utils/builtins.h"
45 
46 PG_MODULE_MAGIC;
47 
48 /*
49  * Soundex
50  */
51 static void _soundex(const char *instr, char *outstr);
52 
53 #define SOUNDEX_LEN 4
54 
55 /*									ABCDEFGHIJKLMNOPQRSTUVWXYZ */
56 static const char *soundex_table = "01230120022455012623010202";
57 
58 static char
soundex_code(char letter)59 soundex_code(char letter)
60 {
61 	letter = toupper((unsigned char) letter);
62 	/* Defend against non-ASCII letters */
63 	if (letter >= 'A' && letter <= 'Z')
64 		return soundex_table[letter - 'A'];
65 	return letter;
66 }
67 
68 /*
69  * Metaphone
70  */
71 #define MAX_METAPHONE_STRLEN		255
72 
73 /*
74  * Original code by Michael G Schwern starts here.
75  * Code slightly modified for use as PostgreSQL function.
76  */
77 
78 
79 /**************************************************************************
80 	metaphone -- Breaks english phrases down into their phonemes.
81 
82 	Input
83 		word			--	An english word to be phonized
84 		max_phonemes	--	How many phonemes to calculate.  If 0, then it
85 							will phonize the entire phrase.
86 		phoned_word		--	The final phonized word.  (We'll allocate the
87 							memory.)
88 	Output
89 		error	--	A simple error flag, returns TRUE or FALSE
90 
91 	NOTES:	ALL non-alpha characters are ignored, this includes whitespace,
92 	although non-alpha characters will break up phonemes.
93 ****************************************************************************/
94 
95 
96 /**************************************************************************
97 	my constants -- constants I like
98 
99 	Probably redundant.
100 
101 ***************************************************************************/
102 
103 #define META_ERROR			FALSE
104 #define META_SUCCESS		TRUE
105 #define META_FAILURE		FALSE
106 
107 
108 /*	I add modifications to the traditional metaphone algorithm that you
109 	might find in books.  Define this if you want metaphone to behave
110 	traditionally */
111 #undef USE_TRADITIONAL_METAPHONE
112 
113 /* Special encodings */
114 #define  SH		'X'
115 #define  TH		'0'
116 
117 static char Lookahead(char *word, int how_far);
118 static int	_metaphone(char *word, int max_phonemes, char **phoned_word);
119 
120 /* Metachar.h ... little bits about characters for metaphone */
121 
122 
123 /*-- Character encoding array & accessing macros --*/
124 /* Stolen directly out of the book... */
125 static const char _codes[26] = {
126 	1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
127 /*	a  b c	d e f g  h i j k l m n o p q r s t u v w x y z */
128 };
129 
130 static int
getcode(char c)131 getcode(char c)
132 {
133 	if (isalpha((unsigned char) c))
134 	{
135 		c = toupper((unsigned char) c);
136 		/* Defend against non-ASCII letters */
137 		if (c >= 'A' && c <= 'Z')
138 			return _codes[c - 'A'];
139 	}
140 	return 0;
141 }
142 
143 #define isvowel(c)	(getcode(c) & 1)	/* AEIOU */
144 
145 /* These letters are passed through unchanged */
146 #define NOCHANGE(c) (getcode(c) & 2)	/* FJMNR */
147 
148 /* These form diphthongs when preceding H */
149 #define AFFECTH(c)	(getcode(c) & 4)	/* CGPST */
150 
151 /* These make C and G soft */
152 #define MAKESOFT(c) (getcode(c) & 8)	/* EIY */
153 
154 /* These prevent GH from becoming F */
155 #define NOGHTOF(c)	(getcode(c) & 16)	/* BDH */
156 
157 PG_FUNCTION_INFO_V1(levenshtein_with_costs);
158 Datum
levenshtein_with_costs(PG_FUNCTION_ARGS)159 levenshtein_with_costs(PG_FUNCTION_ARGS)
160 {
161 	text	   *src = PG_GETARG_TEXT_PP(0);
162 	text	   *dst = PG_GETARG_TEXT_PP(1);
163 	int			ins_c = PG_GETARG_INT32(2);
164 	int			del_c = PG_GETARG_INT32(3);
165 	int			sub_c = PG_GETARG_INT32(4);
166 	const char *s_data;
167 	const char *t_data;
168 	int			s_bytes,
169 				t_bytes;
170 
171 	/* Extract a pointer to the actual character data */
172 	s_data = VARDATA_ANY(src);
173 	t_data = VARDATA_ANY(dst);
174 	/* Determine length of each string in bytes */
175 	s_bytes = VARSIZE_ANY_EXHDR(src);
176 	t_bytes = VARSIZE_ANY_EXHDR(dst);
177 
178 	PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
179 									   ins_c, del_c, sub_c, false));
180 }
181 
182 
183 PG_FUNCTION_INFO_V1(levenshtein);
184 Datum
levenshtein(PG_FUNCTION_ARGS)185 levenshtein(PG_FUNCTION_ARGS)
186 {
187 	text	   *src = PG_GETARG_TEXT_PP(0);
188 	text	   *dst = PG_GETARG_TEXT_PP(1);
189 	const char *s_data;
190 	const char *t_data;
191 	int			s_bytes,
192 				t_bytes;
193 
194 	/* Extract a pointer to the actual character data */
195 	s_data = VARDATA_ANY(src);
196 	t_data = VARDATA_ANY(dst);
197 	/* Determine length of each string in bytes */
198 	s_bytes = VARSIZE_ANY_EXHDR(src);
199 	t_bytes = VARSIZE_ANY_EXHDR(dst);
200 
201 	PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
202 									   1, 1, 1, false));
203 }
204 
205 
206 PG_FUNCTION_INFO_V1(levenshtein_less_equal_with_costs);
207 Datum
levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)208 levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
209 {
210 	text	   *src = PG_GETARG_TEXT_PP(0);
211 	text	   *dst = PG_GETARG_TEXT_PP(1);
212 	int			ins_c = PG_GETARG_INT32(2);
213 	int			del_c = PG_GETARG_INT32(3);
214 	int			sub_c = PG_GETARG_INT32(4);
215 	int			max_d = PG_GETARG_INT32(5);
216 	const char *s_data;
217 	const char *t_data;
218 	int			s_bytes,
219 				t_bytes;
220 
221 	/* Extract a pointer to the actual character data */
222 	s_data = VARDATA_ANY(src);
223 	t_data = VARDATA_ANY(dst);
224 	/* Determine length of each string in bytes */
225 	s_bytes = VARSIZE_ANY_EXHDR(src);
226 	t_bytes = VARSIZE_ANY_EXHDR(dst);
227 
228 	PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
229 												  t_data, t_bytes,
230 												  ins_c, del_c, sub_c,
231 												  max_d, false));
232 }
233 
234 
235 PG_FUNCTION_INFO_V1(levenshtein_less_equal);
236 Datum
levenshtein_less_equal(PG_FUNCTION_ARGS)237 levenshtein_less_equal(PG_FUNCTION_ARGS)
238 {
239 	text	   *src = PG_GETARG_TEXT_PP(0);
240 	text	   *dst = PG_GETARG_TEXT_PP(1);
241 	int			max_d = PG_GETARG_INT32(2);
242 	const char *s_data;
243 	const char *t_data;
244 	int			s_bytes,
245 				t_bytes;
246 
247 	/* Extract a pointer to the actual character data */
248 	s_data = VARDATA_ANY(src);
249 	t_data = VARDATA_ANY(dst);
250 	/* Determine length of each string in bytes */
251 	s_bytes = VARSIZE_ANY_EXHDR(src);
252 	t_bytes = VARSIZE_ANY_EXHDR(dst);
253 
254 	PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
255 												  t_data, t_bytes,
256 												  1, 1, 1,
257 												  max_d, false));
258 }
259 
260 
261 /*
262  * Calculates the metaphone of an input string.
263  * Returns number of characters requested
264  * (suggested value is 4)
265  */
266 PG_FUNCTION_INFO_V1(metaphone);
267 Datum
metaphone(PG_FUNCTION_ARGS)268 metaphone(PG_FUNCTION_ARGS)
269 {
270 	char	   *str_i = TextDatumGetCString(PG_GETARG_DATUM(0));
271 	size_t		str_i_len = strlen(str_i);
272 	int			reqlen;
273 	char	   *metaph;
274 	int			retval;
275 
276 	/* return an empty string if we receive one */
277 	if (!(str_i_len > 0))
278 		PG_RETURN_TEXT_P(cstring_to_text(""));
279 
280 	if (str_i_len > MAX_METAPHONE_STRLEN)
281 		ereport(ERROR,
282 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
283 				 errmsg("argument exceeds the maximum length of %d bytes",
284 						MAX_METAPHONE_STRLEN)));
285 
286 	reqlen = PG_GETARG_INT32(1);
287 	if (reqlen > MAX_METAPHONE_STRLEN)
288 		ereport(ERROR,
289 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
290 				 errmsg("output exceeds the maximum length of %d bytes",
291 						MAX_METAPHONE_STRLEN)));
292 
293 	if (!(reqlen > 0))
294 		ereport(ERROR,
295 				(errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
296 				 errmsg("output cannot be empty string")));
297 
298 
299 	retval = _metaphone(str_i, reqlen, &metaph);
300 	if (retval == META_SUCCESS)
301 		PG_RETURN_TEXT_P(cstring_to_text(metaph));
302 	else
303 	{
304 		/* internal error */
305 		elog(ERROR, "metaphone: failure");
306 		/* keep the compiler quiet */
307 		PG_RETURN_NULL();
308 	}
309 }
310 
311 
312 /*
313  * Original code by Michael G Schwern starts here.
314  * Code slightly modified for use as PostgreSQL
315  * function (palloc, etc).
316  */
317 
318 /* I suppose I could have been using a character pointer instead of
319  * accessing the array directly... */
320 
321 /* Look at the next letter in the word */
322 #define Next_Letter (toupper((unsigned char) word[w_idx+1]))
323 /* Look at the current letter in the word */
324 #define Curr_Letter (toupper((unsigned char) word[w_idx]))
325 /* Go N letters back. */
326 #define Look_Back_Letter(n) \
327 	(w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0')
328 /* Previous letter.  I dunno, should this return null on failure? */
329 #define Prev_Letter (Look_Back_Letter(1))
330 /* Look two letters down.  It makes sure you don't walk off the string. */
331 #define After_Next_Letter \
332 	(Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0')
333 #define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n))
334 
335 
336 /* Allows us to safely look ahead an arbitrary # of letters */
337 /* I probably could have just used strlen... */
338 static char
Lookahead(char * word,int how_far)339 Lookahead(char *word, int how_far)
340 {
341 	char		letter_ahead = '\0';	/* null by default */
342 	int			idx;
343 
344 	for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
345 	/* Edge forward in the string... */
346 
347 	letter_ahead = word[idx];	/* idx will be either == to how_far or at the
348 								 * end of the string */
349 	return letter_ahead;
350 }
351 
352 
353 /* phonize one letter */
354 #define Phonize(c)	do {(*phoned_word)[p_idx++] = c;} while (0)
355 /* Slap a null character on the end of the phoned word */
356 #define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0)
357 /* How long is the phoned word? */
358 #define Phone_Len	(p_idx)
359 
360 /* Note is a letter is a 'break' in the word */
361 #define Isbreak(c)	(!isalpha((unsigned char) (c)))
362 
363 
364 static int
_metaphone(char * word,int max_phonemes,char ** phoned_word)365 _metaphone(char *word,			/* IN */
366 		   int max_phonemes,
367 		   char **phoned_word)	/* OUT */
368 {
369 	int			w_idx = 0;		/* point in the phonization we're at. */
370 	int			p_idx = 0;		/* end of the phoned phrase */
371 
372 	/*-- Parameter checks --*/
373 
374 	/*
375 	 * Shouldn't be necessary, but left these here anyway jec Aug 3, 2001
376 	 */
377 
378 	/* Negative phoneme length is meaningless */
379 	if (!(max_phonemes > 0))
380 		/* internal error */
381 		elog(ERROR, "metaphone: Requested output length must be > 0");
382 
383 	/* Empty/null string is meaningless */
384 	if ((word == NULL) || !(strlen(word) > 0))
385 		/* internal error */
386 		elog(ERROR, "metaphone: Input string length must be > 0");
387 
388 	/*-- Allocate memory for our phoned_phrase --*/
389 	if (max_phonemes == 0)
390 	{							/* Assume largest possible */
391 		*phoned_word = palloc(sizeof(char) * strlen(word) +1);
392 	}
393 	else
394 	{
395 		*phoned_word = palloc(sizeof(char) * max_phonemes + 1);
396 	}
397 
398 	/*-- The first phoneme has to be processed specially. --*/
399 	/* Find our first letter */
400 	for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++)
401 	{
402 		/* On the off chance we were given nothing but crap... */
403 		if (Curr_Letter == '\0')
404 		{
405 			End_Phoned_Word;
406 			return META_SUCCESS;	/* For testing */
407 		}
408 	}
409 
410 	switch (Curr_Letter)
411 	{
412 			/* AE becomes E */
413 		case 'A':
414 			if (Next_Letter == 'E')
415 			{
416 				Phonize('E');
417 				w_idx += 2;
418 			}
419 			/* Remember, preserve vowels at the beginning */
420 			else
421 			{
422 				Phonize('A');
423 				w_idx++;
424 			}
425 			break;
426 			/* [GKP]N becomes N */
427 		case 'G':
428 		case 'K':
429 		case 'P':
430 			if (Next_Letter == 'N')
431 			{
432 				Phonize('N');
433 				w_idx += 2;
434 			}
435 			break;
436 
437 			/*
438 			 * WH becomes H, WR becomes R W if followed by a vowel
439 			 */
440 		case 'W':
441 			if (Next_Letter == 'H' ||
442 				Next_Letter == 'R')
443 			{
444 				Phonize(Next_Letter);
445 				w_idx += 2;
446 			}
447 			else if (isvowel(Next_Letter))
448 			{
449 				Phonize('W');
450 				w_idx += 2;
451 			}
452 			/* else ignore */
453 			break;
454 			/* X becomes S */
455 		case 'X':
456 			Phonize('S');
457 			w_idx++;
458 			break;
459 			/* Vowels are kept */
460 
461 			/*
462 			 * We did A already case 'A': case 'a':
463 			 */
464 		case 'E':
465 		case 'I':
466 		case 'O':
467 		case 'U':
468 			Phonize(Curr_Letter);
469 			w_idx++;
470 			break;
471 		default:
472 			/* do nothing */
473 			break;
474 	}
475 
476 
477 
478 	/* On to the metaphoning */
479 	for (; Curr_Letter != '\0' &&
480 		 (max_phonemes == 0 || Phone_Len < max_phonemes);
481 		 w_idx++)
482 	{
483 		/*
484 		 * How many letters to skip because an earlier encoding handled
485 		 * multiple letters
486 		 */
487 		unsigned short int skip_letter = 0;
488 
489 
490 		/*
491 		 * THOUGHT:  It would be nice if, rather than having things like...
492 		 * well, SCI.  For SCI you encode the S, then have to remember to skip
493 		 * the C.  So the phonome SCI invades both S and C.  It would be
494 		 * better, IMHO, to skip the C from the S part of the encoding. Hell,
495 		 * I'm trying it.
496 		 */
497 
498 		/* Ignore non-alphas */
499 		if (!isalpha((unsigned char) (Curr_Letter)))
500 			continue;
501 
502 		/* Drop duplicates, except CC */
503 		if (Curr_Letter == Prev_Letter &&
504 			Curr_Letter != 'C')
505 			continue;
506 
507 		switch (Curr_Letter)
508 		{
509 				/* B -> B unless in MB */
510 			case 'B':
511 				if (Prev_Letter != 'M')
512 					Phonize('B');
513 				break;
514 
515 				/*
516 				 * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
517 				 * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
518 				 * SCE-, -SCY- (handed in S) else K
519 				 */
520 			case 'C':
521 				if (MAKESOFT(Next_Letter))
522 				{				/* C[IEY] */
523 					if (After_Next_Letter == 'A' &&
524 						Next_Letter == 'I')
525 					{			/* CIA */
526 						Phonize(SH);
527 					}
528 					/* SC[IEY] */
529 					else if (Prev_Letter == 'S')
530 					{
531 						/* Dropped */
532 					}
533 					else
534 						Phonize('S');
535 				}
536 				else if (Next_Letter == 'H')
537 				{
538 #ifndef USE_TRADITIONAL_METAPHONE
539 					if (After_Next_Letter == 'R' ||
540 						Prev_Letter == 'S')
541 					{			/* Christ, School */
542 						Phonize('K');
543 					}
544 					else
545 						Phonize(SH);
546 #else
547 					Phonize(SH);
548 #endif
549 					skip_letter++;
550 				}
551 				else
552 					Phonize('K');
553 				break;
554 
555 				/*
556 				 * J if in -DGE-, -DGI- or -DGY- else T
557 				 */
558 			case 'D':
559 				if (Next_Letter == 'G' &&
560 					MAKESOFT(After_Next_Letter))
561 				{
562 					Phonize('J');
563 					skip_letter++;
564 				}
565 				else
566 					Phonize('T');
567 				break;
568 
569 				/*
570 				 * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
571 				 * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
572 				 * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
573 				 * else K
574 				 */
575 			case 'G':
576 				if (Next_Letter == 'H')
577 				{
578 					if (!(NOGHTOF(Look_Back_Letter(3)) ||
579 						  Look_Back_Letter(4) == 'H'))
580 					{
581 						Phonize('F');
582 						skip_letter++;
583 					}
584 					else
585 					{
586 						/* silent */
587 					}
588 				}
589 				else if (Next_Letter == 'N')
590 				{
591 					if (Isbreak(After_Next_Letter) ||
592 						(After_Next_Letter == 'E' &&
593 						 Look_Ahead_Letter(3) == 'D'))
594 					{
595 						/* dropped */
596 					}
597 					else
598 						Phonize('K');
599 				}
600 				else if (MAKESOFT(Next_Letter) &&
601 						 Prev_Letter != 'G')
602 					Phonize('J');
603 				else
604 					Phonize('K');
605 				break;
606 				/* H if before a vowel and not after C,G,P,S,T */
607 			case 'H':
608 				if (isvowel(Next_Letter) &&
609 					!AFFECTH(Prev_Letter))
610 					Phonize('H');
611 				break;
612 
613 				/*
614 				 * dropped if after C else K
615 				 */
616 			case 'K':
617 				if (Prev_Letter != 'C')
618 					Phonize('K');
619 				break;
620 
621 				/*
622 				 * F if before H else P
623 				 */
624 			case 'P':
625 				if (Next_Letter == 'H')
626 					Phonize('F');
627 				else
628 					Phonize('P');
629 				break;
630 
631 				/*
632 				 * K
633 				 */
634 			case 'Q':
635 				Phonize('K');
636 				break;
637 
638 				/*
639 				 * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S
640 				 */
641 			case 'S':
642 				if (Next_Letter == 'I' &&
643 					(After_Next_Letter == 'O' ||
644 					 After_Next_Letter == 'A'))
645 					Phonize(SH);
646 				else if (Next_Letter == 'H')
647 				{
648 					Phonize(SH);
649 					skip_letter++;
650 				}
651 #ifndef USE_TRADITIONAL_METAPHONE
652 				else if (Next_Letter == 'C' &&
653 						 Look_Ahead_Letter(2) == 'H' &&
654 						 Look_Ahead_Letter(3) == 'W')
655 				{
656 					Phonize(SH);
657 					skip_letter += 2;
658 				}
659 #endif
660 				else
661 					Phonize('S');
662 				break;
663 
664 				/*
665 				 * 'sh' in -TIA- or -TIO- else 'th' before H else T
666 				 */
667 			case 'T':
668 				if (Next_Letter == 'I' &&
669 					(After_Next_Letter == 'O' ||
670 					 After_Next_Letter == 'A'))
671 					Phonize(SH);
672 				else if (Next_Letter == 'H')
673 				{
674 					Phonize(TH);
675 					skip_letter++;
676 				}
677 				else
678 					Phonize('T');
679 				break;
680 				/* F */
681 			case 'V':
682 				Phonize('F');
683 				break;
684 				/* W before a vowel, else dropped */
685 			case 'W':
686 				if (isvowel(Next_Letter))
687 					Phonize('W');
688 				break;
689 				/* KS */
690 			case 'X':
691 				Phonize('K');
692 				if (max_phonemes == 0 || Phone_Len < max_phonemes)
693 					Phonize('S');
694 				break;
695 				/* Y if followed by a vowel */
696 			case 'Y':
697 				if (isvowel(Next_Letter))
698 					Phonize('Y');
699 				break;
700 				/* S */
701 			case 'Z':
702 				Phonize('S');
703 				break;
704 				/* No transformation */
705 			case 'F':
706 			case 'J':
707 			case 'L':
708 			case 'M':
709 			case 'N':
710 			case 'R':
711 				Phonize(Curr_Letter);
712 				break;
713 			default:
714 				/* nothing */
715 				break;
716 		}						/* END SWITCH */
717 
718 		w_idx += skip_letter;
719 	}							/* END FOR */
720 
721 	End_Phoned_Word;
722 
723 	return (META_SUCCESS);
724 }	/* END metaphone */
725 
726 
727 /*
728  * SQL function: soundex(text) returns text
729  */
730 PG_FUNCTION_INFO_V1(soundex);
731 
732 Datum
soundex(PG_FUNCTION_ARGS)733 soundex(PG_FUNCTION_ARGS)
734 {
735 	char		outstr[SOUNDEX_LEN + 1];
736 	char	   *arg;
737 
738 	arg = text_to_cstring(PG_GETARG_TEXT_P(0));
739 
740 	_soundex(arg, outstr);
741 
742 	PG_RETURN_TEXT_P(cstring_to_text(outstr));
743 }
744 
745 static void
_soundex(const char * instr,char * outstr)746 _soundex(const char *instr, char *outstr)
747 {
748 	int			count;
749 
750 	AssertArg(instr);
751 	AssertArg(outstr);
752 
753 	outstr[SOUNDEX_LEN] = '\0';
754 
755 	/* Skip leading non-alphabetic characters */
756 	while (!isalpha((unsigned char) instr[0]) && instr[0])
757 		++instr;
758 
759 	/* No string left */
760 	if (!instr[0])
761 	{
762 		outstr[0] = (char) 0;
763 		return;
764 	}
765 
766 	/* Take the first letter as is */
767 	*outstr++ = (char) toupper((unsigned char) *instr++);
768 
769 	count = 1;
770 	while (*instr && count < SOUNDEX_LEN)
771 	{
772 		if (isalpha((unsigned char) *instr) &&
773 			soundex_code(*instr) != soundex_code(*(instr - 1)))
774 		{
775 			*outstr = soundex_code(instr[0]);
776 			if (*outstr != '0')
777 			{
778 				++outstr;
779 				++count;
780 			}
781 		}
782 		++instr;
783 	}
784 
785 	/* Fill with 0's */
786 	while (count < SOUNDEX_LEN)
787 	{
788 		*outstr = '0';
789 		++outstr;
790 		++count;
791 	}
792 }
793 
794 PG_FUNCTION_INFO_V1(difference);
795 
796 Datum
difference(PG_FUNCTION_ARGS)797 difference(PG_FUNCTION_ARGS)
798 {
799 	char		sndx1[SOUNDEX_LEN + 1],
800 				sndx2[SOUNDEX_LEN + 1];
801 	int			i,
802 				result;
803 
804 	_soundex(text_to_cstring(PG_GETARG_TEXT_P(0)), sndx1);
805 	_soundex(text_to_cstring(PG_GETARG_TEXT_P(1)), sndx2);
806 
807 	result = 0;
808 	for (i = 0; i < SOUNDEX_LEN; i++)
809 	{
810 		if (sndx1[i] == sndx2[i])
811 			result++;
812 	}
813 
814 	PG_RETURN_INT32(result);
815 }
816