1 /* phonetic.c - routines to do phonetic matching */
2 /* $OpenLDAP$ */
3 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
4  *
5  * Copyright 1998-2021 The OpenLDAP Foundation.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted only as authorized by the OpenLDAP
10  * Public License.
11  *
12  * A copy of this license is available in the file LICENSE in the
13  * top-level directory of the distribution or, alternatively, at
14  * <http://www.OpenLDAP.org/license.html>.
15  */
16 /* Portions Copyright (c) 1995 Regents of the University of Michigan.
17  * All rights reserved.
18  *
19  * Redistribution and use in source and binary forms are permitted
20  * provided that this notice is preserved and that due credit is given
21  * to the University of Michigan at Ann Arbor. The name of the University
22  * may not be used to endorse or promote products derived from this
23  * software without specific prior written permission. This software
24  * is provided ``as is'' without express or implied warranty.
25  */
26 
27 #include "portable.h"
28 
29 #include <stdio.h>
30 
31 #include <ac/ctype.h>
32 #include <ac/string.h>
33 #include <ac/socket.h>
34 #include <ac/time.h>
35 
36 #include "slap.h"
37 
38 #if !defined(SLAPD_METAPHONE) && !defined(SLAPD_PHONETIC)
39 #define SLAPD_METAPHONE
40 #endif
41 
42 #define iswordbreak(x)  (!isascii(x) || isspace((unsigned char) (x)) || \
43 			 ispunct((unsigned char) (x)) || \
44 			 isdigit((unsigned char) (x)) || (x) == '\0')
45 
46 #if 0
47 static char *
48 first_word( char *s )
49 {
50 	if ( s == NULL ) {
51 		return( NULL );
52 	}
53 
54 	while ( iswordbreak( *s ) ) {
55 		if ( *s == '\0' ) {
56 			return( NULL );
57 		} else {
58 			s++;
59 		}
60 	}
61 
62 	return( s );
63 }
64 
65 static char *
66 next_word( char *s )
67 {
68 	if ( s == NULL ) {
69 		return( NULL );
70 	}
71 
72 	while ( ! iswordbreak( *s ) ) {
73 		s++;
74 	}
75 
76 	while ( iswordbreak( *s ) ) {
77 		if ( *s == '\0' ) {
78 			return( NULL );
79 		} else {
80 			s++;
81 		}
82 	}
83 
84 	return( s );
85 }
86 
87 static char *
88 word_dup( char *w )
89 {
90 	char	*s, *ret;
91 	char	save;
92 
93 	for ( s = w; !iswordbreak( *s ); s++ )
94 		;	/* NULL */
95 	save = *s;
96 	*s = '\0';
97 	ret = ch_strdup( w );
98 	*s = save;
99 
100 	return( ret );
101 }
102 #endif /* 0 */
103 
104 #ifndef MAXPHONEMELEN
105 #define MAXPHONEMELEN	4
106 #endif
107 
108 #if defined(SLAPD_PHONETIC)
109 
110 /* lifted from isode-8.0 */
111 char *
phonetic(char * s)112 phonetic( char *s )
113 {
114         char	code, adjacent, ch;
115 	char	*p;
116         int	i;
117 	char	phoneme[MAXPHONEMELEN + 1];
118 
119         p = s;
120         if (  p == NULL || *p == '\0' ) {
121                 return( NULL );
122         }
123 
124         adjacent = '0';
125 	phoneme[0] = TOUPPER((unsigned char)*p);
126 
127 	phoneme[1]  = '\0';
128         for ( i = 0; i < 99 && (! iswordbreak(*p)); p++ ) {
129 		ch = TOUPPER ((unsigned char)*p);
130 
131                 code = '0';
132 
133                 switch (ch) {
134                 case 'B':
135                 case 'F':
136 		case 'P':
137                 case 'V':
138                         code = (adjacent != '1') ? '1' : '0';
139                         break;
140                 case 'S':
141                 case 'C':
142                 case 'G':
143                 case 'J':
144                 case 'K':
145                 case 'Q':
146                 case 'X':
147                 case 'Z':
148                         code = (adjacent != '2') ? '2' : '0';
149                         break;
150                 case 'D':
151                 case 'T':
152                         code = (adjacent != '3') ? '3' : '0';
153                         break;
154                 case 'L':
155                         code = (adjacent != '4') ? '4' : '0';
156                         break;
157                 case 'M':
158                 case 'N':
159                         code = (adjacent != '5') ? '5' : '0';
160                         break;
161                 case 'R':
162                         code = (adjacent != '6') ? '6' : '0';
163                         break;
164                 default:
165                         adjacent = '0';
166                 }
167 
168                 if ( i == 0 ) {
169 			adjacent = code;
170 			i++;
171 		} else if ( code != '0' ) {
172 			if ( i == MAXPHONEMELEN )
173 				break;
174                         adjacent = phoneme[i] = code;
175                         i++;
176                 }
177         }
178 
179 	if ( i > 0 )
180 		phoneme[i] = '\0';
181 
182         return( ch_strdup( phoneme ) );
183 }
184 
185 #elif defined(SLAPD_METAPHONE)
186 
187 /*
188  * Metaphone was originally developed by Lawrence Philips and
189  * published in the "Computer Language" magazine in 1990.
190  */
191 /*
192  * Metaphone copied from C Gazette, June/July 1991, pp 56-57,
193  * author Gary A. Parker, with changes by Bernard Tiffany of the
194  * University of Michigan, and more changes by Tim Howes of the
195  * University of Michigan.
196  */
197 
198 /* Character coding array */
199 static const char  vsvfn[26] = {
200 	   1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2,
201 	/* A   B  C   D  E  F  G   H  I  J  K  L  M  */
202 	   2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0};
203 	/* N  O  P  Q  R  S  T  U  V  W  X  Y  Z  */
204 
205 /* Macros to access character coding array */
206 #define vowel(x)        ((x) != '\0' && vsvfn[(x) - 'A'] & 1)	/* AEIOU */
207 #define same(x)         ((x) != '\0' && vsvfn[(x) - 'A'] & 2)	/* FJLMNR */
208 #define varson(x)       ((x) != '\0' && vsvfn[(x) - 'A'] & 4)	/* CGPST */
209 #define frontv(x)       ((x) != '\0' && vsvfn[(x) - 'A'] & 8)	/* EIY */
210 #define noghf(x)        ((x) != '\0' && vsvfn[(x) - 'A'] & 16)	/* BDH */
211 
212 char *
phonetic(char * Word)213 phonetic( char *Word )
214 {
215 	char           *n, *n_start, *n_end;	/* pointers to string */
216 	char           *metaph_end;	/* pointers to metaph */
217 	char            ntrans[40];	/* word with uppercase letters */
218 	int             KSflag;	/* state flag for X -> KS */
219 	char		buf[MAXPHONEMELEN + 2];
220 	char		*Metaph;
221 
222 	/*
223 	 * Copy Word to internal buffer, dropping non-alphabetic characters
224 	 * and converting to upper case
225 	 */
226 
227 	for (n = ntrans + 4, n_end = ntrans + 35; !iswordbreak( *Word ) &&
228 	    n < n_end; Word++) {
229 		if (isalpha((unsigned char)*Word))
230 			*n++ = TOUPPER((unsigned char)*Word);
231 	}
232 	Metaph = buf;
233 	*Metaph = '\0';
234 	if (n == ntrans + 4) {
235 		return( ch_strdup( buf ) );		/* Return if null */
236 	}
237 	n_end = n;		/* Set n_end to end of string */
238 
239 	/* ntrans[0] will always be == 0 */
240 	ntrans[0] = '\0';
241 	ntrans[1] = '\0';
242 	ntrans[2] = '\0';
243 	ntrans[3] = '\0';
244 	*n++ = 0;
245 	*n++ = 0;
246 	*n++ = 0;
247 	*n = 0;			/* Pad with nulls */
248 	n = ntrans + 4;		/* Assign pointer to start */
249 
250 	/* Check for PN, KN, GN, AE, WR, WH, and X at start */
251 	switch (*n) {
252 	case 'P':
253 	case 'K':
254 	case 'G':
255 		/* 'PN', 'KN', 'GN' becomes 'N' */
256 		if (*(n + 1) == 'N')
257 			*n++ = 0;
258 		break;
259 	case 'A':
260 		/* 'AE' becomes 'E' */
261 		if (*(n + 1) == 'E')
262 			*n++ = 0;
263 		break;
264 	case 'W':
265 		/* 'WR' becomes 'R', and 'WH' to 'H' */
266 		if (*(n + 1) == 'R')
267 			*n++ = 0;
268 		else if (*(n + 1) == 'H') {
269 			*(n + 1) = *n;
270 			*n++ = 0;
271 		}
272 		break;
273 	case 'X':
274 		/* 'X' becomes 'S' */
275 		*n = 'S';
276 		break;
277 	}
278 
279 	/*
280 	 * Now, loop step through string, stopping at end of string or when
281 	 * the computed 'metaph' is MAXPHONEMELEN characters long
282 	 */
283 
284 	KSflag = 0;		/* state flag for KS translation */
285 	for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n;
286 	     n <= n_end && Metaph < metaph_end; n++) {
287 		if (KSflag) {
288 			KSflag = 0;
289 			*Metaph++ = 'S';
290 		} else {
291 			/* Drop duplicates except for CC */
292 			if (*(n - 1) == *n && *n != 'C')
293 				continue;
294 			/* Check for F J L M N R or first letter vowel */
295 			if (same(*n) || (n == n_start && vowel(*n)))
296 				*Metaph++ = *n;
297 			else
298 				switch (*n) {
299 				case 'B':
300 
301 					/*
302 					 * B unless in -MB
303 					 */
304 					if (n == (n_end - 1) && *(n - 1) != 'M')
305 						*Metaph++ = *n;
306 					break;
307 				case 'C':
308 
309 					/*
310 					 * X if in -CIA-, -CH- else S if in
311 					 * -CI-, -CE-, -CY- else dropped if
312 					 * in -SCI-, -SCE-, -SCY- else K
313 					 */
314 					if (*(n - 1) != 'S' || !frontv(*(n + 1))) {
315 						if (*(n + 1) == 'I' && *(n + 2) == 'A')
316 							*Metaph++ = 'X';
317 						else if (frontv(*(n + 1)))
318 							*Metaph++ = 'S';
319 						else if (*(n + 1) == 'H')
320 							*Metaph++ = ((n == n_start && !vowel(*(n + 2)))
321 							 || *(n - 1) == 'S')
322 							    ? (char) 'K' : (char) 'X';
323 						else
324 							*Metaph++ = 'K';
325 					}
326 					break;
327 				case 'D':
328 
329 					/*
330 					 * J if in DGE or DGI or DGY else T
331 					 */
332 					*Metaph++ = (*(n + 1) == 'G' && frontv(*(n + 2)))
333 					    ? (char) 'J' : (char) 'T';
334 					break;
335 				case 'G':
336 
337 					/*
338 					 * F if in -GH and not B--GH, D--GH,
339 					 * -H--GH, -H---GH else dropped if
340 					 * -GNED, -GN, -DGE-, -DGI-, -DGY-
341 					 * else J if in -GE-, -GI-, -GY- and
342 					 * not GG else K
343 					 */
344 					if ((*(n + 1) != 'J' || vowel(*(n + 2))) &&
345 					    (*(n + 1) != 'N' || ((n + 1) < n_end &&
346 								 (*(n + 2) != 'E' || *(n + 3) != 'D'))) &&
347 					    (*(n - 1) != 'D' || !frontv(*(n + 1))))
348 						*Metaph++ = (frontv(*(n + 1)) &&
349 							     *(n + 2) != 'G') ? (char) 'G' : (char) 'K';
350 					else if (*(n + 1) == 'H' && !noghf(*(n - 3)) &&
351 						 *(n - 4) != 'H')
352 						*Metaph++ = 'F';
353 					break;
354 				case 'H':
355 
356 					/*
357 					 * H if before a vowel and not after
358 					 * C, G, P, S, T else dropped
359 					 */
360 					if (!varson(*(n - 1)) && (!vowel(*(n - 1)) ||
361 							   vowel(*(n + 1))))
362 						*Metaph++ = 'H';
363 					break;
364 				case 'K':
365 
366 					/*
367 					 * dropped if after C else K
368 					 */
369 					if (*(n - 1) != 'C')
370 						*Metaph++ = 'K';
371 					break;
372 				case 'P':
373 
374 					/*
375 					 * F if before H, else P
376 					 */
377 					*Metaph++ = *(n + 1) == 'H' ?
378 					    (char) 'F' : (char) 'P';
379 					break;
380 				case 'Q':
381 
382 					/*
383 					 * K
384 					 */
385 					*Metaph++ = 'K';
386 					break;
387 				case 'S':
388 
389 					/*
390 					 * X in -SH-, -SIO- or -SIA- else S
391 					 */
392 					*Metaph++ = (*(n + 1) == 'H' ||
393 						     (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
394 							  *(n + 2) == 'A')))
395 					    ? (char) 'X' : (char) 'S';
396 					break;
397 				case 'T':
398 
399 					/*
400 					 * X in -TIA- or -TIO- else 0 (zero)
401 					 * before H else dropped if in -TCH-
402 					 * else T
403 					 */
404 					if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
405 							   *(n + 2) == 'A'))
406 						*Metaph++ = 'X';
407 					else if (*(n + 1) == 'H')
408 						*Metaph++ = '0';
409 					else if (*(n + 1) != 'C' || *(n + 2) != 'H')
410 						*Metaph++ = 'T';
411 					break;
412 				case 'V':
413 
414 					/*
415 					 * F
416 					 */
417 					*Metaph++ = 'F';
418 					break;
419 				case 'W':
420 
421 					/*
422 					 * W after a vowel, else dropped
423 					 */
424 				case 'Y':
425 
426 					/*
427 					 * Y unless followed by a vowel
428 					 */
429 					if (vowel(*(n + 1)))
430 						*Metaph++ = *n;
431 					break;
432 				case 'X':
433 
434 					/*
435 					 * KS
436 					 */
437 					if (n == n_start)
438 						*Metaph++ = 'S';
439 					else {
440 						*Metaph++ = 'K';	/* Insert K, then S */
441 						KSflag = 1;
442 					}
443 					break;
444 				case 'Z':
445 
446 					/*
447 					 * S
448 					 */
449 					*Metaph++ = 'S';
450 					break;
451 				}
452 		}
453 	}
454 
455 	*Metaph = 0;		/* Null terminate */
456 	return( ch_strdup( buf ) );
457 }
458 
459 #endif /* SLAPD_METAPHONE */
460