1 /*	$NetBSD: phonetic.c,v 1.1.1.3 2010/12/12 15:22:35 adam Exp $	*/
2 
3 /* phonetic.c - routines to do phonetic matching */
4 /* OpenLDAP: pkg/ldap/servers/slapd/phonetic.c,v 1.22.2.5 2010/04/13 20:23:17 kurt Exp */
5 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
6  *
7  * Copyright 1998-2010 The OpenLDAP Foundation.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted only as authorized by the OpenLDAP
12  * Public License.
13  *
14  * A copy of this license is available in the file LICENSE in the
15  * top-level directory of the distribution or, alternatively, at
16  * <http://www.OpenLDAP.org/license.html>.
17  */
18 /* Portions Copyright (c) 1995 Regents of the University of Michigan.
19  * All rights reserved.
20  *
21  * Redistribution and use in source and binary forms are permitted
22  * provided that this notice is preserved and that due credit is given
23  * to the University of Michigan at Ann Arbor. The name of the University
24  * may not be used to endorse or promote products derived from this
25  * software without specific prior written permission. This software
26  * is provided ``as is'' without express or implied warranty.
27  */
28 
29 #include "portable.h"
30 
31 #include <stdio.h>
32 
33 #include <ac/ctype.h>
34 #include <ac/string.h>
35 #include <ac/socket.h>
36 #include <ac/time.h>
37 
38 #include "slap.h"
39 
40 #if !defined(SLAPD_METAPHONE) && !defined(SLAPD_PHONETIC)
41 #define SLAPD_METAPHONE
42 #endif
43 
44 #define iswordbreak(x)  (!isascii(x) || isspace((unsigned char) (x)) || \
45 			 ispunct((unsigned char) (x)) || \
46 			 isdigit((unsigned char) (x)) || (x) == '\0')
47 
48 #if 0
49 static char *
50 first_word( char *s )
51 {
52 	if ( s == NULL ) {
53 		return( NULL );
54 	}
55 
56 	while ( iswordbreak( *s ) ) {
57 		if ( *s == '\0' ) {
58 			return( NULL );
59 		} else {
60 			s++;
61 		}
62 	}
63 
64 	return( s );
65 }
66 
67 static char *
68 next_word( char *s )
69 {
70 	if ( s == NULL ) {
71 		return( NULL );
72 	}
73 
74 	while ( ! iswordbreak( *s ) ) {
75 		s++;
76 	}
77 
78 	while ( iswordbreak( *s ) ) {
79 		if ( *s == '\0' ) {
80 			return( NULL );
81 		} else {
82 			s++;
83 		}
84 	}
85 
86 	return( s );
87 }
88 
89 static char *
90 word_dup( char *w )
91 {
92 	char	*s, *ret;
93 	char	save;
94 
95 	for ( s = w; !iswordbreak( *s ); s++ )
96 		;	/* NULL */
97 	save = *s;
98 	*s = '\0';
99 	ret = ch_strdup( w );
100 	*s = save;
101 
102 	return( ret );
103 }
104 #endif /* 0 */
105 
106 #ifndef MAXPHONEMELEN
107 #define MAXPHONEMELEN	4
108 #endif
109 
110 #if defined(SLAPD_PHONETIC)
111 
112 /* lifted from isode-8.0 */
113 char *
114 phonetic( char *s )
115 {
116         char	code, adjacent, ch;
117 	char	*p;
118         int	i;
119 	char	phoneme[MAXPHONEMELEN + 1];
120 
121         p = s;
122         if (  p == NULL || *p == '\0' ) {
123                 return( NULL );
124         }
125 
126         adjacent = '0';
127 	phoneme[0] = TOUPPER((unsigned char)*p);
128 
129 	phoneme[1]  = '\0';
130         for ( i = 0; i < 99 && (! iswordbreak(*p)); p++ ) {
131 		ch = TOUPPER ((unsigned char)*p);
132 
133                 code = '0';
134 
135                 switch (ch) {
136                 case 'B':
137                 case 'F':
138 		case 'P':
139                 case 'V':
140                         code = (adjacent != '1') ? '1' : '0';
141                         break;
142                 case 'S':
143                 case 'C':
144                 case 'G':
145                 case 'J':
146                 case 'K':
147                 case 'Q':
148                 case 'X':
149                 case 'Z':
150                         code = (adjacent != '2') ? '2' : '0';
151                         break;
152                 case 'D':
153                 case 'T':
154                         code = (adjacent != '3') ? '3' : '0';
155                         break;
156                 case 'L':
157                         code = (adjacent != '4') ? '4' : '0';
158                         break;
159                 case 'M':
160                 case 'N':
161                         code = (adjacent != '5') ? '5' : '0';
162                         break;
163                 case 'R':
164                         code = (adjacent != '6') ? '6' : '0';
165                         break;
166                 default:
167                         adjacent = '0';
168                 }
169 
170                 if ( i == 0 ) {
171 			adjacent = code;
172 			i++;
173 		} else if ( code != '0' ) {
174 			if ( i == MAXPHONEMELEN )
175 				break;
176                         adjacent = phoneme[i] = code;
177                         i++;
178                 }
179         }
180 
181 	if ( i > 0 )
182 		phoneme[i] = '\0';
183 
184         return( ch_strdup( phoneme ) );
185 }
186 
187 #elif defined(SLAPD_METAPHONE)
188 
189 /*
190  * Metaphone was originally developed by Lawrence Philips and
191  * published in the "Computer Language" magazine in 1990.
192  */
193 /*
194  * Metaphone copied from C Gazette, June/July 1991, pp 56-57,
195  * author Gary A. Parker, with changes by Bernard Tiffany of the
196  * University of Michigan, and more changes by Tim Howes of the
197  * University of Michigan.
198  */
199 
200 /* Character coding array */
201 static const char  vsvfn[26] = {
202 	   1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2,
203 	/* A   B  C   D  E  F  G   H  I  J  K  L  M  */
204 	   2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0};
205 	/* N  O  P  Q  R  S  T  U  V  W  X  Y  Z  */
206 
207 /* Macros to access character coding array */
208 #define vowel(x)        ((x) != '\0' && vsvfn[(x) - 'A'] & 1)	/* AEIOU */
209 #define same(x)         ((x) != '\0' && vsvfn[(x) - 'A'] & 2)	/* FJLMNR */
210 #define varson(x)       ((x) != '\0' && vsvfn[(x) - 'A'] & 4)	/* CGPST */
211 #define frontv(x)       ((x) != '\0' && vsvfn[(x) - 'A'] & 8)	/* EIY */
212 #define noghf(x)        ((x) != '\0' && vsvfn[(x) - 'A'] & 16)	/* BDH */
213 
214 char *
215 phonetic( char *Word )
216 {
217 	char           *n, *n_start, *n_end;	/* pointers to string */
218 	char           *metaph_end;	/* pointers to metaph */
219 	char            ntrans[40];	/* word with uppercase letters */
220 	int             KSflag;	/* state flag for X -> KS */
221 	char		buf[MAXPHONEMELEN + 2];
222 	char		*Metaph;
223 
224 	/*
225 	 * Copy Word to internal buffer, dropping non-alphabetic characters
226 	 * and converting to upper case
227 	 */
228 
229 	for (n = ntrans + 4, n_end = ntrans + 35; !iswordbreak( *Word ) &&
230 	    n < n_end; Word++) {
231 		if (isalpha((unsigned char)*Word))
232 			*n++ = TOUPPER((unsigned char)*Word);
233 	}
234 	Metaph = buf;
235 	*Metaph = '\0';
236 	if (n == ntrans + 4) {
237 		return( ch_strdup( buf ) );		/* Return if null */
238 	}
239 	n_end = n;		/* Set n_end to end of string */
240 
241 	/* ntrans[0] will always be == 0 */
242 	ntrans[0] = '\0';
243 	ntrans[1] = '\0';
244 	ntrans[2] = '\0';
245 	ntrans[3] = '\0';
246 	*n++ = 0;
247 	*n++ = 0;
248 	*n++ = 0;
249 	*n = 0;			/* Pad with nulls */
250 	n = ntrans + 4;		/* Assign pointer to start */
251 
252 	/* Check for PN, KN, GN, AE, WR, WH, and X at start */
253 	switch (*n) {
254 	case 'P':
255 	case 'K':
256 	case 'G':
257 		/* 'PN', 'KN', 'GN' becomes 'N' */
258 		if (*(n + 1) == 'N')
259 			*n++ = 0;
260 		break;
261 	case 'A':
262 		/* 'AE' becomes 'E' */
263 		if (*(n + 1) == 'E')
264 			*n++ = 0;
265 		break;
266 	case 'W':
267 		/* 'WR' becomes 'R', and 'WH' to 'H' */
268 		if (*(n + 1) == 'R')
269 			*n++ = 0;
270 		else if (*(n + 1) == 'H') {
271 			*(n + 1) = *n;
272 			*n++ = 0;
273 		}
274 		break;
275 	case 'X':
276 		/* 'X' becomes 'S' */
277 		*n = 'S';
278 		break;
279 	}
280 
281 	/*
282 	 * Now, loop step through string, stopping at end of string or when
283 	 * the computed 'metaph' is MAXPHONEMELEN characters long
284 	 */
285 
286 	KSflag = 0;		/* state flag for KS translation */
287 	for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n;
288 	     n <= n_end && Metaph < metaph_end; n++) {
289 		if (KSflag) {
290 			KSflag = 0;
291 			*Metaph++ = 'S';
292 		} else {
293 			/* Drop duplicates except for CC */
294 			if (*(n - 1) == *n && *n != 'C')
295 				continue;
296 			/* Check for F J L M N R or first letter vowel */
297 			if (same(*n) || (n == n_start && vowel(*n)))
298 				*Metaph++ = *n;
299 			else
300 				switch (*n) {
301 				case 'B':
302 
303 					/*
304 					 * B unless in -MB
305 					 */
306 					if (n == (n_end - 1) && *(n - 1) != 'M')
307 						*Metaph++ = *n;
308 					break;
309 				case 'C':
310 
311 					/*
312 					 * X if in -CIA-, -CH- else S if in
313 					 * -CI-, -CE-, -CY- else dropped if
314 					 * in -SCI-, -SCE-, -SCY- else K
315 					 */
316 					if (*(n - 1) != 'S' || !frontv(*(n + 1))) {
317 						if (*(n + 1) == 'I' && *(n + 2) == 'A')
318 							*Metaph++ = 'X';
319 						else if (frontv(*(n + 1)))
320 							*Metaph++ = 'S';
321 						else if (*(n + 1) == 'H')
322 							*Metaph++ = ((n == n_start && !vowel(*(n + 2)))
323 							 || *(n - 1) == 'S')
324 							    ? (char) 'K' : (char) 'X';
325 						else
326 							*Metaph++ = 'K';
327 					}
328 					break;
329 				case 'D':
330 
331 					/*
332 					 * J if in DGE or DGI or DGY else T
333 					 */
334 					*Metaph++ = (*(n + 1) == 'G' && frontv(*(n + 2)))
335 					    ? (char) 'J' : (char) 'T';
336 					break;
337 				case 'G':
338 
339 					/*
340 					 * F if in -GH and not B--GH, D--GH,
341 					 * -H--GH, -H---GH else dropped if
342 					 * -GNED, -GN, -DGE-, -DGI-, -DGY-
343 					 * else J if in -GE-, -GI-, -GY- and
344 					 * not GG else K
345 					 */
346 					if ((*(n + 1) != 'J' || vowel(*(n + 2))) &&
347 					    (*(n + 1) != 'N' || ((n + 1) < n_end &&
348 								 (*(n + 2) != 'E' || *(n + 3) != 'D'))) &&
349 					    (*(n - 1) != 'D' || !frontv(*(n + 1))))
350 						*Metaph++ = (frontv(*(n + 1)) &&
351 							     *(n + 2) != 'G') ? (char) 'G' : (char) 'K';
352 					else if (*(n + 1) == 'H' && !noghf(*(n - 3)) &&
353 						 *(n - 4) != 'H')
354 						*Metaph++ = 'F';
355 					break;
356 				case 'H':
357 
358 					/*
359 					 * H if before a vowel and not after
360 					 * C, G, P, S, T else dropped
361 					 */
362 					if (!varson(*(n - 1)) && (!vowel(*(n - 1)) ||
363 							   vowel(*(n + 1))))
364 						*Metaph++ = 'H';
365 					break;
366 				case 'K':
367 
368 					/*
369 					 * dropped if after C else K
370 					 */
371 					if (*(n - 1) != 'C')
372 						*Metaph++ = 'K';
373 					break;
374 				case 'P':
375 
376 					/*
377 					 * F if before H, else P
378 					 */
379 					*Metaph++ = *(n + 1) == 'H' ?
380 					    (char) 'F' : (char) 'P';
381 					break;
382 				case 'Q':
383 
384 					/*
385 					 * K
386 					 */
387 					*Metaph++ = 'K';
388 					break;
389 				case 'S':
390 
391 					/*
392 					 * X in -SH-, -SIO- or -SIA- else S
393 					 */
394 					*Metaph++ = (*(n + 1) == 'H' ||
395 						     (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
396 							  *(n + 2) == 'A')))
397 					    ? (char) 'X' : (char) 'S';
398 					break;
399 				case 'T':
400 
401 					/*
402 					 * X in -TIA- or -TIO- else 0 (zero)
403 					 * before H else dropped if in -TCH-
404 					 * else T
405 					 */
406 					if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
407 							   *(n + 2) == 'A'))
408 						*Metaph++ = 'X';
409 					else if (*(n + 1) == 'H')
410 						*Metaph++ = '0';
411 					else if (*(n + 1) != 'C' || *(n + 2) != 'H')
412 						*Metaph++ = 'T';
413 					break;
414 				case 'V':
415 
416 					/*
417 					 * F
418 					 */
419 					*Metaph++ = 'F';
420 					break;
421 				case 'W':
422 
423 					/*
424 					 * W after a vowel, else dropped
425 					 */
426 				case 'Y':
427 
428 					/*
429 					 * Y unless followed by a vowel
430 					 */
431 					if (vowel(*(n + 1)))
432 						*Metaph++ = *n;
433 					break;
434 				case 'X':
435 
436 					/*
437 					 * KS
438 					 */
439 					if (n == n_start)
440 						*Metaph++ = 'S';
441 					else {
442 						*Metaph++ = 'K';	/* Insert K, then S */
443 						KSflag = 1;
444 					}
445 					break;
446 				case 'Z':
447 
448 					/*
449 					 * S
450 					 */
451 					*Metaph++ = 'S';
452 					break;
453 				}
454 		}
455 	}
456 
457 	*Metaph = 0;		/* Null terminate */
458 	return( ch_strdup( buf ) );
459 }
460 
461 #endif /* SLAPD_METAPHONE */
462