1 /*	$NetBSD: phonetic.c,v 1.3 2021/08/14 16:14:58 christos Exp $	*/
2 
3 /* phonetic.c - routines to do phonetic matching */
4 /* $OpenLDAP$ */
5 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
6  *
7  * Copyright 1998-2021 The OpenLDAP Foundation.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted only as authorized by the OpenLDAP
12  * Public License.
13  *
14  * A copy of this license is available in the file LICENSE in the
15  * top-level directory of the distribution or, alternatively, at
16  * <http://www.OpenLDAP.org/license.html>.
17  */
18 /* Portions Copyright (c) 1995 Regents of the University of Michigan.
19  * All rights reserved.
20  *
21  * Redistribution and use in source and binary forms are permitted
22  * provided that this notice is preserved and that due credit is given
23  * to the University of Michigan at Ann Arbor. The name of the University
24  * may not be used to endorse or promote products derived from this
25  * software without specific prior written permission. This software
26  * is provided ``as is'' without express or implied warranty.
27  */
28 
29 #include <sys/cdefs.h>
30 __RCSID("$NetBSD: phonetic.c,v 1.3 2021/08/14 16:14:58 christos Exp $");
31 
32 #include "portable.h"
33 
34 #include <stdio.h>
35 
36 #include <ac/ctype.h>
37 #include <ac/string.h>
38 #include <ac/socket.h>
39 #include <ac/time.h>
40 
41 #include "slap.h"
42 
43 #if !defined(SLAPD_METAPHONE) && !defined(SLAPD_PHONETIC)
44 #define SLAPD_METAPHONE
45 #endif
46 
47 #define iswordbreak(x)  (!isascii(x) || isspace((unsigned char) (x)) || \
48 			 ispunct((unsigned char) (x)) || \
49 			 isdigit((unsigned char) (x)) || (x) == '\0')
50 
51 #if 0
52 static char *
53 first_word( char *s )
54 {
55 	if ( s == NULL ) {
56 		return( NULL );
57 	}
58 
59 	while ( iswordbreak( *s ) ) {
60 		if ( *s == '\0' ) {
61 			return( NULL );
62 		} else {
63 			s++;
64 		}
65 	}
66 
67 	return( s );
68 }
69 
70 static char *
71 next_word( char *s )
72 {
73 	if ( s == NULL ) {
74 		return( NULL );
75 	}
76 
77 	while ( ! iswordbreak( *s ) ) {
78 		s++;
79 	}
80 
81 	while ( iswordbreak( *s ) ) {
82 		if ( *s == '\0' ) {
83 			return( NULL );
84 		} else {
85 			s++;
86 		}
87 	}
88 
89 	return( s );
90 }
91 
92 static char *
93 word_dup( char *w )
94 {
95 	char	*s, *ret;
96 	char	save;
97 
98 	for ( s = w; !iswordbreak( *s ); s++ )
99 		;	/* NULL */
100 	save = *s;
101 	*s = '\0';
102 	ret = ch_strdup( w );
103 	*s = save;
104 
105 	return( ret );
106 }
107 #endif /* 0 */
108 
109 #ifndef MAXPHONEMELEN
110 #define MAXPHONEMELEN	4
111 #endif
112 
113 #if defined(SLAPD_PHONETIC)
114 
115 /* lifted from isode-8.0 */
116 char *
phonetic(char * s)117 phonetic( char *s )
118 {
119         char	code, adjacent, ch;
120 	char	*p;
121         int	i;
122 	char	phoneme[MAXPHONEMELEN + 1];
123 
124         p = s;
125         if (  p == NULL || *p == '\0' ) {
126                 return( NULL );
127         }
128 
129         adjacent = '0';
130 	phoneme[0] = TOUPPER((unsigned char)*p);
131 
132 	phoneme[1]  = '\0';
133         for ( i = 0; i < 99 && (! iswordbreak(*p)); p++ ) {
134 		ch = TOUPPER ((unsigned char)*p);
135 
136                 code = '0';
137 
138                 switch (ch) {
139                 case 'B':
140                 case 'F':
141 		case 'P':
142                 case 'V':
143                         code = (adjacent != '1') ? '1' : '0';
144                         break;
145                 case 'S':
146                 case 'C':
147                 case 'G':
148                 case 'J':
149                 case 'K':
150                 case 'Q':
151                 case 'X':
152                 case 'Z':
153                         code = (adjacent != '2') ? '2' : '0';
154                         break;
155                 case 'D':
156                 case 'T':
157                         code = (adjacent != '3') ? '3' : '0';
158                         break;
159                 case 'L':
160                         code = (adjacent != '4') ? '4' : '0';
161                         break;
162                 case 'M':
163                 case 'N':
164                         code = (adjacent != '5') ? '5' : '0';
165                         break;
166                 case 'R':
167                         code = (adjacent != '6') ? '6' : '0';
168                         break;
169                 default:
170                         adjacent = '0';
171                 }
172 
173                 if ( i == 0 ) {
174 			adjacent = code;
175 			i++;
176 		} else if ( code != '0' ) {
177 			if ( i == MAXPHONEMELEN )
178 				break;
179                         adjacent = phoneme[i] = code;
180                         i++;
181                 }
182         }
183 
184 	if ( i > 0 )
185 		phoneme[i] = '\0';
186 
187         return( ch_strdup( phoneme ) );
188 }
189 
190 #elif defined(SLAPD_METAPHONE)
191 
192 /*
193  * Metaphone was originally developed by Lawrence Philips and
194  * published in the "Computer Language" magazine in 1990.
195  */
196 /*
197  * Metaphone copied from C Gazette, June/July 1991, pp 56-57,
198  * author Gary A. Parker, with changes by Bernard Tiffany of the
199  * University of Michigan, and more changes by Tim Howes of the
200  * University of Michigan.
201  */
202 
203 /* Character coding array */
204 static const char  vsvfn[26] = {
205 	   1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2,
206 	/* A   B  C   D  E  F  G   H  I  J  K  L  M  */
207 	   2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0};
208 	/* N  O  P  Q  R  S  T  U  V  W  X  Y  Z  */
209 
210 /* Macros to access character coding array */
211 #define vowel(x)        ((x) != '\0' && vsvfn[(x) - 'A'] & 1)	/* AEIOU */
212 #define same(x)         ((x) != '\0' && vsvfn[(x) - 'A'] & 2)	/* FJLMNR */
213 #define varson(x)       ((x) != '\0' && vsvfn[(x) - 'A'] & 4)	/* CGPST */
214 #define frontv(x)       ((x) != '\0' && vsvfn[(x) - 'A'] & 8)	/* EIY */
215 #define noghf(x)        ((x) != '\0' && vsvfn[(x) - 'A'] & 16)	/* BDH */
216 
217 char *
phonetic(char * Word)218 phonetic( char *Word )
219 {
220 	char           *n, *n_start, *n_end;	/* pointers to string */
221 	char           *metaph_end;	/* pointers to metaph */
222 	char            ntrans[40];	/* word with uppercase letters */
223 	int             KSflag;	/* state flag for X -> KS */
224 	char		buf[MAXPHONEMELEN + 2];
225 	char		*Metaph;
226 
227 	/*
228 	 * Copy Word to internal buffer, dropping non-alphabetic characters
229 	 * and converting to upper case
230 	 */
231 
232 	for (n = ntrans + 4, n_end = ntrans + 35; !iswordbreak( *Word ) &&
233 	    n < n_end; Word++) {
234 		if (isalpha((unsigned char)*Word))
235 			*n++ = TOUPPER((unsigned char)*Word);
236 	}
237 	Metaph = buf;
238 	*Metaph = '\0';
239 	if (n == ntrans + 4) {
240 		return( ch_strdup( buf ) );		/* Return if null */
241 	}
242 	n_end = n;		/* Set n_end to end of string */
243 
244 	/* ntrans[0] will always be == 0 */
245 	ntrans[0] = '\0';
246 	ntrans[1] = '\0';
247 	ntrans[2] = '\0';
248 	ntrans[3] = '\0';
249 	*n++ = 0;
250 	*n++ = 0;
251 	*n++ = 0;
252 	*n = 0;			/* Pad with nulls */
253 	n = ntrans + 4;		/* Assign pointer to start */
254 
255 	/* Check for PN, KN, GN, AE, WR, WH, and X at start */
256 	switch (*n) {
257 	case 'P':
258 	case 'K':
259 	case 'G':
260 		/* 'PN', 'KN', 'GN' becomes 'N' */
261 		if (*(n + 1) == 'N')
262 			*n++ = 0;
263 		break;
264 	case 'A':
265 		/* 'AE' becomes 'E' */
266 		if (*(n + 1) == 'E')
267 			*n++ = 0;
268 		break;
269 	case 'W':
270 		/* 'WR' becomes 'R', and 'WH' to 'H' */
271 		if (*(n + 1) == 'R')
272 			*n++ = 0;
273 		else if (*(n + 1) == 'H') {
274 			*(n + 1) = *n;
275 			*n++ = 0;
276 		}
277 		break;
278 	case 'X':
279 		/* 'X' becomes 'S' */
280 		*n = 'S';
281 		break;
282 	}
283 
284 	/*
285 	 * Now, loop step through string, stopping at end of string or when
286 	 * the computed 'metaph' is MAXPHONEMELEN characters long
287 	 */
288 
289 	KSflag = 0;		/* state flag for KS translation */
290 	for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n;
291 	     n <= n_end && Metaph < metaph_end; n++) {
292 		if (KSflag) {
293 			KSflag = 0;
294 			*Metaph++ = 'S';
295 		} else {
296 			/* Drop duplicates except for CC */
297 			if (*(n - 1) == *n && *n != 'C')
298 				continue;
299 			/* Check for F J L M N R or first letter vowel */
300 			if (same(*n) || (n == n_start && vowel(*n)))
301 				*Metaph++ = *n;
302 			else
303 				switch (*n) {
304 				case 'B':
305 
306 					/*
307 					 * B unless in -MB
308 					 */
309 					if (n == (n_end - 1) && *(n - 1) != 'M')
310 						*Metaph++ = *n;
311 					break;
312 				case 'C':
313 
314 					/*
315 					 * X if in -CIA-, -CH- else S if in
316 					 * -CI-, -CE-, -CY- else dropped if
317 					 * in -SCI-, -SCE-, -SCY- else K
318 					 */
319 					if (*(n - 1) != 'S' || !frontv(*(n + 1))) {
320 						if (*(n + 1) == 'I' && *(n + 2) == 'A')
321 							*Metaph++ = 'X';
322 						else if (frontv(*(n + 1)))
323 							*Metaph++ = 'S';
324 						else if (*(n + 1) == 'H')
325 							*Metaph++ = ((n == n_start && !vowel(*(n + 2)))
326 							 || *(n - 1) == 'S')
327 							    ? (char) 'K' : (char) 'X';
328 						else
329 							*Metaph++ = 'K';
330 					}
331 					break;
332 				case 'D':
333 
334 					/*
335 					 * J if in DGE or DGI or DGY else T
336 					 */
337 					*Metaph++ = (*(n + 1) == 'G' && frontv(*(n + 2)))
338 					    ? (char) 'J' : (char) 'T';
339 					break;
340 				case 'G':
341 
342 					/*
343 					 * F if in -GH and not B--GH, D--GH,
344 					 * -H--GH, -H---GH else dropped if
345 					 * -GNED, -GN, -DGE-, -DGI-, -DGY-
346 					 * else J if in -GE-, -GI-, -GY- and
347 					 * not GG else K
348 					 */
349 					if ((*(n + 1) != 'J' || vowel(*(n + 2))) &&
350 					    (*(n + 1) != 'N' || ((n + 1) < n_end &&
351 								 (*(n + 2) != 'E' || *(n + 3) != 'D'))) &&
352 					    (*(n - 1) != 'D' || !frontv(*(n + 1))))
353 						*Metaph++ = (frontv(*(n + 1)) &&
354 							     *(n + 2) != 'G') ? (char) 'G' : (char) 'K';
355 					else if (*(n + 1) == 'H' && !noghf(*(n - 3)) &&
356 						 *(n - 4) != 'H')
357 						*Metaph++ = 'F';
358 					break;
359 				case 'H':
360 
361 					/*
362 					 * H if before a vowel and not after
363 					 * C, G, P, S, T else dropped
364 					 */
365 					if (!varson(*(n - 1)) && (!vowel(*(n - 1)) ||
366 							   vowel(*(n + 1))))
367 						*Metaph++ = 'H';
368 					break;
369 				case 'K':
370 
371 					/*
372 					 * dropped if after C else K
373 					 */
374 					if (*(n - 1) != 'C')
375 						*Metaph++ = 'K';
376 					break;
377 				case 'P':
378 
379 					/*
380 					 * F if before H, else P
381 					 */
382 					*Metaph++ = *(n + 1) == 'H' ?
383 					    (char) 'F' : (char) 'P';
384 					break;
385 				case 'Q':
386 
387 					/*
388 					 * K
389 					 */
390 					*Metaph++ = 'K';
391 					break;
392 				case 'S':
393 
394 					/*
395 					 * X in -SH-, -SIO- or -SIA- else S
396 					 */
397 					*Metaph++ = (*(n + 1) == 'H' ||
398 						     (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
399 							  *(n + 2) == 'A')))
400 					    ? (char) 'X' : (char) 'S';
401 					break;
402 				case 'T':
403 
404 					/*
405 					 * X in -TIA- or -TIO- else 0 (zero)
406 					 * before H else dropped if in -TCH-
407 					 * else T
408 					 */
409 					if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
410 							   *(n + 2) == 'A'))
411 						*Metaph++ = 'X';
412 					else if (*(n + 1) == 'H')
413 						*Metaph++ = '0';
414 					else if (*(n + 1) != 'C' || *(n + 2) != 'H')
415 						*Metaph++ = 'T';
416 					break;
417 				case 'V':
418 
419 					/*
420 					 * F
421 					 */
422 					*Metaph++ = 'F';
423 					break;
424 				case 'W':
425 
426 					/*
427 					 * W after a vowel, else dropped
428 					 */
429 				case 'Y':
430 
431 					/*
432 					 * Y unless followed by a vowel
433 					 */
434 					if (vowel(*(n + 1)))
435 						*Metaph++ = *n;
436 					break;
437 				case 'X':
438 
439 					/*
440 					 * KS
441 					 */
442 					if (n == n_start)
443 						*Metaph++ = 'S';
444 					else {
445 						*Metaph++ = 'K';	/* Insert K, then S */
446 						KSflag = 1;
447 					}
448 					break;
449 				case 'Z':
450 
451 					/*
452 					 * S
453 					 */
454 					*Metaph++ = 'S';
455 					break;
456 				}
457 		}
458 	}
459 
460 	*Metaph = 0;		/* Null terminate */
461 	return( ch_strdup( buf ) );
462 }
463 
464 #endif /* SLAPD_METAPHONE */
465