1 /* $NetBSD: phonetic.c,v 1.3 2021/08/14 16:14:58 christos Exp $ */
2
3 /* phonetic.c - routines to do phonetic matching */
4 /* $OpenLDAP$ */
5 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
6 *
7 * Copyright 1998-2021 The OpenLDAP Foundation.
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted only as authorized by the OpenLDAP
12 * Public License.
13 *
14 * A copy of this license is available in the file LICENSE in the
15 * top-level directory of the distribution or, alternatively, at
16 * <http://www.OpenLDAP.org/license.html>.
17 */
18 /* Portions Copyright (c) 1995 Regents of the University of Michigan.
19 * All rights reserved.
20 *
21 * Redistribution and use in source and binary forms are permitted
22 * provided that this notice is preserved and that due credit is given
23 * to the University of Michigan at Ann Arbor. The name of the University
24 * may not be used to endorse or promote products derived from this
25 * software without specific prior written permission. This software
26 * is provided ``as is'' without express or implied warranty.
27 */
28
29 #include <sys/cdefs.h>
30 __RCSID("$NetBSD: phonetic.c,v 1.3 2021/08/14 16:14:58 christos Exp $");
31
32 #include "portable.h"
33
34 #include <stdio.h>
35
36 #include <ac/ctype.h>
37 #include <ac/string.h>
38 #include <ac/socket.h>
39 #include <ac/time.h>
40
41 #include "slap.h"
42
43 #if !defined(SLAPD_METAPHONE) && !defined(SLAPD_PHONETIC)
44 #define SLAPD_METAPHONE
45 #endif
46
47 #define iswordbreak(x) (!isascii(x) || isspace((unsigned char) (x)) || \
48 ispunct((unsigned char) (x)) || \
49 isdigit((unsigned char) (x)) || (x) == '\0')
50
51 #if 0
52 static char *
53 first_word( char *s )
54 {
55 if ( s == NULL ) {
56 return( NULL );
57 }
58
59 while ( iswordbreak( *s ) ) {
60 if ( *s == '\0' ) {
61 return( NULL );
62 } else {
63 s++;
64 }
65 }
66
67 return( s );
68 }
69
70 static char *
71 next_word( char *s )
72 {
73 if ( s == NULL ) {
74 return( NULL );
75 }
76
77 while ( ! iswordbreak( *s ) ) {
78 s++;
79 }
80
81 while ( iswordbreak( *s ) ) {
82 if ( *s == '\0' ) {
83 return( NULL );
84 } else {
85 s++;
86 }
87 }
88
89 return( s );
90 }
91
92 static char *
93 word_dup( char *w )
94 {
95 char *s, *ret;
96 char save;
97
98 for ( s = w; !iswordbreak( *s ); s++ )
99 ; /* NULL */
100 save = *s;
101 *s = '\0';
102 ret = ch_strdup( w );
103 *s = save;
104
105 return( ret );
106 }
107 #endif /* 0 */
108
109 #ifndef MAXPHONEMELEN
110 #define MAXPHONEMELEN 4
111 #endif
112
113 #if defined(SLAPD_PHONETIC)
114
115 /* lifted from isode-8.0 */
116 char *
phonetic(char * s)117 phonetic( char *s )
118 {
119 char code, adjacent, ch;
120 char *p;
121 int i;
122 char phoneme[MAXPHONEMELEN + 1];
123
124 p = s;
125 if ( p == NULL || *p == '\0' ) {
126 return( NULL );
127 }
128
129 adjacent = '0';
130 phoneme[0] = TOUPPER((unsigned char)*p);
131
132 phoneme[1] = '\0';
133 for ( i = 0; i < 99 && (! iswordbreak(*p)); p++ ) {
134 ch = TOUPPER ((unsigned char)*p);
135
136 code = '0';
137
138 switch (ch) {
139 case 'B':
140 case 'F':
141 case 'P':
142 case 'V':
143 code = (adjacent != '1') ? '1' : '0';
144 break;
145 case 'S':
146 case 'C':
147 case 'G':
148 case 'J':
149 case 'K':
150 case 'Q':
151 case 'X':
152 case 'Z':
153 code = (adjacent != '2') ? '2' : '0';
154 break;
155 case 'D':
156 case 'T':
157 code = (adjacent != '3') ? '3' : '0';
158 break;
159 case 'L':
160 code = (adjacent != '4') ? '4' : '0';
161 break;
162 case 'M':
163 case 'N':
164 code = (adjacent != '5') ? '5' : '0';
165 break;
166 case 'R':
167 code = (adjacent != '6') ? '6' : '0';
168 break;
169 default:
170 adjacent = '0';
171 }
172
173 if ( i == 0 ) {
174 adjacent = code;
175 i++;
176 } else if ( code != '0' ) {
177 if ( i == MAXPHONEMELEN )
178 break;
179 adjacent = phoneme[i] = code;
180 i++;
181 }
182 }
183
184 if ( i > 0 )
185 phoneme[i] = '\0';
186
187 return( ch_strdup( phoneme ) );
188 }
189
190 #elif defined(SLAPD_METAPHONE)
191
192 /*
193 * Metaphone was originally developed by Lawrence Philips and
194 * published in the "Computer Language" magazine in 1990.
195 */
196 /*
197 * Metaphone copied from C Gazette, June/July 1991, pp 56-57,
198 * author Gary A. Parker, with changes by Bernard Tiffany of the
199 * University of Michigan, and more changes by Tim Howes of the
200 * University of Michigan.
201 */
202
203 /* Character coding array */
204 static const char vsvfn[26] = {
205 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2,
206 /* A B C D E F G H I J K L M */
207 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0};
208 /* N O P Q R S T U V W X Y Z */
209
210 /* Macros to access character coding array */
211 #define vowel(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 1) /* AEIOU */
212 #define same(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 2) /* FJLMNR */
213 #define varson(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 4) /* CGPST */
214 #define frontv(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 8) /* EIY */
215 #define noghf(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 16) /* BDH */
216
217 char *
phonetic(char * Word)218 phonetic( char *Word )
219 {
220 char *n, *n_start, *n_end; /* pointers to string */
221 char *metaph_end; /* pointers to metaph */
222 char ntrans[40]; /* word with uppercase letters */
223 int KSflag; /* state flag for X -> KS */
224 char buf[MAXPHONEMELEN + 2];
225 char *Metaph;
226
227 /*
228 * Copy Word to internal buffer, dropping non-alphabetic characters
229 * and converting to upper case
230 */
231
232 for (n = ntrans + 4, n_end = ntrans + 35; !iswordbreak( *Word ) &&
233 n < n_end; Word++) {
234 if (isalpha((unsigned char)*Word))
235 *n++ = TOUPPER((unsigned char)*Word);
236 }
237 Metaph = buf;
238 *Metaph = '\0';
239 if (n == ntrans + 4) {
240 return( ch_strdup( buf ) ); /* Return if null */
241 }
242 n_end = n; /* Set n_end to end of string */
243
244 /* ntrans[0] will always be == 0 */
245 ntrans[0] = '\0';
246 ntrans[1] = '\0';
247 ntrans[2] = '\0';
248 ntrans[3] = '\0';
249 *n++ = 0;
250 *n++ = 0;
251 *n++ = 0;
252 *n = 0; /* Pad with nulls */
253 n = ntrans + 4; /* Assign pointer to start */
254
255 /* Check for PN, KN, GN, AE, WR, WH, and X at start */
256 switch (*n) {
257 case 'P':
258 case 'K':
259 case 'G':
260 /* 'PN', 'KN', 'GN' becomes 'N' */
261 if (*(n + 1) == 'N')
262 *n++ = 0;
263 break;
264 case 'A':
265 /* 'AE' becomes 'E' */
266 if (*(n + 1) == 'E')
267 *n++ = 0;
268 break;
269 case 'W':
270 /* 'WR' becomes 'R', and 'WH' to 'H' */
271 if (*(n + 1) == 'R')
272 *n++ = 0;
273 else if (*(n + 1) == 'H') {
274 *(n + 1) = *n;
275 *n++ = 0;
276 }
277 break;
278 case 'X':
279 /* 'X' becomes 'S' */
280 *n = 'S';
281 break;
282 }
283
284 /*
285 * Now, loop step through string, stopping at end of string or when
286 * the computed 'metaph' is MAXPHONEMELEN characters long
287 */
288
289 KSflag = 0; /* state flag for KS translation */
290 for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n;
291 n <= n_end && Metaph < metaph_end; n++) {
292 if (KSflag) {
293 KSflag = 0;
294 *Metaph++ = 'S';
295 } else {
296 /* Drop duplicates except for CC */
297 if (*(n - 1) == *n && *n != 'C')
298 continue;
299 /* Check for F J L M N R or first letter vowel */
300 if (same(*n) || (n == n_start && vowel(*n)))
301 *Metaph++ = *n;
302 else
303 switch (*n) {
304 case 'B':
305
306 /*
307 * B unless in -MB
308 */
309 if (n == (n_end - 1) && *(n - 1) != 'M')
310 *Metaph++ = *n;
311 break;
312 case 'C':
313
314 /*
315 * X if in -CIA-, -CH- else S if in
316 * -CI-, -CE-, -CY- else dropped if
317 * in -SCI-, -SCE-, -SCY- else K
318 */
319 if (*(n - 1) != 'S' || !frontv(*(n + 1))) {
320 if (*(n + 1) == 'I' && *(n + 2) == 'A')
321 *Metaph++ = 'X';
322 else if (frontv(*(n + 1)))
323 *Metaph++ = 'S';
324 else if (*(n + 1) == 'H')
325 *Metaph++ = ((n == n_start && !vowel(*(n + 2)))
326 || *(n - 1) == 'S')
327 ? (char) 'K' : (char) 'X';
328 else
329 *Metaph++ = 'K';
330 }
331 break;
332 case 'D':
333
334 /*
335 * J if in DGE or DGI or DGY else T
336 */
337 *Metaph++ = (*(n + 1) == 'G' && frontv(*(n + 2)))
338 ? (char) 'J' : (char) 'T';
339 break;
340 case 'G':
341
342 /*
343 * F if in -GH and not B--GH, D--GH,
344 * -H--GH, -H---GH else dropped if
345 * -GNED, -GN, -DGE-, -DGI-, -DGY-
346 * else J if in -GE-, -GI-, -GY- and
347 * not GG else K
348 */
349 if ((*(n + 1) != 'J' || vowel(*(n + 2))) &&
350 (*(n + 1) != 'N' || ((n + 1) < n_end &&
351 (*(n + 2) != 'E' || *(n + 3) != 'D'))) &&
352 (*(n - 1) != 'D' || !frontv(*(n + 1))))
353 *Metaph++ = (frontv(*(n + 1)) &&
354 *(n + 2) != 'G') ? (char) 'G' : (char) 'K';
355 else if (*(n + 1) == 'H' && !noghf(*(n - 3)) &&
356 *(n - 4) != 'H')
357 *Metaph++ = 'F';
358 break;
359 case 'H':
360
361 /*
362 * H if before a vowel and not after
363 * C, G, P, S, T else dropped
364 */
365 if (!varson(*(n - 1)) && (!vowel(*(n - 1)) ||
366 vowel(*(n + 1))))
367 *Metaph++ = 'H';
368 break;
369 case 'K':
370
371 /*
372 * dropped if after C else K
373 */
374 if (*(n - 1) != 'C')
375 *Metaph++ = 'K';
376 break;
377 case 'P':
378
379 /*
380 * F if before H, else P
381 */
382 *Metaph++ = *(n + 1) == 'H' ?
383 (char) 'F' : (char) 'P';
384 break;
385 case 'Q':
386
387 /*
388 * K
389 */
390 *Metaph++ = 'K';
391 break;
392 case 'S':
393
394 /*
395 * X in -SH-, -SIO- or -SIA- else S
396 */
397 *Metaph++ = (*(n + 1) == 'H' ||
398 (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
399 *(n + 2) == 'A')))
400 ? (char) 'X' : (char) 'S';
401 break;
402 case 'T':
403
404 /*
405 * X in -TIA- or -TIO- else 0 (zero)
406 * before H else dropped if in -TCH-
407 * else T
408 */
409 if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
410 *(n + 2) == 'A'))
411 *Metaph++ = 'X';
412 else if (*(n + 1) == 'H')
413 *Metaph++ = '0';
414 else if (*(n + 1) != 'C' || *(n + 2) != 'H')
415 *Metaph++ = 'T';
416 break;
417 case 'V':
418
419 /*
420 * F
421 */
422 *Metaph++ = 'F';
423 break;
424 case 'W':
425
426 /*
427 * W after a vowel, else dropped
428 */
429 case 'Y':
430
431 /*
432 * Y unless followed by a vowel
433 */
434 if (vowel(*(n + 1)))
435 *Metaph++ = *n;
436 break;
437 case 'X':
438
439 /*
440 * KS
441 */
442 if (n == n_start)
443 *Metaph++ = 'S';
444 else {
445 *Metaph++ = 'K'; /* Insert K, then S */
446 KSflag = 1;
447 }
448 break;
449 case 'Z':
450
451 /*
452 * S
453 */
454 *Metaph++ = 'S';
455 break;
456 }
457 }
458 }
459
460 *Metaph = 0; /* Null terminate */
461 return( ch_strdup( buf ) );
462 }
463
464 #endif /* SLAPD_METAPHONE */
465