1 /* STUFF TO DO
2  * - Make max_phonemes useful
3  */
4 
5 #include "metaphone.h"
6 #include "metachar.h"
7 
8 #include <stdlib.h>
9 #include <string.h>
10 #include <ctype.h>
11 
12 /* I suppose I could have been using a character pointer instead of
13  * accesssing the array directly... */
14 
15 /* Look at the next letter in the word */
16 #define Next_Letter (toupper(word[w_idx+1]))
17 /* Look at the current letter in the word */
18 #define Curr_Letter (toupper(word[w_idx]))
19 /* Go N letters back. */
20 #define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
21 /* Previous letter.  I dunno, should this return null on failure? */
22 #define Prev_Letter (Look_Back_Letter(1))
23 /* Look two letters down.  It makes sure you don't walk off the string. */
24 #define After_Next_Letter   (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
25                                                  : '\0')
26 #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n)))
27 
28 
29 /* Allows us to safely look ahead an arbitrary # of letters */
30 /* I probably could have just used strlen... */
Lookahead(char * word,int how_far)31 char Lookahead(char * word, int how_far) {
32     char letter_ahead = '\0';  /* null by default */
33     int idx;
34     for(idx = 0;  word[idx] != '\0' && idx < how_far;  idx++);
35         /* Edge forward in the string... */
36 
37     letter_ahead = word[idx];  /* idx will be either == to how_far or
38                                 * at the end of the string
39                                 */
40     return letter_ahead;
41 }
42 
43 
44 /* phonize one letter */
45 #define Phonize(c)  {phoned_word[p_idx++] = c;}
46 /* How long is the phoned word? */
47 #define Phone_Len   (p_idx)
48 
49 /* Note is a letter is a 'break' in the word */
50 #define Isbreak(c)  (!isalpha(c))
51 
52 
metaphone(char * word,size_t max_phonemes)53 char *metaphone ( char *word, size_t max_phonemes ) {
54     int w_idx   = 0;    /* point in the phonization we're at. */
55     int p_idx   = 0;    /* end of the phoned phrase */
56     char *phoned_word;
57 
58     /* Assume largest possible if we're given no limit */
59     if( max_phonemes == 0 )
60         max_phonemes = strlen(word);
61 
62     /* It's +2 because X -> KS can result in the phoned word being
63        one larger than the original word.
64     */
65     phoned_word = calloc( max_phonemes + 2, sizeof(char) );
66 
67     /*-- The first phoneme has to be processed specially. --*/
68     /* Find our first letter */
69     for( ;  !isalpha(Curr_Letter);  w_idx++ ) {
70         /* On the off chance we were given nothing but crap... */
71         if( Curr_Letter == '\0' ) {
72             return phoned_word;
73         }
74     }
75 
76     switch (Curr_Letter) {
77         /* AE becomes E */
78         case 'A':
79             if( Next_Letter == 'E' ) {
80                 Phonize('E');
81                 w_idx+=2;
82             }
83             /* Remember, preserve vowels at the beginning */
84             else {
85                 Phonize('A');
86                 w_idx++;
87             }
88             break;
89         /* [GKP]N becomes N */
90         case 'G':
91         case 'K':
92         case 'P':
93             if( Next_Letter == 'N' ) {
94                 Phonize('N');
95                 w_idx+=2;
96             }
97             break;
98         /* WH becomes H,
99            WR becomes R
100            W if followed by a vowel */
101         case 'W':
102             if( Next_Letter == 'H' ||
103                 Next_Letter == 'R' )
104             {
105               Phonize(Next_Letter);
106               w_idx+=2;
107             }
108             else if ( isvowel(Next_Letter) ) {
109               Phonize('W');
110               w_idx+=2;
111             }
112             /* else ignore */
113             break;
114         /* X becomes S */
115         case 'X':
116             Phonize('S');
117             w_idx++;
118             break;
119         /* Vowels are kept */
120         /* We did A already
121         case 'A':
122         case 'a':
123         */
124         case 'E':
125         case 'I':
126         case 'O':
127         case 'U':
128             Phonize(Curr_Letter);
129             w_idx++;
130             break;
131         default:
132             /* do nothing */
133             break;
134     }
135 
136 
137 
138     /* On to the metaphoning */
139     for(;
140         Curr_Letter != '\0'     &&
141         (max_phonemes == 0 || Phone_Len < max_phonemes);
142         w_idx++
143     ) {
144         /* How many letters to skip because an eariler encoding handled
145          * multiple letters */
146         unsigned short int skip_letter = 0;
147 
148 
149         /* THOUGHT:  It would be nice if, rather than having things like...
150          * well, SCI.  For SCI you encode the S, then have to remember
151          * to skip the C.  So the phonome SCI invades both S and C.  It would
152          * be better, IMHO, to skip the C from the S part of the encoding.
153          * Hell, I'm trying it.
154          */
155 
156         /* Ignore non-alphas */
157         if( !isalpha(Curr_Letter) )
158             continue;
159 
160         /* Drop duplicates, except CC */
161         if( Curr_Letter == Prev_Letter &&
162             Curr_Letter != 'C' )
163             continue;
164 
165         switch (Curr_Letter) {
166             /* B -> B unless in MB */
167             case 'B':
168                 if( Prev_Letter != 'M' )
169                     Phonize('B');
170                 break;
171             /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
172              * (SCHW is handled in S)
173              *  S if -CI-, -CE- or -CY-
174              *  dropped if -SCI-, SCE-, -SCY- (handed in S)
175              *  else K
176              */
177             case 'C':
178                 if( MAKESOFT(Next_Letter) ) {   /* C[IEY] */
179                     if( After_Next_Letter == 'A' &&
180                         Next_Letter == 'I' ) { /* CIA */
181                         Phonize(SH);
182                     }
183                     /* SC[IEY] */
184                     else if ( Prev_Letter == 'S' ) {
185                       /* Dropped */
186                     }
187                     else {
188                       Phonize('S');
189                     }
190                 }
191                 else if ( Next_Letter == 'H' ) {
192 #ifndef USE_TRADITIONAL_METAPHONE
193                     if( After_Next_Letter == 'R' ||
194                         Prev_Letter == 'S' ) { /* Christ, School */
195                         Phonize('K');
196                     }
197                     else {
198                         Phonize(SH);
199                     }
200 #else
201                     Phonize(SH);
202 #endif
203                     skip_letter++;
204                 }
205                 else {
206                     Phonize('K');
207                 }
208                 break;
209             /* J if in -DGE-, -DGI- or -DGY-
210              * else T
211              */
212             case 'D':
213                 if( Next_Letter == 'G' &&
214                     MAKESOFT(After_Next_Letter) ) {
215                     Phonize('J');
216                     skip_letter++;
217                 }
218                 else
219                     Phonize('T');
220                 break;
221             /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
222              * else dropped if -GNED, -GN,
223              * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
224              * else J if in -GE-, -GI, -GY and not GG
225              * else K
226              */
227             case 'G':
228                 if( Next_Letter == 'H' ) {
229                     if( !( NOGHTOF(Look_Back_Letter(3)) ||
230                            Look_Back_Letter(4) == 'H' ) ) {
231                         Phonize('F');
232                         skip_letter++;
233                     }
234                     else {
235                         /* silent */
236                     }
237                 }
238                 else if( Next_Letter == 'N' ) {
239                     if( Isbreak(After_Next_Letter) ||
240                         ( After_Next_Letter == 'E' &&
241                           Look_Ahead_Letter(3) == 'D' ) ) {
242                         /* dropped */
243                     }
244                     else
245                         Phonize('K');
246                 }
247                 else if( MAKESOFT(Next_Letter) &&
248                          Prev_Letter != 'G' ) {
249                     Phonize('J');
250                 }
251                 else {
252                     Phonize('K');
253                 }
254                 break;
255             /* H if before a vowel and not after C,G,P,S,T */
256             case 'H':
257                 if( isvowel(Next_Letter) &&
258                     !AFFECTH(Prev_Letter) )
259                     Phonize('H');
260                 break;
261             /* dropped if after C
262              * else K
263              */
264             case 'K':
265                 if( Prev_Letter != 'C' )
266                     Phonize('K');
267                 break;
268             /* F if before H
269              * else P
270              */
271             case 'P':
272                 if( Next_Letter == 'H' ) {
273                     Phonize('F');
274                 }
275                 else {
276                     Phonize('P');
277                 }
278                 break;
279             /* K
280              */
281             case 'Q':
282                 Phonize('K');
283                 break;
284             /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
285              * else S
286              */
287             case 'S':
288                 if( Next_Letter == 'I' &&
289                     ( After_Next_Letter == 'O' ||
290                       After_Next_Letter == 'A' ) ) {
291                     Phonize(SH);
292                 }
293                 else if ( Next_Letter == 'H' ) {
294                     Phonize(SH);
295                     skip_letter++;
296                 }
297 #ifndef USE_TRADITIONAL_METAPHONE
298                 else if ( Next_Letter == 'C' &&
299                       Look_Ahead_Letter(2) == 'H' &&
300                       Look_Ahead_Letter(3) == 'W' ) {
301                     Phonize(SH);
302                     skip_letter += 2;
303                 }
304 #endif
305                 else {
306                     Phonize('S');
307                 }
308                 break;
309             /* 'sh' in -TIA- or -TIO-
310              * else 'th' before H
311              * else T
312              */
313             case 'T':
314                 if( Next_Letter == 'I' &&
315                     ( After_Next_Letter == 'O' ||
316                       After_Next_Letter == 'A' ) ) {
317                     Phonize(SH);
318                 }
319                 else if ( Next_Letter == 'H' ) {
320                     Phonize(TH);
321                     skip_letter++;
322                 }
323                 else {
324                     Phonize('T');
325                 }
326                 break;
327             /* F */
328             case 'V':
329                 Phonize('F');
330                 break;
331             /* W before a vowel, else dropped */
332             case 'W':
333                 if( isvowel(Next_Letter) )
334                     Phonize('W');
335                 break;
336             /* KS */
337             case 'X':
338                 Phonize('K');
339                 Phonize('S');
340                 break;
341             /* Y if followed by a vowel */
342             case 'Y':
343                 if( isvowel(Next_Letter) )
344                     Phonize('Y');
345                 break;
346             /* S */
347             case 'Z':
348                 Phonize('S');
349                 break;
350             /* No transformation */
351             case 'F':
352             case 'J':
353             case 'L':
354             case 'M':
355             case 'N':
356             case 'R':
357                 Phonize(Curr_Letter);
358                 break;
359             default:
360                 /* nothing */
361                 break;
362         }
363 
364         w_idx += skip_letter;
365     }
366 
367     return phoned_word;
368 }
369