1 /* STUFF TO DO
2 * - Make max_phonemes useful
3 */
4
5 #include "metaphone.h"
6 #include "metachar.h"
7
8 #include <stdlib.h>
9 #include <string.h>
10 #include <ctype.h>
11
12 /* I suppose I could have been using a character pointer instead of
13 * accesssing the array directly... */
14
15 /* Look at the next letter in the word */
16 #define Next_Letter (toupper(word[w_idx+1]))
17 /* Look at the current letter in the word */
18 #define Curr_Letter (toupper(word[w_idx]))
19 /* Go N letters back. */
20 #define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
21 /* Previous letter. I dunno, should this return null on failure? */
22 #define Prev_Letter (Look_Back_Letter(1))
23 /* Look two letters down. It makes sure you don't walk off the string. */
24 #define After_Next_Letter (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
25 : '\0')
26 #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n)))
27
28
29 /* Allows us to safely look ahead an arbitrary # of letters */
30 /* I probably could have just used strlen... */
Lookahead(char * word,int how_far)31 char Lookahead(char * word, int how_far) {
32 char letter_ahead = '\0'; /* null by default */
33 int idx;
34 for(idx = 0; word[idx] != '\0' && idx < how_far; idx++);
35 /* Edge forward in the string... */
36
37 letter_ahead = word[idx]; /* idx will be either == to how_far or
38 * at the end of the string
39 */
40 return letter_ahead;
41 }
42
43
44 /* phonize one letter */
45 #define Phonize(c) {phoned_word[p_idx++] = c;}
46 /* How long is the phoned word? */
47 #define Phone_Len (p_idx)
48
49 /* Note is a letter is a 'break' in the word */
50 #define Isbreak(c) (!isalpha(c))
51
52
metaphone(char * word,size_t max_phonemes)53 char *metaphone ( char *word, size_t max_phonemes ) {
54 int w_idx = 0; /* point in the phonization we're at. */
55 int p_idx = 0; /* end of the phoned phrase */
56 char *phoned_word;
57
58 /* Assume largest possible if we're given no limit */
59 if( max_phonemes == 0 )
60 max_phonemes = strlen(word);
61
62 /* It's +2 because X -> KS can result in the phoned word being
63 one larger than the original word.
64 */
65 phoned_word = calloc( max_phonemes + 2, sizeof(char) );
66
67 /*-- The first phoneme has to be processed specially. --*/
68 /* Find our first letter */
69 for( ; !isalpha(Curr_Letter); w_idx++ ) {
70 /* On the off chance we were given nothing but crap... */
71 if( Curr_Letter == '\0' ) {
72 return phoned_word;
73 }
74 }
75
76 switch (Curr_Letter) {
77 /* AE becomes E */
78 case 'A':
79 if( Next_Letter == 'E' ) {
80 Phonize('E');
81 w_idx+=2;
82 }
83 /* Remember, preserve vowels at the beginning */
84 else {
85 Phonize('A');
86 w_idx++;
87 }
88 break;
89 /* [GKP]N becomes N */
90 case 'G':
91 case 'K':
92 case 'P':
93 if( Next_Letter == 'N' ) {
94 Phonize('N');
95 w_idx+=2;
96 }
97 break;
98 /* WH becomes H,
99 WR becomes R
100 W if followed by a vowel */
101 case 'W':
102 if( Next_Letter == 'H' ||
103 Next_Letter == 'R' )
104 {
105 Phonize(Next_Letter);
106 w_idx+=2;
107 }
108 else if ( isvowel(Next_Letter) ) {
109 Phonize('W');
110 w_idx+=2;
111 }
112 /* else ignore */
113 break;
114 /* X becomes S */
115 case 'X':
116 Phonize('S');
117 w_idx++;
118 break;
119 /* Vowels are kept */
120 /* We did A already
121 case 'A':
122 case 'a':
123 */
124 case 'E':
125 case 'I':
126 case 'O':
127 case 'U':
128 Phonize(Curr_Letter);
129 w_idx++;
130 break;
131 default:
132 /* do nothing */
133 break;
134 }
135
136
137
138 /* On to the metaphoning */
139 for(;
140 Curr_Letter != '\0' &&
141 (max_phonemes == 0 || Phone_Len < max_phonemes);
142 w_idx++
143 ) {
144 /* How many letters to skip because an eariler encoding handled
145 * multiple letters */
146 unsigned short int skip_letter = 0;
147
148
149 /* THOUGHT: It would be nice if, rather than having things like...
150 * well, SCI. For SCI you encode the S, then have to remember
151 * to skip the C. So the phonome SCI invades both S and C. It would
152 * be better, IMHO, to skip the C from the S part of the encoding.
153 * Hell, I'm trying it.
154 */
155
156 /* Ignore non-alphas */
157 if( !isalpha(Curr_Letter) )
158 continue;
159
160 /* Drop duplicates, except CC */
161 if( Curr_Letter == Prev_Letter &&
162 Curr_Letter != 'C' )
163 continue;
164
165 switch (Curr_Letter) {
166 /* B -> B unless in MB */
167 case 'B':
168 if( Prev_Letter != 'M' )
169 Phonize('B');
170 break;
171 /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
172 * (SCHW is handled in S)
173 * S if -CI-, -CE- or -CY-
174 * dropped if -SCI-, SCE-, -SCY- (handed in S)
175 * else K
176 */
177 case 'C':
178 if( MAKESOFT(Next_Letter) ) { /* C[IEY] */
179 if( After_Next_Letter == 'A' &&
180 Next_Letter == 'I' ) { /* CIA */
181 Phonize(SH);
182 }
183 /* SC[IEY] */
184 else if ( Prev_Letter == 'S' ) {
185 /* Dropped */
186 }
187 else {
188 Phonize('S');
189 }
190 }
191 else if ( Next_Letter == 'H' ) {
192 #ifndef USE_TRADITIONAL_METAPHONE
193 if( After_Next_Letter == 'R' ||
194 Prev_Letter == 'S' ) { /* Christ, School */
195 Phonize('K');
196 }
197 else {
198 Phonize(SH);
199 }
200 #else
201 Phonize(SH);
202 #endif
203 skip_letter++;
204 }
205 else {
206 Phonize('K');
207 }
208 break;
209 /* J if in -DGE-, -DGI- or -DGY-
210 * else T
211 */
212 case 'D':
213 if( Next_Letter == 'G' &&
214 MAKESOFT(After_Next_Letter) ) {
215 Phonize('J');
216 skip_letter++;
217 }
218 else
219 Phonize('T');
220 break;
221 /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
222 * else dropped if -GNED, -GN,
223 * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
224 * else J if in -GE-, -GI, -GY and not GG
225 * else K
226 */
227 case 'G':
228 if( Next_Letter == 'H' ) {
229 if( !( NOGHTOF(Look_Back_Letter(3)) ||
230 Look_Back_Letter(4) == 'H' ) ) {
231 Phonize('F');
232 skip_letter++;
233 }
234 else {
235 /* silent */
236 }
237 }
238 else if( Next_Letter == 'N' ) {
239 if( Isbreak(After_Next_Letter) ||
240 ( After_Next_Letter == 'E' &&
241 Look_Ahead_Letter(3) == 'D' ) ) {
242 /* dropped */
243 }
244 else
245 Phonize('K');
246 }
247 else if( MAKESOFT(Next_Letter) &&
248 Prev_Letter != 'G' ) {
249 Phonize('J');
250 }
251 else {
252 Phonize('K');
253 }
254 break;
255 /* H if before a vowel and not after C,G,P,S,T */
256 case 'H':
257 if( isvowel(Next_Letter) &&
258 !AFFECTH(Prev_Letter) )
259 Phonize('H');
260 break;
261 /* dropped if after C
262 * else K
263 */
264 case 'K':
265 if( Prev_Letter != 'C' )
266 Phonize('K');
267 break;
268 /* F if before H
269 * else P
270 */
271 case 'P':
272 if( Next_Letter == 'H' ) {
273 Phonize('F');
274 }
275 else {
276 Phonize('P');
277 }
278 break;
279 /* K
280 */
281 case 'Q':
282 Phonize('K');
283 break;
284 /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
285 * else S
286 */
287 case 'S':
288 if( Next_Letter == 'I' &&
289 ( After_Next_Letter == 'O' ||
290 After_Next_Letter == 'A' ) ) {
291 Phonize(SH);
292 }
293 else if ( Next_Letter == 'H' ) {
294 Phonize(SH);
295 skip_letter++;
296 }
297 #ifndef USE_TRADITIONAL_METAPHONE
298 else if ( Next_Letter == 'C' &&
299 Look_Ahead_Letter(2) == 'H' &&
300 Look_Ahead_Letter(3) == 'W' ) {
301 Phonize(SH);
302 skip_letter += 2;
303 }
304 #endif
305 else {
306 Phonize('S');
307 }
308 break;
309 /* 'sh' in -TIA- or -TIO-
310 * else 'th' before H
311 * else T
312 */
313 case 'T':
314 if( Next_Letter == 'I' &&
315 ( After_Next_Letter == 'O' ||
316 After_Next_Letter == 'A' ) ) {
317 Phonize(SH);
318 }
319 else if ( Next_Letter == 'H' ) {
320 Phonize(TH);
321 skip_letter++;
322 }
323 else {
324 Phonize('T');
325 }
326 break;
327 /* F */
328 case 'V':
329 Phonize('F');
330 break;
331 /* W before a vowel, else dropped */
332 case 'W':
333 if( isvowel(Next_Letter) )
334 Phonize('W');
335 break;
336 /* KS */
337 case 'X':
338 Phonize('K');
339 Phonize('S');
340 break;
341 /* Y if followed by a vowel */
342 case 'Y':
343 if( isvowel(Next_Letter) )
344 Phonize('Y');
345 break;
346 /* S */
347 case 'Z':
348 Phonize('S');
349 break;
350 /* No transformation */
351 case 'F':
352 case 'J':
353 case 'L':
354 case 'M':
355 case 'N':
356 case 'R':
357 Phonize(Curr_Letter);
358 break;
359 default:
360 /* nothing */
361 break;
362 }
363
364 w_idx += skip_letter;
365 }
366
367 return phoned_word;
368 }
369