1 // Mozilla has modified this file - see https://hg.mozilla.org/ for details.
2 /*
3  * Licensed to the Apache Software Foundation (ASF) under one or more
4  * contributor license agreements.  See the NOTICE file distributed with
5  * this work for additional information regarding copyright ownership.
6  * The ASF licenses this file to You under the Apache License, Version 2.0
7  * (the "License"); you may not use this file except in compliance with
8  * the License.  You may obtain a copy of the License at
9  *
10  *      http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 package org.mozilla.apache.commons.codec.language;
20 
21 import org.mozilla.apache.commons.codec.EncoderException;
22 import org.mozilla.apache.commons.codec.StringEncoder;
23 
24 /**
25  * Encodes a string into a Metaphone value.
26  * <p>
27  * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
28  * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
29  * </p>
30  * <p>
31  * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
32  * 39.</CITE>
33  * </p>
34  * <p>
35  * Note, that this does not match the algorithm that ships with PHP, or the algorithm
36  * found in the Perl <a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>.
37  * They have had undocumented changes from the originally published algorithm.
38  * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
39  * </p>
40  *
41  * @author Apache Software Foundation
42  * @version $Id: Metaphone.java 1080867 2011-03-12 06:06:46Z ggregory $
43  */
44 public class Metaphone implements StringEncoder {
45 
46     /**
47      * Five values in the English language
48      */
49     private static final String VOWELS = "AEIOU" ;
50 
51     /**
52      * Variable used in Metaphone algorithm
53      */
54     private static final String FRONTV = "EIY"   ;
55 
56     /**
57      * Variable used in Metaphone algorithm
58      */
59     private static final String VARSON = "CSPTG" ;
60 
61     /**
62      * The max code length for metaphone is 4
63      */
64     private int maxCodeLen = 4 ;
65 
66     /**
67      * Creates an instance of the Metaphone encoder
68      */
Metaphone()69     public Metaphone() {
70         super();
71     }
72 
73     /**
74      * Find the metaphone value of a String. This is similar to the
75      * soundex algorithm, but better at finding similar sounding words.
76      * All input is converted to upper case.
77      * Limitations: Input format is expected to be a single ASCII word
78      * with only characters in the A - Z range, no punctuation or numbers.
79      *
80      * @param txt String to find the metaphone code for
81      * @return A metaphone code corresponding to the String supplied
82      */
metaphone(String txt)83     public String metaphone(String txt) {
84         boolean hard = false ;
85         if ((txt == null) || (txt.length() == 0)) {
86             return "" ;
87         }
88         // single character is itself
89         if (txt.length() == 1) {
90             return txt.toUpperCase(java.util.Locale.ENGLISH) ;
91         }
92 
93         char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray() ;
94 
95         StringBuffer local = new StringBuffer(40); // manipulate
96         StringBuffer code = new StringBuffer(10) ; //   output
97         // handle initial 2 characters exceptions
98         switch(inwd[0]) {
99         case 'K' :
100         case 'G' :
101         case 'P' : /* looking for KN, etc*/
102             if (inwd[1] == 'N') {
103                 local.append(inwd, 1, inwd.length - 1);
104             } else {
105                 local.append(inwd);
106             }
107             break;
108         case 'A': /* looking for AE */
109             if (inwd[1] == 'E') {
110                 local.append(inwd, 1, inwd.length - 1);
111             } else {
112                 local.append(inwd);
113             }
114             break;
115         case 'W' : /* looking for WR or WH */
116             if (inwd[1] == 'R') {   // WR -> R
117                 local.append(inwd, 1, inwd.length - 1);
118                 break ;
119             }
120             if (inwd[1] == 'H') {
121                 local.append(inwd, 1, inwd.length - 1);
122                 local.setCharAt(0, 'W'); // WH -> W
123             } else {
124                 local.append(inwd);
125             }
126             break;
127         case 'X' : /* initial X becomes S */
128             inwd[0] = 'S';
129             local.append(inwd);
130             break ;
131         default :
132             local.append(inwd);
133         } // now local has working string with initials fixed
134 
135         int wdsz = local.length();
136         int n = 0 ;
137 
138         while ((code.length() < this.getMaxCodeLen()) &&
139                (n < wdsz) ) { // max code size of 4 works well
140             char symb = local.charAt(n) ;
141             // remove duplicate letters except C
142             if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
143                 n++ ;
144             } else { // not dup
145                 switch(symb) {
146                 case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
147                     if (n == 0) {
148                         code.append(symb);
149                     }
150                     break ; // only use vowel if leading char
151                 case 'B' :
152                     if ( isPreviousChar(local, n, 'M') &&
153                          isLastChar(wdsz, n) ) { // B is silent if word ends in MB
154                         break;
155                     }
156                     code.append(symb);
157                     break;
158                 case 'C' : // lots of C special cases
159                     /* discard if SCI, SCE or SCY */
160                     if ( isPreviousChar(local, n, 'S') &&
161                          !isLastChar(wdsz, n) &&
162                          (FRONTV.indexOf(local.charAt(n + 1)) >= 0) ) {
163                         break;
164                     }
165                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
166                         code.append('X');
167                         break;
168                     }
169                     if (!isLastChar(wdsz, n) &&
170                         (FRONTV.indexOf(local.charAt(n + 1)) >= 0)) {
171                         code.append('S');
172                         break; // CI,CE,CY -> S
173                     }
174                     if (isPreviousChar(local, n, 'S') &&
175                         isNextChar(local, n, 'H') ) { // SCH->sk
176                         code.append('K') ;
177                         break ;
178                     }
179                     if (isNextChar(local, n, 'H')) { // detect CH
180                         if ((n == 0) &&
181                             (wdsz >= 3) &&
182                             isVowel(local,2) ) { // CH consonant -> K consonant
183                             code.append('K');
184                         } else {
185                             code.append('X'); // CHvowel -> X
186                         }
187                     } else {
188                         code.append('K');
189                     }
190                     break ;
191                 case 'D' :
192                     if (!isLastChar(wdsz, n + 1) &&
193                         isNextChar(local, n, 'G') &&
194                         (FRONTV.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J
195                         code.append('J'); n += 2 ;
196                     } else {
197                         code.append('T');
198                     }
199                     break ;
200                 case 'G' : // GH silent at end or before consonant
201                     if (isLastChar(wdsz, n + 1) &&
202                         isNextChar(local, n, 'H')) {
203                         break;
204                     }
205                     if (!isLastChar(wdsz, n + 1) &&
206                         isNextChar(local,n,'H') &&
207                         !isVowel(local,n+2)) {
208                         break;
209                     }
210                     if ((n > 0) &&
211                         ( regionMatch(local, n, "GN") ||
212                           regionMatch(local, n, "GNED") ) ) {
213                         break; // silent G
214                     }
215                     if (isPreviousChar(local, n, 'G')) {
216                         // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
217                         hard = true ;
218                     } else {
219                         hard = false ;
220                     }
221                     if (!isLastChar(wdsz, n) &&
222                         (FRONTV.indexOf(local.charAt(n + 1)) >= 0) &&
223                         (!hard)) {
224                         code.append('J');
225                     } else {
226                         code.append('K');
227                     }
228                     break ;
229                 case 'H':
230                     if (isLastChar(wdsz, n)) {
231                         break ; // terminal H
232                     }
233                     if ((n > 0) &&
234                         (VARSON.indexOf(local.charAt(n - 1)) >= 0)) {
235                         break;
236                     }
237                     if (isVowel(local,n+1)) {
238                         code.append('H'); // Hvowel
239                     }
240                     break;
241                 case 'F':
242                 case 'J' :
243                 case 'L' :
244                 case 'M':
245                 case 'N' :
246                 case 'R' :
247                     code.append(symb);
248                     break;
249                 case 'K' :
250                     if (n > 0) { // not initial
251                         if (!isPreviousChar(local, n, 'C')) {
252                             code.append(symb);
253                         }
254                     } else {
255                         code.append(symb); // initial K
256                     }
257                     break ;
258                 case 'P' :
259                     if (isNextChar(local,n,'H')) {
260                         // PH -> F
261                         code.append('F');
262                     } else {
263                         code.append(symb);
264                     }
265                     break ;
266                 case 'Q' :
267                     code.append('K');
268                     break;
269                 case 'S' :
270                     if (regionMatch(local,n,"SH") ||
271                         regionMatch(local,n,"SIO") ||
272                         regionMatch(local,n,"SIA")) {
273                         code.append('X');
274                     } else {
275                         code.append('S');
276                     }
277                     break;
278                 case 'T' :
279                     if (regionMatch(local,n,"TIA") ||
280                         regionMatch(local,n,"TIO")) {
281                         code.append('X');
282                         break;
283                     }
284                     if (regionMatch(local,n,"TCH")) {
285                         // Silent if in "TCH"
286                         break;
287                     }
288                     // substitute numeral 0 for TH (resembles theta after all)
289                     if (regionMatch(local,n,"TH")) {
290                         code.append('0');
291                     } else {
292                         code.append('T');
293                     }
294                     break ;
295                 case 'V' :
296                     code.append('F'); break ;
297                 case 'W' : case 'Y' : // silent if not followed by vowel
298                     if (!isLastChar(wdsz,n) &&
299                         isVowel(local,n+1)) {
300                         code.append(symb);
301                     }
302                     break ;
303                 case 'X' :
304                     code.append('K'); code.append('S');
305                     break ;
306                 case 'Z' :
307                     code.append('S'); break ;
308                 } // end switch
309                 n++ ;
310             } // end else from symb != 'C'
311             if (code.length() > this.getMaxCodeLen()) {
312                 code.setLength(this.getMaxCodeLen());
313             }
314         }
315         return code.toString();
316     }
317 
isVowel(StringBuffer string, int index)318     private boolean isVowel(StringBuffer string, int index) {
319         return VOWELS.indexOf(string.charAt(index)) >= 0;
320     }
321 
isPreviousChar(StringBuffer string, int index, char c)322     private boolean isPreviousChar(StringBuffer string, int index, char c) {
323         boolean matches = false;
324         if( index > 0 &&
325             index < string.length() ) {
326             matches = string.charAt(index - 1) == c;
327         }
328         return matches;
329     }
330 
isNextChar(StringBuffer string, int index, char c)331     private boolean isNextChar(StringBuffer string, int index, char c) {
332         boolean matches = false;
333         if( index >= 0 &&
334             index < string.length() - 1 ) {
335             matches = string.charAt(index + 1) == c;
336         }
337         return matches;
338     }
339 
regionMatch(StringBuffer string, int index, String test)340     private boolean regionMatch(StringBuffer string, int index, String test) {
341         boolean matches = false;
342         if( index >= 0 &&
343             (index + test.length() - 1) < string.length() ) {
344             String substring = string.substring( index, index + test.length());
345             matches = substring.equals( test );
346         }
347         return matches;
348     }
349 
isLastChar(int wdsz, int n)350     private boolean isLastChar(int wdsz, int n) {
351         return n + 1 == wdsz;
352     }
353 
354 
355     /**
356      * Encodes an Object using the metaphone algorithm.  This method
357      * is provided in order to satisfy the requirements of the
358      * Encoder interface, and will throw an EncoderException if the
359      * supplied object is not of type java.lang.String.
360      *
361      * @param pObject Object to encode
362      * @return An object (or type java.lang.String) containing the
363      *         metaphone code which corresponds to the String supplied.
364      * @throws EncoderException if the parameter supplied is not
365      *                          of type java.lang.String
366      */
encode(Object pObject)367     public Object encode(Object pObject) throws EncoderException {
368         if (!(pObject instanceof String)) {
369             throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
370         }
371         return metaphone((String) pObject);
372     }
373 
374     /**
375      * Encodes a String using the Metaphone algorithm.
376      *
377      * @param pString String object to encode
378      * @return The metaphone code corresponding to the String supplied
379      */
encode(String pString)380     public String encode(String pString) {
381         return metaphone(pString);
382     }
383 
384     /**
385      * Tests is the metaphones of two strings are identical.
386      *
387      * @param str1 First of two strings to compare
388      * @param str2 Second of two strings to compare
389      * @return <code>true</code> if the metaphones of these strings are identical,
390      *        <code>false</code> otherwise.
391      */
isMetaphoneEqual(String str1, String str2)392     public boolean isMetaphoneEqual(String str1, String str2) {
393         return metaphone(str1).equals(metaphone(str2));
394     }
395 
396     /**
397      * Returns the maxCodeLen.
398      * @return int
399      */
getMaxCodeLen()400     public int getMaxCodeLen() { return this.maxCodeLen; }
401 
402     /**
403      * Sets the maxCodeLen.
404      * @param maxCodeLen The maxCodeLen to set
405      */
setMaxCodeLen(int maxCodeLen)406     public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
407 
408 }
409