1 // Mozilla has modified this file - see https://hg.mozilla.org/ for details. 2 /* 3 * Licensed to the Apache Software Foundation (ASF) under one or more 4 * contributor license agreements. See the NOTICE file distributed with 5 * this work for additional information regarding copyright ownership. 6 * The ASF licenses this file to You under the Apache License, Version 2.0 7 * (the "License"); you may not use this file except in compliance with 8 * the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 19 package org.mozilla.apache.commons.codec.language; 20 21 import org.mozilla.apache.commons.codec.EncoderException; 22 import org.mozilla.apache.commons.codec.StringEncoder; 23 24 /** 25 * Encodes a string into a Metaphone value. 26 * <p> 27 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 28 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. 29 * </p> 30 * <p> 31 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p 32 * 39.</CITE> 33 * </p> 34 * <p> 35 * Note, that this does not match the algorithm that ships with PHP, or the algorithm 36 * found in the Perl <a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>. 37 * They have had undocumented changes from the originally published algorithm. 38 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>. 39 * </p> 40 * 41 * @author Apache Software Foundation 42 * @version $Id: Metaphone.java 1080867 2011-03-12 06:06:46Z ggregory $ 43 */ 44 public class Metaphone implements StringEncoder { 45 46 /** 47 * Five values in the English language 48 */ 49 private static final String VOWELS = "AEIOU" ; 50 51 /** 52 * Variable used in Metaphone algorithm 53 */ 54 private static final String FRONTV = "EIY" ; 55 56 /** 57 * Variable used in Metaphone algorithm 58 */ 59 private static final String VARSON = "CSPTG" ; 60 61 /** 62 * The max code length for metaphone is 4 63 */ 64 private int maxCodeLen = 4 ; 65 66 /** 67 * Creates an instance of the Metaphone encoder 68 */ Metaphone()69 public Metaphone() { 70 super(); 71 } 72 73 /** 74 * Find the metaphone value of a String. This is similar to the 75 * soundex algorithm, but better at finding similar sounding words. 76 * All input is converted to upper case. 77 * Limitations: Input format is expected to be a single ASCII word 78 * with only characters in the A - Z range, no punctuation or numbers. 79 * 80 * @param txt String to find the metaphone code for 81 * @return A metaphone code corresponding to the String supplied 82 */ metaphone(String txt)83 public String metaphone(String txt) { 84 boolean hard = false ; 85 if ((txt == null) || (txt.length() == 0)) { 86 return "" ; 87 } 88 // single character is itself 89 if (txt.length() == 1) { 90 return txt.toUpperCase(java.util.Locale.ENGLISH) ; 91 } 92 93 char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray() ; 94 95 StringBuffer local = new StringBuffer(40); // manipulate 96 StringBuffer code = new StringBuffer(10) ; // output 97 // handle initial 2 characters exceptions 98 switch(inwd[0]) { 99 case 'K' : 100 case 'G' : 101 case 'P' : /* looking for KN, etc*/ 102 if (inwd[1] == 'N') { 103 local.append(inwd, 1, inwd.length - 1); 104 } else { 105 local.append(inwd); 106 } 107 break; 108 case 'A': /* looking for AE */ 109 if (inwd[1] == 'E') { 110 local.append(inwd, 1, inwd.length - 1); 111 } else { 112 local.append(inwd); 113 } 114 break; 115 case 'W' : /* looking for WR or WH */ 116 if (inwd[1] == 'R') { // WR -> R 117 local.append(inwd, 1, inwd.length - 1); 118 break ; 119 } 120 if (inwd[1] == 'H') { 121 local.append(inwd, 1, inwd.length - 1); 122 local.setCharAt(0, 'W'); // WH -> W 123 } else { 124 local.append(inwd); 125 } 126 break; 127 case 'X' : /* initial X becomes S */ 128 inwd[0] = 'S'; 129 local.append(inwd); 130 break ; 131 default : 132 local.append(inwd); 133 } // now local has working string with initials fixed 134 135 int wdsz = local.length(); 136 int n = 0 ; 137 138 while ((code.length() < this.getMaxCodeLen()) && 139 (n < wdsz) ) { // max code size of 4 works well 140 char symb = local.charAt(n) ; 141 // remove duplicate letters except C 142 if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) { 143 n++ ; 144 } else { // not dup 145 switch(symb) { 146 case 'A' : case 'E' : case 'I' : case 'O' : case 'U' : 147 if (n == 0) { 148 code.append(symb); 149 } 150 break ; // only use vowel if leading char 151 case 'B' : 152 if ( isPreviousChar(local, n, 'M') && 153 isLastChar(wdsz, n) ) { // B is silent if word ends in MB 154 break; 155 } 156 code.append(symb); 157 break; 158 case 'C' : // lots of C special cases 159 /* discard if SCI, SCE or SCY */ 160 if ( isPreviousChar(local, n, 'S') && 161 !isLastChar(wdsz, n) && 162 (FRONTV.indexOf(local.charAt(n + 1)) >= 0) ) { 163 break; 164 } 165 if (regionMatch(local, n, "CIA")) { // "CIA" -> X 166 code.append('X'); 167 break; 168 } 169 if (!isLastChar(wdsz, n) && 170 (FRONTV.indexOf(local.charAt(n + 1)) >= 0)) { 171 code.append('S'); 172 break; // CI,CE,CY -> S 173 } 174 if (isPreviousChar(local, n, 'S') && 175 isNextChar(local, n, 'H') ) { // SCH->sk 176 code.append('K') ; 177 break ; 178 } 179 if (isNextChar(local, n, 'H')) { // detect CH 180 if ((n == 0) && 181 (wdsz >= 3) && 182 isVowel(local,2) ) { // CH consonant -> K consonant 183 code.append('K'); 184 } else { 185 code.append('X'); // CHvowel -> X 186 } 187 } else { 188 code.append('K'); 189 } 190 break ; 191 case 'D' : 192 if (!isLastChar(wdsz, n + 1) && 193 isNextChar(local, n, 'G') && 194 (FRONTV.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J 195 code.append('J'); n += 2 ; 196 } else { 197 code.append('T'); 198 } 199 break ; 200 case 'G' : // GH silent at end or before consonant 201 if (isLastChar(wdsz, n + 1) && 202 isNextChar(local, n, 'H')) { 203 break; 204 } 205 if (!isLastChar(wdsz, n + 1) && 206 isNextChar(local,n,'H') && 207 !isVowel(local,n+2)) { 208 break; 209 } 210 if ((n > 0) && 211 ( regionMatch(local, n, "GN") || 212 regionMatch(local, n, "GNED") ) ) { 213 break; // silent G 214 } 215 if (isPreviousChar(local, n, 'G')) { 216 // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true 217 hard = true ; 218 } else { 219 hard = false ; 220 } 221 if (!isLastChar(wdsz, n) && 222 (FRONTV.indexOf(local.charAt(n + 1)) >= 0) && 223 (!hard)) { 224 code.append('J'); 225 } else { 226 code.append('K'); 227 } 228 break ; 229 case 'H': 230 if (isLastChar(wdsz, n)) { 231 break ; // terminal H 232 } 233 if ((n > 0) && 234 (VARSON.indexOf(local.charAt(n - 1)) >= 0)) { 235 break; 236 } 237 if (isVowel(local,n+1)) { 238 code.append('H'); // Hvowel 239 } 240 break; 241 case 'F': 242 case 'J' : 243 case 'L' : 244 case 'M': 245 case 'N' : 246 case 'R' : 247 code.append(symb); 248 break; 249 case 'K' : 250 if (n > 0) { // not initial 251 if (!isPreviousChar(local, n, 'C')) { 252 code.append(symb); 253 } 254 } else { 255 code.append(symb); // initial K 256 } 257 break ; 258 case 'P' : 259 if (isNextChar(local,n,'H')) { 260 // PH -> F 261 code.append('F'); 262 } else { 263 code.append(symb); 264 } 265 break ; 266 case 'Q' : 267 code.append('K'); 268 break; 269 case 'S' : 270 if (regionMatch(local,n,"SH") || 271 regionMatch(local,n,"SIO") || 272 regionMatch(local,n,"SIA")) { 273 code.append('X'); 274 } else { 275 code.append('S'); 276 } 277 break; 278 case 'T' : 279 if (regionMatch(local,n,"TIA") || 280 regionMatch(local,n,"TIO")) { 281 code.append('X'); 282 break; 283 } 284 if (regionMatch(local,n,"TCH")) { 285 // Silent if in "TCH" 286 break; 287 } 288 // substitute numeral 0 for TH (resembles theta after all) 289 if (regionMatch(local,n,"TH")) { 290 code.append('0'); 291 } else { 292 code.append('T'); 293 } 294 break ; 295 case 'V' : 296 code.append('F'); break ; 297 case 'W' : case 'Y' : // silent if not followed by vowel 298 if (!isLastChar(wdsz,n) && 299 isVowel(local,n+1)) { 300 code.append(symb); 301 } 302 break ; 303 case 'X' : 304 code.append('K'); code.append('S'); 305 break ; 306 case 'Z' : 307 code.append('S'); break ; 308 } // end switch 309 n++ ; 310 } // end else from symb != 'C' 311 if (code.length() > this.getMaxCodeLen()) { 312 code.setLength(this.getMaxCodeLen()); 313 } 314 } 315 return code.toString(); 316 } 317 isVowel(StringBuffer string, int index)318 private boolean isVowel(StringBuffer string, int index) { 319 return VOWELS.indexOf(string.charAt(index)) >= 0; 320 } 321 isPreviousChar(StringBuffer string, int index, char c)322 private boolean isPreviousChar(StringBuffer string, int index, char c) { 323 boolean matches = false; 324 if( index > 0 && 325 index < string.length() ) { 326 matches = string.charAt(index - 1) == c; 327 } 328 return matches; 329 } 330 isNextChar(StringBuffer string, int index, char c)331 private boolean isNextChar(StringBuffer string, int index, char c) { 332 boolean matches = false; 333 if( index >= 0 && 334 index < string.length() - 1 ) { 335 matches = string.charAt(index + 1) == c; 336 } 337 return matches; 338 } 339 regionMatch(StringBuffer string, int index, String test)340 private boolean regionMatch(StringBuffer string, int index, String test) { 341 boolean matches = false; 342 if( index >= 0 && 343 (index + test.length() - 1) < string.length() ) { 344 String substring = string.substring( index, index + test.length()); 345 matches = substring.equals( test ); 346 } 347 return matches; 348 } 349 isLastChar(int wdsz, int n)350 private boolean isLastChar(int wdsz, int n) { 351 return n + 1 == wdsz; 352 } 353 354 355 /** 356 * Encodes an Object using the metaphone algorithm. This method 357 * is provided in order to satisfy the requirements of the 358 * Encoder interface, and will throw an EncoderException if the 359 * supplied object is not of type java.lang.String. 360 * 361 * @param pObject Object to encode 362 * @return An object (or type java.lang.String) containing the 363 * metaphone code which corresponds to the String supplied. 364 * @throws EncoderException if the parameter supplied is not 365 * of type java.lang.String 366 */ encode(Object pObject)367 public Object encode(Object pObject) throws EncoderException { 368 if (!(pObject instanceof String)) { 369 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 370 } 371 return metaphone((String) pObject); 372 } 373 374 /** 375 * Encodes a String using the Metaphone algorithm. 376 * 377 * @param pString String object to encode 378 * @return The metaphone code corresponding to the String supplied 379 */ encode(String pString)380 public String encode(String pString) { 381 return metaphone(pString); 382 } 383 384 /** 385 * Tests is the metaphones of two strings are identical. 386 * 387 * @param str1 First of two strings to compare 388 * @param str2 Second of two strings to compare 389 * @return <code>true</code> if the metaphones of these strings are identical, 390 * <code>false</code> otherwise. 391 */ isMetaphoneEqual(String str1, String str2)392 public boolean isMetaphoneEqual(String str1, String str2) { 393 return metaphone(str1).equals(metaphone(str2)); 394 } 395 396 /** 397 * Returns the maxCodeLen. 398 * @return int 399 */ getMaxCodeLen()400 public int getMaxCodeLen() { return this.maxCodeLen; } 401 402 /** 403 * Sets the maxCodeLen. 404 * @param maxCodeLen The maxCodeLen to set 405 */ setMaxCodeLen(int maxCodeLen)406 public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; } 407 408 } 409