1 package org.unicode.cldr.tool; 2 3 import java.io.BufferedReader; 4 import java.io.IOException; 5 import java.io.PrintWriter; 6 import java.lang.reflect.Field; 7 import java.util.ArrayList; 8 import java.util.Comparator; 9 import java.util.List; 10 import java.util.Locale; 11 import java.util.Map; 12 import java.util.Set; 13 import java.util.TreeMap; 14 import java.util.TreeSet; 15 16 import org.unicode.cldr.draft.FileUtilities; 17 import org.unicode.cldr.util.CldrUtility; 18 import org.unicode.cldr.util.Pair; 19 20 import com.ibm.icu.impl.Relation; 21 import com.ibm.icu.lang.UCharacter; 22 import com.ibm.icu.text.Collator; 23 import com.ibm.icu.text.NumberFormat; 24 import com.ibm.icu.text.Transliterator; 25 import com.ibm.icu.text.UTF16; 26 import com.ibm.icu.text.UnicodeSet; 27 import com.ibm.icu.text.UnicodeSetIterator; 28 import com.ibm.icu.util.ULocale; 29 30 /** 31 * Takes a list of mappings (tab delimited) from source to target and produces a 32 * transliterator 33 * 34 * @author markdavis 35 * http://en.wikipedia.org/wiki/English_phonology 36 */ 37 public class MakeTransliterator { 38 // DEBUGGING 39 static int forceSeparateIfShorter = 4; // 4 40 41 private static final String CHECK_BASE = null; // "vessel"; 42 private static final String CHECK_BUILT = null; // "vessel"; 43 44 private static final String TEST_STRING = "territories"; 45 private static final boolean SHOW_OVERRIDES = true; 46 47 private static final int MINIMUM_FREQUENCY = 9999; 48 49 static boolean isIPA = true; 50 static boolean onlyToTarget = true; 51 52 // others 53 54 static NumberFormat nf = NumberFormat.getInstance(ULocale.ENGLISH); 55 56 static Collator col = Collator.getInstance(ULocale.ROOT); 57 58 static String cldrDataDir = "C:\\cvsdata\\unicode\\cldr\\tools\\java\\org\\unicode\\cldr\\util\\data\\transforms\\"; 59 main(String[] args)60 public static void main(String[] args) throws IOException { 61 setTranslitDebug(true); 62 63 Locale fil = new Locale("fil"); 64 System.out.println(fil); 65 fil = new Locale("fil", "US"); 66 System.out.println(fil); 67 68 String sourceFile = cldrDataDir + "internal_raw_IPA.txt"; 69 String targetFile = cldrDataDir + "en-IPA.txt"; 70 String targetCountFile = cldrDataDir + "en-IPA_count.txt"; 71 String skippedLinesFile = "C:\\DATA\\GEN\\SkippedIPA.txt"; 72 73 PrintWriter skippedOut = FileUtilities.openUTF8Writer("", skippedLinesFile); 74 75 // String coreRules = getCoreTransliterator(); 76 String fixBadIpaRules = createFromFile(cldrDataDir + "internal_fixBadIpa.txt", null, null); 77 fixBadIpa = Transliterator.createFromRules("foo", fixBadIpaRules, Transliterator.FORWARD); 78 79 Map<String, String> overrides = getOverrides(); 80 81 String coreForeRules = createFromFile(cldrDataDir + "internal_baseEnglishToIpa.txt", null, null); 82 coreBase = Transliterator.createFromRules("foo", coreForeRules, Transliterator.FORWARD); 83 if (CHECK_BASE != null) { 84 setTranslitDebug(true); 85 System.out.println(coreBase.transliterate(CHECK_BASE)); 86 return; 87 } 88 89 if (CHECK_BUILT != null) { 90 String foo = createFromFile(cldrDataDir + "en-IPA.txt", null, null); 91 Transliterator fooTrans = Transliterator.createFromRules("foo", foo, Transliterator.FORWARD); 92 93 setTranslitDebug(true); 94 System.out.println(fooTrans.transliterate(CHECK_BUILT)); 95 return; 96 } 97 98 String coreBackRules = createFromFile(cldrDataDir + "internal_English-IPA-backwards.txt", null, null); 99 checkCoreReversibility(skippedOut, coreForeRules, coreBackRules); 100 String coreRules = coreForeRules + coreBackRules; 101 System.out.println(coreRules); 102 103 // C:\DATA\GEN\mergedIPA2.txt 104 // we have to have items in order. Longest forms need to come first, on both 105 // sides. 106 Relation<String, Pair<String, Long>> store = Relation.of(new TreeMap<String, Set<Pair<String, Long>>>(MyComparator), 107 TreeSet.class); 108 109 targetCharacters = new UnicodeSet(); 110 sourceCharacters = new UnicodeSet(); 111 allowedSourceCharacters = new UnicodeSet( 112 "[[:Letter:]\u2019]").freeze(); 113 allowedTargetCharacters = new UnicodeSet( 114 "[\u00E6 \u0251 b d\u00F0 e \u0259 \u025B f-i \u026A j-n \u014B o p r s \u0283 t u \u028A v w z \u0292 \u03B8]") 115 .freeze(); 116 countSkipped = 0; 117 totalFrequency = 0; 118 skippedFrequency = 0; 119 int targetField = isIPA ? 2 : 1; 120 121 BufferedReader in = FileUtilities.openUTF8Reader("", sourceFile); 122 while (true) { 123 String line = in.readLine(); 124 if (line == null) 125 break; 126 if (line.startsWith("\uFEFF")) { 127 line = line.substring(1); 128 } 129 String originalLine = line; 130 int commentCharPosition = line.indexOf('#'); 131 if (commentCharPosition >= 0) { 132 line = line.substring(0, commentCharPosition); 133 } 134 line = line.trim(); 135 frequency = -1; 136 String[] pieces = line.split(" *[\\t,] *"); 137 if (pieces.length <= targetField) { 138 // skippedOut.println(originalLine + "\tno phonetics"); 139 // countSkipped++; 140 continue; // no phonetics 141 } 142 String source = pieces[0]; 143 if (TEST_STRING != null && source.equals(TEST_STRING)) { 144 System.out.println(line); // for debugging 145 } 146 147 // Fix Source 148 source = source.replace("'", "’"); 149 source = UCharacter.toLowerCase(ULocale.ENGLISH, source); 150 if (source.endsWith(".")) { 151 source = source.substring(0, source.length() - 1); 152 } 153 if (source.contains(" ") || source.contains("-")) { 154 skippedOut.println(originalLine + "\tspace or hyphen"); 155 countSkipped++; 156 skippedFrequency += frequency; 157 continue; 158 } 159 160 //String bestTarget = null; 161 162 String override = overrides.get(source); 163 String spelling = spellout.transliterate(source); 164 165 for (int i = 1; i < pieces.length; ++i) { 166 String target = pieces[i]; 167 if (target.startsWith("%")) { 168 frequency = Long.parseLong(target.substring(1)); 169 continue; 170 } 171 172 if (override != null) { 173 if (SHOW_OVERRIDES) 174 System.out.println("Overriding\t" + source + " → ! " + target + " → " + override); 175 if (override.length() != 0) { 176 if (TEST_STRING != null && source.equals(TEST_STRING)) { 177 setTranslitDebug(true); 178 } 179 target = fixBadIpa.transliterate(override); 180 setTranslitDebug(false); 181 addSourceTarget(skippedOut, source, target, originalLine, store); 182 } 183 break; 184 } 185 186 if (frequency < MINIMUM_FREQUENCY) { 187 // skippedOut.println(originalLine + "\tno frequency"); 188 countSkipped++; 189 continue; 190 } 191 192 target = UCharacter.toLowerCase(ULocale.ENGLISH, target); 193 target = target.replace(" ", ""); // remove extra spaces 194 195 if (target.startsWith("-") || target.endsWith("-")) { 196 continue; 197 } 198 199 String oldTarget = target; 200 target = fixBadIpa.transliterate(target); 201 202 if (target.equals(spelling)) { 203 skippedOut.println(originalLine 204 + "\tspellout"); 205 countSkipped++; 206 continue; 207 } 208 209 if (!target.equals(oldTarget)) { 210 skippedOut.println("\t### fixed IPA:\t" + source + "\t" + target 211 + "\twas: " + oldTarget); 212 } 213 214 addSourceTarget(skippedOut, source, target, originalLine, store); 215 } 216 } 217 218 // add the overrides that are not in. 219 220 for (String word : overrides.keySet()) { 221 if (!store.containsKey(word)) { 222 String target = overrides.get(word); 223 if (target.length() != 0) { 224 if (SHOW_OVERRIDES) System.out.println("New overrides:\t" + word + " → " + target); 225 addSourceTarget(skippedOut, word, target, "overrides", store); 226 } 227 } 228 } 229 in.close(); 230 System.out.println("total count: " + nf.format(store.size())); 231 System.out.println("skipped count: " + nf.format(countSkipped)); 232 233 System.out.println("total frequency-weighted: " + nf.format(totalFrequency)); 234 System.out.println("skipped frequency-weighted: " + nf.format(skippedFrequency)); 235 236 if (false) { 237 System.out.println(CldrUtility.LINE_SEPARATOR + "Source Characters "); 238 showSet(sourceCharacters); 239 System.out.println(CldrUtility.LINE_SEPARATOR + "Target Characters "); 240 showSet(targetCharacters); 241 } 242 243 // Set<String> seenSource = new HashSet<String>(); 244 // Set<String> seenTarget = new HashSet<String>(); 245 246 int countAdded = 0; 247 int countTotal = 0; 248 long frequencyAdded = 0; 249 long frequencySkipped = 0; 250 251 Transliterator base = Transliterator.createFromRules("foo", coreRules, Transliterator.FORWARD); 252 // build up the transliterator one length at a time. 253 List<String> newRules = new ArrayList<>(); 254 StringBuilder buffer = new StringBuilder(); 255 256 int lastSourceLength = 1; 257 258 Relation<Long, String> count_failures = Relation.of(new TreeMap<Long, Set<String>>(), TreeSet.class); 259 260 sourceLoop: for (String source : store.keySet()) { 261 if (TEST_STRING != null && source.equals(TEST_STRING)) { 262 System.out.println(source + "\t" + store.getAll(source)); 263 } 264 countTotal++; 265 // whenever the source changes in length, rebuild the transliterator 266 if (source.length() != lastSourceLength && source.length() >= forceSeparateIfShorter) { 267 System.out.println("Building transliterator for length " + lastSourceLength + " : " + newRules.size()); 268 System.out.flush(); 269 skippedOut.flush(); 270 String rules = buildRules(coreRules, newRules, buffer); 271 // System.out.println(rules); 272 base = Transliterator.createFromRules("foo", rules, Transliterator.FORWARD); 273 274 lastSourceLength = source.length(); 275 } 276 Set<Pair<String, Long>> targetSet = store.getAll(source); 277 // see if any of the mappings fall out 278 String targetUsingCore = base.transliterate(source); 279 280 String bestTarget = null; 281 int bestDistance = 999; 282 long frequency = 0; 283 for (Pair<String, Long> targetPair : targetSet) { 284 String target = targetPair.getFirst(); 285 if (target.length() == 0) { 286 throw new IllegalArgumentException(source + " → " + target); 287 } 288 frequency = targetPair.getSecond(); 289 290 if (targetUsingCore.equals(target)) { 291 // we have a match! skip this source 292 skippedOut.println("# skipping " + source + " → " + target + " ;"); 293 frequencySkipped += frequency; 294 continue sourceLoop; 295 } 296 if (mostlyEqual(source, target, targetUsingCore)) { 297 // we have a match! skip this source 298 skippedOut.println("# skipping " + source + " → " + target + " ; # close enough to " 299 + targetUsingCore); 300 frequencySkipped += frequency; 301 continue sourceLoop; 302 } 303 int distance = distance(source, target, targetUsingCore); 304 if (bestDistance > distance) { 305 bestTarget = target; 306 bestDistance = distance; 307 } 308 } 309 // if we get to here, we have a new rule. 310 if (bestTarget != null) { 311 boolean forceSeparate = false; 312 if (source.length() < forceSeparateIfShorter || bestTarget.length() * 2 > source.length() * 3) { 313 forceSeparate = true; 314 } else { 315 String spelling = spellout.transliterate(source); 316 if (bestTarget.equals(spelling)) { 317 forceSeparate = true; 318 } else { 319 // if it is likely that the word can have an extra letter added that changes the pronunciation 320 // force it to be separate 321 if (source.endsWith("e")) { 322 forceSeparate = true; 323 } 324 } 325 } 326 String targetUsingBaseCore = coreBase.transliterate(source); 327 328 if (forceSeparate) { 329 source = "$x{" + source + "}$x"; 330 } else { 331 source = "$x{" + source; 332 } 333 // strange hack 334 String hackSource = source.startsWith("use") ? "'" + source + "'" : source; 335 newRules.add(hackSource + " → " + bestTarget + " ; # " + targetUsingCore 336 + (targetUsingBaseCore.equals(targetUsingCore) ? "" : "\t\t" + targetUsingBaseCore) 337 + CldrUtility.LINE_SEPARATOR); 338 skippedOut.println("# couldn't replace " + source + " → " + bestTarget + " ; # " + targetUsingCore); 339 count_failures.put(-frequency, source + " → " + bestTarget + " ; # " + targetUsingCore); 340 countAdded++; 341 frequencyAdded += frequency; 342 } 343 } 344 345 String rules = buildRules(coreRules, newRules, buffer); 346 base = Transliterator.createFromRules("foo", rules, Transliterator.FORWARD); // verify that it builds 347 348 PrintWriter out = FileUtilities.openUTF8Writer("", targetFile); 349 out.println(rules); 350 out.close(); 351 352 out = FileUtilities.openUTF8Writer("", targetCountFile); 353 for (long count : count_failures.keySet()) { 354 for (String line : count_failures.getAll(count)) { 355 out.println(count + "\t" + line); 356 } 357 } 358 out.close(); 359 360 // if (false) { 361 // 362 // // now write out the transliterator file 363 // PrintWriter out = FileUtilities.openUTF8Writer("", targetFile); 364 // for (String source : store.keySet()) { 365 // Set<String> targetSet = store.getAll(source); 366 // for (String target : targetSet) { 367 // if (seenSource.contains(source)) { 368 // if (onlyToTarget) { 369 // // nothing 370 // } else if (seenTarget.contains(target)) { 371 // skippedOut.println("# " + source + " → " + target + " ;"); 372 // countSkipped++; 373 // } else { 374 // out.println(source + " ← " + target + " ;"); 375 // countSourceFromTarget++; 376 // } 377 // } else if (onlyToTarget || seenTarget.contains(target)) { 378 // out.println(source + " → " + target + " ;"); 379 // countSourceToTarget++; 380 // } else { 381 // out.println(source + " ↔ " + target + " ;"); 382 // countSourceAndTarget++; 383 // } 384 // seenSource.add(source); 385 // seenTarget.add(target); 386 // } 387 // } 388 // out.close(); 389 // } 390 skippedOut.close(); 391 System.out.println("countTotal: " + nf.format(countTotal)); 392 System.out.println("countAdded: " + nf.format(countAdded)); 393 System.out.println("countSkipped: " + nf.format(countTotal - countAdded)); 394 System.out.println("frequencyTotal: " + nf.format(frequencyAdded + frequencySkipped)); 395 System.out.println("frequencyAdded: " + nf.format(frequencyAdded)); 396 System.out.println("frequencySkipped: " + nf.format(frequencySkipped)); 397 } 398 setTranslitDebug(boolean newSetting)399 private static void setTranslitDebug(boolean newSetting) { 400 // Transliterator.DEBUG = newSetting; 401 try { 402 Field debug = Transliterator.class.getField("DEBUG"); 403 debug.setBoolean(Transliterator.class, newSetting); 404 } catch (Exception e) { 405 e.printStackTrace(); 406 } 407 } 408 addSourceTarget(PrintWriter skippedOut, String source, String target, String originalLine, Relation<String, Pair<String, Long>> store)409 private static void addSourceTarget(PrintWriter skippedOut, String source, String target, String originalLine, 410 Relation<String, Pair<String, Long>> store) { 411 if (source.equals("teh")) { 412 System.out.println("debug"); 413 } 414 if (!allowedSourceCharacters.containsAll(source)) { 415 skippedOut.println(originalLine 416 + "\t# Strange source values:\t" 417 + source 418 + "\t" 419 + new UnicodeSet().addAll(source) 420 .removeAll(allowedSourceCharacters).toPattern(false)); 421 countSkipped++; 422 skippedFrequency += frequency; 423 return; 424 } 425 if (!allowedTargetCharacters.containsAll(target)) { 426 System.out.println(originalLine 427 + "\t# Strange target values:\t" 428 + target 429 + "\t" 430 + new UnicodeSet().addAll(target) 431 .removeAll(allowedTargetCharacters).toPattern(false)); 432 countSkipped++; 433 skippedFrequency += frequency; 434 return; 435 } 436 437 sourceCharacters.addAll(source); 438 targetCharacters.addAll(target); 439 store.put(source, new Pair<>(target, frequency)); 440 totalFrequency += frequency; 441 442 } 443 checkCoreReversibility(PrintWriter skippedOut, String coreRules, String coreBackRules)444 private static void checkCoreReversibility(PrintWriter skippedOut, String coreRules, String coreBackRules) { 445 Transliterator base = Transliterator.createFromRules("foo", coreRules, Transliterator.FORWARD); 446 Transliterator back = Transliterator.createFromRules("foo2", coreBackRules, Transliterator.REVERSE); 447 String[] tests = "bat bait bet beet bit bite bot boat but bute bout boot book boy pat bat vat fat mat tat dat thew father nat sat zoo ash asia gate cat late rate hate yet rang chat jet" 448 .split("\\s"); 449 for (String test : tests) { 450 String test2 = base.transliterate(test); 451 String test3 = back.transliterate(test2); 452 skippedOut.println(test + "\t " + test2 + "\t " + test3); 453 } 454 skippedOut.flush(); 455 } 456 buildRules(String coreRules, List<String> newRules, StringBuilder buffer)457 private static String buildRules(String coreRules, List<String> newRules, StringBuilder buffer) { 458 // Transliterator base; 459 // build backwards!! 460 buffer.setLength(0); 461 buffer.append( 462 "# Author: M Davis" + CldrUtility.LINE_SEPARATOR + 463 "# Email: mark.davis@icu-project.org" + CldrUtility.LINE_SEPARATOR + 464 "# Description: English to IPA" + CldrUtility.LINE_SEPARATOR + 465 // "$nletter {([A-Z]+)} $nletter > &en-IPA/spellout($1) ; " + Utility.LINE_SEPARATOR + 466 ":: lower(); " + CldrUtility.LINE_SEPARATOR + 467 "$x = [:^letter:] ;" + CldrUtility.LINE_SEPARATOR); 468 for (int i = newRules.size() - 1; i >= 0; --i) { 469 buffer.append(newRules.get(i)); 470 } 471 buffer.append(coreRules); 472 // System.out.println(buffer); 473 String result = buffer.toString(); 474 // ensure it builds 475 return result; 476 } 477 showSet(UnicodeSet sourceCharacters)478 private static void showSet(UnicodeSet sourceCharacters) { 479 for (UnicodeSetIterator it = new UnicodeSetIterator(sourceCharacters); it 480 .next();) { 481 System.out.println(com.ibm.icu.impl.Utility.hex(it.codepoint) + "\t(" 482 + UTF16.valueOf(it.codepoint) + ")\t" 483 + UCharacter.getName(it.codepoint)); 484 } 485 } 486 487 public static UnicodeSet vowels = new UnicodeSet("[aeiou æ ɑ ə ɛ ɪ ʊ â î ô]").freeze(); 488 public static UnicodeSet short_vowels = new UnicodeSet("[ɑ æ ə ɛ ɪ ʊ]").freeze(); 489 /** 490 * Return true if the strings are essentially the same. 491 * Differences between schwas and short vowels are counted in certain cases 492 * 493 * @param targetDir 494 * @param targetUsingCore 495 * @param targetUsingCore2 496 * @return 497 */ 498 static UnicodeSet targetChars = new UnicodeSet(); 499 static UnicodeSet targetCoreChars = new UnicodeSet(); 500 static UnicodeSet tempDiff = new UnicodeSet(); 501 static Transliterator distinguishLongVowels = Transliterator.createFromRules("faa", 502 "ɑʊ > â ;" + 503 "ɑɪ > î ;" + 504 "oɪ > ô ;", 505 Transliterator.FORWARD); 506 distance(String source, String target, String targetUsingCore)507 private static int distance(String source, String target, String targetUsingCore) { 508 if (target.equals(targetUsingCore)) return 0; 509 if (mostlyEqual(source, target, targetUsingCore)) return 1; 510 // first compare the consonants. Count each difference as 3 511 String zappedTarget = distinguishLongVowels.transliterate(target); 512 String zappedCoreTarget = distinguishLongVowels.transliterate(targetUsingCore); 513 514 targetChars.clear().addAll(zappedTarget); // 515 targetCoreChars.clear().addAll(zappedCoreTarget); 516 if (targetChars.equals(targetCoreChars)) { 517 return 3; 518 } 519 targetChars.removeAll(short_vowels); 520 targetCoreChars.removeAll(short_vowels); 521 if (targetChars.equals(targetCoreChars)) { 522 return 5; 523 } 524 525 targetChars.removeAll(vowels); 526 targetCoreChars.removeAll(vowels); 527 if (targetChars.equals(targetCoreChars)) { 528 return 5; 529 } 530 531 tempDiff.clear().addAll(targetChars).removeAll(targetCoreChars); 532 int result = 7 + tempDiff.size(); 533 tempDiff.clear().addAll(targetCoreChars).removeAll(targetChars); 534 result += tempDiff.size(); 535 return result; 536 } 537 538 static final Transliterator skeletonize = Transliterator.createFromRules("faa", 539 "ɑʊ > âʊ ;" + 540 "ɑɪ > âi ;" + 541 "oɪ > oi ;" + 542 "ɑr > âr ;" + 543 "ær > er ;" + 544 "ɛr > er ;" + 545 "ɪr > ir ;" + 546 "ʊr > ur ;", 547 Transliterator.FORWARD); 548 mostlyEqual(String inSource, String inTarget, String inTargetUsingCore)549 private static boolean mostlyEqual(String inSource, String inTarget, String inTargetUsingCore) { 550 551 if (inTarget.length() != inTargetUsingCore.length()) return false; 552 553 // transform these -- simplest that way 554 String target = skeletonize.transliterate(inTarget); 555 String targetUsingCore = skeletonize.transliterate(inTargetUsingCore); 556 557 for (int i = 0; i < target.length(); ++i) { 558 char ca = target.charAt(i); 559 char cb = targetUsingCore.charAt(i); 560 if (ca != cb) { 561 // disregard differences with short vowels 562 if (ca == 'ə' && short_vowels.contains(cb) || short_vowels.contains(ca) && cb == 'ə') { 563 continue; 564 } 565 // ɛ") && a.startsWith("ɪ") 566 if (ca == 'ɪ' && cb == 'ɛ' || ca == 'ɪ' && cb == 'ɛ') { 567 continue; 568 } 569 return false; 570 } 571 } 572 return true; // return diffCount == 0 ? true : diffCount < vowelCount; 573 } 574 575 static Transliterator spellout = Transliterator.createFromRules("foo", 576 "a > e ;" 577 + "b > bi ;" 578 + "c > si ;" 579 + "d > di ;" 580 + "e > i ;" 581 + "f > ɛf ;" 582 + "g > dʒi ;" 583 + "h > etʃ ;" 584 + "i > ɑɪ ;" 585 + "j > dʒe ;" 586 + "k > ke ;" 587 + "l > ɛl ;" 588 + "m > ɛm ;" 589 + "n > ɛn ;" 590 + "o > o ;" 591 + "p > pi ;" 592 + "q > kwu ;" 593 + "r > ɑr ;" 594 + "s > ɛs ;" 595 + "t > ti ;" 596 + "u > ju ;" 597 + "v > vi ;" 598 + "w > dəbjə ;" 599 + "x > ɛks ;" 600 + "y > wɑɪ ;" 601 + "z > zi ;", 602 Transliterator.FORWARD); 603 604 /** 605 * Returns items sorted alphabetically, shortest first 606 */ 607 static Comparator MyComparator = new Comparator() { 608 609 @Override 610 public int compare(Object a, Object b) { 611 String as = (String) a; 612 String bs = (String) b; 613 if (as.length() < bs.length()) 614 return -1; 615 if (as.length() > bs.length()) 616 return 1; 617 int result = col.compare(as, bs); 618 if (result != 0) { 619 return result; 620 } 621 return as.compareTo(bs); 622 } 623 624 }; 625 626 // static String dataDir = "C:\\cvsdata\\unicode\\ucd\\unicodetools\\dictionary\\Data\\"; 627 // private static String getCoreTransliterator() throws IOException { 628 // 629 // String accentRules = createFromFile(dataDir + "accentRules.txt", null, null); 630 // 631 // Transliterator doAccentRules = Transliterator.createFromRules("foo", accentRules, Transliterator.FORWARD); 632 // 633 // String markedToIpa = createFromFile(dataDir + "IPARules.txt", doAccentRules, null); 634 // System.out.println(markedToIpa); 635 // Transliterator doMarkedToIpa = Transliterator.createFromRules("foo", markedToIpa, Transliterator.FORWARD); 636 // 637 // String trial = "ạ>æ"; 638 // String result = doMarkedToIpa.transliterate(trial); 639 // System.out.println("****" + result); 640 // 641 // String englishToIpaBase = createFromFile(dataDir + "reduceRules.txt", doAccentRules, doMarkedToIpa); 642 // 643 // System.out.println(englishToIpaBase); 644 // 645 // //Transform file name into id 646 // 647 // return englishToIpaBase; 648 // } 649 createFromFile(String fileName, Transliterator pretrans, Transliterator pretrans2)650 public static String createFromFile(String fileName, Transliterator pretrans, Transliterator pretrans2) 651 throws IOException { 652 StringBuilder buffer = new StringBuilder(); 653 BufferedReader fli = FileUtilities.openUTF8Reader("", fileName); 654 while (true) { 655 String line = fli.readLine(); 656 if (line == null) break; 657 if (line.startsWith("\uFEFF")) line = line.substring(1); 658 if (pretrans != null) { 659 line = pretrans.transliterate(line); 660 } 661 if (pretrans2 != null) { 662 line = pretrans2.transliterate(line); 663 } 664 665 buffer.append(line); 666 buffer.append(CldrUtility.LINE_SEPARATOR); // separate with whitespace 667 } 668 fli.close(); 669 return buffer.toString(); 670 } 671 672 static int LIMIT = Integer.MAX_VALUE; 673 private static Transliterator fixBadIpa; 674 private static UnicodeSet targetCharacters; 675 private static UnicodeSet sourceCharacters; 676 private static UnicodeSet allowedSourceCharacters; 677 private static UnicodeSet allowedTargetCharacters; 678 private static int countSkipped; 679 private static long skippedFrequency; 680 private static long frequency; 681 private static long totalFrequency; 682 private static Transliterator coreBase; 683 getOverrides()684 public static Map<String, String> getOverrides() throws IOException { 685 Map<String, String> result = new TreeMap<>(); 686 BufferedReader br = FileUtilities.openUTF8Reader(cldrDataDir, "internal_overrides.txt"); 687 try { 688 int counter = 0; 689 while (counter < LIMIT) { 690 String line = br.readLine(); 691 if (line == null) break; 692 line = line.trim(); 693 if (line.length() == 0) continue; 694 695 String[] iLine = line.split("\\s*→\\s*"); 696 String word = iLine[0].trim(); 697 if (result.containsKey(word)) { 698 System.out.println("Overrides already contain: " + word); 699 continue; 700 } 701 if (iLine.length < 2) { 702 result.put(word, ""); 703 } else { 704 String ipa = fixBadIpa.transliterate(iLine[1].trim()); 705 result.put(word, ipa); 706 } 707 } 708 } finally { 709 br.close(); 710 } 711 return result; 712 } 713 714 }