1 package org.unicode.cldr.tool;
2 
3 import java.io.BufferedReader;
4 import java.io.IOException;
5 import java.io.PrintWriter;
6 import java.lang.reflect.Field;
7 import java.util.ArrayList;
8 import java.util.Comparator;
9 import java.util.List;
10 import java.util.Locale;
11 import java.util.Map;
12 import java.util.Set;
13 import java.util.TreeMap;
14 import java.util.TreeSet;
15 
16 import org.unicode.cldr.draft.FileUtilities;
17 import org.unicode.cldr.util.CldrUtility;
18 import org.unicode.cldr.util.Pair;
19 
20 import com.ibm.icu.impl.Relation;
21 import com.ibm.icu.lang.UCharacter;
22 import com.ibm.icu.text.Collator;
23 import com.ibm.icu.text.NumberFormat;
24 import com.ibm.icu.text.Transliterator;
25 import com.ibm.icu.text.UTF16;
26 import com.ibm.icu.text.UnicodeSet;
27 import com.ibm.icu.text.UnicodeSetIterator;
28 import com.ibm.icu.util.ULocale;
29 
30 /**
31  * Takes a list of mappings (tab delimited) from source to target and produces a
32  * transliterator
33  *
34  * @author markdavis
35  *         http://en.wikipedia.org/wiki/English_phonology
36  */
37 public class MakeTransliterator {
38     // DEBUGGING
39     static int forceSeparateIfShorter = 4; // 4
40 
41     private static final String CHECK_BASE = null; // "vessel";
42     private static final String CHECK_BUILT = null; // "vessel";
43 
44     private static final String TEST_STRING = "territories";
45     private static final boolean SHOW_OVERRIDES = true;
46 
47     private static final int MINIMUM_FREQUENCY = 9999;
48 
49     static boolean isIPA = true;
50     static boolean onlyToTarget = true;
51 
52     // others
53 
54     static NumberFormat nf = NumberFormat.getInstance(ULocale.ENGLISH);
55 
56     static Collator col = Collator.getInstance(ULocale.ROOT);
57 
58     static String cldrDataDir = "C:\\cvsdata\\unicode\\cldr\\tools\\java\\org\\unicode\\cldr\\util\\data\\transforms\\";
59 
main(String[] args)60     public static void main(String[] args) throws IOException {
61         setTranslitDebug(true);
62 
63         Locale fil = new Locale("fil");
64         System.out.println(fil);
65         fil = new Locale("fil", "US");
66         System.out.println(fil);
67 
68         String sourceFile = cldrDataDir + "internal_raw_IPA.txt";
69         String targetFile = cldrDataDir + "en-IPA.txt";
70         String targetCountFile = cldrDataDir + "en-IPA_count.txt";
71         String skippedLinesFile = "C:\\DATA\\GEN\\SkippedIPA.txt";
72 
73         PrintWriter skippedOut = FileUtilities.openUTF8Writer("", skippedLinesFile);
74 
75         // String coreRules = getCoreTransliterator();
76         String fixBadIpaRules = createFromFile(cldrDataDir + "internal_fixBadIpa.txt", null, null);
77         fixBadIpa = Transliterator.createFromRules("foo", fixBadIpaRules, Transliterator.FORWARD);
78 
79         Map<String, String> overrides = getOverrides();
80 
81         String coreForeRules = createFromFile(cldrDataDir + "internal_baseEnglishToIpa.txt", null, null);
82         coreBase = Transliterator.createFromRules("foo", coreForeRules, Transliterator.FORWARD);
83         if (CHECK_BASE != null) {
84             setTranslitDebug(true);
85             System.out.println(coreBase.transliterate(CHECK_BASE));
86             return;
87         }
88 
89         if (CHECK_BUILT != null) {
90             String foo = createFromFile(cldrDataDir + "en-IPA.txt", null, null);
91             Transliterator fooTrans = Transliterator.createFromRules("foo", foo, Transliterator.FORWARD);
92 
93             setTranslitDebug(true);
94             System.out.println(fooTrans.transliterate(CHECK_BUILT));
95             return;
96         }
97 
98         String coreBackRules = createFromFile(cldrDataDir + "internal_English-IPA-backwards.txt", null, null);
99         checkCoreReversibility(skippedOut, coreForeRules, coreBackRules);
100         String coreRules = coreForeRules + coreBackRules;
101         System.out.println(coreRules);
102 
103         // C:\DATA\GEN\mergedIPA2.txt
104         // we have to have items in order. Longest forms need to come first, on both
105         // sides.
106         Relation<String, Pair<String, Long>> store = Relation.of(new TreeMap<String, Set<Pair<String, Long>>>(MyComparator),
107             TreeSet.class);
108 
109         targetCharacters = new UnicodeSet();
110         sourceCharacters = new UnicodeSet();
111         allowedSourceCharacters = new UnicodeSet(
112             "[[:Letter:]\u2019]").freeze();
113         allowedTargetCharacters = new UnicodeSet(
114             "[\u00E6 \u0251 b d\u00F0 e \u0259 \u025B f-i \u026A j-n \u014B o p r s \u0283 t u \u028A v w z \u0292 \u03B8]")
115                 .freeze();
116         countSkipped = 0;
117         totalFrequency = 0;
118         skippedFrequency = 0;
119         int targetField = isIPA ? 2 : 1;
120 
121         BufferedReader in = FileUtilities.openUTF8Reader("", sourceFile);
122         while (true) {
123             String line = in.readLine();
124             if (line == null)
125                 break;
126             if (line.startsWith("\uFEFF")) {
127                 line = line.substring(1);
128             }
129             String originalLine = line;
130             int commentCharPosition = line.indexOf('#');
131             if (commentCharPosition >= 0) {
132                 line = line.substring(0, commentCharPosition);
133             }
134             line = line.trim();
135             frequency = -1;
136             String[] pieces = line.split(" *[\\t,] *");
137             if (pieces.length <= targetField) {
138                 // skippedOut.println(originalLine + "\tno phonetics");
139                 // countSkipped++;
140                 continue; // no phonetics
141             }
142             String source = pieces[0];
143             if (TEST_STRING != null && source.equals(TEST_STRING)) {
144                 System.out.println(line); // for debugging
145             }
146 
147             // Fix Source
148             source = source.replace("'", "’");
149             source = UCharacter.toLowerCase(ULocale.ENGLISH, source);
150             if (source.endsWith(".")) {
151                 source = source.substring(0, source.length() - 1);
152             }
153             if (source.contains(" ") || source.contains("-")) {
154                 skippedOut.println(originalLine + "\tspace or hyphen");
155                 countSkipped++;
156                 skippedFrequency += frequency;
157                 continue;
158             }
159 
160             //String bestTarget = null;
161 
162             String override = overrides.get(source);
163             String spelling = spellout.transliterate(source);
164 
165             for (int i = 1; i < pieces.length; ++i) {
166                 String target = pieces[i];
167                 if (target.startsWith("%")) {
168                     frequency = Long.parseLong(target.substring(1));
169                     continue;
170                 }
171 
172                 if (override != null) {
173                     if (SHOW_OVERRIDES)
174                         System.out.println("Overriding\t" + source + " → ! " + target + " → " + override);
175                     if (override.length() != 0) {
176                         if (TEST_STRING != null && source.equals(TEST_STRING)) {
177                             setTranslitDebug(true);
178                         }
179                         target = fixBadIpa.transliterate(override);
180                         setTranslitDebug(false);
181                         addSourceTarget(skippedOut, source, target, originalLine, store);
182                     }
183                     break;
184                 }
185 
186                 if (frequency < MINIMUM_FREQUENCY) {
187                     // skippedOut.println(originalLine + "\tno frequency");
188                     countSkipped++;
189                     continue;
190                 }
191 
192                 target = UCharacter.toLowerCase(ULocale.ENGLISH, target);
193                 target = target.replace(" ", ""); // remove extra spaces
194 
195                 if (target.startsWith("-") || target.endsWith("-")) {
196                     continue;
197                 }
198 
199                 String oldTarget = target;
200                 target = fixBadIpa.transliterate(target);
201 
202                 if (target.equals(spelling)) {
203                     skippedOut.println(originalLine
204                         + "\tspellout");
205                     countSkipped++;
206                     continue;
207                 }
208 
209                 if (!target.equals(oldTarget)) {
210                     skippedOut.println("\t### fixed IPA:\t" + source + "\t" + target
211                         + "\twas: " + oldTarget);
212                 }
213 
214                 addSourceTarget(skippedOut, source, target, originalLine, store);
215             }
216         }
217 
218         // add the overrides that are not in.
219 
220         for (String word : overrides.keySet()) {
221             if (!store.containsKey(word)) {
222                 String target = overrides.get(word);
223                 if (target.length() != 0) {
224                     if (SHOW_OVERRIDES) System.out.println("New overrides:\t" + word + " → " + target);
225                     addSourceTarget(skippedOut, word, target, "overrides", store);
226                 }
227             }
228         }
229         in.close();
230         System.out.println("total count: " + nf.format(store.size()));
231         System.out.println("skipped count: " + nf.format(countSkipped));
232 
233         System.out.println("total frequency-weighted: " + nf.format(totalFrequency));
234         System.out.println("skipped frequency-weighted: " + nf.format(skippedFrequency));
235 
236         if (false) {
237             System.out.println(CldrUtility.LINE_SEPARATOR + "Source Characters ");
238             showSet(sourceCharacters);
239             System.out.println(CldrUtility.LINE_SEPARATOR + "Target Characters ");
240             showSet(targetCharacters);
241         }
242 
243         // Set<String> seenSource = new HashSet<String>();
244         // Set<String> seenTarget = new HashSet<String>();
245 
246         int countAdded = 0;
247         int countTotal = 0;
248         long frequencyAdded = 0;
249         long frequencySkipped = 0;
250 
251         Transliterator base = Transliterator.createFromRules("foo", coreRules, Transliterator.FORWARD);
252         // build up the transliterator one length at a time.
253         List<String> newRules = new ArrayList<>();
254         StringBuilder buffer = new StringBuilder();
255 
256         int lastSourceLength = 1;
257 
258         Relation<Long, String> count_failures = Relation.of(new TreeMap<Long, Set<String>>(), TreeSet.class);
259 
260         sourceLoop: for (String source : store.keySet()) {
261             if (TEST_STRING != null && source.equals(TEST_STRING)) {
262                 System.out.println(source + "\t" + store.getAll(source));
263             }
264             countTotal++;
265             // whenever the source changes in length, rebuild the transliterator
266             if (source.length() != lastSourceLength && source.length() >= forceSeparateIfShorter) {
267                 System.out.println("Building transliterator for length " + lastSourceLength + " : " + newRules.size());
268                 System.out.flush();
269                 skippedOut.flush();
270                 String rules = buildRules(coreRules, newRules, buffer);
271                 // System.out.println(rules);
272                 base = Transliterator.createFromRules("foo", rules, Transliterator.FORWARD);
273 
274                 lastSourceLength = source.length();
275             }
276             Set<Pair<String, Long>> targetSet = store.getAll(source);
277             // see if any of the mappings fall out
278             String targetUsingCore = base.transliterate(source);
279 
280             String bestTarget = null;
281             int bestDistance = 999;
282             long frequency = 0;
283             for (Pair<String, Long> targetPair : targetSet) {
284                 String target = targetPair.getFirst();
285                 if (target.length() == 0) {
286                     throw new IllegalArgumentException(source + " → " + target);
287                 }
288                 frequency = targetPair.getSecond();
289 
290                 if (targetUsingCore.equals(target)) {
291                     // we have a match! skip this source
292                     skippedOut.println("# skipping " + source + " → " + target + " ;");
293                     frequencySkipped += frequency;
294                     continue sourceLoop;
295                 }
296                 if (mostlyEqual(source, target, targetUsingCore)) {
297                     // we have a match! skip this source
298                     skippedOut.println("# skipping " + source + " → " + target + " ; # close enough to "
299                         + targetUsingCore);
300                     frequencySkipped += frequency;
301                     continue sourceLoop;
302                 }
303                 int distance = distance(source, target, targetUsingCore);
304                 if (bestDistance > distance) {
305                     bestTarget = target;
306                     bestDistance = distance;
307                 }
308             }
309             // if we get to here, we have a new rule.
310             if (bestTarget != null) {
311                 boolean forceSeparate = false;
312                 if (source.length() < forceSeparateIfShorter || bestTarget.length() * 2 > source.length() * 3) {
313                     forceSeparate = true;
314                 } else {
315                     String spelling = spellout.transliterate(source);
316                     if (bestTarget.equals(spelling)) {
317                         forceSeparate = true;
318                     } else {
319                         // if it is likely that the word can have an extra letter added that changes the pronunciation
320                         // force it to be separate
321                         if (source.endsWith("e")) {
322                             forceSeparate = true;
323                         }
324                     }
325                 }
326                 String targetUsingBaseCore = coreBase.transliterate(source);
327 
328                 if (forceSeparate) {
329                     source = "$x{" + source + "}$x";
330                 } else {
331                     source = "$x{" + source;
332                 }
333                 // strange hack
334                 String hackSource = source.startsWith("use") ? "'" + source + "'" : source;
335                 newRules.add(hackSource + " → " + bestTarget + " ; # " + targetUsingCore
336                     + (targetUsingBaseCore.equals(targetUsingCore) ? "" : "\t\t" + targetUsingBaseCore)
337                     + CldrUtility.LINE_SEPARATOR);
338                 skippedOut.println("# couldn't replace  " + source + " → " + bestTarget + " ; # " + targetUsingCore);
339                 count_failures.put(-frequency, source + " → " + bestTarget + " ; # " + targetUsingCore);
340                 countAdded++;
341                 frequencyAdded += frequency;
342             }
343         }
344 
345         String rules = buildRules(coreRules, newRules, buffer);
346         base = Transliterator.createFromRules("foo", rules, Transliterator.FORWARD); // verify that it builds
347 
348         PrintWriter out = FileUtilities.openUTF8Writer("", targetFile);
349         out.println(rules);
350         out.close();
351 
352         out = FileUtilities.openUTF8Writer("", targetCountFile);
353         for (long count : count_failures.keySet()) {
354             for (String line : count_failures.getAll(count)) {
355                 out.println(count + "\t" + line);
356             }
357         }
358         out.close();
359 
360         // if (false) {
361         //
362         // // now write out the transliterator file
363         // PrintWriter out = FileUtilities.openUTF8Writer("", targetFile);
364         // for (String source : store.keySet()) {
365         // Set<String> targetSet = store.getAll(source);
366         // for (String target : targetSet) {
367         // if (seenSource.contains(source)) {
368         // if (onlyToTarget) {
369         // // nothing
370         // } else if (seenTarget.contains(target)) {
371         // skippedOut.println("# " + source + " → " + target + " ;");
372         // countSkipped++;
373         // } else {
374         // out.println(source + " ← " + target + " ;");
375         // countSourceFromTarget++;
376         // }
377         // } else if (onlyToTarget || seenTarget.contains(target)) {
378         // out.println(source + " → " + target + " ;");
379         // countSourceToTarget++;
380         // } else {
381         // out.println(source + " ↔ " + target + " ;");
382         // countSourceAndTarget++;
383         // }
384         // seenSource.add(source);
385         // seenTarget.add(target);
386         // }
387         // }
388         // out.close();
389         // }
390         skippedOut.close();
391         System.out.println("countTotal: " + nf.format(countTotal));
392         System.out.println("countAdded: " + nf.format(countAdded));
393         System.out.println("countSkipped: " + nf.format(countTotal - countAdded));
394         System.out.println("frequencyTotal: " + nf.format(frequencyAdded + frequencySkipped));
395         System.out.println("frequencyAdded: " + nf.format(frequencyAdded));
396         System.out.println("frequencySkipped: " + nf.format(frequencySkipped));
397     }
398 
setTranslitDebug(boolean newSetting)399     private static void setTranslitDebug(boolean newSetting) {
400         // Transliterator.DEBUG = newSetting;
401         try {
402             Field debug = Transliterator.class.getField("DEBUG");
403             debug.setBoolean(Transliterator.class, newSetting);
404         } catch (Exception e) {
405             e.printStackTrace();
406         }
407     }
408 
addSourceTarget(PrintWriter skippedOut, String source, String target, String originalLine, Relation<String, Pair<String, Long>> store)409     private static void addSourceTarget(PrintWriter skippedOut, String source, String target, String originalLine,
410         Relation<String, Pair<String, Long>> store) {
411         if (source.equals("teh")) {
412             System.out.println("debug");
413         }
414         if (!allowedSourceCharacters.containsAll(source)) {
415             skippedOut.println(originalLine
416                 + "\t# Strange source values:\t"
417                 + source
418                 + "\t"
419                 + new UnicodeSet().addAll(source)
420                     .removeAll(allowedSourceCharacters).toPattern(false));
421             countSkipped++;
422             skippedFrequency += frequency;
423             return;
424         }
425         if (!allowedTargetCharacters.containsAll(target)) {
426             System.out.println(originalLine
427                 + "\t# Strange target values:\t"
428                 + target
429                 + "\t"
430                 + new UnicodeSet().addAll(target)
431                     .removeAll(allowedTargetCharacters).toPattern(false));
432             countSkipped++;
433             skippedFrequency += frequency;
434             return;
435         }
436 
437         sourceCharacters.addAll(source);
438         targetCharacters.addAll(target);
439         store.put(source, new Pair<>(target, frequency));
440         totalFrequency += frequency;
441 
442     }
443 
checkCoreReversibility(PrintWriter skippedOut, String coreRules, String coreBackRules)444     private static void checkCoreReversibility(PrintWriter skippedOut, String coreRules, String coreBackRules) {
445         Transliterator base = Transliterator.createFromRules("foo", coreRules, Transliterator.FORWARD);
446         Transliterator back = Transliterator.createFromRules("foo2", coreBackRules, Transliterator.REVERSE);
447         String[] tests = "bat bait bet beet bit bite bot boat but bute bout boot book boy pat bat vat fat mat tat dat thew father nat sat zoo ash asia gate cat late rate hate yet rang chat jet"
448             .split("\\s");
449         for (String test : tests) {
450             String test2 = base.transliterate(test);
451             String test3 = back.transliterate(test2);
452             skippedOut.println(test + "\t " + test2 + "\t " + test3);
453         }
454         skippedOut.flush();
455     }
456 
buildRules(String coreRules, List<String> newRules, StringBuilder buffer)457     private static String buildRules(String coreRules, List<String> newRules, StringBuilder buffer) {
458         // Transliterator base;
459         // build backwards!!
460         buffer.setLength(0);
461         buffer.append(
462             "# Author: M Davis" + CldrUtility.LINE_SEPARATOR +
463                 "# Email: mark.davis@icu-project.org" + CldrUtility.LINE_SEPARATOR +
464                 "# Description: English to IPA" + CldrUtility.LINE_SEPARATOR +
465                 // "$nletter {([A-Z]+)} $nletter > &en-IPA/spellout($1) ; " + Utility.LINE_SEPARATOR +
466                 ":: lower(); " + CldrUtility.LINE_SEPARATOR +
467                 "$x = [:^letter:] ;" + CldrUtility.LINE_SEPARATOR);
468         for (int i = newRules.size() - 1; i >= 0; --i) {
469             buffer.append(newRules.get(i));
470         }
471         buffer.append(coreRules);
472         // System.out.println(buffer);
473         String result = buffer.toString();
474         // ensure it builds
475         return result;
476     }
477 
showSet(UnicodeSet sourceCharacters)478     private static void showSet(UnicodeSet sourceCharacters) {
479         for (UnicodeSetIterator it = new UnicodeSetIterator(sourceCharacters); it
480             .next();) {
481             System.out.println(com.ibm.icu.impl.Utility.hex(it.codepoint) + "\t("
482                 + UTF16.valueOf(it.codepoint) + ")\t"
483                 + UCharacter.getName(it.codepoint));
484         }
485     }
486 
487     public static UnicodeSet vowels = new UnicodeSet("[aeiou æ ɑ ə ɛ ɪ ʊ â î ô]").freeze();
488     public static UnicodeSet short_vowels = new UnicodeSet("[ɑ æ ə ɛ ɪ ʊ]").freeze();
489     /**
490      * Return true if the strings are essentially the same.
491      * Differences between schwas and short vowels are counted in certain cases
492      *
493      * @param targetDir
494      * @param targetUsingCore
495      * @param targetUsingCore2
496      * @return
497      */
498     static UnicodeSet targetChars = new UnicodeSet();
499     static UnicodeSet targetCoreChars = new UnicodeSet();
500     static UnicodeSet tempDiff = new UnicodeSet();
501     static Transliterator distinguishLongVowels = Transliterator.createFromRules("faa",
502         "ɑʊ > â ;" +
503             "ɑɪ > î ;" +
504             "oɪ > ô ;",
505         Transliterator.FORWARD);
506 
distance(String source, String target, String targetUsingCore)507     private static int distance(String source, String target, String targetUsingCore) {
508         if (target.equals(targetUsingCore)) return 0;
509         if (mostlyEqual(source, target, targetUsingCore)) return 1;
510         // first compare the consonants. Count each difference as 3
511         String zappedTarget = distinguishLongVowels.transliterate(target);
512         String zappedCoreTarget = distinguishLongVowels.transliterate(targetUsingCore);
513 
514         targetChars.clear().addAll(zappedTarget); //
515         targetCoreChars.clear().addAll(zappedCoreTarget);
516         if (targetChars.equals(targetCoreChars)) {
517             return 3;
518         }
519         targetChars.removeAll(short_vowels);
520         targetCoreChars.removeAll(short_vowels);
521         if (targetChars.equals(targetCoreChars)) {
522             return 5;
523         }
524 
525         targetChars.removeAll(vowels);
526         targetCoreChars.removeAll(vowels);
527         if (targetChars.equals(targetCoreChars)) {
528             return 5;
529         }
530 
531         tempDiff.clear().addAll(targetChars).removeAll(targetCoreChars);
532         int result = 7 + tempDiff.size();
533         tempDiff.clear().addAll(targetCoreChars).removeAll(targetChars);
534         result += tempDiff.size();
535         return result;
536     }
537 
538     static final Transliterator skeletonize = Transliterator.createFromRules("faa",
539         "ɑʊ > âʊ ;" +
540             "ɑɪ > âi ;" +
541             "oɪ > oi ;" +
542             "ɑr > âr ;" +
543             "ær > er ;" +
544             "ɛr > er ;" +
545             "ɪr > ir ;" +
546             "ʊr > ur ;",
547         Transliterator.FORWARD);
548 
mostlyEqual(String inSource, String inTarget, String inTargetUsingCore)549     private static boolean mostlyEqual(String inSource, String inTarget, String inTargetUsingCore) {
550 
551         if (inTarget.length() != inTargetUsingCore.length()) return false;
552 
553         // transform these -- simplest that way
554         String target = skeletonize.transliterate(inTarget);
555         String targetUsingCore = skeletonize.transliterate(inTargetUsingCore);
556 
557         for (int i = 0; i < target.length(); ++i) {
558             char ca = target.charAt(i);
559             char cb = targetUsingCore.charAt(i);
560             if (ca != cb) {
561                 // disregard differences with short vowels
562                 if (ca == 'ə' && short_vowels.contains(cb) || short_vowels.contains(ca) && cb == 'ə') {
563                     continue;
564                 }
565                 // ɛ")  && a.startsWith("ɪ")
566                 if (ca == 'ɪ' && cb == 'ɛ' || ca == 'ɪ' && cb == 'ɛ') {
567                     continue;
568                 }
569                 return false;
570             }
571         }
572         return true; // return diffCount == 0 ? true : diffCount < vowelCount;
573     }
574 
575     static Transliterator spellout = Transliterator.createFromRules("foo",
576         "a > e ;"
577             + "b > bi ;"
578             + "c > si ;"
579             + "d > di ;"
580             + "e > i ;"
581             + "f > ɛf ;"
582             + "g > dʒi ;"
583             + "h > etʃ ;"
584             + "i > ɑɪ ;"
585             + "j > dʒe ;"
586             + "k > ke ;"
587             + "l > ɛl ;"
588             + "m > ɛm ;"
589             + "n > ɛn ;"
590             + "o > o ;"
591             + "p > pi ;"
592             + "q > kwu ;"
593             + "r > ɑr ;"
594             + "s > ɛs ;"
595             + "t > ti ;"
596             + "u > ju ;"
597             + "v > vi ;"
598             + "w > dəbjə ;"
599             + "x > ɛks ;"
600             + "y > wɑɪ ;"
601             + "z > zi ;",
602         Transliterator.FORWARD);
603 
604     /**
605      * Returns items sorted alphabetically, shortest first
606      */
607     static Comparator MyComparator = new Comparator() {
608 
609         @Override
610         public int compare(Object a, Object b) {
611             String as = (String) a;
612             String bs = (String) b;
613             if (as.length() < bs.length())
614                 return -1;
615             if (as.length() > bs.length())
616                 return 1;
617             int result = col.compare(as, bs);
618             if (result != 0) {
619                 return result;
620             }
621             return as.compareTo(bs);
622         }
623 
624     };
625 
626     // static String dataDir = "C:\\cvsdata\\unicode\\ucd\\unicodetools\\dictionary\\Data\\";
627 // private static String getCoreTransliterator() throws IOException {
628     //
629     // String accentRules = createFromFile(dataDir + "accentRules.txt", null, null);
630     //
631     // Transliterator doAccentRules = Transliterator.createFromRules("foo", accentRules, Transliterator.FORWARD);
632     //
633     // String markedToIpa = createFromFile(dataDir + "IPARules.txt", doAccentRules, null);
634     // System.out.println(markedToIpa);
635     // Transliterator doMarkedToIpa = Transliterator.createFromRules("foo", markedToIpa, Transliterator.FORWARD);
636     //
637     // String trial = "ạ>æ";
638     // String result = doMarkedToIpa.transliterate(trial);
639     // System.out.println("****" + result);
640     //
641     // String englishToIpaBase = createFromFile(dataDir + "reduceRules.txt", doAccentRules, doMarkedToIpa);
642     //
643     // System.out.println(englishToIpaBase);
644     //
645     // //Transform file name into id
646     //
647     // return englishToIpaBase;
648     // }
649 
createFromFile(String fileName, Transliterator pretrans, Transliterator pretrans2)650     public static String createFromFile(String fileName, Transliterator pretrans, Transliterator pretrans2)
651         throws IOException {
652         StringBuilder buffer = new StringBuilder();
653         BufferedReader fli = FileUtilities.openUTF8Reader("", fileName);
654         while (true) {
655             String line = fli.readLine();
656             if (line == null) break;
657             if (line.startsWith("\uFEFF")) line = line.substring(1);
658             if (pretrans != null) {
659                 line = pretrans.transliterate(line);
660             }
661             if (pretrans2 != null) {
662                 line = pretrans2.transliterate(line);
663             }
664 
665             buffer.append(line);
666             buffer.append(CldrUtility.LINE_SEPARATOR); // separate with whitespace
667         }
668         fli.close();
669         return buffer.toString();
670     }
671 
672     static int LIMIT = Integer.MAX_VALUE;
673     private static Transliterator fixBadIpa;
674     private static UnicodeSet targetCharacters;
675     private static UnicodeSet sourceCharacters;
676     private static UnicodeSet allowedSourceCharacters;
677     private static UnicodeSet allowedTargetCharacters;
678     private static int countSkipped;
679     private static long skippedFrequency;
680     private static long frequency;
681     private static long totalFrequency;
682     private static Transliterator coreBase;
683 
getOverrides()684     public static Map<String, String> getOverrides() throws IOException {
685         Map<String, String> result = new TreeMap<>();
686         BufferedReader br = FileUtilities.openUTF8Reader(cldrDataDir, "internal_overrides.txt");
687         try {
688             int counter = 0;
689             while (counter < LIMIT) {
690                 String line = br.readLine();
691                 if (line == null) break;
692                 line = line.trim();
693                 if (line.length() == 0) continue;
694 
695                 String[] iLine = line.split("\\s*→\\s*");
696                 String word = iLine[0].trim();
697                 if (result.containsKey(word)) {
698                     System.out.println("Overrides already contain: " + word);
699                     continue;
700                 }
701                 if (iLine.length < 2) {
702                     result.put(word, "");
703                 } else {
704                     String ipa = fixBadIpa.transliterate(iLine[1].trim());
705                     result.put(word, ipa);
706                 }
707             }
708         } finally {
709             br.close();
710         }
711         return result;
712     }
713 
714 }