1 package org.unicode.cldr.test;
2 
3 import java.io.PrintWriter;
4 import java.math.BigDecimal;
5 import java.text.ParsePosition;
6 import java.util.ArrayList;
7 import java.util.Arrays;
8 import java.util.BitSet;
9 import java.util.Collection;
10 import java.util.Collections;
11 import java.util.EnumSet;
12 import java.util.HashMap;
13 import java.util.HashSet;
14 import java.util.Iterator;
15 import java.util.List;
16 import java.util.Locale;
17 import java.util.Map;
18 import java.util.Set;
19 import java.util.TreeSet;
20 import java.util.regex.Matcher;
21 
22 import org.unicode.cldr.util.CLDRFile;
23 import org.unicode.cldr.util.CLDRFile.Status;
24 import org.unicode.cldr.util.CLDRPaths;
25 import org.unicode.cldr.util.CldrUtility;
26 import org.unicode.cldr.util.DtdType;
27 import org.unicode.cldr.util.Factory;
28 import org.unicode.cldr.util.Iso639Data;
29 import org.unicode.cldr.util.Iso639Data.Scope;
30 import org.unicode.cldr.util.Level;
31 import org.unicode.cldr.util.Pair;
32 import org.unicode.cldr.util.PatternCache;
33 import org.unicode.cldr.util.SimpleFactory;
34 import org.unicode.cldr.util.StandardCodes;
35 import org.unicode.cldr.util.VariantFolder;
36 import org.unicode.cldr.util.VariantFolder.CanonicalFolder;
37 import org.unicode.cldr.util.VariantFolder.CaseVariantFolder;
38 import org.unicode.cldr.util.VariantFolder.CompatibilityFolder;
39 import org.unicode.cldr.util.XPathParts;
40 import org.unicode.cldr.util.props.BagFormatter;
41 
42 import com.ibm.icu.lang.UCharacter;
43 import com.ibm.icu.lang.UScript;
44 import com.ibm.icu.text.Collator;
45 import com.ibm.icu.text.DecimalFormat;
46 import com.ibm.icu.text.NumberFormat;
47 import com.ibm.icu.text.Transliterator;
48 import com.ibm.icu.text.UTF16;
49 import com.ibm.icu.text.UnicodeSet;
50 import com.ibm.icu.text.UnicodeSetIterator;
51 import com.ibm.icu.util.Currency;
52 import com.ibm.icu.util.ULocale;
53 
54 public class TestMisc {
55 
56     static Currency SWISS_FRANC = Currency.getInstance("CHF");
57 
58     static class Lists {
sortedCopy(Collection<E> iterable)59         public static <E extends Comparable> List<E> sortedCopy(Collection<E> iterable) {
60             List<E> list = new ArrayList<>();
61             list.addAll(iterable);
62             Collections.sort(list);
63             return list;
64         }
65     }
66 
67     enum Foo {
68         A, M, Z
69     }
70 
main(String[] args)71     public static void main(String[] args) {
72 
73         checkAliases();
74         if (true) return;
75 
76         Transliterator en_ru = Transliterator.getInstance("en-ru");
77         System.out.println("Mark + " + en_ru.transform("Mark"));
78 
79         Transliterator latn_cyrl = Transliterator.getInstance("Latn-Cyrl");
80         System.out.println("Mark + " + latn_cyrl.transform("Mark"));
81 
82         Transliterator ulatn_ucyrl = Transliterator.getInstance("und_Latn-und_Cyrl");
83         System.out.println("Mark + " + latn_cyrl.transform("Mark"));
84 
85         Locale locale = new Locale("abc-d αγζθ ?ef_g%hi", "abc-d αγζθ ?ef_g%hi", "abc-d αγζθ ?ef_g%hi");
86 
87         System.out
88             .println("Locale locale = new Locale(\"abc-d αγζθ ?ef_g%hi\",\"abc-d αγζθ ?ef_g%hi\",\"abc-d αγζθ ?ef_g%hi\");");
89         System.out.println("locale.toString() == \"" + locale + "\"");
90 
91         MyXSymbolTable sym = new MyXSymbolTable();
92         BagFormatter bf = new BagFormatter();
93         for (String test : new String[] {
94             "[:reduceCase=[[Åå{fi}]]:]",
95             "[:reduceCanonical=[[Åå{fi}]]:]",
96             "[[,٫.]]",
97             "[[,٫.][:close=compatibility:]]",
98             "[[\\ ,٬.']]",
99             "[[\\ ,٬.'][:close=compatibility:]]",
100             "[[\u002E\u2024\uFE52\uFF0E\u3002][:close=compatibility:]]",
101             "[[[\u002C \u002E \u066B \u2024 \u3002 \uFE52 \uFF0E、، \u002E \u2024 \uFE52 \uFF0E \u3002]-[\u002E\u2024\uFE52\uFF0E\u3002]][:close=compatibility:]]",
102 
103             "[[" +
104                 "\\u0020" +
105                 "[, ٬ ..․﹒ '' \u2018 \u2019 ]" +
106                 "-[.\u2024\u3002\uFE12\uFE52\uFF0E\uFF61]" +
107                 "-[,\u060C\u066B\u3001\uFE10\uFE11\uFE50\uFE51\uFF0C\uFF64]]" +
108                 "[:close=compatibility:]]",
109 
110             /*
111              * "[[Åå{fi}][:close=canonical:]]",
112              * "[[Åå{fi}][:close=compatibility:]]",
113              * "[[Åå{fi}][:reduce=case:]]",
114              * "[[Åå{fi}][:reduce=canonical:]]",
115              * "[[Åå{fi}][:reduce=compatibility:]]",
116              */
117         }) {
118             ParsePosition p = new ParsePosition(0);
119             UnicodeSet set = new UnicodeSet(test, p, sym);
120             UnicodeSet codes = set.complement().complement();
121             System.out.println(test + CldrUtility.LINE_SEPARATOR +
122                 codes.toPattern(true) + CldrUtility.LINE_SEPARATOR +
123                 bf.showSetNames(set.complement().complement()) + CldrUtility.LINE_SEPARATOR);
124         }
125         if (true) return;
126 
127         StandardCodes sc = StandardCodes.make();
128         for (String s : new String[] { "language", "script", "territory" }) {
129             System.out.println(s + ":\t" + sc.getGoodAvailableCodes(s).size());
130         }
131         if (true) return;
132 
133         Set<Foo> inFileOrder = EnumSet.allOf(Foo.class);
134         List<Foo> inAlphaOrder = Lists.sortedCopy(inFileOrder);
135         System.out.println(inFileOrder);
136         System.out.println(inAlphaOrder);
137 
138         DecimalFormat currencyFormat = (DecimalFormat) NumberFormat.getCurrencyInstance(new ULocale("de-CH"));
139         currencyFormat.setCurrency(SWISS_FRANC);
140         // sometime later...
141         // we want the financial format of the currency, not the retail format
142         System.out.println("Retail:\t" + currencyFormat.format(123.53));
143         BigDecimal increment = currencyFormat.getRoundingIncrement();
144         System.out.println("Rounding Increment:\t" + increment);
145         double double_increment = increment.doubleValue();
146         System.out.println("Double rounding Increment:\t" + double_increment);
147         double log = Math.log10(double_increment);
148         System.out.println("Double log:\t" + log);
149         double new_increment = Math.pow(10, Math.floor(log));
150         System.out.println("Floored Increment:\t" + new_increment);
151         currencyFormat.setRoundingIncrement(new_increment);
152         System.out.println("Financial:\t" + currencyFormat.format(123.53));
153 
154         if (true) return;
155 
156         testWeights();
157         if (true) return;
158 
159         testScripts();
160         testToRegex();
161         // checkEastAsianWidth();
162         if (true) return;
163         // import ICU
164         UnicodeSet RTL = new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]");
165 
166         checkCollections();
167 
168         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
169         CLDRFile englishFile = cldrFactory.make("en", true);
170         ExampleGenerator eg = new ExampleGenerator(englishFile, englishFile, CLDRPaths.SUPPLEMENTAL_DIRECTORY);
171         System.out
172             .println(eg
173                 .getHelpHtml(
174                     "//ldml/numbers/currencyFormats/currencyFormatLength/currencyFormat[@type=\"standard\"]/pattern[@type=\"standard\"][@draft=\"provisional\"]",
175                     ""));
176         System.out.println(eg.getHelpHtml("/exemplarCharacters", ""));
177         System.out.println(eg.getHelpHtml("/calendar/pattern", ""));
178 
179         if (true) return;
180         Set<String> s = new HashSet<>(Arrays.asList("a", "A", "c"));
181         Collator caselessCompare = Collator.getInstance(Locale.ENGLISH);
182         caselessCompare.setStrength(Collator.PRIMARY);
183         Set<String> t = new TreeSet<>(caselessCompare);
184         t.addAll(Arrays.asList("a", "b", "c"));
185         System.out.println("s equals t: " + s.equals(t));
186         System.out.println("t equals s: " + t.equals(s));
187 
188         Set<String> u = Collections.unmodifiableSet(t);
189         System.out.println("s==t " + (s.equals(t)));
190         System.out.println("s==u " + (s.equals(u)));
191         UnicodeSet x = new UnicodeSet("[a-z]");
192         UnicodeSet y = new UnicodeSet("[a-z]").freeze();
193         System.out.println("x==y " + (x.equals(y)));
194         // showEnglish();
195         // checkPrivateUse();
196         // testPopulous();
197         // checkDistinguishing();
198         // checkEastAsianWidth();
199         // checkEnglishPaths();
200         System.out.println("Done");
201     }
202 
checkAliases()203     private static void checkAliases() {
204         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
205         CLDRFile en = cldrFactory.make("root", true);
206         Status status = new Status();
207         Matcher m = PatternCache.get("gregorian.*dayPeriods").matcher("");
208         for (Iterator<String> it = en.iterator(null, en.getComparator()); it.hasNext();) {
209             String path = it.next();
210             if (!m.reset(path).find()) {
211                 continue;
212             }
213             //String locale = en.getSourceLocaleID(path, status);
214             String value = en.getStringValue(path);
215             String fullPath = en.getFullXPath(path);
216             System.out.println("value:\t" + value + "\tpath:\t" + fullPath);
217             if (!path.equals(status.pathWhereFound)) {
218                 System.out.println("\torigin:\t" + status);
219             }
220             // System.out.println("locale:\t" + locale);
221             System.out.println();
222         }
223     }
224 
testWeights()225     private static void testWeights() {
226         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
227         CLDRFile english = cldrFactory.make("en", true);
228         Set<Pair<Integer, String>> rel = new TreeSet<>();
229         for (String desiredLocale : cldrFactory.getAvailable()) {
230             int vote = Level.getDefaultWeight("google", desiredLocale);
231             rel.add(new Pair<>(vote, desiredLocale));
232         }
233         for (Pair<Integer, String> p : rel) {
234             System.out.println(p + "\t" + english.getName(p.getSecond()));
235         }
236     }
237 
testScripts()238     private static void testScripts() {
239         BagFormatter bf = new BagFormatter();
240 
241         UnicodeSet caseFolded = new UnicodeSet();
242         UnicodeSet simpleCaseFolded = new UnicodeSet();
243         for (int i = 0; i < 0x10FFFF; ++i) {
244             String form = UTF16.valueOf(i);
245             if (UCharacter.foldCase(form, true).equals(form)) {
246                 caseFolded.add(i);
247             }
248             if (UCharacter.foldCase(i, true) == i) {
249                 simpleCaseFolded.add(i);
250             }
251         }
252         caseFolded.freeze();
253         simpleCaseFolded.freeze();
254 
255         UnicodeSet functionalExceptCase = new UnicodeSet("[" +
256             "[:L:][:Mc:][:Mn:][:Nd:]" +
257             "&[:^NFKC_QuickCheck=No:]" +
258             "&[:^default_ignorable_code_point:]]").freeze();
259 
260         UnicodeSet asciiIdn = new UnicodeSet("[-A-Z0-9]").freeze();
261 
262         UnicodeSet archaic = new UnicodeSet("[" +
263             "[:script=Bugi:]" +
264             "[:script=Copt:]" +
265             "[:script=Cprt:]" +
266             "[:script=Dsrt:]" +
267             "[:script=Glag:]" +
268             "[:script=Goth:]" +
269             "[:script=Hano:]" +
270             "[:script=Ital:]" +
271             "[:script=Khar:]" +
272             "[:script=Linb:]" +
273             "[:script=Ogam:]" +
274             "[:script=Osma:]" +
275             "[:script=Phag:]" +
276             "[:script=Phnx:]" +
277             "[:script=Runr:]" +
278             "[:script=Shaw:]" +
279             "[:script=Sylo:]" +
280             "[:script=Syrc:]" +
281             "[:script=Tagb:]" +
282             "[:script=Tglg:]" +
283             "[:script=Ugar:]" +
284             "[:script=Xpeo:]" +
285             "[:script=Xsux:]" +
286             // "[:script=Arab:]" +
287             // "[:script=Armn:]" +
288             // "[:script=Beng:]" +
289             // "[:script=Bopo:]" +
290             "[:block=Combining_Diacritical_Marks _for_Symbols:]" +
291             "[:block=Musical_Symbols:]" +
292             "[:block=Ancient_Greek_Musical_Notation:]]").freeze();
293 
294         System.out.println("functionalExceptCase: " + functionalExceptCase);
295         System.out.println("archaic: " + archaic);
296 
297         System.out.println("SimpleCaseFolded & !CaseFolded & Functional & !Archaic:" + CldrUtility.LINE_SEPARATOR
298             + bf.showSetNames(new UnicodeSet(simpleCaseFolded)
299                 .removeAll(caseFolded)
300                 .retainAll(functionalExceptCase)
301                 .removeAll(archaic).removeAll(asciiIdn)));
302 
303         UnicodeSet functional = new UnicodeSet(functionalExceptCase).retainAll(caseFolded).freeze();
304         System.out.println("functional: " + functional.size());
305         UnicodeSet functionalAndNotArchaic = new UnicodeSet(functional).removeAll(archaic).freeze();
306         System.out.println("archaic: " + archaic.size());
307         System.out.println("functionalAndNotArchaic: " + functionalAndNotArchaic.size());
308 
309         // System.out.println(bf.showSetNames("Case Folded", caseFolded,"Simple Case Folded", simpleCaseFolded));
310 
311         UnicodeSet functionalCommon = new UnicodeSet("[:script=common:]").retainAll(functional).removeAll(archaic)
312             .removeAll(asciiIdn);
313         System.out.println("Common & Functional & !Archaic:" + CldrUtility.LINE_SEPARATOR
314             + bf.showSetNames(functionalCommon));
315 
316         UnicodeSet functionalInherited = new UnicodeSet("[:script=inherited:]").retainAll(functional)
317             .removeAll(archaic).removeAll(asciiIdn);
318         System.out.println("Inherited & Functional & !Archaic:" + CldrUtility.LINE_SEPARATOR
319             + bf.showSetNames(functionalInherited));
320 
321         UnicodeSet nl = new UnicodeSet("[:Nl:]").retainAll(functional).removeAll(archaic);
322         System.out.println("Nl:" + CldrUtility.LINE_SEPARATOR + bf.showSetNames(new UnicodeSet("[:Nl:]")));
323         System.out.println("Nl & Functional & !Archaic:" + CldrUtility.LINE_SEPARATOR + bf.showSetNames(nl));
324 
325         UnicodeSet restrictedXidContinue = new UnicodeSet(
326             "[[:xid_continue:]" +
327                 "&[:^NFKC_QuickCheck=No:]" +
328                 "&[:^default_ignorable_code_point:]" +
329                 "&[:^Pc:]]").retainAll(caseFolded);
330 
331         System.out.println(bf.showSetDifferences("IDNA Functional", functional,
332             "Unicode XID & NFKC &!DefaultIgnorable &! Pc", restrictedXidContinue));
333 
334         Transliterator t = Transliterator.getInstance("lower");
335         System.out.println("ABC " + t.transliterate("ABC"));
336         /*
337          * generalCategory(cp) is {Ll, Lu, Lo, Lm, Mn, Mc, Nd}, AND
338          * NFKC(cp) == cp, AND
339          * casefold(cp) == cp, AND
340          * !defaultIgnorableCodePoint(cp)
341          */
342         BitSet scripts = new BitSet();
343         for (int cp = 0; cp < 0x10FFFF; ++cp) {
344             int script = UScript.getScript(cp);
345             if (script == UScript.COMMON || script == UScript.UNKNOWN || script == UScript.INHERITED) {
346                 continue;
347             }
348             scripts.set(script);
349         }
350         Set<String> toPrint = new TreeSet<>();
351         for (int script = 0; script < scripts.size(); ++script) {
352             if (!scripts.get(script)) continue;
353             String code = UScript.getShortName(script);
354             String name = UScript.getName(script);
355             if (StandardCodes.isScriptModern(code)) {
356                 toPrint.add("modern\t" + code + "\t" + name);
357             } else {
358                 toPrint.add("archaic\t" + code + "\t" + name);
359             }
360         }
361         for (String line : toPrint) {
362             System.out.println(line);
363         }
364     }
365 
checkCollections()366     private static void checkCollections() {
367         System.out.println("Collections");
368         new org.unicode.cldr.util.CldrUtility.Apply<String>() {
369             @Override
370             public void apply(String item) {
371                 if (Iso639Data.getScope(item.toString()) != Scope.Collection) return;
372                 System.out.println(item + "\t" + CldrUtility.join(Iso639Data.getNames(item), ", "));
373             }
374         }.applyTo(Iso639Data.getAvailable());
375         System.out.println(CldrUtility.LINE_SEPARATOR + "Macrolanguages");
376         new org.unicode.cldr.util.CldrUtility.Apply<String>() {
377             @Override
378             public void apply(String item) {
379                 if (Iso639Data.getScope(item.toString()) != Scope.Macrolanguage) return;
380                 System.out.println(item + "\t" + CldrUtility.join(Iso639Data.getNames(item), ", "));
381             }
382         }.applyTo(Iso639Data.getAvailable());
383     }
384 
testToRegex()385     static void testToRegex() {
386         String[] tests = { "\\-", "a", "d-f", "\\u2000", "\\uAC00-\\uAC12", "{AB}", "{CDE}", "\\uFFF0-\\U0010000F",
387             "\\U0010100F-\\U0010300F" }; // }; //
388         for (int i = (1 << tests.length) - 1; i >= 0; --i) {
389             String test = "[";
390             for (int j = 0; j < tests.length; ++j) {
391                 if ((i & (1 << j)) != 0) {
392                     test += tests[j];
393                 }
394             }
395             test += "]";
396             testToRegex(new UnicodeSet(test));
397         }
398     }
399 
testToRegex(UnicodeSet test)400     private static void testToRegex(UnicodeSet test) {
401         String formatted = CldrUtility.toRegex(test);
402         System.out.println(test + "\t->\t" + formatted);
403         Matcher newTest = PatternCache.get(formatted).matcher("");
404         UnicodeSet failures = new UnicodeSet();
405         for (UnicodeSetIterator it = new UnicodeSetIterator(test); it.next();) {
406             if (!newTest.reset(it.getString()).matches()) {
407                 failures.add(it.getString());
408             }
409         }
410         if (failures.size() != 0) {
411             System.out.println("\tFailed on: " + failures);
412         }
413         System.out.flush();
414     }
415 
checkEastAsianWidth()416     static void checkEastAsianWidth() {
417         UnicodeSet dontCares = new UnicodeSet("[[:surrogate:][:unassigned:][:control:]]").freeze();
418         UnicodeSet dontCares2 = new UnicodeSet("[:^letter:]").freeze();
419 
420         // UnicodeSet wide = new UnicodeSet("[[:East_Asian_Width=wide:][:East_Asian_Width=fullwidth:][:Co:]]"); //
421         // remove supplementaries
422         // System.out.format("Wide %s" + Utility.LINE_SEPARATOR + "" + Utility.LINE_SEPARATOR, wide);
423         // System.out.format("Wide(spanned) %s" + Utility.LINE_SEPARATOR + "" + Utility.LINE_SEPARATOR,
424         // Utility.addDontCareSpans(wide, dontCares));
425         // UnicodeSet zeroWidth = new
426         // UnicodeSet("[[:default_ignorable_code_point:][:Mn:][:Me:]-[:Noncharacter_Code_Point:]-[:Cc:]]"); // remove
427         // supplementaries
428         // System.out.format("ZeroWidth %s" + Utility.LINE_SEPARATOR + "" + Utility.LINE_SEPARATOR, zeroWidth);
429         // System.out.format("ZeroWidth(spanned) %s" + Utility.LINE_SEPARATOR + "" + Utility.LINE_SEPARATOR,
430         // Utility.addDontCareSpans(zeroWidth, dontCares));
431 
432         // P2. In each paragraph, find the first character of type L, AL, or R.
433         UnicodeSet strongL = new UnicodeSet("[[:BidiClass=L:]-[:unassigned:]]").freeze(); //
434         showSpans("Bidi L", strongL, dontCares);
435         showSpans("Bidi L*", strongL, dontCares2);
436 
437         UnicodeSet strongRAL = new UnicodeSet("[[:BidiClass=R:][:BidiClass=AL:]-[:unassigned:]]").freeze();
438         showSpans("Bidi R,AL", strongRAL, dontCares);
439         showSpans("Bidi R,AL*", strongRAL, dontCares2);
440 
441         UnicodeSet strong = new UnicodeSet(
442             "[[:BidiClass=L:][:BidiClass=R:][:BidiClass=AL:]-[:unassigned:]]").freeze();
443         showSpans("Strong", strong, dontCares);
444         showSpans("Strong*", strong, dontCares2);
445 
446     }
447 
showSpans(String title, UnicodeSet sourceSet, UnicodeSet dontCares)448     private static void showSpans(String title, UnicodeSet sourceSet, UnicodeSet dontCares) {
449         System.out.println(title);
450         System.out.format("\tSource Set: %s" + CldrUtility.LINE_SEPARATOR, sourceSet);
451         System.out.format("\tDon't Cares: %s" + CldrUtility.LINE_SEPARATOR, dontCares);
452         UnicodeSet spanned = new UnicodeSet(sourceSet).addBridges(dontCares);
453         spanned = spanned.complement().complement();
454         String spannedString = spanned.toString();
455         String unescapedString = spanned.toPattern(false);
456         System.out.format("\tRanges: %d" + CldrUtility.LINE_SEPARATOR, spanned.getRangeCount());
457         System.out.format("\tStrlen(\\u): %d" + CldrUtility.LINE_SEPARATOR, spannedString.length());
458         System.out.format("\tStrlen(!\\u): %d" + CldrUtility.LINE_SEPARATOR, unescapedString.length());
459         String title2 = "Result";
460         String sample = spannedString;
461         if (false) {
462             if (sample.length() > 60) {
463                 title2 = "Sample";
464                 sample = sample.substring(0, 60) + " ...";
465             }
466         }
467         System.out.format("\t%s: %s" + CldrUtility.LINE_SEPARATOR, title2, sample);
468         System.out.println();
469     }
470 
471     static int[] extraCJK = {
472 
473         0x3006, // IDEOGRAPHIC CLOSING MARK;Lo
474         0x302A, // IDEOGRAPHIC LEVEL TONE MARK;Mn
475         0x302B, // IDEOGRAPHIC RISING TONE MARK;Mn
476         0x302C, // IDEOGRAPHIC DEPARTING TONE MARK;Mn
477         0x302D, // IDEOGRAPHIC ENTERING TONE MARK;Mn
478         0x302E, // HANGUL SINGLE DOT TONE MARK;Mn
479         0x302F, // HANGUL DOUBLE DOT TONE MARK;Mn
480         0x3031, // VERTICAL KANA REPEAT MARK;Lm
481         0x3032, // VERTICAL KANA REPEAT WITH VOICED SOUND MARK;Lm
482         0x3033, // VERTICAL KANA REPEAT MARK UPPER HALF;Lm
483         0x3034, // VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF;Lm
484         0x3035, // VERTICAL KANA REPEAT MARK LOWER HALF;Lm
485         0x303C, // MASU MARK;Lo
486         0x3099, // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK;Mn
487         0x309A, // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Mn
488         0x309B, // KATAKANA-HIRAGANA VOICED SOUND MARK;Sk
489         0x309C, // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Sk
490         0x30A0, // KATAKANA-HIRAGANA DOUBLE HYPHEN;Pd
491         0x30FC, // KATAKANA-HIRAGANA PROLONGED SOUND MARK;Lm
492         0xFF70, // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK;Lm
493         0xFF9E, // HALFWIDTH KATAKANA VOICED SOUND MARK;Lm
494         0xFF9F, // HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm
495     };
496 
checkCFK()497     void checkCFK() {
498         // UnicodeSet Han, Hangul, Hiragana, Katakana, or Bopomofo
499     }
500 
checkDistinguishing()501     private static void checkDistinguishing() {
502         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
503         Set<String> cldrFiles = cldrFactory.getAvailableLanguages();
504         Set<String> distinguishing = new TreeSet<>();
505         Set<String> nondistinguishing = new TreeSet<>();
506         for (Iterator<String> it = cldrFiles.iterator(); it.hasNext();) {
507             CLDRFile cldrFile = cldrFactory.make(it.next(), false);
508             DtdType dtdType = null;
509             if (cldrFile.isNonInheriting()) {
510                 continue;
511             }
512             for (Iterator<String> it2 = cldrFile.iterator(); it2.hasNext();) {
513                 String path = it2.next();
514                 if (dtdType == null) {
515                     dtdType = DtdType.fromPath(path);
516                 }
517                 String fullPath = cldrFile.getFullXPath(path);
518                 if (path.equals(fullPath)) {
519                     continue;
520                 }
521                 XPathParts parts = XPathParts.getFrozenInstance(fullPath);
522                 for (int i = 0; i < parts.size(); ++i) {
523                     Map<String, String> m = parts.getAttributes(i);
524                     if (m.size() == 0) {
525                         continue;
526                     }
527                     String element = parts.getElement(i);
528                     for (Iterator<String> mit = m.keySet().iterator(); mit.hasNext();) {
529                         String attribute = mit.next();
530                         if (CLDRFile.isDistinguishing(dtdType, element, attribute)) {
531                             distinguishing.add(attribute + "\tD\t" + element);
532                         } else {
533                             nondistinguishing.add(attribute + "\tN\t" + element);
534                         }
535                     }
536                 }
537             }
538         }
539         System.out.println("Distinguishing");
540         for (Iterator<String> it = distinguishing.iterator(); it.hasNext();) {
541             System.out.println(it.next());
542         }
543         System.out.println();
544         System.out.println("Non-Distinguishing");
545         for (Iterator<String> it = nondistinguishing.iterator(); it.hasNext();) {
546             System.out.println(it.next());
547         }
548     }
549 
showEnglish()550     private static void showEnglish() {
551         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
552         String requestedLocale = "en";
553         CLDRFile cldrFile = cldrFactory.make(requestedLocale, true);
554         CLDRFile.Status status = new CLDRFile.Status();
555         for (Iterator<String> it = cldrFile.iterator(); it.hasNext();) {
556             String requestedPath = it.next();
557             String localeWhereFound = cldrFile.getSourceLocaleID(requestedPath, status);
558             if (!localeWhereFound.equals(requestedLocale) || !status.pathWhereFound.equals(requestedPath)) {
559                 System.out.println("requested path:\t" + requestedPath
560                     + "\tfound locale:\t" + localeWhereFound
561                     + "\tsame?\t" + localeWhereFound.equals(requestedLocale)
562                     + "\tfound path:\t" + status.pathWhereFound
563                     + "\tsame?\t" + status.pathWhereFound.equals(requestedPath));
564             }
565         }
566     }
567 
checkPrivateUse()568     private static void checkPrivateUse() {
569         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
570         String requestedLocale = "en";
571         CLDRFile cldrFile = cldrFactory.make(requestedLocale, true);
572         StandardCodes sc = StandardCodes.make();
573         Set<String> careAbout = new HashSet<>(Arrays.asList(new String[] { "language", "script", "territory", "variant" }));
574         HashMap<String, Set<String>> foundItems = new HashMap<>();
575         TreeSet<String> problems = new TreeSet<>();
576         for (Iterator<String> it = cldrFile.iterator("", new UTF16.StringComparator(true, false, 0)); it.hasNext();) {
577             String requestedPath = it.next();
578             XPathParts parts = XPathParts.getFrozenInstance(requestedPath);
579             String element = parts.getElement(-1);
580             if (!careAbout.contains(element)) {
581                 continue;
582             }
583             String type = parts.getAttributeValue(-1, "type");
584             if (type == null) {
585                 continue;
586             }
587             Set<String> foundSet = foundItems.get(element);
588             if (foundSet == null) {
589                 foundItems.put(element, foundSet = new TreeSet<>());
590             }
591             foundSet.add(type);
592 
593             List<String> data = sc.getFullData(element, type);
594             if (data == null) {
595                 problems.add("No RFC3066bis data for: " + element + "\t" + type + "\t"
596                     + cldrFile.getStringValue(requestedPath));
597                 continue;
598             }
599             if (isPrivateOrDeprecated(data)) {
600                 problems.add("Private/Deprecated Data for: " + element + "\t" + type + "\t"
601                     + cldrFile.getStringValue(requestedPath) + "\t" + data);
602             }
603             // String canonical_value = (String)data.get(2);
604         }
605         for (Iterator<String> it = problems.iterator(); it.hasNext();) {
606             System.out.println(it.next());
607         }
608         for (Iterator<String> it = careAbout.iterator(); it.hasNext();) {
609             String element = it.next();
610             Set<String> real = sc.getAvailableCodes(element);
611             Set<String> notFound = new TreeSet<>(real);
612             notFound.removeAll(foundItems.get(element));
613             for (Iterator<String> it2 = notFound.iterator(); it2.hasNext();) {
614                 String type = it2.next();
615                 List<String> data = sc.getFullData(element, type);
616                 if (isPrivateOrDeprecated(data)) continue;
617                 System.out.println("Missing Translation for: " + element + "\t" + type + "\t"
618                     + "\t" + data);
619             }
620         }
621     }
622 
isPrivateOrDeprecated(List<String> data)623     static boolean isPrivateOrDeprecated(List<String> data) {
624         if (data.toString().indexOf("PRIVATE") >= 0) {
625             return true;
626         }
627         if ("PRIVATE USE".equals(data.get(0))) return true;
628         if (data.size() < 3) return false;
629         if (data.get(2) == null) return false;
630         if (data.get(2).toString().length() != 0) return true;
631         return false;
632     }
633 
testPopulous()634     static void testPopulous() {
635         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
636         CLDRFile supp = cldrFactory.make("supplementalData", false);
637         CLDRFile temp = SimpleFactory.makeFile("supplemental");
638         temp.setNonInheriting(true);
639         for (Iterator<String> it = supp.iterator(null, supp.getComparator()); it.hasNext();) {
640             String path = it.next();
641             String value = supp.getStringValue(path);
642             String fullPath = supp.getFullXPath(path);
643             XPathParts parts = XPathParts.getFrozenInstance(fullPath);
644             String type = parts.getAttributeValue(-1, "type");
645             String pop = language_territory_hack_map.get(type);
646             if (pop != null) {
647                 parts = parts.cloneAsThawed();
648                 parts.putAttributeValue(-1, "mostPopulousTerritory", pop);
649                 fullPath = parts.toString();
650             }
651             temp.add(fullPath, value);
652         }
653         PrintWriter pw = new PrintWriter(System.out);
654         temp.write(pw);
655         pw.close();
656     }
657 
658     private static final Map<String, String> language_territory_hack_map = new HashMap<>();
659     private static final String[][] language_territory_hack = {
660         { "af", "ZA" },
661         { "am", "ET" },
662         { "ar", "SA" },
663         { "as", "IN" },
664         { "ay", "PE" },
665         { "az", "AZ" },
666         { "bal", "PK" },
667         { "be", "BY" },
668         { "bg", "BG" },
669         { "bn", "IN" },
670         { "bs", "BA" },
671         { "ca", "ES" },
672         { "ch", "MP" },
673         { "cpe", "SL" },
674         { "cs", "CZ" },
675         { "cy", "GB" },
676         { "da", "DK" },
677         { "de", "DE" },
678         { "dv", "MV" },
679         { "dz", "BT" },
680         { "el", "GR" },
681         { "en", "US" },
682         { "es", "ES" },
683         { "et", "EE" },
684         { "eu", "ES" },
685         { "fa", "IR" },
686         { "fi", "FI" },
687         { "fil", "PH" },
688         { "fj", "FJ" },
689         { "fo", "FO" },
690         { "fr", "FR" },
691         { "ga", "IE" },
692         { "gd", "GB" },
693         { "gl", "ES" },
694         { "gn", "PY" },
695         { "gu", "IN" },
696         { "gv", "GB" },
697         { "ha", "NG" },
698         { "he", "IL" },
699         { "hi", "IN" },
700         { "ho", "PG" },
701         { "hr", "HR" },
702         { "ht", "HT" },
703         { "hu", "HU" },
704         { "hy", "AM" },
705         { "id", "ID" },
706         { "is", "IS" },
707         { "it", "IT" },
708         { "ja", "JP" },
709         { "ka", "GE" },
710         { "kk", "KZ" },
711         { "kl", "GL" },
712         { "km", "KH" },
713         { "kn", "IN" },
714         { "ko", "KR" },
715         { "kok", "IN" },
716         { "ks", "IN" },
717         { "ku", "TR" },
718         { "ky", "KG" },
719         { "la", "VA" },
720         { "lb", "LU" },
721         { "ln", "CG" },
722         { "lo", "LA" },
723         { "lt", "LT" },
724         { "lv", "LV" },
725         { "mai", "IN" },
726         { "men", "GN" },
727         { "mg", "MG" },
728         { "mh", "MH" },
729         { "mk", "MK" },
730         { "ml", "IN" },
731         { "mn", "MN" },
732         { "mni", "IN" },
733         { "mo", "MD" },
734         { "mr", "IN" },
735         { "ms", "MY" },
736         { "mt", "MT" },
737         { "my", "MM" },
738         { "na", "NR" },
739         { "nb", "NO" },
740         { "nd", "ZA" },
741         { "ne", "NP" },
742         { "niu", "NU" },
743         { "nl", "NL" },
744         { "nn", "NO" },
745         { "no", "NO" },
746         { "nr", "ZA" },
747         { "nso", "ZA" },
748         { "ny", "MW" },
749         { "om", "KE" },
750         { "or", "IN" },
751         { "pa", "IN" },
752         { "pau", "PW" },
753         { "pl", "PL" },
754         { "ps", "PK" },
755         { "pt", "BR" },
756         { "qu", "PE" },
757         { "rn", "BI" },
758         { "ro", "RO" },
759         { "ru", "RU" },
760         { "rw", "RW" },
761         { "sd", "IN" },
762         { "sg", "CF" },
763         { "si", "LK" },
764         { "sk", "SK" },
765         { "sl", "SI" },
766         { "sm", "WS" },
767         { "so", "DJ" },
768         { "sq", "CS" },
769         { "sr", "CS" },
770         { "ss", "ZA" },
771         { "st", "ZA" },
772         { "sv", "SE" },
773         { "sw", "KE" },
774         { "ta", "IN" },
775         { "te", "IN" },
776         { "tem", "SL" },
777         { "tet", "TL" },
778         { "th", "TH" },
779         { "ti", "ET" },
780         { "tg", "TJ" },
781         { "tk", "TM" },
782         { "tkl", "TK" },
783         { "tvl", "TV" },
784         { "tl", "PH" },
785         { "tn", "ZA" },
786         { "to", "TO" },
787         { "tpi", "PG" },
788         { "tr", "TR" },
789         { "ts", "ZA" },
790         { "uk", "UA" },
791         { "ur", "IN" },
792         { "uz", "UZ" },
793         { "ve", "ZA" },
794         { "vi", "VN" },
795         { "wo", "SN" },
796         { "xh", "ZA" },
797         { "zh", "CN" },
798         { "zh_Hant", "TW" },
799         { "zu", "ZA" },
800         { "aa", "ET" },
801         { "byn", "ER" },
802         { "eo", "DE" },
803         { "gez", "ET" },
804         { "haw", "US" },
805         { "iu", "CA" },
806         { "kw", "GB" },
807         { "sa", "IN" },
808         { "sh", "HR" },
809         { "sid", "ET" },
810         { "syr", "SY" },
811         { "tig", "ER" },
812         { "tt", "RU" },
813         { "wal", "ET" }, };
814     static {
815         for (int i = 0; i < language_territory_hack.length; ++i) {
language_territory_hack_map.put(language_territory_hack[i][0], language_territory_hack[i][1])816             language_territory_hack_map.put(language_territory_hack[i][0], language_territory_hack[i][1]);
817         }
818     }
819 
820     static class MyXSymbolTable extends UnicodeSet.XSymbolTable {
821         static VariantFolder caseFolder = new VariantFolder(new CaseVariantFolder());
822         static VariantFolder canonicalFolder = new VariantFolder(new CanonicalFolder());
823         static VariantFolder compatibilityFolder = new VariantFolder(new CompatibilityFolder());
824 
825         @Override
applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result)826         public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) {
827             if (propertyName.equalsIgnoreCase("close")) {
828                 if (propertyValue.equalsIgnoreCase("case")) {
829                     result.addAll(caseFolder.getClosure(result));
830                 } else if (propertyValue.equalsIgnoreCase("canonical")) {
831                     result.addAll(canonicalFolder.getClosure(result));
832                 } else if (propertyValue.equalsIgnoreCase("compatibility")) {
833                     result.addAll(compatibilityFolder.getClosure(result));
834                 }
835                 return true;
836             } else if (propertyName.equalsIgnoreCase("reduce")) {
837                 if (propertyValue.equalsIgnoreCase("case")) {
838                     UnicodeSet temp = caseFolder.reduce(result);
839                     result.clear().addAll(temp);
840                 } else if (propertyValue.equalsIgnoreCase("canonical")) {
841                     UnicodeSet temp = canonicalFolder.reduce(result);
842                     result.clear().addAll(temp);
843                 } else if (propertyValue.equalsIgnoreCase("compatibility")) {
844                     UnicodeSet temp = compatibilityFolder.reduce(result);
845                     result.clear().addAll(temp);
846                 }
847                 return true;
848             } else if (propertyName.equalsIgnoreCase("reduceCase")) {
849                 UnicodeSet temp = caseFolder.reduce(new UnicodeSet(propertyValue.replace(
850                     "·]", ":]")));
851                 result.clear().addAll(temp);
852                 return true;
853             } else if (propertyName.equalsIgnoreCase("reduceCanonical")) {
854                 UnicodeSet temp = canonicalFolder.reduce(new UnicodeSet(propertyValue.replace(
855                     "·]", ":]")));
856                 result.clear().addAll(temp);
857                 return true;
858             } else if (propertyName.equalsIgnoreCase("reduceCase")) {
859                 UnicodeSet temp = caseFolder.reduce(new UnicodeSet(propertyValue.replace(
860                     "·]", ":]")));
861                 result.clear().addAll(temp);
862                 return true;
863             }
864             return false;
865         }
866     }
867 
868 }