1 package org.unicode.cldr.test; 2 3 import java.io.PrintWriter; 4 import java.math.BigDecimal; 5 import java.text.ParsePosition; 6 import java.util.ArrayList; 7 import java.util.Arrays; 8 import java.util.BitSet; 9 import java.util.Collection; 10 import java.util.Collections; 11 import java.util.EnumSet; 12 import java.util.HashMap; 13 import java.util.HashSet; 14 import java.util.Iterator; 15 import java.util.List; 16 import java.util.Locale; 17 import java.util.Map; 18 import java.util.Set; 19 import java.util.TreeSet; 20 import java.util.regex.Matcher; 21 22 import org.unicode.cldr.util.CLDRFile; 23 import org.unicode.cldr.util.CLDRFile.Status; 24 import org.unicode.cldr.util.CLDRPaths; 25 import org.unicode.cldr.util.CldrUtility; 26 import org.unicode.cldr.util.DtdType; 27 import org.unicode.cldr.util.Factory; 28 import org.unicode.cldr.util.Iso639Data; 29 import org.unicode.cldr.util.Iso639Data.Scope; 30 import org.unicode.cldr.util.Level; 31 import org.unicode.cldr.util.Pair; 32 import org.unicode.cldr.util.PatternCache; 33 import org.unicode.cldr.util.SimpleFactory; 34 import org.unicode.cldr.util.StandardCodes; 35 import org.unicode.cldr.util.VariantFolder; 36 import org.unicode.cldr.util.VariantFolder.CanonicalFolder; 37 import org.unicode.cldr.util.VariantFolder.CaseVariantFolder; 38 import org.unicode.cldr.util.VariantFolder.CompatibilityFolder; 39 import org.unicode.cldr.util.XPathParts; 40 import org.unicode.cldr.util.props.BagFormatter; 41 42 import com.ibm.icu.lang.UCharacter; 43 import com.ibm.icu.lang.UScript; 44 import com.ibm.icu.text.Collator; 45 import com.ibm.icu.text.DecimalFormat; 46 import com.ibm.icu.text.NumberFormat; 47 import com.ibm.icu.text.Transliterator; 48 import com.ibm.icu.text.UTF16; 49 import com.ibm.icu.text.UnicodeSet; 50 import com.ibm.icu.text.UnicodeSetIterator; 51 import com.ibm.icu.util.Currency; 52 import com.ibm.icu.util.ULocale; 53 54 public class TestMisc { 55 56 static Currency SWISS_FRANC = Currency.getInstance("CHF"); 57 58 static class Lists { sortedCopy(Collection<E> iterable)59 public static <E extends Comparable> List<E> sortedCopy(Collection<E> iterable) { 60 List<E> list = new ArrayList<>(); 61 list.addAll(iterable); 62 Collections.sort(list); 63 return list; 64 } 65 } 66 67 enum Foo { 68 A, M, Z 69 } 70 main(String[] args)71 public static void main(String[] args) { 72 73 checkAliases(); 74 if (true) return; 75 76 Transliterator en_ru = Transliterator.getInstance("en-ru"); 77 System.out.println("Mark + " + en_ru.transform("Mark")); 78 79 Transliterator latn_cyrl = Transliterator.getInstance("Latn-Cyrl"); 80 System.out.println("Mark + " + latn_cyrl.transform("Mark")); 81 82 Transliterator ulatn_ucyrl = Transliterator.getInstance("und_Latn-und_Cyrl"); 83 System.out.println("Mark + " + latn_cyrl.transform("Mark")); 84 85 Locale locale = new Locale("abc-d αγζθ ?ef_g%hi", "abc-d αγζθ ?ef_g%hi", "abc-d αγζθ ?ef_g%hi"); 86 87 System.out 88 .println("Locale locale = new Locale(\"abc-d αγζθ ?ef_g%hi\",\"abc-d αγζθ ?ef_g%hi\",\"abc-d αγζθ ?ef_g%hi\");"); 89 System.out.println("locale.toString() == \"" + locale + "\""); 90 91 MyXSymbolTable sym = new MyXSymbolTable(); 92 BagFormatter bf = new BagFormatter(); 93 for (String test : new String[] { 94 "[:reduceCase=[[Åå{fi}]]:]", 95 "[:reduceCanonical=[[Åå{fi}]]:]", 96 "[[,٫.]]", 97 "[[,٫.][:close=compatibility:]]", 98 "[[\\ ,٬.']]", 99 "[[\\ ,٬.'][:close=compatibility:]]", 100 "[[\u002E\u2024\uFE52\uFF0E\u3002][:close=compatibility:]]", 101 "[[[\u002C \u002E \u066B \u2024 \u3002 \uFE52 \uFF0E、، \u002E \u2024 \uFE52 \uFF0E \u3002]-[\u002E\u2024\uFE52\uFF0E\u3002]][:close=compatibility:]]", 102 103 "[[" + 104 "\\u0020" + 105 "[, ٬ ..․﹒ '' \u2018 \u2019 ]" + 106 "-[.\u2024\u3002\uFE12\uFE52\uFF0E\uFF61]" + 107 "-[,\u060C\u066B\u3001\uFE10\uFE11\uFE50\uFE51\uFF0C\uFF64]]" + 108 "[:close=compatibility:]]", 109 110 /* 111 * "[[Åå{fi}][:close=canonical:]]", 112 * "[[Åå{fi}][:close=compatibility:]]", 113 * "[[Åå{fi}][:reduce=case:]]", 114 * "[[Åå{fi}][:reduce=canonical:]]", 115 * "[[Åå{fi}][:reduce=compatibility:]]", 116 */ 117 }) { 118 ParsePosition p = new ParsePosition(0); 119 UnicodeSet set = new UnicodeSet(test, p, sym); 120 UnicodeSet codes = set.complement().complement(); 121 System.out.println(test + CldrUtility.LINE_SEPARATOR + 122 codes.toPattern(true) + CldrUtility.LINE_SEPARATOR + 123 bf.showSetNames(set.complement().complement()) + CldrUtility.LINE_SEPARATOR); 124 } 125 if (true) return; 126 127 StandardCodes sc = StandardCodes.make(); 128 for (String s : new String[] { "language", "script", "territory" }) { 129 System.out.println(s + ":\t" + sc.getGoodAvailableCodes(s).size()); 130 } 131 if (true) return; 132 133 Set<Foo> inFileOrder = EnumSet.allOf(Foo.class); 134 List<Foo> inAlphaOrder = Lists.sortedCopy(inFileOrder); 135 System.out.println(inFileOrder); 136 System.out.println(inAlphaOrder); 137 138 DecimalFormat currencyFormat = (DecimalFormat) NumberFormat.getCurrencyInstance(new ULocale("de-CH")); 139 currencyFormat.setCurrency(SWISS_FRANC); 140 // sometime later... 141 // we want the financial format of the currency, not the retail format 142 System.out.println("Retail:\t" + currencyFormat.format(123.53)); 143 BigDecimal increment = currencyFormat.getRoundingIncrement(); 144 System.out.println("Rounding Increment:\t" + increment); 145 double double_increment = increment.doubleValue(); 146 System.out.println("Double rounding Increment:\t" + double_increment); 147 double log = Math.log10(double_increment); 148 System.out.println("Double log:\t" + log); 149 double new_increment = Math.pow(10, Math.floor(log)); 150 System.out.println("Floored Increment:\t" + new_increment); 151 currencyFormat.setRoundingIncrement(new_increment); 152 System.out.println("Financial:\t" + currencyFormat.format(123.53)); 153 154 if (true) return; 155 156 testWeights(); 157 if (true) return; 158 159 testScripts(); 160 testToRegex(); 161 // checkEastAsianWidth(); 162 if (true) return; 163 // import ICU 164 UnicodeSet RTL = new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]"); 165 166 checkCollections(); 167 168 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 169 CLDRFile englishFile = cldrFactory.make("en", true); 170 ExampleGenerator eg = new ExampleGenerator(englishFile, englishFile, CLDRPaths.SUPPLEMENTAL_DIRECTORY); 171 System.out 172 .println(eg 173 .getHelpHtml( 174 "//ldml/numbers/currencyFormats/currencyFormatLength/currencyFormat[@type=\"standard\"]/pattern[@type=\"standard\"][@draft=\"provisional\"]", 175 "")); 176 System.out.println(eg.getHelpHtml("/exemplarCharacters", "")); 177 System.out.println(eg.getHelpHtml("/calendar/pattern", "")); 178 179 if (true) return; 180 Set<String> s = new HashSet<>(Arrays.asList("a", "A", "c")); 181 Collator caselessCompare = Collator.getInstance(Locale.ENGLISH); 182 caselessCompare.setStrength(Collator.PRIMARY); 183 Set<String> t = new TreeSet<>(caselessCompare); 184 t.addAll(Arrays.asList("a", "b", "c")); 185 System.out.println("s equals t: " + s.equals(t)); 186 System.out.println("t equals s: " + t.equals(s)); 187 188 Set<String> u = Collections.unmodifiableSet(t); 189 System.out.println("s==t " + (s.equals(t))); 190 System.out.println("s==u " + (s.equals(u))); 191 UnicodeSet x = new UnicodeSet("[a-z]"); 192 UnicodeSet y = new UnicodeSet("[a-z]").freeze(); 193 System.out.println("x==y " + (x.equals(y))); 194 // showEnglish(); 195 // checkPrivateUse(); 196 // testPopulous(); 197 // checkDistinguishing(); 198 // checkEastAsianWidth(); 199 // checkEnglishPaths(); 200 System.out.println("Done"); 201 } 202 checkAliases()203 private static void checkAliases() { 204 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 205 CLDRFile en = cldrFactory.make("root", true); 206 Status status = new Status(); 207 Matcher m = PatternCache.get("gregorian.*dayPeriods").matcher(""); 208 for (Iterator<String> it = en.iterator(null, en.getComparator()); it.hasNext();) { 209 String path = it.next(); 210 if (!m.reset(path).find()) { 211 continue; 212 } 213 //String locale = en.getSourceLocaleID(path, status); 214 String value = en.getStringValue(path); 215 String fullPath = en.getFullXPath(path); 216 System.out.println("value:\t" + value + "\tpath:\t" + fullPath); 217 if (!path.equals(status.pathWhereFound)) { 218 System.out.println("\torigin:\t" + status); 219 } 220 // System.out.println("locale:\t" + locale); 221 System.out.println(); 222 } 223 } 224 testWeights()225 private static void testWeights() { 226 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 227 CLDRFile english = cldrFactory.make("en", true); 228 Set<Pair<Integer, String>> rel = new TreeSet<>(); 229 for (String desiredLocale : cldrFactory.getAvailable()) { 230 int vote = Level.getDefaultWeight("google", desiredLocale); 231 rel.add(new Pair<>(vote, desiredLocale)); 232 } 233 for (Pair<Integer, String> p : rel) { 234 System.out.println(p + "\t" + english.getName(p.getSecond())); 235 } 236 } 237 testScripts()238 private static void testScripts() { 239 BagFormatter bf = new BagFormatter(); 240 241 UnicodeSet caseFolded = new UnicodeSet(); 242 UnicodeSet simpleCaseFolded = new UnicodeSet(); 243 for (int i = 0; i < 0x10FFFF; ++i) { 244 String form = UTF16.valueOf(i); 245 if (UCharacter.foldCase(form, true).equals(form)) { 246 caseFolded.add(i); 247 } 248 if (UCharacter.foldCase(i, true) == i) { 249 simpleCaseFolded.add(i); 250 } 251 } 252 caseFolded.freeze(); 253 simpleCaseFolded.freeze(); 254 255 UnicodeSet functionalExceptCase = new UnicodeSet("[" + 256 "[:L:][:Mc:][:Mn:][:Nd:]" + 257 "&[:^NFKC_QuickCheck=No:]" + 258 "&[:^default_ignorable_code_point:]]").freeze(); 259 260 UnicodeSet asciiIdn = new UnicodeSet("[-A-Z0-9]").freeze(); 261 262 UnicodeSet archaic = new UnicodeSet("[" + 263 "[:script=Bugi:]" + 264 "[:script=Copt:]" + 265 "[:script=Cprt:]" + 266 "[:script=Dsrt:]" + 267 "[:script=Glag:]" + 268 "[:script=Goth:]" + 269 "[:script=Hano:]" + 270 "[:script=Ital:]" + 271 "[:script=Khar:]" + 272 "[:script=Linb:]" + 273 "[:script=Ogam:]" + 274 "[:script=Osma:]" + 275 "[:script=Phag:]" + 276 "[:script=Phnx:]" + 277 "[:script=Runr:]" + 278 "[:script=Shaw:]" + 279 "[:script=Sylo:]" + 280 "[:script=Syrc:]" + 281 "[:script=Tagb:]" + 282 "[:script=Tglg:]" + 283 "[:script=Ugar:]" + 284 "[:script=Xpeo:]" + 285 "[:script=Xsux:]" + 286 // "[:script=Arab:]" + 287 // "[:script=Armn:]" + 288 // "[:script=Beng:]" + 289 // "[:script=Bopo:]" + 290 "[:block=Combining_Diacritical_Marks _for_Symbols:]" + 291 "[:block=Musical_Symbols:]" + 292 "[:block=Ancient_Greek_Musical_Notation:]]").freeze(); 293 294 System.out.println("functionalExceptCase: " + functionalExceptCase); 295 System.out.println("archaic: " + archaic); 296 297 System.out.println("SimpleCaseFolded & !CaseFolded & Functional & !Archaic:" + CldrUtility.LINE_SEPARATOR 298 + bf.showSetNames(new UnicodeSet(simpleCaseFolded) 299 .removeAll(caseFolded) 300 .retainAll(functionalExceptCase) 301 .removeAll(archaic).removeAll(asciiIdn))); 302 303 UnicodeSet functional = new UnicodeSet(functionalExceptCase).retainAll(caseFolded).freeze(); 304 System.out.println("functional: " + functional.size()); 305 UnicodeSet functionalAndNotArchaic = new UnicodeSet(functional).removeAll(archaic).freeze(); 306 System.out.println("archaic: " + archaic.size()); 307 System.out.println("functionalAndNotArchaic: " + functionalAndNotArchaic.size()); 308 309 // System.out.println(bf.showSetNames("Case Folded", caseFolded,"Simple Case Folded", simpleCaseFolded)); 310 311 UnicodeSet functionalCommon = new UnicodeSet("[:script=common:]").retainAll(functional).removeAll(archaic) 312 .removeAll(asciiIdn); 313 System.out.println("Common & Functional & !Archaic:" + CldrUtility.LINE_SEPARATOR 314 + bf.showSetNames(functionalCommon)); 315 316 UnicodeSet functionalInherited = new UnicodeSet("[:script=inherited:]").retainAll(functional) 317 .removeAll(archaic).removeAll(asciiIdn); 318 System.out.println("Inherited & Functional & !Archaic:" + CldrUtility.LINE_SEPARATOR 319 + bf.showSetNames(functionalInherited)); 320 321 UnicodeSet nl = new UnicodeSet("[:Nl:]").retainAll(functional).removeAll(archaic); 322 System.out.println("Nl:" + CldrUtility.LINE_SEPARATOR + bf.showSetNames(new UnicodeSet("[:Nl:]"))); 323 System.out.println("Nl & Functional & !Archaic:" + CldrUtility.LINE_SEPARATOR + bf.showSetNames(nl)); 324 325 UnicodeSet restrictedXidContinue = new UnicodeSet( 326 "[[:xid_continue:]" + 327 "&[:^NFKC_QuickCheck=No:]" + 328 "&[:^default_ignorable_code_point:]" + 329 "&[:^Pc:]]").retainAll(caseFolded); 330 331 System.out.println(bf.showSetDifferences("IDNA Functional", functional, 332 "Unicode XID & NFKC &!DefaultIgnorable &! Pc", restrictedXidContinue)); 333 334 Transliterator t = Transliterator.getInstance("lower"); 335 System.out.println("ABC " + t.transliterate("ABC")); 336 /* 337 * generalCategory(cp) is {Ll, Lu, Lo, Lm, Mn, Mc, Nd}, AND 338 * NFKC(cp) == cp, AND 339 * casefold(cp) == cp, AND 340 * !defaultIgnorableCodePoint(cp) 341 */ 342 BitSet scripts = new BitSet(); 343 for (int cp = 0; cp < 0x10FFFF; ++cp) { 344 int script = UScript.getScript(cp); 345 if (script == UScript.COMMON || script == UScript.UNKNOWN || script == UScript.INHERITED) { 346 continue; 347 } 348 scripts.set(script); 349 } 350 Set<String> toPrint = new TreeSet<>(); 351 for (int script = 0; script < scripts.size(); ++script) { 352 if (!scripts.get(script)) continue; 353 String code = UScript.getShortName(script); 354 String name = UScript.getName(script); 355 if (StandardCodes.isScriptModern(code)) { 356 toPrint.add("modern\t" + code + "\t" + name); 357 } else { 358 toPrint.add("archaic\t" + code + "\t" + name); 359 } 360 } 361 for (String line : toPrint) { 362 System.out.println(line); 363 } 364 } 365 checkCollections()366 private static void checkCollections() { 367 System.out.println("Collections"); 368 new org.unicode.cldr.util.CldrUtility.Apply<String>() { 369 @Override 370 public void apply(String item) { 371 if (Iso639Data.getScope(item.toString()) != Scope.Collection) return; 372 System.out.println(item + "\t" + CldrUtility.join(Iso639Data.getNames(item), ", ")); 373 } 374 }.applyTo(Iso639Data.getAvailable()); 375 System.out.println(CldrUtility.LINE_SEPARATOR + "Macrolanguages"); 376 new org.unicode.cldr.util.CldrUtility.Apply<String>() { 377 @Override 378 public void apply(String item) { 379 if (Iso639Data.getScope(item.toString()) != Scope.Macrolanguage) return; 380 System.out.println(item + "\t" + CldrUtility.join(Iso639Data.getNames(item), ", ")); 381 } 382 }.applyTo(Iso639Data.getAvailable()); 383 } 384 testToRegex()385 static void testToRegex() { 386 String[] tests = { "\\-", "a", "d-f", "\\u2000", "\\uAC00-\\uAC12", "{AB}", "{CDE}", "\\uFFF0-\\U0010000F", 387 "\\U0010100F-\\U0010300F" }; // }; // 388 for (int i = (1 << tests.length) - 1; i >= 0; --i) { 389 String test = "["; 390 for (int j = 0; j < tests.length; ++j) { 391 if ((i & (1 << j)) != 0) { 392 test += tests[j]; 393 } 394 } 395 test += "]"; 396 testToRegex(new UnicodeSet(test)); 397 } 398 } 399 testToRegex(UnicodeSet test)400 private static void testToRegex(UnicodeSet test) { 401 String formatted = CldrUtility.toRegex(test); 402 System.out.println(test + "\t->\t" + formatted); 403 Matcher newTest = PatternCache.get(formatted).matcher(""); 404 UnicodeSet failures = new UnicodeSet(); 405 for (UnicodeSetIterator it = new UnicodeSetIterator(test); it.next();) { 406 if (!newTest.reset(it.getString()).matches()) { 407 failures.add(it.getString()); 408 } 409 } 410 if (failures.size() != 0) { 411 System.out.println("\tFailed on: " + failures); 412 } 413 System.out.flush(); 414 } 415 checkEastAsianWidth()416 static void checkEastAsianWidth() { 417 UnicodeSet dontCares = new UnicodeSet("[[:surrogate:][:unassigned:][:control:]]").freeze(); 418 UnicodeSet dontCares2 = new UnicodeSet("[:^letter:]").freeze(); 419 420 // UnicodeSet wide = new UnicodeSet("[[:East_Asian_Width=wide:][:East_Asian_Width=fullwidth:][:Co:]]"); // 421 // remove supplementaries 422 // System.out.format("Wide %s" + Utility.LINE_SEPARATOR + "" + Utility.LINE_SEPARATOR, wide); 423 // System.out.format("Wide(spanned) %s" + Utility.LINE_SEPARATOR + "" + Utility.LINE_SEPARATOR, 424 // Utility.addDontCareSpans(wide, dontCares)); 425 // UnicodeSet zeroWidth = new 426 // UnicodeSet("[[:default_ignorable_code_point:][:Mn:][:Me:]-[:Noncharacter_Code_Point:]-[:Cc:]]"); // remove 427 // supplementaries 428 // System.out.format("ZeroWidth %s" + Utility.LINE_SEPARATOR + "" + Utility.LINE_SEPARATOR, zeroWidth); 429 // System.out.format("ZeroWidth(spanned) %s" + Utility.LINE_SEPARATOR + "" + Utility.LINE_SEPARATOR, 430 // Utility.addDontCareSpans(zeroWidth, dontCares)); 431 432 // P2. In each paragraph, find the first character of type L, AL, or R. 433 UnicodeSet strongL = new UnicodeSet("[[:BidiClass=L:]-[:unassigned:]]").freeze(); // 434 showSpans("Bidi L", strongL, dontCares); 435 showSpans("Bidi L*", strongL, dontCares2); 436 437 UnicodeSet strongRAL = new UnicodeSet("[[:BidiClass=R:][:BidiClass=AL:]-[:unassigned:]]").freeze(); 438 showSpans("Bidi R,AL", strongRAL, dontCares); 439 showSpans("Bidi R,AL*", strongRAL, dontCares2); 440 441 UnicodeSet strong = new UnicodeSet( 442 "[[:BidiClass=L:][:BidiClass=R:][:BidiClass=AL:]-[:unassigned:]]").freeze(); 443 showSpans("Strong", strong, dontCares); 444 showSpans("Strong*", strong, dontCares2); 445 446 } 447 showSpans(String title, UnicodeSet sourceSet, UnicodeSet dontCares)448 private static void showSpans(String title, UnicodeSet sourceSet, UnicodeSet dontCares) { 449 System.out.println(title); 450 System.out.format("\tSource Set: %s" + CldrUtility.LINE_SEPARATOR, sourceSet); 451 System.out.format("\tDon't Cares: %s" + CldrUtility.LINE_SEPARATOR, dontCares); 452 UnicodeSet spanned = new UnicodeSet(sourceSet).addBridges(dontCares); 453 spanned = spanned.complement().complement(); 454 String spannedString = spanned.toString(); 455 String unescapedString = spanned.toPattern(false); 456 System.out.format("\tRanges: %d" + CldrUtility.LINE_SEPARATOR, spanned.getRangeCount()); 457 System.out.format("\tStrlen(\\u): %d" + CldrUtility.LINE_SEPARATOR, spannedString.length()); 458 System.out.format("\tStrlen(!\\u): %d" + CldrUtility.LINE_SEPARATOR, unescapedString.length()); 459 String title2 = "Result"; 460 String sample = spannedString; 461 if (false) { 462 if (sample.length() > 60) { 463 title2 = "Sample"; 464 sample = sample.substring(0, 60) + " ..."; 465 } 466 } 467 System.out.format("\t%s: %s" + CldrUtility.LINE_SEPARATOR, title2, sample); 468 System.out.println(); 469 } 470 471 static int[] extraCJK = { 472 473 0x3006, // IDEOGRAPHIC CLOSING MARK;Lo 474 0x302A, // IDEOGRAPHIC LEVEL TONE MARK;Mn 475 0x302B, // IDEOGRAPHIC RISING TONE MARK;Mn 476 0x302C, // IDEOGRAPHIC DEPARTING TONE MARK;Mn 477 0x302D, // IDEOGRAPHIC ENTERING TONE MARK;Mn 478 0x302E, // HANGUL SINGLE DOT TONE MARK;Mn 479 0x302F, // HANGUL DOUBLE DOT TONE MARK;Mn 480 0x3031, // VERTICAL KANA REPEAT MARK;Lm 481 0x3032, // VERTICAL KANA REPEAT WITH VOICED SOUND MARK;Lm 482 0x3033, // VERTICAL KANA REPEAT MARK UPPER HALF;Lm 483 0x3034, // VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF;Lm 484 0x3035, // VERTICAL KANA REPEAT MARK LOWER HALF;Lm 485 0x303C, // MASU MARK;Lo 486 0x3099, // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK;Mn 487 0x309A, // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Mn 488 0x309B, // KATAKANA-HIRAGANA VOICED SOUND MARK;Sk 489 0x309C, // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Sk 490 0x30A0, // KATAKANA-HIRAGANA DOUBLE HYPHEN;Pd 491 0x30FC, // KATAKANA-HIRAGANA PROLONGED SOUND MARK;Lm 492 0xFF70, // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK;Lm 493 0xFF9E, // HALFWIDTH KATAKANA VOICED SOUND MARK;Lm 494 0xFF9F, // HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm 495 }; 496 checkCFK()497 void checkCFK() { 498 // UnicodeSet Han, Hangul, Hiragana, Katakana, or Bopomofo 499 } 500 checkDistinguishing()501 private static void checkDistinguishing() { 502 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 503 Set<String> cldrFiles = cldrFactory.getAvailableLanguages(); 504 Set<String> distinguishing = new TreeSet<>(); 505 Set<String> nondistinguishing = new TreeSet<>(); 506 for (Iterator<String> it = cldrFiles.iterator(); it.hasNext();) { 507 CLDRFile cldrFile = cldrFactory.make(it.next(), false); 508 DtdType dtdType = null; 509 if (cldrFile.isNonInheriting()) { 510 continue; 511 } 512 for (Iterator<String> it2 = cldrFile.iterator(); it2.hasNext();) { 513 String path = it2.next(); 514 if (dtdType == null) { 515 dtdType = DtdType.fromPath(path); 516 } 517 String fullPath = cldrFile.getFullXPath(path); 518 if (path.equals(fullPath)) { 519 continue; 520 } 521 XPathParts parts = XPathParts.getFrozenInstance(fullPath); 522 for (int i = 0; i < parts.size(); ++i) { 523 Map<String, String> m = parts.getAttributes(i); 524 if (m.size() == 0) { 525 continue; 526 } 527 String element = parts.getElement(i); 528 for (Iterator<String> mit = m.keySet().iterator(); mit.hasNext();) { 529 String attribute = mit.next(); 530 if (CLDRFile.isDistinguishing(dtdType, element, attribute)) { 531 distinguishing.add(attribute + "\tD\t" + element); 532 } else { 533 nondistinguishing.add(attribute + "\tN\t" + element); 534 } 535 } 536 } 537 } 538 } 539 System.out.println("Distinguishing"); 540 for (Iterator<String> it = distinguishing.iterator(); it.hasNext();) { 541 System.out.println(it.next()); 542 } 543 System.out.println(); 544 System.out.println("Non-Distinguishing"); 545 for (Iterator<String> it = nondistinguishing.iterator(); it.hasNext();) { 546 System.out.println(it.next()); 547 } 548 } 549 showEnglish()550 private static void showEnglish() { 551 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 552 String requestedLocale = "en"; 553 CLDRFile cldrFile = cldrFactory.make(requestedLocale, true); 554 CLDRFile.Status status = new CLDRFile.Status(); 555 for (Iterator<String> it = cldrFile.iterator(); it.hasNext();) { 556 String requestedPath = it.next(); 557 String localeWhereFound = cldrFile.getSourceLocaleID(requestedPath, status); 558 if (!localeWhereFound.equals(requestedLocale) || !status.pathWhereFound.equals(requestedPath)) { 559 System.out.println("requested path:\t" + requestedPath 560 + "\tfound locale:\t" + localeWhereFound 561 + "\tsame?\t" + localeWhereFound.equals(requestedLocale) 562 + "\tfound path:\t" + status.pathWhereFound 563 + "\tsame?\t" + status.pathWhereFound.equals(requestedPath)); 564 } 565 } 566 } 567 checkPrivateUse()568 private static void checkPrivateUse() { 569 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 570 String requestedLocale = "en"; 571 CLDRFile cldrFile = cldrFactory.make(requestedLocale, true); 572 StandardCodes sc = StandardCodes.make(); 573 Set<String> careAbout = new HashSet<>(Arrays.asList(new String[] { "language", "script", "territory", "variant" })); 574 HashMap<String, Set<String>> foundItems = new HashMap<>(); 575 TreeSet<String> problems = new TreeSet<>(); 576 for (Iterator<String> it = cldrFile.iterator("", new UTF16.StringComparator(true, false, 0)); it.hasNext();) { 577 String requestedPath = it.next(); 578 XPathParts parts = XPathParts.getFrozenInstance(requestedPath); 579 String element = parts.getElement(-1); 580 if (!careAbout.contains(element)) { 581 continue; 582 } 583 String type = parts.getAttributeValue(-1, "type"); 584 if (type == null) { 585 continue; 586 } 587 Set<String> foundSet = foundItems.get(element); 588 if (foundSet == null) { 589 foundItems.put(element, foundSet = new TreeSet<>()); 590 } 591 foundSet.add(type); 592 593 List<String> data = sc.getFullData(element, type); 594 if (data == null) { 595 problems.add("No RFC3066bis data for: " + element + "\t" + type + "\t" 596 + cldrFile.getStringValue(requestedPath)); 597 continue; 598 } 599 if (isPrivateOrDeprecated(data)) { 600 problems.add("Private/Deprecated Data for: " + element + "\t" + type + "\t" 601 + cldrFile.getStringValue(requestedPath) + "\t" + data); 602 } 603 // String canonical_value = (String)data.get(2); 604 } 605 for (Iterator<String> it = problems.iterator(); it.hasNext();) { 606 System.out.println(it.next()); 607 } 608 for (Iterator<String> it = careAbout.iterator(); it.hasNext();) { 609 String element = it.next(); 610 Set<String> real = sc.getAvailableCodes(element); 611 Set<String> notFound = new TreeSet<>(real); 612 notFound.removeAll(foundItems.get(element)); 613 for (Iterator<String> it2 = notFound.iterator(); it2.hasNext();) { 614 String type = it2.next(); 615 List<String> data = sc.getFullData(element, type); 616 if (isPrivateOrDeprecated(data)) continue; 617 System.out.println("Missing Translation for: " + element + "\t" + type + "\t" 618 + "\t" + data); 619 } 620 } 621 } 622 isPrivateOrDeprecated(List<String> data)623 static boolean isPrivateOrDeprecated(List<String> data) { 624 if (data.toString().indexOf("PRIVATE") >= 0) { 625 return true; 626 } 627 if ("PRIVATE USE".equals(data.get(0))) return true; 628 if (data.size() < 3) return false; 629 if (data.get(2) == null) return false; 630 if (data.get(2).toString().length() != 0) return true; 631 return false; 632 } 633 testPopulous()634 static void testPopulous() { 635 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 636 CLDRFile supp = cldrFactory.make("supplementalData", false); 637 CLDRFile temp = SimpleFactory.makeFile("supplemental"); 638 temp.setNonInheriting(true); 639 for (Iterator<String> it = supp.iterator(null, supp.getComparator()); it.hasNext();) { 640 String path = it.next(); 641 String value = supp.getStringValue(path); 642 String fullPath = supp.getFullXPath(path); 643 XPathParts parts = XPathParts.getFrozenInstance(fullPath); 644 String type = parts.getAttributeValue(-1, "type"); 645 String pop = language_territory_hack_map.get(type); 646 if (pop != null) { 647 parts = parts.cloneAsThawed(); 648 parts.putAttributeValue(-1, "mostPopulousTerritory", pop); 649 fullPath = parts.toString(); 650 } 651 temp.add(fullPath, value); 652 } 653 PrintWriter pw = new PrintWriter(System.out); 654 temp.write(pw); 655 pw.close(); 656 } 657 658 private static final Map<String, String> language_territory_hack_map = new HashMap<>(); 659 private static final String[][] language_territory_hack = { 660 { "af", "ZA" }, 661 { "am", "ET" }, 662 { "ar", "SA" }, 663 { "as", "IN" }, 664 { "ay", "PE" }, 665 { "az", "AZ" }, 666 { "bal", "PK" }, 667 { "be", "BY" }, 668 { "bg", "BG" }, 669 { "bn", "IN" }, 670 { "bs", "BA" }, 671 { "ca", "ES" }, 672 { "ch", "MP" }, 673 { "cpe", "SL" }, 674 { "cs", "CZ" }, 675 { "cy", "GB" }, 676 { "da", "DK" }, 677 { "de", "DE" }, 678 { "dv", "MV" }, 679 { "dz", "BT" }, 680 { "el", "GR" }, 681 { "en", "US" }, 682 { "es", "ES" }, 683 { "et", "EE" }, 684 { "eu", "ES" }, 685 { "fa", "IR" }, 686 { "fi", "FI" }, 687 { "fil", "PH" }, 688 { "fj", "FJ" }, 689 { "fo", "FO" }, 690 { "fr", "FR" }, 691 { "ga", "IE" }, 692 { "gd", "GB" }, 693 { "gl", "ES" }, 694 { "gn", "PY" }, 695 { "gu", "IN" }, 696 { "gv", "GB" }, 697 { "ha", "NG" }, 698 { "he", "IL" }, 699 { "hi", "IN" }, 700 { "ho", "PG" }, 701 { "hr", "HR" }, 702 { "ht", "HT" }, 703 { "hu", "HU" }, 704 { "hy", "AM" }, 705 { "id", "ID" }, 706 { "is", "IS" }, 707 { "it", "IT" }, 708 { "ja", "JP" }, 709 { "ka", "GE" }, 710 { "kk", "KZ" }, 711 { "kl", "GL" }, 712 { "km", "KH" }, 713 { "kn", "IN" }, 714 { "ko", "KR" }, 715 { "kok", "IN" }, 716 { "ks", "IN" }, 717 { "ku", "TR" }, 718 { "ky", "KG" }, 719 { "la", "VA" }, 720 { "lb", "LU" }, 721 { "ln", "CG" }, 722 { "lo", "LA" }, 723 { "lt", "LT" }, 724 { "lv", "LV" }, 725 { "mai", "IN" }, 726 { "men", "GN" }, 727 { "mg", "MG" }, 728 { "mh", "MH" }, 729 { "mk", "MK" }, 730 { "ml", "IN" }, 731 { "mn", "MN" }, 732 { "mni", "IN" }, 733 { "mo", "MD" }, 734 { "mr", "IN" }, 735 { "ms", "MY" }, 736 { "mt", "MT" }, 737 { "my", "MM" }, 738 { "na", "NR" }, 739 { "nb", "NO" }, 740 { "nd", "ZA" }, 741 { "ne", "NP" }, 742 { "niu", "NU" }, 743 { "nl", "NL" }, 744 { "nn", "NO" }, 745 { "no", "NO" }, 746 { "nr", "ZA" }, 747 { "nso", "ZA" }, 748 { "ny", "MW" }, 749 { "om", "KE" }, 750 { "or", "IN" }, 751 { "pa", "IN" }, 752 { "pau", "PW" }, 753 { "pl", "PL" }, 754 { "ps", "PK" }, 755 { "pt", "BR" }, 756 { "qu", "PE" }, 757 { "rn", "BI" }, 758 { "ro", "RO" }, 759 { "ru", "RU" }, 760 { "rw", "RW" }, 761 { "sd", "IN" }, 762 { "sg", "CF" }, 763 { "si", "LK" }, 764 { "sk", "SK" }, 765 { "sl", "SI" }, 766 { "sm", "WS" }, 767 { "so", "DJ" }, 768 { "sq", "CS" }, 769 { "sr", "CS" }, 770 { "ss", "ZA" }, 771 { "st", "ZA" }, 772 { "sv", "SE" }, 773 { "sw", "KE" }, 774 { "ta", "IN" }, 775 { "te", "IN" }, 776 { "tem", "SL" }, 777 { "tet", "TL" }, 778 { "th", "TH" }, 779 { "ti", "ET" }, 780 { "tg", "TJ" }, 781 { "tk", "TM" }, 782 { "tkl", "TK" }, 783 { "tvl", "TV" }, 784 { "tl", "PH" }, 785 { "tn", "ZA" }, 786 { "to", "TO" }, 787 { "tpi", "PG" }, 788 { "tr", "TR" }, 789 { "ts", "ZA" }, 790 { "uk", "UA" }, 791 { "ur", "IN" }, 792 { "uz", "UZ" }, 793 { "ve", "ZA" }, 794 { "vi", "VN" }, 795 { "wo", "SN" }, 796 { "xh", "ZA" }, 797 { "zh", "CN" }, 798 { "zh_Hant", "TW" }, 799 { "zu", "ZA" }, 800 { "aa", "ET" }, 801 { "byn", "ER" }, 802 { "eo", "DE" }, 803 { "gez", "ET" }, 804 { "haw", "US" }, 805 { "iu", "CA" }, 806 { "kw", "GB" }, 807 { "sa", "IN" }, 808 { "sh", "HR" }, 809 { "sid", "ET" }, 810 { "syr", "SY" }, 811 { "tig", "ER" }, 812 { "tt", "RU" }, 813 { "wal", "ET" }, }; 814 static { 815 for (int i = 0; i < language_territory_hack.length; ++i) { language_territory_hack_map.put(language_territory_hack[i][0], language_territory_hack[i][1])816 language_territory_hack_map.put(language_territory_hack[i][0], language_territory_hack[i][1]); 817 } 818 } 819 820 static class MyXSymbolTable extends UnicodeSet.XSymbolTable { 821 static VariantFolder caseFolder = new VariantFolder(new CaseVariantFolder()); 822 static VariantFolder canonicalFolder = new VariantFolder(new CanonicalFolder()); 823 static VariantFolder compatibilityFolder = new VariantFolder(new CompatibilityFolder()); 824 825 @Override applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result)826 public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) { 827 if (propertyName.equalsIgnoreCase("close")) { 828 if (propertyValue.equalsIgnoreCase("case")) { 829 result.addAll(caseFolder.getClosure(result)); 830 } else if (propertyValue.equalsIgnoreCase("canonical")) { 831 result.addAll(canonicalFolder.getClosure(result)); 832 } else if (propertyValue.equalsIgnoreCase("compatibility")) { 833 result.addAll(compatibilityFolder.getClosure(result)); 834 } 835 return true; 836 } else if (propertyName.equalsIgnoreCase("reduce")) { 837 if (propertyValue.equalsIgnoreCase("case")) { 838 UnicodeSet temp = caseFolder.reduce(result); 839 result.clear().addAll(temp); 840 } else if (propertyValue.equalsIgnoreCase("canonical")) { 841 UnicodeSet temp = canonicalFolder.reduce(result); 842 result.clear().addAll(temp); 843 } else if (propertyValue.equalsIgnoreCase("compatibility")) { 844 UnicodeSet temp = compatibilityFolder.reduce(result); 845 result.clear().addAll(temp); 846 } 847 return true; 848 } else if (propertyName.equalsIgnoreCase("reduceCase")) { 849 UnicodeSet temp = caseFolder.reduce(new UnicodeSet(propertyValue.replace( 850 "·]", ":]"))); 851 result.clear().addAll(temp); 852 return true; 853 } else if (propertyName.equalsIgnoreCase("reduceCanonical")) { 854 UnicodeSet temp = canonicalFolder.reduce(new UnicodeSet(propertyValue.replace( 855 "·]", ":]"))); 856 result.clear().addAll(temp); 857 return true; 858 } else if (propertyName.equalsIgnoreCase("reduceCase")) { 859 UnicodeSet temp = caseFolder.reduce(new UnicodeSet(propertyValue.replace( 860 "·]", ":]"))); 861 result.clear().addAll(temp); 862 return true; 863 } 864 return false; 865 } 866 } 867 868 }