1 package org.unicode.cldr.tool;
2 
3 import java.io.IOException;
4 import java.io.PrintWriter;
5 import java.lang.invoke.MethodHandles;
6 import java.util.ArrayList;
7 import java.util.Collection;
8 import java.util.Collections;
9 import java.util.Comparator;
10 import java.util.HashMap;
11 import java.util.HashSet;
12 import java.util.LinkedHashSet;
13 import java.util.List;
14 import java.util.Locale;
15 import java.util.Map;
16 import java.util.Map.Entry;
17 import java.util.Set;
18 import java.util.TreeMap;
19 import java.util.TreeSet;
20 import java.util.regex.Pattern;
21 
22 import org.unicode.cldr.tool.GenerateSubdivisions.SubdivisionInfo;
23 import org.unicode.cldr.util.CLDRConfig;
24 import org.unicode.cldr.util.CLDRFile;
25 import org.unicode.cldr.util.CLDRPaths;
26 import org.unicode.cldr.util.ChainedMap;
27 import org.unicode.cldr.util.ChainedMap.M3;
28 import org.unicode.cldr.util.DtdType;
29 import org.unicode.cldr.util.Factory;
30 import org.unicode.cldr.util.Pair;
31 import org.unicode.cldr.util.PatternCache;
32 import org.unicode.cldr.util.StandardCodes;
33 import org.unicode.cldr.util.StandardCodes.LstrField;
34 import org.unicode.cldr.util.StandardCodes.LstrType;
35 import org.unicode.cldr.util.SupplementalDataInfo;
36 import org.unicode.cldr.util.Validity;
37 import org.unicode.cldr.util.Validity.Status;
38 import org.unicode.cldr.util.WikiSubdivisionLanguages;
39 import org.unicode.cldr.util.XMLFileReader;
40 import org.unicode.cldr.util.XPathParts;
41 import org.unicode.cldr.util.XPathParts.Comments.CommentType;
42 
43 import com.google.common.base.Joiner;
44 import com.ibm.icu.impl.Relation;
45 import com.ibm.icu.impl.Row.R2;
46 import com.ibm.icu.impl.Utility;
47 import com.ibm.icu.lang.UCharacter;
48 import com.ibm.icu.text.CaseMap;
49 import com.ibm.icu.text.Collator;
50 import com.ibm.icu.text.LocaleDisplayNames;
51 import com.ibm.icu.text.Normalizer2;
52 import com.ibm.icu.text.RuleBasedCollator;
53 import com.ibm.icu.util.ULocale;
54 
55 public class SubdivisionNode {
56     static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance();
57     static final Map<String, R2<List<String>, String>> territoryAliases = SDI.getLocaleAliasInfo().get("territory");
58     static final Set<String> containment = SDI.getContainers();
59     static final Map<String, Map<LstrField, String>> codeToData = StandardCodes.getEnumLstreg().get(LstrType.region);
60 
61     static LocaleDisplayNames ENGLISH_ICU = LocaleDisplayNames.getInstance(ULocale.ENGLISH);
62 
63     static final CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE = CaseMap.toTitle().wholeString().noLowercase();
64     static final Comparator<String> ROOT_COL;
65     static {
66         RuleBasedCollator _ROOT_COL = (RuleBasedCollator) Collator.getInstance(ULocale.ENGLISH);
67         _ROOT_COL.setNumericCollation(true);
_ROOT_COL.freeze()68         _ROOT_COL.freeze();
69         ROOT_COL = (Comparator) _ROOT_COL;
70     }
71     static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance();
72     static final CLDRFile ENGLISH_CLDR = CLDR_CONFIG.getEnglish();
73     static final Normalizer2 nfc = Normalizer2.getNFCInstance();
74 
convertToCldr(String regionOrSubdivision)75     public static String convertToCldr(String regionOrSubdivision) {
76         return SubdivisionNames.isRegionCode(regionOrSubdivision) ? regionOrSubdivision.toUpperCase(Locale.ROOT)
77             : regionOrSubdivision.replace("-", "").toLowerCase(Locale.ROOT);
78     }
79 
80     final SubdivisionSet sset;
81     final String code;
82     final int level;
83     final SubdivisionNode parent;
84     final Map<String, SubdivisionNode> children = new TreeMap<>(ROOT_COL);
85 
SubdivisionNode(String code, SubdivisionNode parent, SubdivisionSet sset)86     public SubdivisionNode(String code, SubdivisionNode parent, SubdivisionSet sset) {
87         this.code = code;
88         this.level = parent == null ? -1 : parent.level + 1;
89         this.parent = parent;
90         this.sset = sset;
91         sset.ID_TO_NODE.put(code, this);
92     }
93 
addName(String lang, String value)94     public SubdivisionNode addName(String lang, String value) {
95         sset.NAMES.put(code, lang, value);
96         return this;
97     }
98 
99     static class SubdivisionSet {
100 
101         final M3<String, String, String> NAMES = ChainedMap.of(
102             new TreeMap<String, Object>(),
103             new TreeMap<String, Object>(),
104             String.class);
105         final Map<String, String> TO_COUNTRY_CODE = new TreeMap<>();
106         final Relation<String, String> ID_SAMPLE = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
107         final Map<String, String> SUB_TO_CAT = new TreeMap<>();
108         final Relation<String, String> REGION_CONTAINS = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
109         final Map<String, SubdivisionNode> ID_TO_NODE = new HashMap<>();
110 
111         final SubdivisionNode BASE = new SubdivisionNode("001", null, this).addName("en", "World");
112 
addName(String code, String lang, String value)113         public void addName(String code, String lang, String value) {
114             int parenPos = value.indexOf("(see also separate country");
115             if (parenPos >= 0) {
116                 /*
117                 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ1: expected "Caribbean Netherlands", got "Bonaire"
118                 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ2: expected "Caribbean Netherlands", got "Saba"
119                 Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ3: expected "Caribbean Netherlands", got "Sint Eustatius"
120                 Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-21: expected "Svalbard & Jan Mayen", got "Svalbard"
121                 Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-22: expected "Svalbard & Jan Mayen", got "Jan Mayen"
122                  */
123                 // OLD code to guess country from comment
124 //              String paren = value.substring(value.length() - 3, value.length() - 1);
125 //                if (!paren.equals("BQ") && !paren.equals("SJ")) {
126 //                    String old = TO_COUNTRY_CODE.get(code);
127 //                    if (old != null) {
128 //                        System.err.println("Duplicate: " + code + "\t" + old + "\t" + paren);
129 //                    }
130 //                    TO_COUNTRY_CODE.put(code, paren);
131 //                }
132                 value = value.substring(0, parenPos).trim();
133             }
134             value = value.replace("*", "");
135             NAMES.put(code, lang, value);
136         }
137 
138 
139 
140 
141         static final String[] CRUFT = {
142             "Emirate",
143             "Parish",
144             "County",
145             "District",
146             "Region",
147             "Province of",
148             "Province",
149             "Republic",
150             ", Barbados",
151             ", Burkina Faso",
152             "Governorate",
153             "Department",
154             "Canton of",
155             "(Région des)",
156             "(Région du)",
157             "(Région de la)",
158             "Autonomous",
159             "Archipelago of",
160             "Canton",
161             "kanton",
162             ", Bahamas",
163             "province",
164             "(Région)",
165             "(Région de l')",
166             ", Cameroon",
167             "State of",
168             "State",
169             "Metropolitan Borough of",
170             "London Borough of",
171             "Royal Borough of",
172             "Borough of",
173             "Borough",
174             "Council of",
175             "Council",
176             "City of",
177             ", The",
178             "prefecture",
179             "Prefecture",
180             "municipality"
181         };
182 
183         static final Pattern CRUFT_PATTERN = PatternCache.get("(?i)\\b" + String.join("|", CRUFT) + "\\b");
184         static final Pattern BRACKETED = PatternCache.get("\\[.*\\]");
185 
clean(String input)186         static String clean(String input) {
187             if (input == null) {
188                 return input;
189             }
190             // Quick & dirty
191             input = BRACKETED.matcher(input).replaceAll("");
192             input = CRUFT_PATTERN.matcher(input).replaceAll("");
193 //            for (String cruft : CRUFT) {
194 //                int pos = input.indexOf(cruft);
195 //                if (pos >= 0) {
196 //                    input = input.substring(0,pos) + input.substring(pos + cruft.length());
197 //                }
198 //            }
199             input = input.replace("  ", " ");
200             if (input.endsWith(",")) {
201                 input = input.substring(0, input.length() - 1);
202             }
203             return fixName(input);
204         }
205 
206 
207 
appendName(CLDRFile fileSubdivisions, final String sdCode, String name, String level)208         private static void appendName(CLDRFile fileSubdivisions, final String sdCode, String name, String level) throws IOException {
209             if (name == null) {
210                 return;
211             }
212             String cldrCode = convertToCldr(sdCode);
213             String path = "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\"" + cldrCode + "\"]";
214             String oldValue = fileSubdivisions.getStringValue(path);
215             if (oldValue != null) {
216                 return; // don't override old values
217             }
218             fileSubdivisions.add(path, name);
219             if (level != null) {
220                 fileSubdivisions.addComment(path, level, CommentType.LINE);
221             }
222         }
223 
isKosher(String regionCode)224         private boolean isKosher(String regionCode) {
225             if (regionCode.equals("001")) {
226                 return false;
227             }
228             if (territoryAliases.containsKey(regionCode)
229                 || containment.contains(regionCode)
230                 || codeToData.get(regionCode).get(LstrField.Description).contains("Private use")) {
231                 Set<String> rc = REGION_CONTAINS.get(regionCode);
232                 if (rc != null) {
233                     throw new IllegalArgumentException("? " + regionCode + ": " + rc);
234                 }
235                 return false;
236             }
237             return true;
238         }
239 
addChildren(Set<SubdivisionNode> ordered, Map<String, SubdivisionNode> children2)240         private static void addChildren(Set<SubdivisionNode> ordered, Map<String, SubdivisionNode> children2) {
241             TreeMap<String, SubdivisionNode> temp = new TreeMap<>(ROOT_COL);
242             temp.putAll(children2);
243             ordered.addAll(temp.values());
244             for (SubdivisionNode n : temp.values()) {
245                 if (!n.children.isEmpty()) {
246                     addChildren(ordered, n.children);
247                 }
248             }
249         }
250 
251         static Map<String, String> NAME_CORRECTIONS = new HashMap<>();
252 
getBestName(String value, boolean useIso)253         private String getBestName(String value, boolean useIso) {
254             String cldrName = null;
255             cldrName = NAME_CORRECTIONS.get(value);
256             if (cldrName != null) {
257                 return fixName(cldrName);
258             }
259             R2<List<String>, String> subdivisionAlias = SubdivisionInfo.SUBDIVISION_ALIASES_FORMER.get(value);
260             if (subdivisionAlias != null) {
261                 String country = subdivisionAlias.get0().get(0);
262                 cldrName = ENGLISH_CLDR.getName(CLDRFile.TERRITORY_NAME, country);
263                 if (cldrName != null) {
264                     return fixName(cldrName);
265                 }
266             }
267 
268 
269             cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(value);
270             if (cldrName != null) {
271                 return fixName(cldrName);
272             }
273 
274             Collection<String> oldAliases = SubdivisionInfo.subdivisionIdToOld.get(value);
275             if (oldAliases != null) {
276                 for (String oldAlias : oldAliases) {
277                     cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(oldAlias);
278                     if (cldrName != null) {
279                         return fixName(cldrName);
280                     }
281                 }
282             }
283 
284             if (useIso) {
285                 cldrName = getIsoName(value);
286                 if (cldrName == null) {
287                     cldrName = "UNKNOWN";
288                     //throw new IllegalArgumentException("Failed to find name: " + value);
289                 }
290                 return fixName(cldrName);
291             }
292             return null;
293         }
294 
fixName(String name)295         private static String fixName(String name) {
296             return name == null ? null : nfc.normalize(name.replace('\'', '’').replace("  ", " ").trim());
297         }
298 
SubdivisionSet(String sourceFile)299         public SubdivisionSet(String sourceFile) {
300 
301             //    <country id="AD" version="16">
302             //           <subdivision-code footnote="*">AD-02</subdivision-code>
303             //             <subdivision-locale lang3code="eng" xml:lang="en">
304             //                  <subdivision-locale-name>Otago</subdivision-locale-name>
305 
306             List<Pair<String, String>> pathValues = XMLFileReader.loadPathValues(
307                 sourceFile,
308                 new ArrayList<Pair<String, String>>(), false);
309             int maxIndent = 0;
310             SubdivisionNode lastNode = null;
311             String lastCode = null;
312             Set<String> conflictingTargetCountries = new HashSet<>();
313 
314             for (Pair<String, String> pair : pathValues) {
315                 String path = pair.getFirst();
316                 boolean code = path.contains("/subdivision-code");
317                 boolean name = path.contains("/subdivision-locale-name");
318                 boolean nameCat = path.contains("/category-name");
319                 boolean relatedCountry = path.contains("/subdivision-related-country");
320 
321                 //    <country id="AD" version="16">
322                 //       <category id="262">
323                 //  <category-name lang3code="fra" xml:lang="fr">paroisse</category-name>
324                 //  <category-name lang3code="eng" xml:lang="en">parish</category-name>
325                 // also languages in region...
326 
327                 // new XML from ISO, so we don't have to guess the country code:
328                 //            <subdivision-code footnote="*">NL-BQ1</subdivision-code>
329                 //            <subdivision-related-country country-id="BQ" xml:lang="en">BONAIRE, SINT EUSTATIUS AND SABA</subdivision-related-country>
330 
331                 if (!code && !name && !nameCat && !relatedCountry) {
332                     continue;
333                 }
334                 XPathParts parts = XPathParts.getFrozenInstance(path);
335                 String value = pair.getSecond();
336                 if (relatedCountry) {
337                     String target = parts.getAttributeValue(-1, "country-id");
338                     // remove conflicting target countries
339                     for (Entry<String, String> entry : TO_COUNTRY_CODE.entrySet()) {
340                         if (entry.getValue().equals(target)) {
341                             conflictingTargetCountries.add(target);
342                             TO_COUNTRY_CODE.remove(entry.getKey(), target); // there can be at most one
343                             break;
344                         }
345                     }
346                     if (!conflictingTargetCountries.contains(target)) {
347                         TO_COUNTRY_CODE.put(lastCode, target);
348                         //System.out.println(lastCode + " => " + target);
349                     }
350                 } else if (name) {
351                     int elementNum = -2;
352                     String lang = parts.getAttributeValue(elementNum, "xml:lang");
353                     if (lang == null) {
354                         lang = parts.getAttributeValue(elementNum, "lang3code");
355                     }
356                     addName(lastCode, lang, value);
357                     //output.println(count + Utility.repeat("\t", indent) + "\tlang=" + lang + ":\t«" + value + "»\t");
358                 } else if (nameCat) {
359                     //country-codes[@generated="2015-05-04T15:40:13.424465+02:00"]/country[@id="AD"][@version="16"]/category[@id="262"]/category-name[@lang3code="fra"][@xml:lang="fr"]
360                     int elementNum = -1;
361                     String lang = parts.getAttributeValue(elementNum, "xml:lang");
362                     if (lang == null) {
363                         lang = parts.getAttributeValue(elementNum, "lang3code");
364                     }
365                     String category = parts.getAttributeValue(-2, "id");
366                     addName(category, lang, value);
367                     //output.println(count + Utility.repeat("\t", indent) + "\tlang=" + lang + ":\t«" + value + "»\t");
368                 } else {
369                     int countSubdivision = 0;
370                     for (int i = 0; i < parts.size(); ++i) {
371                         if (parts.getElement(i).equals("subdivision")) {
372                             ++countSubdivision;
373                         }
374                     }
375                     if (maxIndent < countSubdivision) {
376                         maxIndent = countSubdivision;
377                     }
378                     value = convertToCldr(value);
379                     if (countSubdivision == 1) {
380                         lastNode = addNode(null, value);
381                     } else {
382                         lastNode = addNode(lastNode, value);
383                     }
384                     lastCode = value;
385                     int subdivisionElement = parts.findElement("subdivision");
386                     String id = parts.getAttributeValue(subdivisionElement, "category-id");
387                     addIdSample(id, value);
388                     //<subdivision category-id="262">//<subdivision-code footnote="*">AD-06</subdivision-code>
389                     // <subdivision category-id="262">
390                     //output.println(++count + Utility.repeat("\t", indent) + "code=" + value);
391                 }
392             }
393         }
394 
addIdSample(String id, String value)395         public void addIdSample(String id, String value) {
396             SUB_TO_CAT.put(value, id);
397             ID_SAMPLE.put(getIsoName(id), value);
398         }
399 
addNode(SubdivisionNode lastSubdivision, String subdivision)400         final SubdivisionNode addNode(SubdivisionNode lastSubdivision, String subdivision) {
401             // "NZ-S", x
402             String region = SubdivisionNames.getRegionFromSubdivision(subdivision);
403             REGION_CONTAINS.put(region, subdivision);
404             if (lastSubdivision == null) {
405                 lastSubdivision = BASE.children.get(region);
406                 if (lastSubdivision == null) {
407                     lastSubdivision = new SubdivisionNode(region, BASE, this).addName("en", ENGLISH_ICU.regionDisplayName(region));
408                     BASE.children.put(region, lastSubdivision);
409                 }
410                 return add(lastSubdivision, subdivision);
411             }
412             add(lastSubdivision, subdivision);
413             return lastSubdivision;
414         }
415 
add(SubdivisionNode subdivisionNode1, String subdivision2)416         private SubdivisionNode add(SubdivisionNode subdivisionNode1, String subdivision2) {
417             SubdivisionNode subdivisionNode2 = subdivisionNode1.children.get(subdivision2);
418             if (subdivisionNode2 == null) {
419                 subdivisionNode2 = new SubdivisionNode(subdivision2, subdivisionNode1, this);
420             }
421             subdivisionNode1.children.put(subdivision2, subdivisionNode2);
422             return subdivisionNode2;
423         }
424 
getName(SubdivisionNode base2)425         private String getName(SubdivisionNode base2) {
426             return getIsoName(base2.code);
427         }
428 
getIsoName(String code)429         private String getIsoName(String code) {
430             if (code == null) {
431                 return null;
432             }
433             Map<String, String> map = NAMES.get(code);
434             if (map == null) {
435                 return "???";
436             }
437             String name = map.get("en");
438             if (name != null) {
439                 return name;
440             }
441             name = map.get("es");
442             if (name != null) {
443                 return name;
444             }
445             name = map.get("fr");
446             if (name != null) {
447                 return name;
448             }
449             if (name == null) {
450                 name = map.entrySet().iterator().next().getValue();
451             }
452             return name;
453         }
print(PrintWriter out)454         public void print(PrintWriter out) {
455             print(out, 0, "", BASE);
456             for (Entry<String, String> entry : TO_COUNTRY_CODE.entrySet()) {
457                 out.println(entry.getKey() + "\t" + entry.getValue());
458             }
459         }
print(PrintWriter out, int indent, String prefix, SubdivisionNode base2)460         private void print(PrintWriter out, int indent, String prefix, SubdivisionNode base2) {
461             if (!prefix.isEmpty()) {
462                 prefix += "\t";
463             }
464             prefix += base2.code;
465             final String indentString = Utility.repeat("\t", 4-indent);
466             out.println(prefix + indentString + getName(base2));
467             if (base2.children.isEmpty()) {
468                 return;
469             }
470             for (SubdivisionNode child : base2.children.values()) {
471                 print(out, indent + 1, prefix, child);
472             }
473         }
474     }
475 
476     static class SubDivisionExtractor {
477         final SubdivisionSet sdset;
478         final Validity validityFormer;
479         final Map<String, R2<List<String>, String>> subdivisionAliasesFormer;
480         final Relation<String, String> formerRegionToSubdivisions;
481 
SubDivisionExtractor(SubdivisionSet sdset, Validity validityFormer, Map<String, R2<List<String>, String>> subdivisionAliasesFormer, Relation<String, String> formerRegionToSubdivisions)482         public SubDivisionExtractor(SubdivisionSet sdset,
483             Validity validityFormer,
484             Map<String, R2<List<String>, String>> subdivisionAliasesFormer,
485             Relation<String, String> formerRegionToSubdivisions) {
486             this.sdset = sdset;
487             this.validityFormer = validityFormer;
488             this.subdivisionAliasesFormer = subdivisionAliasesFormer;
489             this.formerRegionToSubdivisions = formerRegionToSubdivisions;
490         }
491 
printXml(Appendable output)492         void printXml(Appendable output) throws IOException {
493 
494             /*
495             <subdivisionContainment>
496             <group type="NZ" category="island" contains="NZ-N NZ-S"/> <!-- New Zealand -->
497             <group type="NZ" category="special island authority" contains="NZ-CIT"/> <!-- New Zealand -->
498             <group type="NZ-N" contains="NZ-AUK NZ-BOP NZ-GIS NZ-HKB NZ-MWT NZ-NTL NZ-AUK NZ-TKI NZ-WGN NZ-WKO"/> <!-- North Island -->
499             <group type="NZ-S" contains="NZ-CAN NZ-MBH NZ-STL NZ-NSN NZ-OTA NZ-TAS NZ-WTC"/> <!-- South Island -->
500             </subdivisionContainment>
501              */
502             output.append(
503                 DtdType.supplementalData.header(MethodHandles.lookup().lookupClass())
504                 + "\t<version number=\"$Revision" + "$\"/>\n"
505                 + "\t<subdivisionContainment>\n");
506             printXml(output, sdset.BASE, 0);
507             output.append("\t</subdivisionContainment>\n</supplementalData>\n");
508         }
509 
510 //        private static String header(DtdType type) {
511 //            return "<?xml version='1.0' encoding='UTF-8' ?>\n"
512 //                + "<!DOCTYPE " + type // supplementalData
513 //                + " SYSTEM '../../" + type.dtdPath + "'>\n" // "common/dtd/ldmlSupplemental.dtd"
514 //                + "<!--\n"
515 //                + "Copyright © 1991-2013 Unicode, Inc.\n"
516 //                + "CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)\n"
517 //                + "For terms of use, see http://www.unicode.org/copyright.html\n"
518 //                + "-->\n";
519 //        }
520 
printAliases(Appendable output)521         void printAliases(Appendable output) throws IOException {
522             addAliases(output, sdset.TO_COUNTRY_CODE.keySet());
523 
524             // Get the old validity data
525             Map<Status, Set<String>> oldSubdivisionData = validityFormer.getStatusToCodes(LstrType.subdivision);
526             Set<String> missing = new TreeSet<>(ROOT_COL);
527             missing.addAll(sdset.TO_COUNTRY_CODE.keySet());
528             Set<String> nowValid = sdset.ID_TO_NODE.keySet();
529             for (Entry<Status, Set<String>> e : oldSubdivisionData.entrySet()) {
530                 Status v = e.getKey();
531                 if (v == Status.unknown) {
532                     continue;
533                 }
534                 Set<String> set = e.getValue();
535                 for (String sdcodeRaw : set) {
536                     String sdcode = sdcodeRaw; // .toUpperCase(Locale.ROOT);
537 //                  sdcode = sdcode.substring(0,2) + "-" + sdcode.substring(2);
538                     if (!nowValid.contains(sdcode)) {
539                         missing.add(sdcode);
540                     }
541                 }
542             }
543             missing.removeAll(sdset.TO_COUNTRY_CODE.keySet());
544             addAliases(output, missing);
545         }
546 
addAliases(Appendable output, Set<String> missing)547         private void addAliases(Appendable output, Set<String> missing) throws IOException {
548             for (String toReplace : missing) {
549                 List<String> replaceBy = null;
550                 String reason = "deprecated";
551                 R2<List<String>, String> aliasInfo = subdivisionAliasesFormer.get(toReplace);
552                 if (aliasInfo != null) {
553                     replaceBy = aliasInfo.get0();
554                     reason = aliasInfo.get1();
555                     System.out.println("Adding former alias: " + toReplace + " => " + replaceBy);
556                 } else {
557                     String replacement = sdset.TO_COUNTRY_CODE.get(toReplace);
558                     if (replacement != null) {
559                         replaceBy = Collections.singletonList(replacement);
560                         reason = "overlong";
561                         System.out.println("Adding country code alias: " + toReplace + " => " + replaceBy);
562                     }
563                 }
564                 addAlias(output, toReplace, replaceBy, reason);
565             }
566         }
567 
addAlias(Appendable output, final String toReplace, final List<String> replaceBy, final String reason)568         private void addAlias(Appendable output, final String toReplace, final List<String> replaceBy, final String reason) throws IOException {
569             // <languageAlias type="art_lojban" replacement="jbo" reason="deprecated"/> <!-- Lojban -->
570             output.append("\t\t\t");
571             if (replaceBy == null) {
572                 output.append("<!-- ");
573             }
574             output.append("<subdivisionAlias"
575                 + " type=\"" + toReplace + "\""
576                 + " replacement=\"" + (replaceBy == null ? toReplace.substring(0, 2) + "?" :
577                 Joiner.on(" ").join(replaceBy)) + "\""
578                 + " reason=\"" + reason + "\"/>"
579                 + (replaceBy == null ? " <!- - " : " <!-- ")
580                 + sdset.getBestName(toReplace, true) + " => " + (replaceBy == null ? "??" : getBestName(replaceBy, true)) + " -->"
581                 + "\n");
582         }
583 
getBestName(List<String> replaceBy, boolean useIso)584         private String getBestName(List<String> replaceBy, boolean useIso) {
585             StringBuilder result = new StringBuilder();
586             for (String s : replaceBy) {
587                 if (result.length() != 0) {
588                     result.append(", ");
589                 }
590                 if (SubdivisionNames.isRegionCode(s)) {
591                     result.append(ENGLISH_CLDR.getName(CLDRFile.TERRITORY_NAME, s));
592                 } else {
593                     result.append(sdset.getBestName(s, useIso));
594                 }
595             }
596             return result.toString();
597         }
598 
printXml(Appendable output, SubdivisionNode base2, int indent)599         private void printXml(Appendable output, SubdivisionNode base2, int indent) throws IOException {
600             if (base2.children.isEmpty()) {
601                 return;
602             }
603             String type = base2.code;
604             if (base2 != sdset.BASE) {
605                 type = convertToCldr(type);
606                 output.append("\t\t" + "<subgroup"
607                     + " type=\"" + type + "\""
608                     + " contains=\"");
609                 boolean first = true;
610                 for (String child : base2.children.keySet()) {
611                     if (first) {
612                         first = false;
613                     } else {
614                         output.append(' ');
615                     }
616                     String subregion = convertToCldr(child);
617                     output.append(subregion);
618                 }
619                 output.append("\"/>\n");
620             }
621             for (SubdivisionNode child : base2.children.values()) {
622                 printXml(output, child, indent);
623             }
624         }
625 
printSamples(Appendable pw)626         public void printSamples(Appendable pw) throws IOException {
627             Set<String> seen = new HashSet<>();
628             for (Entry<String, Set<String>> entry : sdset.ID_SAMPLE.keyValuesSet()) {
629                 pw.append(entry.getKey());
630                 //int max = 10;
631                 seen.clear();
632                 for (String sample : entry.getValue()) {
633                     String region = sample.substring(0, 2);
634                     if (seen.contains(region)) {
635                         continue;
636                     }
637                     seen.add(region);
638                     pw.append(";\t" + ENGLISH_ICU.regionDisplayName(region) + ": " + sdset.getIsoName(sample)
639                     + " (" + sample + ")");
640                     //if (--max < 0) break;
641                 }
642                 pw.append(System.lineSeparator());
643             }
644         }
645 
printEnglishComp(Appendable output)646         public void printEnglishComp(Appendable output) throws IOException {
647             Set<String> countEqual = new TreeSet<>();
648             String lastCC = null;
649             output.append("Country\tMID\tSubdivision\tCLDR\tISO\tWikidata\tEqual\n");
650             for (Entry<String, Set<String>> entry : sdset.REGION_CONTAINS.keyValuesSet()) {
651                 final String countryCode = entry.getKey();
652                 if (!countryCode.equals(lastCC)) {
653                     if (lastCC != null && countEqual.size() != 0) {
654                         output.append(ENGLISH_ICU.regionDisplayName(lastCC) + "\t\t\tEquals:\t" + countEqual.size() + "\t" + countEqual + "\n");
655                     }
656                     countEqual.clear();
657 
658                     lastCC = countryCode;
659                 }
660                 for (String value : entry.getValue()) {
661                     String cldrName = sdset.getBestName(value, false);
662                     String wiki = WikiSubdivisionLanguages.getBestWikiEnglishName(value);
663                     final String iso = sdset.getIsoName(value);
664                     if (iso.equals(wiki)) {
665                         countEqual.add(iso);
666                         continue;
667                     }
668                     output.append(
669                         ENGLISH_ICU.regionDisplayName(countryCode)
670 //                        + "\t" + WikiSubdivisionLanguages.WIKIDATA_TO_MID.get(value)
671                         + "\t" + cldrName
672                         + "\t" + value
673                         + "\t" + iso
674                         + "\t" + wiki
675                         + "\n");
676                 }
677             }
678             if (countEqual.size() != 0) {
679                 output.append(ENGLISH_ICU.regionDisplayName(lastCC) + "\t\t\tEquals:\t" + countEqual.size() + "\t" + countEqual + "\n");
680             }
681         }
682 
printEnglishCompFull(Appendable output)683         public void printEnglishCompFull(Appendable output) throws IOException {
684             output.append("Country\tMID\tSubdivision\tCLDR\tISO\tWikidata\n");
685             for (Entry<String, Set<String>> entry : sdset.REGION_CONTAINS.keyValuesSet()) {
686                 final String countryCode = entry.getKey();
687                 for (String value : entry.getValue()) {
688                     String cldrName = sdset.getBestName(value, false);
689                     //getBestName(value);
690                     String wiki = WikiSubdivisionLanguages.getBestWikiEnglishName(value);
691                     final String iso = sdset.getIsoName(value);
692                     output.append(
693                         ENGLISH_ICU.regionDisplayName(countryCode)
694 //                        + "\t" + WikiSubdivisionLanguages.WIKIDATA_TO_MID.get(value)
695                         + "\t" + value
696                         + "\t" + cldrName
697                         + "\t" + iso
698                         + "\t" + wiki
699                         + "\n");
700                 }
701             }
702         }
703 
printEnglish(PrintWriter output)704         public void printEnglish(PrintWriter output) throws IOException {
705             TreeSet<String> allRegions = new TreeSet<>();
706             allRegions.addAll(codeToData.keySet());
707             allRegions.addAll(formerRegionToSubdivisions.keySet()); // override
708 
709             Factory cldrFactorySubdivisions = Factory.make(CLDRPaths.SUBDIVISIONS_DIRECTORY, ".*");
710             CLDRFile oldFileSubdivisions = cldrFactorySubdivisions.make("en", false);
711             CLDRFile fileSubdivisions = oldFileSubdivisions.cloneAsThawed();
712 
713             Set<String> skipped = new LinkedHashSet<>();
714 
715             for (String regionCode : allRegions) {
716                 if (!sdset.isKosher(regionCode)) {
717                     if (regionCode.length() != 3) {
718                         skipped.add(regionCode);
719                     }
720                     continue;
721                 }
722                 Set<String> remainder = formerRegionToSubdivisions.get(regionCode);
723                 remainder = remainder == null ? Collections.emptySet() : new LinkedHashSet<>(remainder);
724 
725                 SubdivisionNode regionNode = sdset.ID_TO_NODE.get(regionCode);
726                 if (regionNode == null) {
727                     continue;
728                 }
729 
730                 Set<SubdivisionNode> ordered = new LinkedHashSet<>();
731                 SubdivisionSet.addChildren(ordered, regionNode.children);
732 
733                 for (SubdivisionNode node : ordered) {
734                     final String sdCode = node.code;
735                     String name = sdset.getBestName(sdCode, true);
736                     String upper = UCharacter.toUpperCase(name);
737                     String title = SubdivisionNode.TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply(Locale.ROOT, null, name);
738                     if (name.equals(upper) || !name.equals(title)) {
739                         System.out.println("Suspicious name: " + name);
740                     }
741                     SubdivisionSet.appendName(fileSubdivisions, sdCode, name, null);
742                     remainder.remove(sdCode);
743                 }
744                 for (String sdCode : remainder) {
745                     String name = sdset.getBestName(sdCode, true);
746                     if (!name.equals("???")) {
747                         SubdivisionSet.appendName(fileSubdivisions, sdCode, name, "\t<!-- deprecated -->");
748                     }
749                 }
750             }
751             System.out.println("Skipping: " + skipped);
752             fileSubdivisions.write(output);
753         }
754 
printMissingMIDs(PrintWriter pw)755         public void printMissingMIDs(PrintWriter pw) {
756 //          for (Entry<String, String> entry : WikiSubdivisionLanguages.WIKIDATA_TO_MID.entrySet()) {
757 //              String mid = entry.getValue();
758 //              if (!mid.isEmpty()) {
759 //                  continue;
760 //              }
761 //              String subCode = entry.getKey();
762 //              String wiki = clean(getWikiName(subCode));
763 //              String iso = clean(getIsoName(subCode));
764 //              String countryCode = subCode.substring(0, 2);
765 //              String cat = SUB_TO_CAT.get(subCode);
766 //              String catName = getIsoName(cat);
767 //              pw.append(
768 //                  ENGLISH_ICU.regionDisplayName(countryCode)
769 //                  + "\t" + mid
770 //                  + "\t" + subCode
771 //                  + "\t" + catName
772 //                  + "\t" + wiki
773 //                  + "\t" + iso
774 //                  + "\n"
775 //                  );
776 //          }
777         }
778     }
779 }