package org.unicode.cldr.tool; import java.io.IOException; import java.io.PrintWriter; import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern; import org.unicode.cldr.tool.GenerateSubdivisions.SubdivisionInfo; import org.unicode.cldr.util.CLDRConfig; import org.unicode.cldr.util.CLDRFile; import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.ChainedMap; import org.unicode.cldr.util.ChainedMap.M3; import org.unicode.cldr.util.DtdType; import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.Pair; import org.unicode.cldr.util.PatternCache; import org.unicode.cldr.util.StandardCodes; import org.unicode.cldr.util.StandardCodes.LstrField; import org.unicode.cldr.util.StandardCodes.LstrType; import org.unicode.cldr.util.SupplementalDataInfo; import org.unicode.cldr.util.Validity; import org.unicode.cldr.util.Validity.Status; import org.unicode.cldr.util.WikiSubdivisionLanguages; import org.unicode.cldr.util.XMLFileReader; import org.unicode.cldr.util.XPathParts; import org.unicode.cldr.util.XPathParts.Comments.CommentType; import com.google.common.base.Joiner; import com.ibm.icu.impl.Relation; import com.ibm.icu.impl.Row.R2; import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.CaseMap; import com.ibm.icu.text.Collator; import com.ibm.icu.text.LocaleDisplayNames; import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.util.ULocale; public class SubdivisionNode { static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance(); static final Map, String>> territoryAliases = SDI.getLocaleAliasInfo().get("territory"); static final Set containment = SDI.getContainers(); static final Map> codeToData = StandardCodes.getEnumLstreg().get(LstrType.region); static LocaleDisplayNames ENGLISH_ICU = LocaleDisplayNames.getInstance(ULocale.ENGLISH); static final CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE = CaseMap.toTitle().wholeString().noLowercase(); static final Comparator ROOT_COL; static { RuleBasedCollator _ROOT_COL = (RuleBasedCollator) Collator.getInstance(ULocale.ENGLISH); _ROOT_COL.setNumericCollation(true); _ROOT_COL.freeze(); ROOT_COL = (Comparator) _ROOT_COL; } static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); static final CLDRFile ENGLISH_CLDR = CLDR_CONFIG.getEnglish(); static final Normalizer2 nfc = Normalizer2.getNFCInstance(); public static String convertToCldr(String regionOrSubdivision) { return SubdivisionNames.isRegionCode(regionOrSubdivision) ? regionOrSubdivision.toUpperCase(Locale.ROOT) : regionOrSubdivision.replace("-", "").toLowerCase(Locale.ROOT); } final SubdivisionSet sset; final String code; final int level; final SubdivisionNode parent; final Map children = new TreeMap<>(ROOT_COL); public SubdivisionNode(String code, SubdivisionNode parent, SubdivisionSet sset) { this.code = code; this.level = parent == null ? -1 : parent.level + 1; this.parent = parent; this.sset = sset; sset.ID_TO_NODE.put(code, this); } public SubdivisionNode addName(String lang, String value) { sset.NAMES.put(code, lang, value); return this; } static class SubdivisionSet { final M3 NAMES = ChainedMap.of( new TreeMap(), new TreeMap(), String.class); final Map TO_COUNTRY_CODE = new TreeMap<>(); final Relation ID_SAMPLE = Relation.of(new TreeMap>(), TreeSet.class); final Map SUB_TO_CAT = new TreeMap<>(); final Relation REGION_CONTAINS = Relation.of(new TreeMap>(), TreeSet.class); final Map ID_TO_NODE = new HashMap<>(); final SubdivisionNode BASE = new SubdivisionNode("001", null, this).addName("en", "World"); public void addName(String code, String lang, String value) { int parenPos = value.indexOf("(see also separate country"); if (parenPos >= 0) { /* Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ1: expected "Caribbean Netherlands", got "Bonaire" Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ2: expected "Caribbean Netherlands", got "Saba" Error: (TestSubdivisions.java:66) : country BQ = subdivisionNL-BQ3: expected "Caribbean Netherlands", got "Sint Eustatius" Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-21: expected "Svalbard & Jan Mayen", got "Svalbard" Error: (TestSubdivisions.java:66) : country SJ = subdivisionNO-22: expected "Svalbard & Jan Mayen", got "Jan Mayen" */ // OLD code to guess country from comment // String paren = value.substring(value.length() - 3, value.length() - 1); // if (!paren.equals("BQ") && !paren.equals("SJ")) { // String old = TO_COUNTRY_CODE.get(code); // if (old != null) { // System.err.println("Duplicate: " + code + "\t" + old + "\t" + paren); // } // TO_COUNTRY_CODE.put(code, paren); // } value = value.substring(0, parenPos).trim(); } value = value.replace("*", ""); NAMES.put(code, lang, value); } static final String[] CRUFT = { "Emirate", "Parish", "County", "District", "Region", "Province of", "Province", "Republic", ", Barbados", ", Burkina Faso", "Governorate", "Department", "Canton of", "(Région des)", "(Région du)", "(Région de la)", "Autonomous", "Archipelago of", "Canton", "kanton", ", Bahamas", "province", "(Région)", "(Région de l')", ", Cameroon", "State of", "State", "Metropolitan Borough of", "London Borough of", "Royal Borough of", "Borough of", "Borough", "Council of", "Council", "City of", ", The", "prefecture", "Prefecture", "municipality" }; static final Pattern CRUFT_PATTERN = PatternCache.get("(?i)\\b" + String.join("|", CRUFT) + "\\b"); static final Pattern BRACKETED = PatternCache.get("\\[.*\\]"); static String clean(String input) { if (input == null) { return input; } // Quick & dirty input = BRACKETED.matcher(input).replaceAll(""); input = CRUFT_PATTERN.matcher(input).replaceAll(""); // for (String cruft : CRUFT) { // int pos = input.indexOf(cruft); // if (pos >= 0) { // input = input.substring(0,pos) + input.substring(pos + cruft.length()); // } // } input = input.replace(" ", " "); if (input.endsWith(",")) { input = input.substring(0, input.length() - 1); } return fixName(input); } private static void appendName(CLDRFile fileSubdivisions, final String sdCode, String name, String level) throws IOException { if (name == null) { return; } String cldrCode = convertToCldr(sdCode); String path = "//ldml/localeDisplayNames/subdivisions/subdivision[@type=\"" + cldrCode + "\"]"; String oldValue = fileSubdivisions.getStringValue(path); if (oldValue != null) { return; // don't override old values } fileSubdivisions.add(path, name); if (level != null) { fileSubdivisions.addComment(path, level, CommentType.LINE); } } private boolean isKosher(String regionCode) { if (regionCode.equals("001")) { return false; } if (territoryAliases.containsKey(regionCode) || containment.contains(regionCode) || codeToData.get(regionCode).get(LstrField.Description).contains("Private use")) { Set rc = REGION_CONTAINS.get(regionCode); if (rc != null) { throw new IllegalArgumentException("? " + regionCode + ": " + rc); } return false; } return true; } private static void addChildren(Set ordered, Map children2) { TreeMap temp = new TreeMap<>(ROOT_COL); temp.putAll(children2); ordered.addAll(temp.values()); for (SubdivisionNode n : temp.values()) { if (!n.children.isEmpty()) { addChildren(ordered, n.children); } } } static Map NAME_CORRECTIONS = new HashMap<>(); private String getBestName(String value, boolean useIso) { String cldrName = null; cldrName = NAME_CORRECTIONS.get(value); if (cldrName != null) { return fixName(cldrName); } R2, String> subdivisionAlias = SubdivisionInfo.SUBDIVISION_ALIASES_FORMER.get(value); if (subdivisionAlias != null) { String country = subdivisionAlias.get0().get(0); cldrName = ENGLISH_CLDR.getName(CLDRFile.TERRITORY_NAME, country); if (cldrName != null) { return fixName(cldrName); } } cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(value); if (cldrName != null) { return fixName(cldrName); } Collection oldAliases = SubdivisionInfo.subdivisionIdToOld.get(value); if (oldAliases != null) { for (String oldAlias : oldAliases) { cldrName = SubdivisionInfo.SUBDIVISION_NAMES_ENGLISH_FORMER.get(oldAlias); if (cldrName != null) { return fixName(cldrName); } } } if (useIso) { cldrName = getIsoName(value); if (cldrName == null) { cldrName = "UNKNOWN"; //throw new IllegalArgumentException("Failed to find name: " + value); } return fixName(cldrName); } return null; } private static String fixName(String name) { return name == null ? null : nfc.normalize(name.replace('\'', '’').replace(" ", " ").trim()); } public SubdivisionSet(String sourceFile) { //