1 package org.unicode.cldr.tool; 2 3 import java.io.File; 4 import java.io.IOException; 5 import java.util.ArrayList; 6 import java.util.Arrays; 7 import java.util.HashSet; 8 import java.util.LinkedHashSet; 9 import java.util.List; 10 import java.util.Map; 11 import java.util.Map.Entry; 12 import java.util.Set; 13 import java.util.TreeMap; 14 import java.util.TreeSet; 15 import java.util.regex.Matcher; 16 17 import org.unicode.cldr.tool.FormattedFileWriter.Anchors; 18 import org.unicode.cldr.util.CLDRConfig; 19 import org.unicode.cldr.util.CLDRFile; 20 import org.unicode.cldr.util.CLDRFile.DraftStatus; 21 import org.unicode.cldr.util.CLDRFile.ExemplarType; 22 import org.unicode.cldr.util.CLDRFile.NumberingSystem; 23 import org.unicode.cldr.util.CLDRFile.WinningChoice; 24 import org.unicode.cldr.util.CLDRPaths; 25 import org.unicode.cldr.util.Factory; 26 import org.unicode.cldr.util.FileCopier; 27 import org.unicode.cldr.util.Pair; 28 import org.unicode.cldr.util.PatternCache; 29 import org.unicode.cldr.util.XMLFileReader; 30 import org.unicode.cldr.util.XPathParts; 31 32 import com.google.common.base.Joiner; 33 import com.google.common.base.Splitter; 34 import com.ibm.icu.text.Collator; 35 import com.ibm.icu.text.RuleBasedCollator; 36 import com.ibm.icu.text.Transliterator; 37 import com.ibm.icu.text.UnicodeSet; 38 39 public class ChartCollation extends Chart { 40 41 static final String NOT_TAILORED = "notTailored"; 42 static final String NOT_EXEMPLARS = "notExemplars"; 43 44 private static final String KNOWN_PROBLEMS = 45 "<ul>" + LS 46 + "<li>The characters used in the illustration are:" + LS 47 + "<ol>" + LS 48 + "<li>those <span class='" + NOT_TAILORED + "'>not tailored</span> (added from standard exemplars for context)</li>" + LS 49 + "<li>those <span class='" + NOT_EXEMPLARS + "'>tailored</span>, but not in any exemplars (standard, aux, punctuation)</li>" + LS 50 + "<li>those both tailored and in exemplars</li>" + LS 51 + "</ol>" + LS 52 + "<li>The tailored characters may include:" + LS 53 + "<ol>" + LS 54 + "<li>some longer strings (contractions) from the rules</li>" + LS 55 + "<li>generated Unicode characters (for <i>canonical closure</i>)</li>" + LS 56 + "</ol>" + LS 57 + "</li>" + LS 58 + "</ul>" + LS; 59 60 private static final Factory CLDR_FACTORY = CLDRConfig.getInstance().getCldrFactory(); 61 private static final boolean DEBUG = false; 62 private static final String DIR = CLDRPaths.CHART_DIRECTORY + "collation/"; 63 64 //static Factory cldrFactory = Factory.make(CLDRPaths.COMMON_DIRECTORY + "collation/", ".*"); 65 main(String[] args)66 public static void main(String[] args) { 67 new ChartCollation().writeChart(null); 68 } 69 70 @Override getDirectory()71 public String getDirectory() { 72 return DIR; 73 } 74 75 @Override getTitle()76 public String getTitle() { 77 return "Collation Charts"; 78 } 79 80 @Override getFileName()81 public String getFileName() { 82 return "index"; 83 } 84 85 @Override getExplanation()86 public String getExplanation() { 87 return "<p>Collation tailorings provide language or locale-specific modifications of the standard Unicode CLDR collation order, " 88 + "which is based on <a target='_blank' href='http://unicode.org/charts/collation/'>Unicode default collation charts</a>. " 89 + "Locales that just use the standard CLDR order (<a href='root.html'>Root</a>) are not listed.</p>" 90 + dataScrapeMessage("/tr35-collation.html", "common/testData/units/unitsTest.txt", "common/collation")+ LS; 91 } 92 93 @Override writeContents(FormattedFileWriter pw)94 public void writeContents(FormattedFileWriter pw) throws IOException { 95 FileCopier.ensureDirectoryExists(DIR); 96 FileCopier.copy(Chart.class, "index.css", DIR); 97 FormattedFileWriter.copyIncludeHtmls(DIR); 98 99 FormattedFileWriter.Anchors anchors = new FormattedFileWriter.Anchors(); 100 writeSubcharts(anchors); 101 pw.setIndex("Main Chart Index", "../index.html"); 102 pw.write(anchors.toString()); 103 } 104 105 static class Data { 106 RuleBasedCollator collator; 107 Set<String> settings = new LinkedHashSet<>(); 108 } 109 writeSubcharts(Anchors anchors)110 public void writeSubcharts(Anchors anchors) throws IOException { 111 Matcher settingsMatcher = PatternCache.get( 112 "//ldml/collations/collation" 113 + "\\[@type=\"([^\"]+)\"]" 114 + "(.*)?" 115 + "/(settings|import|cr)" 116 + "(.*)") 117 .matcher(""); 118 Splitter settingSplitter = Splitter.onPattern("[\\[\\]@]").omitEmptyStrings().trimResults(); 119 File baseDir = new File(CLDRPaths.COMMON_DIRECTORY + "collation/"); 120 Transliterator fromUnicode = Transliterator.getInstance("Hex-Any"); 121 List<Pair<String, String>> pathValueList = new ArrayList<>(); 122 HashSet<String> mainAvailable = new HashSet<>(CLDR_FACTORY.getAvailable()); 123 // for (String xmlName : baseDir.list()) { 124 // if (!xmlName.endsWith(".xml")) { 125 // continue; 126 // } 127 // String locale = xmlName.substring(0,xmlName.length()-4); 128 // } 129 for (String xmlName : baseDir.list()) { 130 if (!xmlName.endsWith(".xml")) { 131 continue; 132 } 133 String locale = xmlName.substring(0, xmlName.length() - 4); 134 if (!mainAvailable.contains(locale)) { 135 System.out.println("Skipping locale not in main: " + locale); 136 continue; 137 } 138 139 pathValueList.clear(); 140 XMLFileReader.loadPathValues(CLDRPaths.COMMON_DIRECTORY + "collation/" + xmlName, pathValueList, true); 141 Map<String, Data> data = new TreeMap<>(); 142 143 for (Pair<String, String> entry : pathValueList) { 144 String path = entry.getFirst(); 145 String value = entry.getSecond(); 146 if (path.startsWith("//ldml/identity/")) { 147 continue; 148 } 149 150 if (path.equals("//ldml/collations/defaultCollation")) { 151 addCollator(data, value, "defaultCollation", Arrays.asList("true")); 152 continue; 153 } 154 155 // Root collator being empty isn't really a failure - just skip it. 156 if (xmlName.equals("root.xml") && path.equals("//ldml/collations/collation[@type=\"standard\"]")) { 157 continue; 158 } 159 XPathParts xpp = XPathParts.getFrozenInstance(path); 160 DraftStatus status = DraftStatus.forString(xpp.findFirstAttributeValue("draft")); 161 if (status == DraftStatus.unconfirmed) { 162 System.out.println("Skipping " + path + " in: " + xmlName + " due to draft status = " + status.toString()); 163 continue; 164 } 165 166 if (!settingsMatcher.reset(path).matches()) { 167 System.out.println("Failure in " + xmlName + " with: " + path); 168 continue; 169 } 170 String type = settingsMatcher.group(1); 171 String otherAttributes = settingsMatcher.group(2); 172 String leaf = settingsMatcher.group(3); 173 String values = settingsMatcher.group(4); 174 175 if (leaf.equals("settings") || leaf.equals("import")) { 176 //ldml/collations/collation[@type="compat"][@visibility="external"]/settings[@reorder="Arab"] 177 List<String> settings = settingSplitter.splitToList(values); 178 addCollator(data, type, leaf, settings); 179 continue; 180 } 181 String rules = value; 182 if (!rules.contains("'#⃣'")) { 183 rules = rules.replace("#⃣", "'#⃣'").replace("*⃣", "'*⃣'"); //hack for 8288 184 } 185 rules = fromUnicode.transform(rules); 186 187 try { 188 RuleBasedCollator col = new RuleBasedCollator(rules); 189 col.setStrength(Collator.IDENTICAL); 190 col.freeze(); 191 addCollator(data, type, col); 192 } catch (Exception e) { 193 System.out.println("*** Skipping " + locale + ":" + type + ", " + e); 194 } 195 } 196 if (data.isEmpty()) { // remove completely empty 197 continue; 198 } 199 if (!data.containsKey("standard")) { 200 addCollator(data, "standard", (RuleBasedCollator) null); 201 } 202 new Subchart(ENGLISH.getName(locale, true, CLDRFile.SHORT_ALTS), locale, data).writeChart(anchors); 203 } 204 } 205 addCollator(Map<String, Data> data, String type, String leaf, List<String> settings)206 private void addCollator(Map<String, Data> data, String type, String leaf, List<String> settings) { 207 if (type.startsWith("private-")) { 208 type = "\uFFFF" + type; 209 } 210 Data dataItem = data.get(type); 211 if (dataItem == null) { 212 data.put(type, dataItem = new Data()); 213 } 214 dataItem.settings.add(leaf + ":" + Joiner.on(";").join(settings)); 215 } 216 addCollator(Map<String, Data> data, String type, RuleBasedCollator col)217 private void addCollator(Map<String, Data> data, String type, RuleBasedCollator col) { 218 if (type.startsWith("private-")) { 219 type = "\uFFFF\uFFFF" + type; 220 } else if (type.equals("search")) { 221 type = "\uFFFF" + type; 222 } 223 Data dataItem = data.get(type); 224 if (dataItem == null) { 225 data.put(type, dataItem = new Data()); 226 } 227 dataItem.collator = col; 228 } 229 230 //RuleBasedCollator ROOT = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); 231 232 private class Subchart extends Chart { 233 private static final String HIGH_COLLATION_PRIMARY = "\uFFFF"; 234 String title; 235 String file; 236 private Map<String, Data> data; 237 238 @Override getShowDate()239 public boolean getShowDate() { 240 return false; 241 } 242 Subchart(String title, String file, Map<String, Data> data2)243 public Subchart(String title, String file, Map<String, Data> data2) { 244 this.title = title; 245 this.file = file; 246 this.data = data2; 247 } 248 249 @Override getDirectory()250 public String getDirectory() { 251 return DIR; 252 } 253 254 @Override getTitle()255 public String getTitle() { 256 return title; 257 } 258 259 @Override getFileName()260 public String getFileName() { 261 return file; 262 } 263 264 @Override getExplanation()265 public String getExplanation() { 266 return "<p>The following illustrates the ordering for the " + title 267 + " collation tailorings. It does not show the <i>strength differences</i>, such as where case is ignored where there are letter differences. " 268 + " The <i>search</i> order is special: it only used for comparing characters for similarity, so the order among the characters does not matter. " 269 + " Where a type is not present, such as <i>emoji</i> or <i>search</i>, it defaults to the <a href='root.html'>Root</a> type.</p>" + LS 270 + KNOWN_PROBLEMS 271 + dataScrapeMessage("/tr35-collation.html", null, "common/collation") + LS 272 ; 273 } 274 275 @Override writeContents(FormattedFileWriter pw)276 public void writeContents(FormattedFileWriter pw) throws IOException { 277 278 CLDRFile cldrFile = CLDR_FACTORY.make(file, true); 279 UnicodeSet exemplars_all = new UnicodeSet(); 280 for (ExemplarType ex : ExemplarType.values()) { 281 UnicodeSet possExemplars = cldrFile.getExemplarSet(ex, WinningChoice.WINNING).freeze(); 282 exemplars_all.addAll(possExemplars); 283 } 284 // UnicodeSet exemplars = cldrFile.getExemplarSet("", WinningChoice.WINNING).freeze(); 285 // 286 // UnicodeSet exemplars_all = new UnicodeSet(exemplars); 287 // UnicodeSet exemplars_auxiliary = cldrFile.getExemplarSet("auxiliary", WinningChoice.WINNING); 288 // UnicodeSet exemplars_punctuation = cldrFile.getExemplarSet("punctuation", WinningChoice.WINNING); 289 // exemplars_all.addAll(exemplars_auxiliary) 290 // .addAll(exemplars_punctuation); 291 292 for (NumberingSystem system : NumberingSystem.values()) { 293 UnicodeSet exemplars_numeric = cldrFile.getExemplarsNumeric(system); 294 if (exemplars_numeric != null) { 295 exemplars_all.addAll(exemplars_numeric); 296 //System.out.println(file + "\t" + system + "\t" + exemplars_numeric.toPattern(false)); 297 } 298 } 299 exemplars_all.freeze(); 300 301 TablePrinter tablePrinter = new TablePrinter() 302 .addColumn("Type", "class='source'", null, "class='source'", true) 303 .addColumn("Ordering", "class='target'", null, "class='target_nofont'", true); 304 305 for (Entry<String, Data> entry : data.entrySet()) { 306 // sort the characters 307 String type = entry.getKey(); 308 if (type.startsWith(HIGH_COLLATION_PRIMARY)) { 309 type = type.substring(1); 310 } 311 RuleBasedCollator col = entry.getValue().collator; 312 Set<String> settings = entry.getValue().settings; 313 StringBuilder list = new StringBuilder(); 314 if (!settings.isEmpty()) { 315 list.append(Joiner.on("<br>").join(settings)); 316 list.append("<br><b><i>plus</i></b><br>"); 317 } 318 if (col == null) { 319 list.append("<i>CLDR default character order</i>"); 320 } else { 321 UnicodeSet tailored = new UnicodeSet(col.getTailoredSet()); 322 Set<String> sorted = new TreeSet<>(col); 323 exemplars_all.addAllTo(sorted); 324 tailored.addAllTo(sorted); 325 boolean first = true; 326 for (String s : sorted) { 327 // if (--maxCount < 0) { 328 // list.append(" …"); 329 // break; 330 // } 331 if (first) { 332 first = false; 333 } else { 334 list.append(' '); 335 } 336 if (s.startsWith("\uFDD0")) { // special CJK markers 337 int len = list.length(); 338 if (len > 4 && list.substring(len - 4, len).equals("<br>")) { 339 list.append("<br>"); 340 } 341 continue; 342 } 343 if (!tailored.contains(s)) { 344 list.append("<span class='" + NOT_TAILORED + "'>").append(s).append("</span>"); 345 } else if (!exemplars_all.containsAll(s) && !file.equals("root")) { 346 list.append("<span class='" + NOT_EXEMPLARS + "'>").append(s).append("</span>"); 347 } else { 348 list.append(s); 349 } 350 } 351 } 352 tablePrinter 353 .addRow() 354 .addCell(type) 355 .addCell(list.toString()); 356 tablePrinter.finishRow(); 357 } 358 pw.write(tablePrinter.toTable()); 359 } 360 } 361 } 362