1 package org.unicode.cldr.tool;
2 
3 import java.io.File;
4 import java.io.IOException;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.HashSet;
8 import java.util.LinkedHashSet;
9 import java.util.List;
10 import java.util.Map;
11 import java.util.Map.Entry;
12 import java.util.Set;
13 import java.util.TreeMap;
14 import java.util.TreeSet;
15 import java.util.regex.Matcher;
16 
17 import org.unicode.cldr.tool.FormattedFileWriter.Anchors;
18 import org.unicode.cldr.util.CLDRConfig;
19 import org.unicode.cldr.util.CLDRFile;
20 import org.unicode.cldr.util.CLDRFile.DraftStatus;
21 import org.unicode.cldr.util.CLDRFile.ExemplarType;
22 import org.unicode.cldr.util.CLDRFile.NumberingSystem;
23 import org.unicode.cldr.util.CLDRFile.WinningChoice;
24 import org.unicode.cldr.util.CLDRPaths;
25 import org.unicode.cldr.util.Factory;
26 import org.unicode.cldr.util.FileCopier;
27 import org.unicode.cldr.util.Pair;
28 import org.unicode.cldr.util.PatternCache;
29 import org.unicode.cldr.util.XMLFileReader;
30 import org.unicode.cldr.util.XPathParts;
31 
32 import com.google.common.base.Joiner;
33 import com.google.common.base.Splitter;
34 import com.ibm.icu.text.Collator;
35 import com.ibm.icu.text.RuleBasedCollator;
36 import com.ibm.icu.text.Transliterator;
37 import com.ibm.icu.text.UnicodeSet;
38 
39 public class ChartCollation extends Chart {
40 
41     static final String NOT_TAILORED = "notTailored";
42     static final String NOT_EXEMPLARS = "notExemplars";
43 
44     private static final String KNOWN_PROBLEMS =
45         "<ul>" + LS
46         + "<li>The characters used in the illustration are:" + LS
47         + "<ol>" + LS
48         + "<li>those <span class='" + NOT_TAILORED + "'>not tailored</span> (added from standard exemplars for context)</li>" + LS
49         + "<li>those <span class='" + NOT_EXEMPLARS + "'>tailored</span>, but not in any exemplars (standard, aux, punctuation)</li>" + LS
50         + "<li>those both tailored and in exemplars</li>" + LS
51         + "</ol>" + LS
52         + "<li>The tailored characters may include:" + LS
53         + "<ol>" + LS
54         + "<li>some longer strings (contractions) from the rules</li>" + LS
55         + "<li>generated Unicode characters (for <i>canonical closure</i>)</li>" + LS
56         + "</ol>" + LS
57         + "</li>" + LS
58         + "</ul>" + LS;
59 
60     private static final Factory CLDR_FACTORY = CLDRConfig.getInstance().getCldrFactory();
61     private static final boolean DEBUG = false;
62     private static final String DIR = CLDRPaths.CHART_DIRECTORY + "collation/";
63 
64     //static Factory cldrFactory = Factory.make(CLDRPaths.COMMON_DIRECTORY + "collation/", ".*");
65 
main(String[] args)66     public static void main(String[] args) {
67         new ChartCollation().writeChart(null);
68     }
69 
70     @Override
getDirectory()71     public String getDirectory() {
72         return DIR;
73     }
74 
75     @Override
getTitle()76     public String getTitle() {
77         return "Collation Charts";
78     }
79 
80     @Override
getFileName()81     public String getFileName() {
82         return "index";
83     }
84 
85     @Override
getExplanation()86     public String getExplanation() {
87         return "<p>Collation tailorings provide language or locale-specific modifications of the standard Unicode CLDR collation order, "
88             + "which is based on <a target='_blank' href='http://unicode.org/charts/collation/'>Unicode default collation charts</a>. "
89             + "Locales that just use the standard CLDR order (<a href='root.html'>Root</a>) are not listed.</p>"
90             + dataScrapeMessage("/tr35-collation.html", "common/testData/units/unitsTest.txt", "common/collation")+ LS;
91     }
92 
93     @Override
writeContents(FormattedFileWriter pw)94     public void writeContents(FormattedFileWriter pw) throws IOException {
95         FileCopier.ensureDirectoryExists(DIR);
96         FileCopier.copy(Chart.class, "index.css", DIR);
97         FormattedFileWriter.copyIncludeHtmls(DIR);
98 
99         FormattedFileWriter.Anchors anchors = new FormattedFileWriter.Anchors();
100         writeSubcharts(anchors);
101         pw.setIndex("Main Chart Index", "../index.html");
102         pw.write(anchors.toString());
103     }
104 
105     static class Data {
106         RuleBasedCollator collator;
107         Set<String> settings = new LinkedHashSet<>();
108     }
109 
writeSubcharts(Anchors anchors)110     public void writeSubcharts(Anchors anchors) throws IOException {
111         Matcher settingsMatcher = PatternCache.get(
112             "//ldml/collations/collation"
113                 + "\\[@type=\"([^\"]+)\"]"
114                 + "(.*)?"
115                 + "/(settings|import|cr)"
116                 + "(.*)")
117             .matcher("");
118         Splitter settingSplitter = Splitter.onPattern("[\\[\\]@]").omitEmptyStrings().trimResults();
119         File baseDir = new File(CLDRPaths.COMMON_DIRECTORY + "collation/");
120         Transliterator fromUnicode = Transliterator.getInstance("Hex-Any");
121         List<Pair<String, String>> pathValueList = new ArrayList<>();
122         HashSet<String> mainAvailable = new HashSet<>(CLDR_FACTORY.getAvailable());
123 //        for (String xmlName : baseDir.list()) {
124 //            if (!xmlName.endsWith(".xml")) {
125 //                continue;
126 //            }
127 //            String locale = xmlName.substring(0,xmlName.length()-4);
128 //        }
129         for (String xmlName : baseDir.list()) {
130             if (!xmlName.endsWith(".xml")) {
131                 continue;
132             }
133             String locale = xmlName.substring(0, xmlName.length() - 4);
134             if (!mainAvailable.contains(locale)) {
135                 System.out.println("Skipping locale not in main: " + locale);
136                 continue;
137             }
138 
139             pathValueList.clear();
140             XMLFileReader.loadPathValues(CLDRPaths.COMMON_DIRECTORY + "collation/" + xmlName, pathValueList, true);
141             Map<String, Data> data = new TreeMap<>();
142 
143             for (Pair<String, String> entry : pathValueList) {
144                 String path = entry.getFirst();
145                 String value = entry.getSecond();
146                 if (path.startsWith("//ldml/identity/")) {
147                     continue;
148                 }
149 
150                 if (path.equals("//ldml/collations/defaultCollation")) {
151                     addCollator(data, value, "defaultCollation", Arrays.asList("true"));
152                     continue;
153                 }
154 
155                 // Root collator being empty isn't really a failure - just skip it.
156                 if (xmlName.equals("root.xml") && path.equals("//ldml/collations/collation[@type=\"standard\"]")) {
157                     continue;
158                 }
159                 XPathParts xpp = XPathParts.getFrozenInstance(path);
160                 DraftStatus status = DraftStatus.forString(xpp.findFirstAttributeValue("draft"));
161                 if (status == DraftStatus.unconfirmed) {
162                     System.out.println("Skipping " + path + " in: " + xmlName + " due to draft status = " + status.toString());
163                     continue;
164                 }
165 
166                 if (!settingsMatcher.reset(path).matches()) {
167                     System.out.println("Failure in " + xmlName + " with: " + path);
168                     continue;
169                 }
170                 String type = settingsMatcher.group(1);
171                 String otherAttributes = settingsMatcher.group(2);
172                 String leaf = settingsMatcher.group(3);
173                 String values = settingsMatcher.group(4);
174 
175                 if (leaf.equals("settings") || leaf.equals("import")) {
176                     //ldml/collations/collation[@type="compat"][@visibility="external"]/settings[@reorder="Arab"]
177                     List<String> settings = settingSplitter.splitToList(values);
178                     addCollator(data, type, leaf, settings);
179                     continue;
180                 }
181                 String rules = value;
182                 if (!rules.contains("'#⃣'")) {
183                     rules = rules.replace("#⃣", "'#⃣'").replace("*⃣", "'*⃣'"); //hack for 8288
184                 }
185                 rules = fromUnicode.transform(rules);
186 
187                 try {
188                     RuleBasedCollator col = new RuleBasedCollator(rules);
189                     col.setStrength(Collator.IDENTICAL);
190                     col.freeze();
191                     addCollator(data, type, col);
192                 } catch (Exception e) {
193                     System.out.println("*** Skipping " + locale + ":" + type + ", " + e);
194                 }
195             }
196             if (data.isEmpty()) { // remove completely empty
197                 continue;
198             }
199             if (!data.containsKey("standard")) {
200                 addCollator(data, "standard", (RuleBasedCollator) null);
201             }
202             new Subchart(ENGLISH.getName(locale, true, CLDRFile.SHORT_ALTS), locale, data).writeChart(anchors);
203         }
204     }
205 
addCollator(Map<String, Data> data, String type, String leaf, List<String> settings)206     private void addCollator(Map<String, Data> data, String type, String leaf, List<String> settings) {
207         if (type.startsWith("private-")) {
208             type = "\uFFFF" + type;
209         }
210         Data dataItem = data.get(type);
211         if (dataItem == null) {
212             data.put(type, dataItem = new Data());
213         }
214         dataItem.settings.add(leaf + ":" + Joiner.on(";").join(settings));
215     }
216 
addCollator(Map<String, Data> data, String type, RuleBasedCollator col)217     private void addCollator(Map<String, Data> data, String type, RuleBasedCollator col) {
218         if (type.startsWith("private-")) {
219             type = "\uFFFF\uFFFF" + type;
220         } else if (type.equals("search")) {
221             type = "\uFFFF" + type;
222         }
223         Data dataItem = data.get(type);
224         if (dataItem == null) {
225             data.put(type, dataItem = new Data());
226         }
227         dataItem.collator = col;
228     }
229 
230     //RuleBasedCollator ROOT = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
231 
232     private class Subchart extends Chart {
233         private static final String HIGH_COLLATION_PRIMARY = "\uFFFF";
234         String title;
235         String file;
236         private Map<String, Data> data;
237 
238         @Override
getShowDate()239         public boolean getShowDate() {
240             return false;
241         }
242 
Subchart(String title, String file, Map<String, Data> data2)243         public Subchart(String title, String file, Map<String, Data> data2) {
244             this.title = title;
245             this.file = file;
246             this.data = data2;
247         }
248 
249         @Override
getDirectory()250         public String getDirectory() {
251             return DIR;
252         }
253 
254         @Override
getTitle()255         public String getTitle() {
256             return title;
257         }
258 
259         @Override
getFileName()260         public String getFileName() {
261             return file;
262         }
263 
264         @Override
getExplanation()265         public String getExplanation() {
266             return "<p>The following illustrates the ordering for the " + title
267                 + " collation tailorings. It does not show the <i>strength differences</i>, such as where case is ignored where there are letter differences. "
268                 + " The <i>search</i> order is special: it only used for comparing characters for similarity, so the order among the characters does not matter. "
269                 + " Where a type is not present, such as <i>emoji</i> or <i>search</i>, it defaults to the <a href='root.html'>Root</a> type.</p>" + LS
270                 + KNOWN_PROBLEMS
271                 + dataScrapeMessage("/tr35-collation.html", null, "common/collation") + LS
272                 ;
273         }
274 
275         @Override
writeContents(FormattedFileWriter pw)276         public void writeContents(FormattedFileWriter pw) throws IOException {
277 
278             CLDRFile cldrFile = CLDR_FACTORY.make(file, true);
279             UnicodeSet exemplars_all = new UnicodeSet();
280             for (ExemplarType ex : ExemplarType.values()) {
281                 UnicodeSet possExemplars = cldrFile.getExemplarSet(ex, WinningChoice.WINNING).freeze();
282                 exemplars_all.addAll(possExemplars);
283             }
284 //            UnicodeSet exemplars = cldrFile.getExemplarSet("", WinningChoice.WINNING).freeze();
285 //
286 //            UnicodeSet exemplars_all = new UnicodeSet(exemplars);
287 //            UnicodeSet exemplars_auxiliary = cldrFile.getExemplarSet("auxiliary", WinningChoice.WINNING);
288 //            UnicodeSet exemplars_punctuation = cldrFile.getExemplarSet("punctuation", WinningChoice.WINNING);
289 //            exemplars_all.addAll(exemplars_auxiliary)
290 //                .addAll(exemplars_punctuation);
291 
292             for (NumberingSystem system : NumberingSystem.values()) {
293                 UnicodeSet exemplars_numeric = cldrFile.getExemplarsNumeric(system);
294                 if (exemplars_numeric != null) {
295                     exemplars_all.addAll(exemplars_numeric);
296                     //System.out.println(file + "\t" + system + "\t" + exemplars_numeric.toPattern(false));
297                 }
298             }
299             exemplars_all.freeze();
300 
301             TablePrinter tablePrinter = new TablePrinter()
302                 .addColumn("Type", "class='source'", null, "class='source'", true)
303                 .addColumn("Ordering", "class='target'", null, "class='target_nofont'", true);
304 
305             for (Entry<String, Data> entry : data.entrySet()) {
306                 // sort the characters
307                 String type = entry.getKey();
308                 if (type.startsWith(HIGH_COLLATION_PRIMARY)) {
309                     type = type.substring(1);
310                 }
311                 RuleBasedCollator col = entry.getValue().collator;
312                 Set<String> settings = entry.getValue().settings;
313                 StringBuilder list = new StringBuilder();
314                 if (!settings.isEmpty()) {
315                     list.append(Joiner.on("<br>").join(settings));
316                     list.append("<br><b><i>plus</i></b><br>");
317                 }
318                 if (col == null) {
319                     list.append("<i>CLDR default character order</i>");
320                 } else {
321                     UnicodeSet tailored = new UnicodeSet(col.getTailoredSet());
322                     Set<String> sorted = new TreeSet<>(col);
323                     exemplars_all.addAllTo(sorted);
324                     tailored.addAllTo(sorted);
325                     boolean first = true;
326                     for (String s : sorted) {
327 //                        if (--maxCount < 0) {
328 //                            list.append(" …");
329 //                            break;
330 //                        }
331                         if (first) {
332                             first = false;
333                         } else {
334                             list.append(' ');
335                         }
336                         if (s.startsWith("\uFDD0")) { // special CJK markers
337                             int len = list.length();
338                             if (len > 4 && list.substring(len - 4, len).equals("<br>")) {
339                                 list.append("<br>");
340                             }
341                             continue;
342                         }
343                         if (!tailored.contains(s)) {
344                             list.append("<span class='" + NOT_TAILORED + "'>").append(s).append("</span>");
345                         } else if (!exemplars_all.containsAll(s) && !file.equals("root")) {
346                             list.append("<span class='" + NOT_EXEMPLARS + "'>").append(s).append("</span>");
347                         } else {
348                             list.append(s);
349                         }
350                     }
351                 }
352                 tablePrinter
353                 .addRow()
354                 .addCell(type)
355                 .addCell(list.toString());
356                 tablePrinter.finishRow();
357             }
358             pw.write(tablePrinter.toTable());
359         }
360     }
361 }
362