1 /*
2  **********************************************************************
3  * Copyright (c) 2006-2007, Google and others.  All Rights Reserved.
4  **********************************************************************
5  * Author: Mark Davis
6  **********************************************************************
7  */
8 package org.unicode.cldr.util;
9 
10 import java.util.HashMap;
11 import java.util.Iterator;
12 import java.util.LinkedHashSet;
13 import java.util.Map;
14 import java.util.Map.Entry;
15 import java.util.Set;
16 import java.util.TreeMap;
17 
18 import org.unicode.cldr.util.CharUtilities.CharSourceWrapper;
19 import org.unicode.cldr.util.Dictionary.Matcher;
20 import org.unicode.cldr.util.Dictionary.Matcher.Filter;
21 import org.unicode.cldr.util.Dictionary.Matcher.Status;
22 import org.unicode.cldr.util.SimpleDictionary.SimpleDictionaryBuilder;
23 
24 import com.ibm.icu.lang.UCharacter;
25 import com.ibm.icu.text.DateFormat;
26 import com.ibm.icu.text.SimpleDateFormat;
27 import com.ibm.icu.text.UnicodeSet;
28 import com.ibm.icu.text.UnicodeSetIterator;
29 import com.ibm.icu.util.TimeZone;
30 import com.ibm.icu.util.ULocale;
31 
32 /**
33  * Should be in the package usertest, but it's a pain to rename files in CVS.
34  *
35  * @author markdavis
36  *
37  * @param <T>
38  */
39 public class TestStateDictionaryBuilder<T> {
40     private static final boolean SHORT_TEST = true;
41 
42     private static final boolean SHOW_CONTENTS = true;
43 
44     private static final boolean CHECK_BOOLEAN = false;
45 
46     private final boolean SHOW_STATES = true;
47 
48     boolean SIMPLE_ONLY = false;
49 
50     boolean TEST_AGAINST_SIMPLE = true;
51 
52     Dictionary<T> stateDictionary;
53     Dictionary.Matcher<T> stateMatcher;
54 
55     Dictionary<T> simpleDictionary;
56     Dictionary.Matcher<T> simpleMatcher;
57 
58     Map<CharSequence, T> baseMapping = new TreeMap<>();
59 
60     final StateDictionaryBuilder<T> stateDictionaryBuilder = new StateDictionaryBuilder<>();
61     final SimpleDictionaryBuilder<T> simpleDictionaryBuilder = new SimpleDictionaryBuilder<>();
62 
63     // TODO: convert to TestFramework
main(String[] args)64     public static void main(String[] args) {
65 
66         try {
67             new TestStateDictionaryBuilder<String>().test(args);
68         } finally {
69             System.out.println("DONE");
70         }
71     }
72 
73     @SuppressWarnings({ "unchecked" })
test(String[] args)74     public void test(String[] args) {
75 
76         for (String arg : args) {
77             if (arg.equalsIgnoreCase("utf8")) {
78                 stateDictionaryBuilder.setByteConverter(new Utf8StringByteConverter());
79             } else if (arg.equalsIgnoreCase("normal")) {
80                 stateDictionaryBuilder.setByteConverter(new CompactStringByteConverter(false));
81             } else if (arg.equalsIgnoreCase("compact")) {
82                 stateDictionaryBuilder.setByteConverter(new CompactStringByteConverter(true));
83             }
84         }
85         baseMapping.put("GMT+0000", (T) ("t"));
86         baseMapping.put("GMT+0100", (T) ("t"));
87         baseMapping.put("GMT+0200", (T) ("t"));
88         baseMapping.put("GMT+0300", (T) ("t"));
89         baseMapping.put("GMT+0307", (T) ("t"));
90         showDictionaryContents();
91 
92         addToBoth("man", 1);
93         addToBoth("manner", 100);
94         addToBoth("many", 10);
95         addToBoth("any", 83);
96         showDictionaryContents();
97 
98         baseMapping.put("man", (T) "Woman");
99         baseMapping.put("many", (T) "Few");
100         baseMapping.put("any", (T) "All");
101         showDictionaryContents();
102 
103         for (Filter filter : Filter.values()) {
104             final String string = "many manners ma";
105             tryFind(string, new CharSourceWrapper<CharSequence>(string), stateDictionary, filter);
106         }
107 
108         showWords("ma");
109         showWords("ma!");
110         showWords("!ma");
111         showWords("man");
112         showWords("man!");
113         showWords("mann");
114         showWords("mann!");
115         showWords("many");
116         showWords("many!");
117         compare();
118 
119         addToBoth("m\u03B1nner", 1000);
120         showDictionaryContents();
121         showWords("m\u03B1");
122         compare();
123 
124         // if (true) return;
125         // clear out
126 
127         addToBoth("fish", 10);
128         showDictionaryContents();
129         showWords("a fisherman");
130         compare();
131 
132         addToBoth("fisher", 13);
133         showDictionaryContents();
134         showWords("a fisherman");
135         compare();
136 
137         addToBoth("her", 55);
138         showDictionaryContents();
139         showWords("a fisherman");
140         compare();
141 
142         // clear out
143 
144         // check some non-latin
145         String[] zoneIDs = TimeZone.getAvailableIDs();
146         SimpleDateFormat dt = (SimpleDateFormat) DateFormat.getDateInstance(DateFormat.LONG, new ULocale("hi"));
147         dt.applyPattern("vvvv");
148         for (String zoneID : zoneIDs) {
149             TimeZone zone = TimeZone.getTimeZone(zoneID);
150             dt.setTimeZone(zone);
151             String zoneName = dt.format(0);
152             addToBoth(zoneName, (T) (CHECK_BOOLEAN ? "t" : zoneID));
153         }
154         compare();
155         showDictionaryContents();
156         ((StateDictionary<T>) stateDictionary).flatten();
157 
158         if (SIMPLE_ONLY) {
159             testWithUnicodeNames();
160 
161             ((StateDictionary<T>) stateDictionary).flatten();
162             compare();
163             System.out.println();
164             showDictionaryContents();
165         }
166 
167     }
168 
tryFind(CharSequence originalText, CharSource charListText, Dictionary<U> dictionary, Filter filter)169     static public <U> void tryFind(CharSequence originalText, CharSource charListText, Dictionary<U> dictionary,
170         Filter filter) {
171         System.out.println("Using dictionary: "
172             + Dictionary.load(dictionary.getMapping(), new TreeMap<CharSequence, U>()));
173         System.out.println("Searching in: {" + originalText + "} with filter=" + filter);
174         // Dictionaries are immutable, so we create a Matcher to search/test text.
175         Matcher<U> matcher = dictionary.getMatcher();
176         matcher.setText(charListText);
177         while (true) {
178             Status status = matcher.find(filter);
179             String unique = ""; // only set if needed
180             if (status == Status.NONE) {
181                 break;
182             } else if (status == Status.PARTIAL) {
183                 // sets the match value to the "first" partial match
184                 if (matcher.nextUniquePartial()) {
185                     unique = "\tUnique";
186                 } else {
187                     unique = "\tNot Unique";
188                 }
189             }
190             // Show results
191             System.out.println("{"
192                 + showBoth(charListText, 0, matcher.getOffset()) + "[["
193                 + showBoth(charListText, matcher.getOffset(), matcher.getMatchEnd())
194                 + "]]" + showBoth(charListText, matcher.getMatchEnd(), charListText.getKnownLength())
195                 + "}\t" + status + "  \t{" + matcher.getMatchValue() + "}\t" + unique);
196         }
197         System.out.println();
198     }
199 
showBoth(CharSource source, int start, int end)200     static public CharSequence showBoth(CharSource source, int start, int end) {
201         if (source instanceof CharSourceWrapper) {
202             CharSourceWrapper new_name = (CharSourceWrapper) source;
203             return new_name.sourceSubSequence(start, end);
204         }
205         return source.subSequence(start, end);
206     }
207 
showDictionaryContents()208     private void showDictionaryContents() {
209         // build stuff to use from now on
210         simpleDictionary = simpleDictionaryBuilder.make(baseMapping);
211         simpleMatcher = simpleDictionary.getMatcher();
212         stateDictionary = stateDictionaryBuilder.make(baseMapping);
213         stateMatcher = stateDictionary.getMatcher();
214         baseMapping.clear();
215 
216         // ((Dictionary.Builder) simpleDictionary).addMapping(string, i);
217         // ((Dictionary.Builder) stateDictionary).addMapping(string, i);
218 
219         System.out.println("Dictionary: "
220             + Dictionary.load(stateDictionary.getMapping(), new TreeMap<CharSequence, T>()));
221         System.out.println();
222         if (SHOW_STATES) {
223             System.out.println("States:" + CldrUtility.LINE_SEPARATOR + stateDictionary);
224             System.out.println();
225         }
226         if (SHOW_CONTENTS) {
227             System.out.println("Structure:" + CldrUtility.LINE_SEPARATOR + stateDictionary.debugShow());
228             System.out.println();
229         }
230     }
231 
232     @SuppressWarnings("unchecked")
testWithUnicodeNames()233     private void testWithUnicodeNames() {
234         UnicodeSet testSet = new UnicodeSet(
235             "[[:assigned:] - [:ideographic:] - [:Co:] - [:Cs:]]"); // &
236         // [\\u0000-\\u0FFF]
237         int count = 0;
238         Map<String, T> data = new TreeMap<>();
239         for (UnicodeSetIterator it = new UnicodeSetIterator(testSet); it.next();) {
240             String name = UCharacter.getExtendedName(it.codepoint);
241             if (name == null) {
242                 continue;
243             }
244             if ((++count & 0xFF) == 0) {
245                 System.out.println(count + ":\t"
246                     + com.ibm.icu.impl.Utility.hex(it.codepoint) + "\t" + name);
247             }
248             data.put(name, (T) com.ibm.icu.impl.Utility.hex(it.codepoint, 4));
249         }
250         count = 0;
251         for (String item : data.keySet()) {
252             if (SHORT_TEST && count++ > 500) continue; //
253             addToBoth(item, data.get(item));
254         }
255         simpleDictionary = simpleDictionaryBuilder.make(baseMapping);
256         stateDictionary = stateDictionaryBuilder.make(baseMapping);
257         baseMapping.clear();
258         compare();
259     }
260 
compare()261     private void compare() {
262         System.out.println("Comparing results: ");
263 
264         Map<CharSequence, T> dictionaryData = Dictionary.load(stateDictionary.getMapping(),
265             new HashMap<CharSequence, T>());
266         Map<CharSequence, T> simpleDictionaryData = Dictionary.load(simpleDictionary.getMapping(),
267             new HashMap<CharSequence, T>());
268 
269         assert dictionaryData.equals(simpleDictionaryData) : showDifference(dictionaryData, simpleDictionaryData);
270         if (SHOW_STATES) {
271             System.out.println("Size: " + dictionaryData.size());
272             System.out.println("Rows: "
273                 + ((StateDictionary<T>) stateDictionary).getRowCount());
274         }
275 
276         System.out.println("Checking values: state dictionary");
277         checkSimpleMatches(stateMatcher, dictionaryData);
278         System.out.println("Checking values: simple dictionary");
279         checkSimpleMatches(simpleMatcher, simpleDictionaryData);
280         int count = 0;
281         System.out.println("Cross-checking all values");
282         for (CharSequence myText : simpleDictionaryData.keySet()) {
283             if ((++count & 0xFF) == 0xFF) {
284                 System.out.println(count + ":\t" + myText);
285             }
286             crossCheck(new CharSourceWrapper<>(myText));
287             crossCheck("!" + myText);
288             crossCheck(myText + "!");
289         }
290     }
291 
showDifference(Map<CharSequence, T> dictionaryData, Map<CharSequence, T> simpleDictionaryData)292     private String showDifference(Map<CharSequence, T> dictionaryData, Map<CharSequence, T> simpleDictionaryData) {
293         System.out.println(dictionaryData.size() + ", " + simpleDictionaryData.size());
294         Iterator<Entry<CharSequence, T>> it1 = dictionaryData.entrySet().iterator();
295         Iterator<Entry<CharSequence, T>> it2 = simpleDictionaryData.entrySet().iterator();
296         while (it1.hasNext() || it2.hasNext()) {
297             Entry<CharSequence, T> item1 = it1.hasNext() ? it1.next() : null;
298             Entry<CharSequence, T> item2 = it2.hasNext() ? it2.next() : null;
299             System.out.println(item1 + ", " + item2);
300             if (item1 == null || item2 == null || !item1.equals(item2)) {
301                 return item1 + "!=" + item2;
302             }
303         }
304         return "no difference";
305     }
306 
crossCheck(CharSequence myText)307     private void crossCheck(CharSequence myText) {
308         crossCheck(new CharSourceWrapper<>(myText));
309     }
310 
crossCheck(CharSource myText)311     private void crossCheck(CharSource myText) {
312         stateMatcher.setText(myText); // set the text to operate on
313         simpleMatcher.setText(myText); // set the text to operate on
314         for (int i = 0; stateMatcher.getText().hasCharAt(i); ++i) {
315             stateMatcher.setOffset(i);
316             simpleMatcher.setOffset(i);
317             while (true) {
318                 Status stateStatus = stateMatcher.next();
319                 Status simpleStatus = simpleMatcher.next();
320                 assert stateStatus == simpleStatus : showValues(stateStatus, simpleStatus);
321                 final int stateEnd = stateMatcher.getMatchEnd();
322                 final int simpleEnd = simpleMatcher.getMatchEnd();
323                 assert stateEnd == simpleEnd : showValues(stateStatus, simpleStatus);
324                 if (stateStatus == Status.PARTIAL) {
325                     boolean stateUnique = stateMatcher.nextUniquePartial();
326                     boolean simpleUnique = simpleMatcher.nextUniquePartial();
327                     assert stateUnique == simpleUnique : showValues(stateStatus, simpleStatus);
328                 }
329                 // test this after checking PARTIAL
330                 assert stateMatcher.getMatchValue() == simpleMatcher.getMatchValue() : showValues(stateStatus,
331                     simpleStatus);
332                 if (stateStatus != Status.MATCH) {
333                     break;
334                 }
335             }
336         }
337     }
338 
showValues(Status stateStatus, Status simpleStatus)339     private String showValues(Status stateStatus, Status simpleStatus) {
340         return CldrUtility.LINE_SEPARATOR + "TEXT:\t" + stateMatcher.text + CldrUtility.LINE_SEPARATOR + "STATE:\t"
341             + showValues(stateStatus, stateMatcher) + CldrUtility.LINE_SEPARATOR + "SIMPLE:\t"
342             + showValues(simpleStatus, simpleMatcher);
343     }
344 
showValues(Status status, Matcher<T> matcher)345     private String showValues(Status status, Matcher<T> matcher) {
346         boolean uniquePartial = status == Status.PARTIAL && matcher.nextUniquePartial(); // sets matchValue for PARTIAL
347         return String.format("\tOffsets: %s,%s\tStatus: %s\tString: \"%s\"\tValue: %s %s",
348             matcher.getOffset(),
349             matcher.getMatchEnd(),
350             status,
351             matcher.getMatchText(),
352             matcher.getMatchValue(),
353             status == Status.PARTIAL && uniquePartial ? "\tUNIQUE" : "");
354     }
355 
356     /**
357      * Check that the words all match against themselves.
358      *
359      * @param matcher
360      * @param data
361      */
checkSimpleMatches(Matcher<T> matcher, Map<CharSequence, T> data)362     private void checkSimpleMatches(Matcher<T> matcher, Map<CharSequence, T> data) {
363         int count = 0;
364         for (CharSequence myText : data.keySet()) {
365             if ((count++ & 0xFF) == 0xFF) {
366                 System.out.println(count + ":\t" + myText);
367             }
368             matcher.setText(myText); // set the text to operate on
369 
370             matcher.setOffset(0);
371             int matchEnd = -1;
372             T matchValue = null;
373             // find the longest match
374             while (true) {
375                 Dictionary.Matcher.Status next1 = matcher.next();
376                 if (next1 == Dictionary.Matcher.Status.MATCH) {
377                     matchEnd = matcher.getMatchEnd();
378                     matchValue = matcher.getMatchValue();
379                 } else {
380                     break;
381                 }
382             }
383             assert matchEnd == myText.length() : "failed to find end of <" + myText + "> got instead " + matchEnd;
384             assert matchValue == data.get(myText);
385         }
386     }
387 
388     @SuppressWarnings("unchecked")
addToBoth(CharSequence string, int i)389     private void addToBoth(CharSequence string, int i) {
390         baseMapping.put(string, (T) (i + "/" + string));
391     }
392 
addToBoth(CharSequence string, T i)393     private void addToBoth(CharSequence string, T i) {
394         baseMapping.put(string, i);
395         // if (simpleDictionary.contains(string)) return;
396         // if (!stateDictionary.contains(string)) {
397         // stateDictionary.contains(string);
398         // }
399         // assert stateDictionary.contains(string);
400     }
401 
showWords(String myText)402     private void showWords(String myText) {
403         System.out.format("Finding words in: \"%s\"" + CldrUtility.LINE_SEPARATOR, myText);
404         if (SIMPLE_ONLY) {
405             showWords("", simpleMatcher, myText);
406         } else {
407             Set<String> simpleResult = showWords("Simple", simpleMatcher, myText);
408             Set<String> stateResult = showWords("STATE", stateMatcher, myText);
409             if (!simpleResult.equals(stateResult)) {
410                 // repeat, for debugging
411                 System.out.println("  DIFFERENCE");
412                 showWords("Simple", simpleMatcher, myText);
413                 showWords("STATE", stateMatcher, myText);
414                 Set<String> simpleMinusState = new LinkedHashSet<>(simpleResult);
415                 simpleMinusState.removeAll(stateResult);
416                 System.out.println("Simple-State" + simpleMinusState);
417                 Set<String> stateMinusSimple = new LinkedHashSet<>(stateResult);
418                 stateMinusSimple.removeAll(simpleResult);
419                 System.out.println("State-Simple" + stateMinusSimple);
420             }
421         }
422     }
423 
showWords(String title, Matcher<T> matcher, String myText)424     private Set<String> showWords(String title, Matcher<T> matcher, String myText) {
425         title = title.equals("") ? "" : "\tType: " + title;
426         // Walk through a strings and gather information about what we find
427         // according to the matcher
428         Set<String> result = new LinkedHashSet<>();
429         // Set the text to operate on
430         matcher.setText(myText);
431         boolean uniquePartial = false;
432         for (int i = 0; matcher.hasCharAt(i); ++i) {
433             matcher.setOffset(i);
434             Status status;
435             // We might get multiple matches at each point, so walk through all of
436             // them. The last one might be a partial, so collect some extra
437             // information in that case.
438             do {
439                 // Sets matchValue if there is a MATCH
440                 status = matcher.next();
441                 if (status == Status.PARTIAL) {
442                     // Sets matchValue if the next() status was PARTIAL
443                     uniquePartial = matcher.nextUniquePartial();
444                 }
445                 // Format all of the information
446                 String info = String.format(
447                     "\tOffsets: %s,%s\tStatus: %s\tString: \"%s\"\tValue: %s%s", //
448                     matcher.getOffset(), matcher.getMatchEnd(), status, //
449                     matcher.getMatchText(), matcher.getMatchValue(), //
450                     status == Status.PARTIAL && uniquePartial ? "\tUNIQUE" : "");
451                 result.add(info);
452                 if (status != Status.NONE) {
453                     // If there was a match or partial match, show what we got
454                     System.out.println(title + info);
455                 }
456             } while (status == Status.MATCH);
457         }
458         return result;
459     }
460 }