1 package org.unicode.cldr.util;
2 
3 import java.util.Collection;
4 import java.util.Comparator;
5 import java.util.HashSet;
6 import java.util.Set;
7 import java.util.TreeSet;
8 
9 import org.unicode.cldr.util.XEquivalenceClass.SetMaker;
10 
11 import com.ibm.icu.lang.UCharacter;
12 import com.ibm.icu.text.Normalizer;
13 import com.ibm.icu.text.UTF16;
14 import com.ibm.icu.text.UnicodeSet;
15 import com.ibm.icu.text.UnicodeSetIterator;
16 import com.ibm.icu.util.ULocale;
17 
18 public class VariantFolder {
19     private AlternateFetcher alternateFetcher;
20     public static SetMaker mySetMaker = new SetMaker() {
21         Comparator c = new UTF16.StringComparator(true, false, 0);
22         Comparator bestIsLowest = new Comparator() {
23             @Override
24             public int compare(Object o1, Object o2) {
25                 String s1 = o1.toString();
26                 String s2 = o2.toString();
27                 final boolean casefold1 = UCharacter.foldCase(s1, true).equals(s1);
28                 final boolean casefold2 = UCharacter.foldCase(s2, true).equals(s2);
29                 if (casefold1 != casefold2) {
30                     return casefold1 ? -1 : 1;
31                 }
32                 final boolean canonical1 = Normalizer.isNormalized(s1, Normalizer.COMPOSE, 0);
33                 final boolean canonical2 = Normalizer.isNormalized(s2, Normalizer.COMPOSE, 0);
34                 if (canonical1 != canonical2) {
35                     return canonical1 ? -1 : 1;
36                 }
37                 int len1 = s1.codePointCount(0, s1.length());
38                 int len2 = s2.codePointCount(0, s2.length());
39                 if (len1 != len2) {
40                     return len1 - len2;
41                 }
42                 return c.compare(s1, s2);
43             }
44 
45         };
46 
47         @Override
48         public Set make() {
49             return new TreeSet(bestIsLowest);
50         }
51     };
52     public static final UnicodeSet NORMAL_CHARS = new UnicodeSet("[^[:c:]]");
53 
54     //private String source;
55 
56     //private Set<String> result;
57 
58     public interface AlternateFetcher {
59         /**
60          * The input string MUST be in the output set. Note that the results must be
61          * valid even if input string might not be on even code point boundaries.
62          * For example, if the input is "XabY" where X and Y are have surrogates,
63          * and the alternates are by case, then the results have to be {"XabY",
64          * "XAbY", "XaBY", "XABY"}.
65          * <p>
66          * The caller must never modify the set.
67          *
68          * @param item
69          * @return
70          */
getAlternates(String item, Set<String> output)71         Set<String> getAlternates(String item, Set<String> output);
72     }
73 
74     public static class CompatibilityFolder implements VariantFolder.AlternateFetcher {
75         private static final UnicodeSet NORMAL_CHARS = new UnicodeSet("[^[:c:]]");
76         static XEquivalenceClass equivalents = new XEquivalenceClass("none", mySetMaker);
77         static {
78             for (UnicodeSetIterator it = new UnicodeSetIterator(NORMAL_CHARS); it.next();) {
79                 String item = it.getString();
equivalents.add(item, Normalizer.decompose(item, true))80                 equivalents.add(item, Normalizer.decompose(item, true));
equivalents.add(item, Normalizer.compose(item, true))81                 equivalents.add(item, Normalizer.compose(item, true));
82             }
83         }
84 
85         @Override
getAlternates(String item, Set<String> output)86         public Set<String> getAlternates(String item, Set<String> output) {
87             output.add(item);
88             return equivalents.getEquivalences(item);
89         }
90 
91     }
92 
93     public static class CanonicalFolder implements VariantFolder.AlternateFetcher {
94         static XEquivalenceClass equivalents = new XEquivalenceClass("none", mySetMaker);
95         static {
96             for (UnicodeSetIterator it = new UnicodeSetIterator(NORMAL_CHARS); it.next();) {
97                 String item = it.getString();
equivalents.add(item, Normalizer.decompose(item, false))98                 equivalents.add(item, Normalizer.decompose(item, false));
equivalents.add(item, Normalizer.compose(item, false))99                 equivalents.add(item, Normalizer.compose(item, false));
100             }
101         }
102 
103         @Override
getAlternates(String item, Set<String> output)104         public Set<String> getAlternates(String item, Set<String> output) {
105             output.add(item);
106             return equivalents.getEquivalences(item);
107         }
108 
109     }
110 
111     public static class CaseVariantFolder implements VariantFolder.AlternateFetcher {
112         private static final UnicodeSet NORMAL_CHARS = new UnicodeSet("[^[:c:]]");
113         static XEquivalenceClass equivalents = new XEquivalenceClass("none", mySetMaker);
114         static {
115             for (UnicodeSetIterator it = new UnicodeSetIterator(NORMAL_CHARS); it.next();) {
116                 String item = it.getString();
equivalents.add(item, UCharacter.toLowerCase(item))117                 equivalents.add(item, UCharacter.toLowerCase(item));
equivalents.add(item, UCharacter.toUpperCase(item))118                 equivalents.add(item, UCharacter.toUpperCase(item));
equivalents.add(item, UCharacter.foldCase(item, true))119                 equivalents.add(item, UCharacter.foldCase(item, true));
equivalents.add(item, UCharacter.toTitleCase(ULocale.ROOT, item, null))120                 equivalents.add(item, UCharacter.toTitleCase(ULocale.ROOT, item, null));
121             }
122         }
123 
124         @Override
getAlternates(String item, Set<String> output)125         public Set<String> getAlternates(String item, Set<String> output) {
126             output.add(item);
127             return equivalents.getEquivalences(item);
128         }
129     }
130 
131     /**
132      * The class is designed to be immutable, at least as far as Java allows. That is, if the alternateFetcher is, then
133      * it will be.
134      *
135      * @param alternateFetcher
136      */
VariantFolder(AlternateFetcher alternateFetcher)137     public VariantFolder(AlternateFetcher alternateFetcher) {
138         this.alternateFetcher = alternateFetcher;
139     }
140 
141     // We keep track of the alternates for each combination of start,len
142     // so with a length of 3 we have the following structure
143     // {{0,1}, {1,2}, {2,3} -- length of 1
144     // {0,2}, {1,3}
145     // {0,3}}
146 
147     @SuppressWarnings("unchecked")
getClosure(String source)148     public Set<String> getClosure(String source) {
149         int stringLength = source.length();
150         if (stringLength == 0) {
151             Set<String> result = new HashSet<>();
152             result.add(source);
153             return result;
154         }
155         Set<String>[][] combos = new Set[stringLength][];
156         for (int i = 0; i < stringLength; ++i) {
157             combos[i] = new Set[stringLength - i];
158         }
159         for (int i = 0; i < stringLength; ++i) {
160             combos[0][i] = alternateFetcher.getAlternates(source.substring(i, i + 1),
161                 new HashSet<String>());
162         }
163         for (int level = 1; level < stringLength; ++level) {
164             // at each level, we add strings of that length (plus 1)
165             for (int start = 0; start < stringLength - level; ++start) {
166                 int limit = start + level + 1;
167                 // System.out.println(start + ", " + limit);
168                 // we first add any longer alternates
169                 Collection<String> current = combos[level][start] = new HashSet<>();
170                 current.addAll(alternateFetcher.getAlternates(source.substring(start,
171                     limit), new HashSet<String>()));
172                 // then we add the cross product of shorter strings
173                 for (int breakPoint = start + 1; breakPoint < limit; ++breakPoint) {
174                     addCrossProduct(combos[breakPoint - start - 1][start], combos[limit
175                         - breakPoint - 1][breakPoint], current);
176                 }
177             }
178         }
179         return combos[combos.length - 1][0];
180     }
181 
addCrossProduct(Collection<String> source1, Collection<String> source2, Collection<String> output)182     private void addCrossProduct(Collection<String> source1,
183         Collection<String> source2, Collection<String> output) {
184         for (String x : source1) {
185             for (String y : source2) {
186                 output.add(x + y);
187             }
188         }
189     }
190 
getClosure(UnicodeSet input)191     public UnicodeSet getClosure(UnicodeSet input) {
192         UnicodeSet result = new UnicodeSet();
193         for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next();) {
194             Set<String> temp = getClosure(it.getString());
195             for (String s : temp) {
196                 result.add(s);
197             }
198         }
199         return result;
200     }
201 
reduce(String s)202     public String reduce(String s) {
203         Set<String> temp = getClosure(s);
204         return temp.iterator().next();
205     }
206 
reduce(UnicodeSet input)207     public UnicodeSet reduce(UnicodeSet input) {
208         UnicodeSet result = new UnicodeSet();
209         for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next();) {
210             final String reduce = reduce(it.getString());
211             result.add(reduce);
212         }
213         return result;
214     }
215 }