1 /*
2  **********************************************************************
3  * Copyright (c) 2006-2007, Google and others.  All Rights Reserved.
4  **********************************************************************
5  * Author: Mark Davis
6  **********************************************************************
7  */
8 package org.unicode.cldr.util;
9 
10 import java.util.ArrayList;
11 import java.util.BitSet;
12 import java.util.Collection;
13 import java.util.Comparator;
14 import java.util.HashSet;
15 import java.util.Iterator;
16 import java.util.Map;
17 import java.util.Map.Entry;
18 import java.util.Set;
19 import java.util.TreeMap;
20 import java.util.TreeSet;
21 
22 import org.unicode.cldr.util.StateDictionary.Row.Uniqueness;
23 
24 public class StateDictionary<T> extends Dictionary<T> {
25 
26     private static final boolean DEBUG_FLATTEN = false;
27 
28     // results of build
29     private final ArrayList<Row> builtRows;
30 
31     private final Row builtBaseRow;
32 
33     private final IntMap<T> builtResults;
34 
35     private final int builtMaxByteLength;
36 
37     private final StringByteConverter byteString;
38 
39     // TODO remove before deployment; not thread safe
40     private static int debugReferenceCount = 0;
41     private int debugReferenceNumber = debugReferenceCount++;
42 
43     /**
44      * Only should be called by StateDictionaryBuilder
45      *
46      * @param builtBaseRow2
47      * @param builtRows2
48      * @param builtResults2
49      * @param builtMaxByteLength
50      *            TODO
51      * @param byteConverter
52      *            TODO
53      */
54     StateDictionary(Row builtBaseRow2, ArrayList<Row> builtRows2,
55         IntMap<T> builtResults2, int builtMaxByteLength,
56         StringByteConverter byteConverter) {
57         builtBaseRow = builtBaseRow2;
58         builtRows = builtRows2;
59         builtResults = builtResults2;
60         this.builtMaxByteLength = builtMaxByteLength;
61         this.byteString = byteConverter;
62         // builtBaseValue = builtResults.get(0);
63     }
64 
65     @Override
66     public Matcher<T> getMatcher() {
67         return new StateMatcher();
68     }
69 
70     @Override
71     public String toString() {
72         StringBuilder result = new StringBuilder();
73         // TreeSet<Row> rowSet = new TreeSet<Row>(builtRows);
74         for (Row row : builtRows) {
75             result.append(row.toString()).append(CldrUtility.LINE_SEPARATOR);
76         }
77         Map<T, Integer> map = builtResults.getValueMap();
78         Set<Pair<Integer, String>> sorted = new TreeSet<>();
79         for (T item : map.keySet()) {
80             sorted.add(new Pair<>(map.get(item), item.toString()));
81         }
82         for (Pair<Integer, String> pair : sorted) {
83             result.append(pair.getFirst()).append("*=").append(pair.getSecond()).append(CldrUtility.LINE_SEPARATOR);
84         }
85         return result.toString();
86     }
87 
88     @Override
89     public Iterator<Entry<CharSequence, T>> getMapping() {
90         // TODO Optimize this to only return the items on demand
91         return new TextFetcher().getWords().entrySet().iterator();
92     }
93 
94     @Override
95     public String debugShow() {
96         return new TextFetcher().debugShow();
97     }
98 
99     public int getRowCount() {
100         return builtRows.size();
101     }
102 
103     /**
104      * Internals. The text is transformed into a byte stream. A state table is
105      * used to successively map {state, byte, result} to {newstate, newresult,
106      * isReturn}. A state is represented by a Row, which is a mapping from byte to
107      * a Cell, where each cell has the {nextRow, delta result, returns flag}.
108      *
109      * <pre>
110      *  state = next state (row)
111      *  result += delta result
112      *  if (returns) return the result
113      *  &lt;pre&gt;
114      * However, the result and state are preserved for the next call on next().
115      *
116      */
117 
118     static class Row implements Comparable {
119         enum Uniqueness {
120             // the unknown value is only used in building
121             UNIQUE, AMBIGUOUS, UNKNOWN;
122 
123             public String debugName() {
124                 switch (this) {
125                 case UNIQUE:
126                     return ("¹");
127                 case AMBIGUOUS:
128                     return "²";
129                 default:
130                     return "?";
131                 }
132             }
133         }
134 
135         Uniqueness hasUniqueValue = Uniqueness.UNKNOWN;
136 
137         final TreeMap<Byte, Cell> byteToCell = new TreeMap<>();
138 
139         // keeps track of the number of cells with returns
140         transient int returnCount;
141 
142         transient int terminatingReturnCount;
143 
144         private int referenceNumber;
145 
146         Row(int rowNumber) {
147             referenceNumber = rowNumber;
148         }
149 
150         public int nonTerminating() {
151             return byteToCell.size() - terminatingReturnCount;
152         }
153 
154         public int nonReturn() {
155             return byteToCell.size() - returnCount;
156         }
157 
158         public int maximumDepth() {
159             int result = 0;
160             for (Cell cell : byteToCell.values()) {
161                 if (cell.nextRow != null) {
162                     int temp = cell.nextRow.maximumDepth() + 1;
163                     if (result < temp) {
164                         result = temp;
165                     }
166                 }
167             }
168             return result;
169         }
170 
171         @Override
172         public int compareTo(Object o) {
173             Row other = (Row) o;
174             int result;
175             // we want to sort items first with the fewest number of non-terminating
176             // returns
177             // cells, then most
178             // number of terminating returns, then most number of returns
179             if (0 != (result = maximumDepth() - other.maximumDepth()))
180                 return result;
181             if (0 != (result = byteToCell.size() - other.byteToCell.size()))
182                 return result;
183             // otherwise, try alphabetic among the keys. We are guaranteed that the
184             // sizes are the same
185             java.util.Iterator<Byte> otherIt = other.byteToCell.keySet().iterator();
186             for (byte key : byteToCell.keySet()) {
187                 int otherKey = otherIt.next();
188                 if (0 != (result = key - otherKey)) {
189                     return result;
190                 }
191                 // at this point, we are guaranteed that the keys are the same. Compare
192                 // deltaResults, and row
193                 Cell cell = byteToCell.get(key);
194                 Cell otherCell = other.byteToCell.get(key);
195                 if (0 != (result = cell.deltaResult - otherCell.deltaResult)) {
196                     return result;
197                 }
198             }
199             // if we fail completely, use the age.
200             return referenceNumber - other.referenceNumber;
201         }
202 
203         @Override
204         public String toString() {
205             StringBuilder buffer = new StringBuilder();
206             buffer.append("R" + getReferenceNumber() + hasUniqueValue.debugName() + "{");
207             boolean first = true;
208             Set<Byte> sorted = new TreeSet<>(unsignedByteComparator);
209             sorted.addAll(byteToCell.keySet());
210             for (Byte key : sorted) {
211                 if (first) {
212                     first = false;
213                 } else {
214                     buffer.append(' ');
215                 }
216                 buffer.append(com.ibm.icu.impl.Utility.hex(key & 0xFF, 2));
217                 buffer.append('=');
218                 buffer.append(byteToCell.get(key));
219             }
220             buffer.append('}');
221             return buffer.toString();
222         }
223 
224         public String toStringCells() {
225             StringBuilder buffer = new StringBuilder();
226             for (Byte key : byteToCell.keySet()) {
227                 buffer.append(com.ibm.icu.impl.Utility.hex(key & 0xFF, 2));
228                 buffer.append(byteToCell.get(key).toString());
229                 buffer.append(' ');
230             }
231             return buffer.toString();
232         }
233 
234         public int getReferenceNumber() {
235             return referenceNumber;
236         }
237 
238         int compact(byte[] target) {
239             int pos = 0;
240             for (Byte key : byteToCell.keySet()) {
241                 target[pos++] = key;
242                 pos = byteToCell.get(key).addBytes(target, pos, 0);
243             }
244             target[pos++] = 0;
245             return pos;
246         }
247     }
248 
249     static class Cell {
250         public Row nextRow; // next state
251 
252         public int deltaResult;
253 
254         public boolean returns;
255 
256         public int addBytes(byte[] target, int pos, int rowDelta) {
257             pos = StateDictionary.addBytes(deltaResult, target, pos);
258             int rowOffset = nextRow == null ? 0 : rowDelta - nextRow.getReferenceNumber();
259             rowOffset <<= 1; // make room for returns
260             if (returns)
261                 rowOffset |= 1;
262             return StateDictionary.addBytes(rowOffset, target, pos);
263         }
264 
265         @Override
266         public String toString() {
267             String result = deltaResult == 0 ? "" : String.valueOf(deltaResult);
268             if (returns) {
269                 result += "*";
270             }
271             if (nextRow != null) {
272                 if (result.length() != 0) {
273                     result += "/";
274                 }
275                 result += "R" + nextRow.getReferenceNumber();
276             }
277             return result;
278         }
279     }
280 
281     // should be private, but easier to debug if package private
282     class StateMatcher extends Matcher<T> {
283         private static final boolean SHOW_DEBUG = false;
284 
285         final private byte[] matchByteBuffer = new byte[byteString
286             .getMaxBytesPerChar()];
287 
288         private int matchByteStringIndex;
289 
290         private int matchByteBufferLength;
291 
292         // only used in matching
293         private Row matchCurrentRow;
294 
295         private int matchIntValue = -1;
296 
297         private Row matchLastRow;
298 
299         @Override
300         public Matcher<T> setOffset(int offset) {
301             matchCurrentRow = builtBaseRow;
302             partialLastRow = null; // can remove this later, only for debugging
303             partialMatchValue = 0; // ditto
304             matchIntValue = 0;
305             myMatchEnd = offset;
306             matchValue = null;
307             byteString.clear();
308             matchByteStringIndex = offset;
309             return super.setOffset(offset);
310         }
311 
312         int myMatchEnd;
313 
314         private Row partialLastRow;
315 
316         private int partialMatchValue;
317 
318         @Override
319         public Status next() {
320             if (SHOW_DEBUG) {
321                 System.out.println("NEXT: " + this);
322             }
323             if (matchCurrentRow == null) {
324                 matchIntValue = -1;
325                 matchValue = null;
326                 return Status.NONE;
327             }
328             Status result = Status.PARTIAL;
329 
330             while (text.hasCharAt(myMatchEnd)) {
331                 // get more bytes IF matchEnd is set right
332                 if (myMatchEnd == matchByteStringIndex) {
333                     if (true) { // matchEnd < text.length()
334                         char ch = text.charAt(matchByteStringIndex++);
335                         matchByteBufferLength = byteString.toBytes(ch, matchByteBuffer, 0);
336                         if (SHOW_DEBUG) {
337                             System.out.println("\tChar: " + ch + "\t" + com.ibm.icu.impl.Utility.hex(ch) + "\t->\t"
338                                 + CldrUtility.hex(matchByteBuffer, 0, matchByteBufferLength, " "));
339                         }
340                     } else {
341                         matchByteBufferLength = byteString.toBytes(matchByteBuffer, 0);
342                     }
343                 }
344                 for (int i = 0; i < matchByteBufferLength; ++i) {
345                     result = nextByte(matchByteBuffer[i]);
346                     if (result != Status.PARTIAL) {
347                         break;
348                     }
349                 }
350                 // Normally, we will never have a return value except at the end of a character, so we don't need
351                 // to check after each nextByte. However, if the byteString converts C to a sequence of bytes that
352                 // is a prefix of what it converts D into, then we will get a partial match *WITHIN* a character
353 
354                 if (result == Status.PARTIAL) {
355                     ++myMatchEnd;
356                     // and continue with the loop
357                 } else if (result == Status.MATCH) {
358                     ++myMatchEnd;
359                     matchValue = builtResults.get(matchIntValue);
360                     matchEnd = myMatchEnd;
361                     if (SHOW_DEBUG) {
362                         System.out.println("NEXT RESULT: " + result + "\t" + this);
363                     }
364                     return result;
365                 } else {
366                     // if we didn't get a MATCH, we have NONE. But in reality, there could be a possible match
367                     // so we check to see whether the current row allows for any continuation.
368                     if (myMatchEnd > offset && matchCurrentRow.byteToCell.size() > 0) {
369                         result = Status.PARTIAL;
370                     }
371                     if (result == Status.NONE) {
372                         matchIntValue = -1;
373                         matchValue = null;
374                     }
375                     break;
376                 }
377             }
378             matchLastRow = matchCurrentRow;
379             matchCurrentRow = null;
380             if (result == Status.PARTIAL) {
381                 matchValue = builtResults.get(matchIntValue);
382                 matchEnd = myMatchEnd;
383                 partialLastRow = matchLastRow;
384                 partialMatchValue = matchIntValue;
385                 if (SHOW_DEBUG) {
386                     System.out.println("NEXT RESULT: " + result + "\t" + this);
387                 }
388             }
389             return result;
390         }
391 
392         /**
393          * Returns NONE if we cannot go any farther, MATCH if there was a match, and PARTIAL otherwise.
394          * If we couldn't go any farther, then the currentRow is left alone.
395          *
396          * @param chunk
397          * @return
398          */
399         private Status nextByte(int chunk) {
400             if (SHOW_DEBUG) {
401                 System.out.println("\t" + debugReferenceNumber + "\tRow: " + matchCurrentRow);
402             }
403             Cell cell = matchCurrentRow.byteToCell.get((byte) chunk);
404             if (cell == null) {
405                 return Status.NONE;
406             }
407             matchIntValue += cell.deltaResult;
408             matchCurrentRow = cell.nextRow;
409             if (cell.returns) {
410                 return Status.MATCH;
411             }
412             return Status.PARTIAL;
413         }
414 
415         public int getIntMatchValue() {
416             return matchIntValue;
417         }
418 
419         @Override
420         public boolean nextUniquePartial() {
421             if (partialLastRow.hasUniqueValue == Uniqueness.UNIQUE) {
422                 matchValue = builtResults.get(partialMatchValue);
423                 matchEnd = myMatchEnd;
424                 return true;
425             }
426             return false;
427         }
428 
429         @Override
430         public StateDictionary<T> getDictionary() {
431             return StateDictionary.this;
432         }
433     }
434 
435     static final Comparator<Byte> unsignedByteComparator = new Comparator<Byte>() {
436 
437         @Override
438         public int compare(Byte o1, Byte o2) {
439             int b1 = o1 & 0xFF;
440             int b2 = o2 & 0xFF;
441             return b1 < b2 ? -1 : b1 > b2 ? 1 : 0;
442         }
443 
444     };
445 
446     static final Comparator<Row> rowComparator = new Comparator<Row>() {
447 
448         @Override
449         public int compare(Row row1, Row row2) {
450             if (row1 == row2) {
451                 return 0;
452             } else if (row1 == null) {
453                 return -1;
454             } else if (row2 == null) {
455                 return 1;
456             }
457             int result;
458             if (0 != (result = row1.byteToCell.size() - row2.byteToCell.size())) {
459                 return result;
460             }
461             java.util.Iterator<Byte> otherIt = row2.byteToCell.keySet().iterator();
462             for (byte key : row1.byteToCell.keySet()) {
463                 byte otherKey = otherIt.next();
464                 if (0 != (result = key - otherKey)) {
465                     return result;
466                 }
467                 // at this point, we are guaranteed that the keys are the same. Compare
468                 // deltaResults, returns, and then recurse on the the row
469                 Cell cell1 = row1.byteToCell.get(key);
470                 Cell cell2 = row2.byteToCell.get(key);
471                 if (0 != (result = cell1.deltaResult - cell2.deltaResult)) {
472                     return result;
473                 }
474                 if (cell1.returns != cell2.returns) {
475                     return cell1.returns ? 1 : -1;
476                 }
477                 if (0 != (result = compare(cell1.nextRow, cell2.nextRow))) {
478                     return result;
479                 }
480             }
481             return 0;
482 
483         }
484 
485     };
486 
487     static int addBytes(int source, byte[] target, int pos) {
488         // swap the top bit
489         if (source < 0) {
490             source = ((-source) << 1) | 1;
491         } else {
492             source <<= 1;
493         }
494         // emit the rest as 7 bit quantities with 1 as termination bit
495         while (true) {
496             byte b = (byte) (source & 0x7F);
497             source >>>= 7;
498             if (source == 0) {
499                 b |= 0x80;
500                 target[pos++] = b;
501                 return pos;
502             }
503             target[pos++] = b;
504         }
505     }
506 
507     private class TextFetcher {
508 
509         Map<CharSequence, T> result = new TreeMap<>();
510 
511         byte[] soFar = new byte[builtMaxByteLength];
512 
513         BitSet shown = new BitSet();
514 
515         StringBuilder buffer = new StringBuilder();
516 
517         StringBuilder debugTreeView = new StringBuilder();
518 
519         private HashSet<Row> rowsSeen = new HashSet<>();
520 
521         public Map<CharSequence, T> getWords() {
522             result.clear();
523             getWords(0, 0, builtBaseRow);
524             return result;
525         }
526 
527         public String debugShow() {
528             rowsSeen.clear();
529             Counter<Integer> debugCounter = new Counter<>();
530             getDebugWords(0, 0, builtBaseRow, Integer.MAX_VALUE);
531             for (Row row : builtRows) {
532                 debugCounter.add(row.byteToCell.size(), 1);
533             }
534             for (Integer item : (Collection<Integer>) debugCounter
535                 .getKeysetSortedByKey()) {
536                 debugTreeView.append("cells in row=\t").append(item).append(
537                     "\trows with count=\t").append(debugCounter.getCount(item)).append(
538                         CldrUtility.LINE_SEPARATOR);
539             }
540             return debugTreeView.toString();
541         }
542 
543         private void getDebugWords(int byteLength, int resultSoFar, Row row,
544             int suppressAbove) {
545             // we do this to show when rows have already been seen, and so the structure has been compacted
546             if (rowsSeen.contains(row)) {
547                 // reset if bigger
548                 if (suppressAbove > byteLength) {
549                     suppressAbove = byteLength;
550                 }
551             } else {
552                 rowsSeen.add(row);
553             }
554             // walk through the cells, display and recurse
555             Set<Byte> sorted = new TreeSet<>(unsignedByteComparator);
556             sorted.addAll(row.byteToCell.keySet());
557             for (Byte key : sorted) {
558                 Cell cell = row.byteToCell.get(key);
559                 soFar[byteLength] = key;
560                 shown.set(byteLength, false);
561                 int currentValue = resultSoFar + cell.deltaResult;
562                 if (cell.returns) {
563                     CharSequence key2 = stringFromBytes(soFar, byteLength + 1);
564                     T value2 = builtResults.get(currentValue);
565                     for (int i = 0; i <= byteLength; ++i) {
566                         debugTreeView.append(' ');
567                         if (i >= suppressAbove) {
568                             debugTreeView.append("++");
569                         } else if (shown.get(i)) {
570                             debugTreeView.append("--");
571                         } else {
572                             debugTreeView.append(com.ibm.icu.impl.Utility.hex(
573                                 soFar[i] & 0xFF, 2));
574                             shown.set(i);
575                         }
576                     }
577                     debugTreeView.append("\t<").append(key2).append(">\t<")
578                         .append(value2).append(">" + CldrUtility.LINE_SEPARATOR);
579                 }
580                 if (cell.nextRow != null) {
581                     getDebugWords(byteLength + 1, currentValue, cell.nextRow,
582                         suppressAbove);
583                 }
584             }
585         }
586 
587         // recurse through the strings
588         private void getWords(int byteLength, int resultSoFar, Row row) {
589             for (Byte key : row.byteToCell.keySet()) {
590                 Cell cell = row.byteToCell.get(key);
591                 soFar[byteLength] = key;
592                 int currentValue = resultSoFar + cell.deltaResult;
593                 if (cell.returns) {
594                     CharSequence key2 = stringFromBytes(soFar, byteLength + 1);
595                     T value2 = builtResults.get(currentValue);
596                     result.put(key2, value2);
597                 }
598                 if (cell.nextRow != null) {
599                     getWords(byteLength + 1, currentValue, cell.nextRow);
600                 }
601             }
602         }
603 
604         private CharSequence stringFromBytes(byte[] soFar, int len) {
605             buffer.setLength(0);
606             byteString.fromBytes(soFar, 0, len, buffer);
607             return buffer.toString();
608         }
609     }
610 
611     /**
612      * Just for testing flattening.
613      *
614      *
615      */
616     public void flatten() {
617         TreeSet<Row> s = new TreeSet<>(builtRows);
618         int count = 0;
619         int oldDepth = 999;
620         String oldCell = "";
621         int uniqueCount = 0;
622         int cellCount = 0;
623         byte[] target = new byte[500];
624         int totalBytesCompacted = 0;
625         for (Row row : s) {
626             row.referenceNumber = count++;
627             int depth = row.maximumDepth();
628             if (depth != oldDepth) {
629                 if (DEBUG_FLATTEN) {
630                     System.out.println("*** " + depth + "***");
631                 }
632                 oldDepth = depth;
633             }
634             int bytesCompacted = row.compact(target);
635             if (DEBUG_FLATTEN) {
636                 System.out.println(bytesCompacted + "\t" + row);
637             }
638             String newCell = row.toStringCells();
639             if (!newCell.equals(oldCell)) {
640                 uniqueCount++;
641                 totalBytesCompacted += bytesCompacted;
642                 cellCount += row.byteToCell.size();
643             }
644             oldCell = newCell;
645 
646             for (Cell cell : row.byteToCell.values()) {
647                 if (cell.nextRow != null && cell.nextRow.referenceNumber > row.referenceNumber) {
648                     if (DEBUG_FLATTEN) {
649                         System.out.println("*** Fail");
650                     }
651                     break;
652                 }
653             }
654         }
655         System.out.println("Count: " + count);
656         System.out.println("UniqueCount: " + uniqueCount);
657         System.out.println("CellCount: " + cellCount);
658         System.out.println("TotalBytesCompacted: " + totalBytesCompacted);
659     }
660 
661 }