1 /* 2 ********************************************************************** 3 * Copyright (c) 2006-2007, Google and others. All Rights Reserved. 4 ********************************************************************** 5 * Author: Mark Davis 6 ********************************************************************** 7 */ 8 package org.unicode.cldr.util; 9 10 import java.util.ArrayList; 11 import java.util.BitSet; 12 import java.util.Collection; 13 import java.util.Comparator; 14 import java.util.HashSet; 15 import java.util.Iterator; 16 import java.util.Map; 17 import java.util.Map.Entry; 18 import java.util.Set; 19 import java.util.TreeMap; 20 import java.util.TreeSet; 21 22 import org.unicode.cldr.util.StateDictionary.Row.Uniqueness; 23 24 public class StateDictionary<T> extends Dictionary<T> { 25 26 private static final boolean DEBUG_FLATTEN = false; 27 28 // results of build 29 private final ArrayList<Row> builtRows; 30 31 private final Row builtBaseRow; 32 33 private final IntMap<T> builtResults; 34 35 private final int builtMaxByteLength; 36 37 private final StringByteConverter byteString; 38 39 // TODO remove before deployment; not thread safe 40 private static int debugReferenceCount = 0; 41 private int debugReferenceNumber = debugReferenceCount++; 42 43 /** 44 * Only should be called by StateDictionaryBuilder 45 * 46 * @param builtBaseRow2 47 * @param builtRows2 48 * @param builtResults2 49 * @param builtMaxByteLength 50 * TODO 51 * @param byteConverter 52 * TODO 53 */ 54 StateDictionary(Row builtBaseRow2, ArrayList<Row> builtRows2, 55 IntMap<T> builtResults2, int builtMaxByteLength, 56 StringByteConverter byteConverter) { 57 builtBaseRow = builtBaseRow2; 58 builtRows = builtRows2; 59 builtResults = builtResults2; 60 this.builtMaxByteLength = builtMaxByteLength; 61 this.byteString = byteConverter; 62 // builtBaseValue = builtResults.get(0); 63 } 64 65 @Override 66 public Matcher<T> getMatcher() { 67 return new StateMatcher(); 68 } 69 70 @Override 71 public String toString() { 72 StringBuilder result = new StringBuilder(); 73 // TreeSet<Row> rowSet = new TreeSet<Row>(builtRows); 74 for (Row row : builtRows) { 75 result.append(row.toString()).append(CldrUtility.LINE_SEPARATOR); 76 } 77 Map<T, Integer> map = builtResults.getValueMap(); 78 Set<Pair<Integer, String>> sorted = new TreeSet<>(); 79 for (T item : map.keySet()) { 80 sorted.add(new Pair<>(map.get(item), item.toString())); 81 } 82 for (Pair<Integer, String> pair : sorted) { 83 result.append(pair.getFirst()).append("*=").append(pair.getSecond()).append(CldrUtility.LINE_SEPARATOR); 84 } 85 return result.toString(); 86 } 87 88 @Override 89 public Iterator<Entry<CharSequence, T>> getMapping() { 90 // TODO Optimize this to only return the items on demand 91 return new TextFetcher().getWords().entrySet().iterator(); 92 } 93 94 @Override 95 public String debugShow() { 96 return new TextFetcher().debugShow(); 97 } 98 99 public int getRowCount() { 100 return builtRows.size(); 101 } 102 103 /** 104 * Internals. The text is transformed into a byte stream. A state table is 105 * used to successively map {state, byte, result} to {newstate, newresult, 106 * isReturn}. A state is represented by a Row, which is a mapping from byte to 107 * a Cell, where each cell has the {nextRow, delta result, returns flag}. 108 * 109 * <pre> 110 * state = next state (row) 111 * result += delta result 112 * if (returns) return the result 113 * <pre> 114 * However, the result and state are preserved for the next call on next(). 115 * 116 */ 117 118 static class Row implements Comparable { 119 enum Uniqueness { 120 // the unknown value is only used in building 121 UNIQUE, AMBIGUOUS, UNKNOWN; 122 123 public String debugName() { 124 switch (this) { 125 case UNIQUE: 126 return ("¹"); 127 case AMBIGUOUS: 128 return "²"; 129 default: 130 return "?"; 131 } 132 } 133 } 134 135 Uniqueness hasUniqueValue = Uniqueness.UNKNOWN; 136 137 final TreeMap<Byte, Cell> byteToCell = new TreeMap<>(); 138 139 // keeps track of the number of cells with returns 140 transient int returnCount; 141 142 transient int terminatingReturnCount; 143 144 private int referenceNumber; 145 146 Row(int rowNumber) { 147 referenceNumber = rowNumber; 148 } 149 150 public int nonTerminating() { 151 return byteToCell.size() - terminatingReturnCount; 152 } 153 154 public int nonReturn() { 155 return byteToCell.size() - returnCount; 156 } 157 158 public int maximumDepth() { 159 int result = 0; 160 for (Cell cell : byteToCell.values()) { 161 if (cell.nextRow != null) { 162 int temp = cell.nextRow.maximumDepth() + 1; 163 if (result < temp) { 164 result = temp; 165 } 166 } 167 } 168 return result; 169 } 170 171 @Override 172 public int compareTo(Object o) { 173 Row other = (Row) o; 174 int result; 175 // we want to sort items first with the fewest number of non-terminating 176 // returns 177 // cells, then most 178 // number of terminating returns, then most number of returns 179 if (0 != (result = maximumDepth() - other.maximumDepth())) 180 return result; 181 if (0 != (result = byteToCell.size() - other.byteToCell.size())) 182 return result; 183 // otherwise, try alphabetic among the keys. We are guaranteed that the 184 // sizes are the same 185 java.util.Iterator<Byte> otherIt = other.byteToCell.keySet().iterator(); 186 for (byte key : byteToCell.keySet()) { 187 int otherKey = otherIt.next(); 188 if (0 != (result = key - otherKey)) { 189 return result; 190 } 191 // at this point, we are guaranteed that the keys are the same. Compare 192 // deltaResults, and row 193 Cell cell = byteToCell.get(key); 194 Cell otherCell = other.byteToCell.get(key); 195 if (0 != (result = cell.deltaResult - otherCell.deltaResult)) { 196 return result; 197 } 198 } 199 // if we fail completely, use the age. 200 return referenceNumber - other.referenceNumber; 201 } 202 203 @Override 204 public String toString() { 205 StringBuilder buffer = new StringBuilder(); 206 buffer.append("R" + getReferenceNumber() + hasUniqueValue.debugName() + "{"); 207 boolean first = true; 208 Set<Byte> sorted = new TreeSet<>(unsignedByteComparator); 209 sorted.addAll(byteToCell.keySet()); 210 for (Byte key : sorted) { 211 if (first) { 212 first = false; 213 } else { 214 buffer.append(' '); 215 } 216 buffer.append(com.ibm.icu.impl.Utility.hex(key & 0xFF, 2)); 217 buffer.append('='); 218 buffer.append(byteToCell.get(key)); 219 } 220 buffer.append('}'); 221 return buffer.toString(); 222 } 223 224 public String toStringCells() { 225 StringBuilder buffer = new StringBuilder(); 226 for (Byte key : byteToCell.keySet()) { 227 buffer.append(com.ibm.icu.impl.Utility.hex(key & 0xFF, 2)); 228 buffer.append(byteToCell.get(key).toString()); 229 buffer.append(' '); 230 } 231 return buffer.toString(); 232 } 233 234 public int getReferenceNumber() { 235 return referenceNumber; 236 } 237 238 int compact(byte[] target) { 239 int pos = 0; 240 for (Byte key : byteToCell.keySet()) { 241 target[pos++] = key; 242 pos = byteToCell.get(key).addBytes(target, pos, 0); 243 } 244 target[pos++] = 0; 245 return pos; 246 } 247 } 248 249 static class Cell { 250 public Row nextRow; // next state 251 252 public int deltaResult; 253 254 public boolean returns; 255 256 public int addBytes(byte[] target, int pos, int rowDelta) { 257 pos = StateDictionary.addBytes(deltaResult, target, pos); 258 int rowOffset = nextRow == null ? 0 : rowDelta - nextRow.getReferenceNumber(); 259 rowOffset <<= 1; // make room for returns 260 if (returns) 261 rowOffset |= 1; 262 return StateDictionary.addBytes(rowOffset, target, pos); 263 } 264 265 @Override 266 public String toString() { 267 String result = deltaResult == 0 ? "" : String.valueOf(deltaResult); 268 if (returns) { 269 result += "*"; 270 } 271 if (nextRow != null) { 272 if (result.length() != 0) { 273 result += "/"; 274 } 275 result += "R" + nextRow.getReferenceNumber(); 276 } 277 return result; 278 } 279 } 280 281 // should be private, but easier to debug if package private 282 class StateMatcher extends Matcher<T> { 283 private static final boolean SHOW_DEBUG = false; 284 285 final private byte[] matchByteBuffer = new byte[byteString 286 .getMaxBytesPerChar()]; 287 288 private int matchByteStringIndex; 289 290 private int matchByteBufferLength; 291 292 // only used in matching 293 private Row matchCurrentRow; 294 295 private int matchIntValue = -1; 296 297 private Row matchLastRow; 298 299 @Override 300 public Matcher<T> setOffset(int offset) { 301 matchCurrentRow = builtBaseRow; 302 partialLastRow = null; // can remove this later, only for debugging 303 partialMatchValue = 0; // ditto 304 matchIntValue = 0; 305 myMatchEnd = offset; 306 matchValue = null; 307 byteString.clear(); 308 matchByteStringIndex = offset; 309 return super.setOffset(offset); 310 } 311 312 int myMatchEnd; 313 314 private Row partialLastRow; 315 316 private int partialMatchValue; 317 318 @Override 319 public Status next() { 320 if (SHOW_DEBUG) { 321 System.out.println("NEXT: " + this); 322 } 323 if (matchCurrentRow == null) { 324 matchIntValue = -1; 325 matchValue = null; 326 return Status.NONE; 327 } 328 Status result = Status.PARTIAL; 329 330 while (text.hasCharAt(myMatchEnd)) { 331 // get more bytes IF matchEnd is set right 332 if (myMatchEnd == matchByteStringIndex) { 333 if (true) { // matchEnd < text.length() 334 char ch = text.charAt(matchByteStringIndex++); 335 matchByteBufferLength = byteString.toBytes(ch, matchByteBuffer, 0); 336 if (SHOW_DEBUG) { 337 System.out.println("\tChar: " + ch + "\t" + com.ibm.icu.impl.Utility.hex(ch) + "\t->\t" 338 + CldrUtility.hex(matchByteBuffer, 0, matchByteBufferLength, " ")); 339 } 340 } else { 341 matchByteBufferLength = byteString.toBytes(matchByteBuffer, 0); 342 } 343 } 344 for (int i = 0; i < matchByteBufferLength; ++i) { 345 result = nextByte(matchByteBuffer[i]); 346 if (result != Status.PARTIAL) { 347 break; 348 } 349 } 350 // Normally, we will never have a return value except at the end of a character, so we don't need 351 // to check after each nextByte. However, if the byteString converts C to a sequence of bytes that 352 // is a prefix of what it converts D into, then we will get a partial match *WITHIN* a character 353 354 if (result == Status.PARTIAL) { 355 ++myMatchEnd; 356 // and continue with the loop 357 } else if (result == Status.MATCH) { 358 ++myMatchEnd; 359 matchValue = builtResults.get(matchIntValue); 360 matchEnd = myMatchEnd; 361 if (SHOW_DEBUG) { 362 System.out.println("NEXT RESULT: " + result + "\t" + this); 363 } 364 return result; 365 } else { 366 // if we didn't get a MATCH, we have NONE. But in reality, there could be a possible match 367 // so we check to see whether the current row allows for any continuation. 368 if (myMatchEnd > offset && matchCurrentRow.byteToCell.size() > 0) { 369 result = Status.PARTIAL; 370 } 371 if (result == Status.NONE) { 372 matchIntValue = -1; 373 matchValue = null; 374 } 375 break; 376 } 377 } 378 matchLastRow = matchCurrentRow; 379 matchCurrentRow = null; 380 if (result == Status.PARTIAL) { 381 matchValue = builtResults.get(matchIntValue); 382 matchEnd = myMatchEnd; 383 partialLastRow = matchLastRow; 384 partialMatchValue = matchIntValue; 385 if (SHOW_DEBUG) { 386 System.out.println("NEXT RESULT: " + result + "\t" + this); 387 } 388 } 389 return result; 390 } 391 392 /** 393 * Returns NONE if we cannot go any farther, MATCH if there was a match, and PARTIAL otherwise. 394 * If we couldn't go any farther, then the currentRow is left alone. 395 * 396 * @param chunk 397 * @return 398 */ 399 private Status nextByte(int chunk) { 400 if (SHOW_DEBUG) { 401 System.out.println("\t" + debugReferenceNumber + "\tRow: " + matchCurrentRow); 402 } 403 Cell cell = matchCurrentRow.byteToCell.get((byte) chunk); 404 if (cell == null) { 405 return Status.NONE; 406 } 407 matchIntValue += cell.deltaResult; 408 matchCurrentRow = cell.nextRow; 409 if (cell.returns) { 410 return Status.MATCH; 411 } 412 return Status.PARTIAL; 413 } 414 415 public int getIntMatchValue() { 416 return matchIntValue; 417 } 418 419 @Override 420 public boolean nextUniquePartial() { 421 if (partialLastRow.hasUniqueValue == Uniqueness.UNIQUE) { 422 matchValue = builtResults.get(partialMatchValue); 423 matchEnd = myMatchEnd; 424 return true; 425 } 426 return false; 427 } 428 429 @Override 430 public StateDictionary<T> getDictionary() { 431 return StateDictionary.this; 432 } 433 } 434 435 static final Comparator<Byte> unsignedByteComparator = new Comparator<Byte>() { 436 437 @Override 438 public int compare(Byte o1, Byte o2) { 439 int b1 = o1 & 0xFF; 440 int b2 = o2 & 0xFF; 441 return b1 < b2 ? -1 : b1 > b2 ? 1 : 0; 442 } 443 444 }; 445 446 static final Comparator<Row> rowComparator = new Comparator<Row>() { 447 448 @Override 449 public int compare(Row row1, Row row2) { 450 if (row1 == row2) { 451 return 0; 452 } else if (row1 == null) { 453 return -1; 454 } else if (row2 == null) { 455 return 1; 456 } 457 int result; 458 if (0 != (result = row1.byteToCell.size() - row2.byteToCell.size())) { 459 return result; 460 } 461 java.util.Iterator<Byte> otherIt = row2.byteToCell.keySet().iterator(); 462 for (byte key : row1.byteToCell.keySet()) { 463 byte otherKey = otherIt.next(); 464 if (0 != (result = key - otherKey)) { 465 return result; 466 } 467 // at this point, we are guaranteed that the keys are the same. Compare 468 // deltaResults, returns, and then recurse on the the row 469 Cell cell1 = row1.byteToCell.get(key); 470 Cell cell2 = row2.byteToCell.get(key); 471 if (0 != (result = cell1.deltaResult - cell2.deltaResult)) { 472 return result; 473 } 474 if (cell1.returns != cell2.returns) { 475 return cell1.returns ? 1 : -1; 476 } 477 if (0 != (result = compare(cell1.nextRow, cell2.nextRow))) { 478 return result; 479 } 480 } 481 return 0; 482 483 } 484 485 }; 486 487 static int addBytes(int source, byte[] target, int pos) { 488 // swap the top bit 489 if (source < 0) { 490 source = ((-source) << 1) | 1; 491 } else { 492 source <<= 1; 493 } 494 // emit the rest as 7 bit quantities with 1 as termination bit 495 while (true) { 496 byte b = (byte) (source & 0x7F); 497 source >>>= 7; 498 if (source == 0) { 499 b |= 0x80; 500 target[pos++] = b; 501 return pos; 502 } 503 target[pos++] = b; 504 } 505 } 506 507 private class TextFetcher { 508 509 Map<CharSequence, T> result = new TreeMap<>(); 510 511 byte[] soFar = new byte[builtMaxByteLength]; 512 513 BitSet shown = new BitSet(); 514 515 StringBuilder buffer = new StringBuilder(); 516 517 StringBuilder debugTreeView = new StringBuilder(); 518 519 private HashSet<Row> rowsSeen = new HashSet<>(); 520 521 public Map<CharSequence, T> getWords() { 522 result.clear(); 523 getWords(0, 0, builtBaseRow); 524 return result; 525 } 526 527 public String debugShow() { 528 rowsSeen.clear(); 529 Counter<Integer> debugCounter = new Counter<>(); 530 getDebugWords(0, 0, builtBaseRow, Integer.MAX_VALUE); 531 for (Row row : builtRows) { 532 debugCounter.add(row.byteToCell.size(), 1); 533 } 534 for (Integer item : (Collection<Integer>) debugCounter 535 .getKeysetSortedByKey()) { 536 debugTreeView.append("cells in row=\t").append(item).append( 537 "\trows with count=\t").append(debugCounter.getCount(item)).append( 538 CldrUtility.LINE_SEPARATOR); 539 } 540 return debugTreeView.toString(); 541 } 542 543 private void getDebugWords(int byteLength, int resultSoFar, Row row, 544 int suppressAbove) { 545 // we do this to show when rows have already been seen, and so the structure has been compacted 546 if (rowsSeen.contains(row)) { 547 // reset if bigger 548 if (suppressAbove > byteLength) { 549 suppressAbove = byteLength; 550 } 551 } else { 552 rowsSeen.add(row); 553 } 554 // walk through the cells, display and recurse 555 Set<Byte> sorted = new TreeSet<>(unsignedByteComparator); 556 sorted.addAll(row.byteToCell.keySet()); 557 for (Byte key : sorted) { 558 Cell cell = row.byteToCell.get(key); 559 soFar[byteLength] = key; 560 shown.set(byteLength, false); 561 int currentValue = resultSoFar + cell.deltaResult; 562 if (cell.returns) { 563 CharSequence key2 = stringFromBytes(soFar, byteLength + 1); 564 T value2 = builtResults.get(currentValue); 565 for (int i = 0; i <= byteLength; ++i) { 566 debugTreeView.append(' '); 567 if (i >= suppressAbove) { 568 debugTreeView.append("++"); 569 } else if (shown.get(i)) { 570 debugTreeView.append("--"); 571 } else { 572 debugTreeView.append(com.ibm.icu.impl.Utility.hex( 573 soFar[i] & 0xFF, 2)); 574 shown.set(i); 575 } 576 } 577 debugTreeView.append("\t<").append(key2).append(">\t<") 578 .append(value2).append(">" + CldrUtility.LINE_SEPARATOR); 579 } 580 if (cell.nextRow != null) { 581 getDebugWords(byteLength + 1, currentValue, cell.nextRow, 582 suppressAbove); 583 } 584 } 585 } 586 587 // recurse through the strings 588 private void getWords(int byteLength, int resultSoFar, Row row) { 589 for (Byte key : row.byteToCell.keySet()) { 590 Cell cell = row.byteToCell.get(key); 591 soFar[byteLength] = key; 592 int currentValue = resultSoFar + cell.deltaResult; 593 if (cell.returns) { 594 CharSequence key2 = stringFromBytes(soFar, byteLength + 1); 595 T value2 = builtResults.get(currentValue); 596 result.put(key2, value2); 597 } 598 if (cell.nextRow != null) { 599 getWords(byteLength + 1, currentValue, cell.nextRow); 600 } 601 } 602 } 603 604 private CharSequence stringFromBytes(byte[] soFar, int len) { 605 buffer.setLength(0); 606 byteString.fromBytes(soFar, 0, len, buffer); 607 return buffer.toString(); 608 } 609 } 610 611 /** 612 * Just for testing flattening. 613 * 614 * 615 */ 616 public void flatten() { 617 TreeSet<Row> s = new TreeSet<>(builtRows); 618 int count = 0; 619 int oldDepth = 999; 620 String oldCell = ""; 621 int uniqueCount = 0; 622 int cellCount = 0; 623 byte[] target = new byte[500]; 624 int totalBytesCompacted = 0; 625 for (Row row : s) { 626 row.referenceNumber = count++; 627 int depth = row.maximumDepth(); 628 if (depth != oldDepth) { 629 if (DEBUG_FLATTEN) { 630 System.out.println("*** " + depth + "***"); 631 } 632 oldDepth = depth; 633 } 634 int bytesCompacted = row.compact(target); 635 if (DEBUG_FLATTEN) { 636 System.out.println(bytesCompacted + "\t" + row); 637 } 638 String newCell = row.toStringCells(); 639 if (!newCell.equals(oldCell)) { 640 uniqueCount++; 641 totalBytesCompacted += bytesCompacted; 642 cellCount += row.byteToCell.size(); 643 } 644 oldCell = newCell; 645 646 for (Cell cell : row.byteToCell.values()) { 647 if (cell.nextRow != null && cell.nextRow.referenceNumber > row.referenceNumber) { 648 if (DEBUG_FLATTEN) { 649 System.out.println("*** Fail"); 650 } 651 break; 652 } 653 } 654 } 655 System.out.println("Count: " + count); 656 System.out.println("UniqueCount: " + uniqueCount); 657 System.out.println("CellCount: " + cellCount); 658 System.out.println("TotalBytesCompacted: " + totalBytesCompacted); 659 } 660 661 }