1 /* 2 * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * 28 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 29 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved 30 * 31 * The original version of this source code and documentation 32 * is copyrighted and owned by Taligent, Inc., a wholly-owned 33 * subsidiary of IBM. These materials are provided under terms 34 * of a License Agreement between Taligent and Sun. This technology 35 * is protected by multiple US and International patents. 36 * 37 * This notice and attribution to Taligent may not be removed. 38 * Taligent is a registered trademark of Taligent, Inc. 39 */ 40 package sun.text; 41 42 import java.nio.BufferUnderflowException; 43 import java.nio.ByteBuffer; 44 import java.util.MissingResourceException; 45 import sun.text.CompactByteArray; 46 import sun.text.SupplementaryCharacterData; 47 48 /** 49 * This is the class that represents the list of known words used by 50 * DictionaryBasedBreakIterator. The conceptual data structure used 51 * here is a trie: there is a node hanging off the root node for every 52 * letter that can start a word. Each of these nodes has a node hanging 53 * off of it for every letter that can be the second letter of a word 54 * if this node is the first letter, and so on. The trie is represented 55 * as a two-dimensional array that can be treated as a table of state 56 * transitions. Indexes are used to compress this array, taking 57 * advantage of the fact that this array will always be very sparse. 58 */ 59 class BreakDictionary { 60 61 //========================================================================= 62 // data members 63 //========================================================================= 64 65 /** 66 * The version of the dictionary that was read in. 67 */ 68 private static int supportedVersion = 1; 69 70 /** 71 * Maps from characters to column numbers. The main use of this is to 72 * avoid making room in the array for empty columns. 73 */ 74 private CompactByteArray columnMap = null; 75 private SupplementaryCharacterData supplementaryCharColumnMap = null; 76 77 /** 78 * The number of actual columns in the table 79 */ 80 private int numCols; 81 82 /** 83 * Columns are organized into groups of 32. This says how many 84 * column groups. (We could calculate this, but we store the 85 * value to avoid having to repeatedly calculate it.) 86 */ 87 private int numColGroups; 88 89 /** 90 * The actual compressed state table. Each conceptual row represents 91 * a state, and the cells in it contain the row numbers of the states 92 * to transition to for each possible letter. 0 is used to indicate 93 * an illegal combination of letters (i.e., the error state). The 94 * table is compressed by eliminating all the unpopulated (i.e., zero) 95 * cells. Multiple conceptual rows can then be doubled up in a single 96 * physical row by sliding them up and possibly shifting them to one 97 * side or the other so the populated cells don't collide. Indexes 98 * are used to identify unpopulated cells and to locate populated cells. 99 */ 100 private short[] table = null; 101 102 /** 103 * This index maps logical row numbers to physical row numbers 104 */ 105 private short[] rowIndex = null; 106 107 /** 108 * A bitmap is used to tell which cells in the comceptual table are 109 * populated. This array contains all the unique bit combinations 110 * in that bitmap. If the table is more than 32 columns wide, 111 * successive entries in this array are used for a single row. 112 */ 113 private int[] rowIndexFlags = null; 114 115 /** 116 * This index maps from a logical row number into the bitmap table above. 117 * (This keeps us from storing duplicate bitmap combinations.) Since there 118 * are a lot of rows with only one populated cell, instead of wasting space 119 * in the bitmap table, we just store a negative number in this index for 120 * rows with one populated cell. The absolute value of that number is 121 * the column number of the populated cell. 122 */ 123 private short[] rowIndexFlagsIndex = null; 124 125 /** 126 * For each logical row, this index contains a constant that is added to 127 * the logical column number to get the physical column number 128 */ 129 private byte[] rowIndexShifts = null; 130 131 //========================================================================= 132 // deserialization 133 //========================================================================= 134 BreakDictionary(String dictionaryName, byte[] dictionaryData)135 BreakDictionary(String dictionaryName, byte[] dictionaryData) { 136 try { 137 setupDictionary(dictionaryName, dictionaryData); 138 } catch (BufferUnderflowException bue) { 139 MissingResourceException e; 140 e = new MissingResourceException("Corrupted dictionary data", 141 dictionaryName, ""); 142 e.initCause(bue); 143 throw e; 144 } 145 } 146 setupDictionary(String dictionaryName, byte[] dictionaryData)147 private void setupDictionary(String dictionaryName, byte[] dictionaryData) { 148 ByteBuffer bb = ByteBuffer.wrap(dictionaryData); 149 150 // check version 151 int version = bb.getInt(); 152 if (version != supportedVersion) { 153 throw new MissingResourceException("Dictionary version(" + version + ") is unsupported", 154 dictionaryName, ""); 155 } 156 157 // Check data size 158 int len = bb.getInt(); 159 if (bb.position() + len != bb.limit()) { 160 throw new MissingResourceException("Dictionary size is wrong: " + bb.limit(), 161 dictionaryName, ""); 162 } 163 164 // read in the column map for BMP characteres (this is serialized in 165 // its internal form: an index array followed by a data array) 166 len = bb.getInt(); 167 short[] temp = new short[len]; 168 for (int i = 0; i < len; i++) { 169 temp[i] = bb.getShort(); 170 } 171 len = bb.getInt(); 172 byte[] temp2 = new byte[len]; 173 bb.get(temp2); 174 columnMap = new CompactByteArray(temp, temp2); 175 176 // read in numCols and numColGroups 177 numCols = bb.getInt(); 178 numColGroups = bb.getInt(); 179 180 // read in the row-number index 181 len = bb.getInt(); 182 rowIndex = new short[len]; 183 for (int i = 0; i < len; i++) { 184 rowIndex[i] = bb.getShort(); 185 } 186 187 // load in the populated-cells bitmap: index first, then bitmap list 188 len = bb.getInt(); 189 rowIndexFlagsIndex = new short[len]; 190 for (int i = 0; i < len; i++) { 191 rowIndexFlagsIndex[i] = bb.getShort(); 192 } 193 len = bb.getInt(); 194 rowIndexFlags = new int[len]; 195 for (int i = 0; i < len; i++) { 196 rowIndexFlags[i] = bb.getInt(); 197 } 198 199 // load in the row-shift index 200 len = bb.getInt(); 201 rowIndexShifts = new byte[len]; 202 bb.get(rowIndexShifts); 203 204 // load in the actual state table 205 len = bb.getInt(); 206 table = new short[len]; 207 for (int i = 0; i < len; i++) { 208 table[i] = bb.getShort(); 209 } 210 211 // finally, prepare the column map for supplementary characters 212 len = bb.getInt(); 213 int[] temp3 = new int[len]; 214 for (int i = 0; i < len; i++) { 215 temp3[i] = bb.getInt(); 216 } 217 assert bb.position() == bb.limit(); 218 219 supplementaryCharColumnMap = new SupplementaryCharacterData(temp3); 220 } 221 222 //========================================================================= 223 // access to the words 224 //========================================================================= 225 226 /** 227 * Uses the column map to map the character to a column number, then 228 * passes the row and column number to getNextState() 229 * @param row The current state 230 * @param ch The character whose column we're interested in 231 * @return The new state to transition to 232 */ getNextStateFromCharacter(int row, int ch)233 public final short getNextStateFromCharacter(int row, int ch) { 234 int col; 235 if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 236 col = columnMap.elementAt((char)ch); 237 } else { 238 col = supplementaryCharColumnMap.getValue(ch); 239 } 240 return getNextState(row, col); 241 } 242 243 /** 244 * Returns the value in the cell with the specified (logical) row and 245 * column numbers. In DictionaryBasedBreakIterator, the row number is 246 * a state number, the column number is an input, and the return value 247 * is the row number of the new state to transition to. (0 is the 248 * "error" state, and -1 is the "end of word" state in a dictionary) 249 * @param row The row number of the current state 250 * @param col The column number of the input character (0 means "not a 251 * dictionary character") 252 * @return The row number of the new state to transition to 253 */ getNextState(int row, int col)254 public final short getNextState(int row, int col) { 255 if (cellIsPopulated(row, col)) { 256 // we map from logical to physical row number by looking up the 257 // mapping in rowIndex; we map from logical column number to 258 // physical column number by looking up a shift value for this 259 // logical row and offsetting the logical column number by 260 // the shift amount. Then we can use internalAt() to actually 261 // get the value out of the table. 262 return internalAt(rowIndex[row], col + rowIndexShifts[row]); 263 } 264 else { 265 return 0; 266 } 267 } 268 269 /** 270 * Given (logical) row and column numbers, returns true if the 271 * cell in that position is populated 272 */ cellIsPopulated(int row, int col)273 private boolean cellIsPopulated(int row, int col) { 274 // look up the entry in the bitmap index for the specified row. 275 // If it's a negative number, it's the column number of the only 276 // populated cell in the row 277 if (rowIndexFlagsIndex[row] < 0) { 278 return col == -rowIndexFlagsIndex[row]; 279 } 280 281 // if it's a positive number, it's the offset of an entry in the bitmap 282 // list. If the table is more than 32 columns wide, the bitmap is stored 283 // successive entries in the bitmap list, so we have to divide the column 284 // number by 32 and offset the number we got out of the index by the result. 285 // Once we have the appropriate piece of the bitmap, test the appropriate 286 // bit and return the result. 287 else { 288 int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)]; 289 return (flags & (1 << (col & 0x1f))) != 0; 290 } 291 } 292 293 /** 294 * Implementation of getNextState() when we know the specified cell is 295 * populated. 296 * @param row The PHYSICAL row number of the cell 297 * @param col The PHYSICAL column number of the cell 298 * @return The value stored in the cell 299 */ internalAt(int row, int col)300 private short internalAt(int row, int col) { 301 // the table is a one-dimensional array, so this just does the math necessary 302 // to treat it as a two-dimensional array (we don't just use a two-dimensional 303 // array because two-dimensional arrays are inefficient in Java) 304 return table[row * numCols + col]; 305 } 306 } 307