1 /*
2  * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 /*
27  *
28  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
29  * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
30  *
31  * The original version of this source code and documentation
32  * is copyrighted and owned by Taligent, Inc., a wholly-owned
33  * subsidiary of IBM. These materials are provided under terms
34  * of a License Agreement between Taligent and Sun. This technology
35  * is protected by multiple US and International patents.
36  *
37  * This notice and attribution to Taligent may not be removed.
38  * Taligent is a registered trademark of Taligent, Inc.
39  */
40 package sun.text;
41 
42 import java.nio.BufferUnderflowException;
43 import java.nio.ByteBuffer;
44 import java.util.MissingResourceException;
45 import sun.text.CompactByteArray;
46 import sun.text.SupplementaryCharacterData;
47 
48 /**
49  * This is the class that represents the list of known words used by
50  * DictionaryBasedBreakIterator.  The conceptual data structure used
51  * here is a trie: there is a node hanging off the root node for every
52  * letter that can start a word.  Each of these nodes has a node hanging
53  * off of it for every letter that can be the second letter of a word
54  * if this node is the first letter, and so on.  The trie is represented
55  * as a two-dimensional array that can be treated as a table of state
56  * transitions.  Indexes are used to compress this array, taking
57  * advantage of the fact that this array will always be very sparse.
58  */
59 class BreakDictionary {
60 
61     //=========================================================================
62     // data members
63     //=========================================================================
64 
65     /**
66       * The version of the dictionary that was read in.
67       */
68     private static int supportedVersion = 1;
69 
70     /**
71      * Maps from characters to column numbers.  The main use of this is to
72      * avoid making room in the array for empty columns.
73      */
74     private CompactByteArray columnMap = null;
75     private SupplementaryCharacterData supplementaryCharColumnMap = null;
76 
77     /**
78      * The number of actual columns in the table
79      */
80     private int numCols;
81 
82     /**
83      * Columns are organized into groups of 32.  This says how many
84      * column groups.  (We could calculate this, but we store the
85      * value to avoid having to repeatedly calculate it.)
86      */
87     private int numColGroups;
88 
89     /**
90      * The actual compressed state table.  Each conceptual row represents
91      * a state, and the cells in it contain the row numbers of the states
92      * to transition to for each possible letter.  0 is used to indicate
93      * an illegal combination of letters (i.e., the error state).  The
94      * table is compressed by eliminating all the unpopulated (i.e., zero)
95      * cells.  Multiple conceptual rows can then be doubled up in a single
96      * physical row by sliding them up and possibly shifting them to one
97      * side or the other so the populated cells don't collide.  Indexes
98      * are used to identify unpopulated cells and to locate populated cells.
99      */
100     private short[] table = null;
101 
102     /**
103      * This index maps logical row numbers to physical row numbers
104      */
105     private short[] rowIndex = null;
106 
107     /**
108      * A bitmap is used to tell which cells in the comceptual table are
109      * populated.  This array contains all the unique bit combinations
110      * in that bitmap.  If the table is more than 32 columns wide,
111      * successive entries in this array are used for a single row.
112      */
113     private int[] rowIndexFlags = null;
114 
115     /**
116      * This index maps from a logical row number into the bitmap table above.
117      * (This keeps us from storing duplicate bitmap combinations.)  Since there
118      * are a lot of rows with only one populated cell, instead of wasting space
119      * in the bitmap table, we just store a negative number in this index for
120      * rows with one populated cell.  The absolute value of that number is
121      * the column number of the populated cell.
122      */
123     private short[] rowIndexFlagsIndex = null;
124 
125     /**
126      * For each logical row, this index contains a constant that is added to
127      * the logical column number to get the physical column number
128      */
129     private byte[] rowIndexShifts = null;
130 
131     //=========================================================================
132     // deserialization
133     //=========================================================================
134 
BreakDictionary(String dictionaryName, byte[] dictionaryData)135     BreakDictionary(String dictionaryName, byte[] dictionaryData) {
136         try {
137             setupDictionary(dictionaryName, dictionaryData);
138         } catch (BufferUnderflowException bue) {
139             MissingResourceException e;
140             e = new MissingResourceException("Corrupted dictionary data",
141                                              dictionaryName, "");
142             e.initCause(bue);
143             throw e;
144         }
145     }
146 
setupDictionary(String dictionaryName, byte[] dictionaryData)147     private void setupDictionary(String dictionaryName, byte[] dictionaryData) {
148         ByteBuffer bb = ByteBuffer.wrap(dictionaryData);
149 
150         // check version
151         int version = bb.getInt();
152         if (version != supportedVersion) {
153             throw new MissingResourceException("Dictionary version(" + version + ") is unsupported",
154                                                dictionaryName, "");
155         }
156 
157         // Check data size
158         int len = bb.getInt();
159         if (bb.position() + len != bb.limit()) {
160             throw new MissingResourceException("Dictionary size is wrong: " + bb.limit(),
161                                                dictionaryName, "");
162         }
163 
164         // read in the column map for BMP characteres (this is serialized in
165         // its internal form: an index array followed by a data array)
166         len = bb.getInt();
167         short[] temp = new short[len];
168         for (int i = 0; i < len; i++) {
169             temp[i] = bb.getShort();
170         }
171         len = bb.getInt();
172         byte[] temp2 = new byte[len];
173         bb.get(temp2);
174         columnMap = new CompactByteArray(temp, temp2);
175 
176         // read in numCols and numColGroups
177         numCols = bb.getInt();
178         numColGroups = bb.getInt();
179 
180         // read in the row-number index
181         len = bb.getInt();
182         rowIndex = new short[len];
183         for (int i = 0; i < len; i++) {
184             rowIndex[i] = bb.getShort();
185         }
186 
187         // load in the populated-cells bitmap: index first, then bitmap list
188         len = bb.getInt();
189         rowIndexFlagsIndex = new short[len];
190         for (int i = 0; i < len; i++) {
191             rowIndexFlagsIndex[i] = bb.getShort();
192         }
193         len = bb.getInt();
194         rowIndexFlags = new int[len];
195         for (int i = 0; i < len; i++) {
196             rowIndexFlags[i] = bb.getInt();
197         }
198 
199         // load in the row-shift index
200         len = bb.getInt();
201         rowIndexShifts = new byte[len];
202         bb.get(rowIndexShifts);
203 
204         // load in the actual state table
205         len = bb.getInt();
206         table = new short[len];
207         for (int i = 0; i < len; i++) {
208             table[i] = bb.getShort();
209         }
210 
211         // finally, prepare the column map for supplementary characters
212         len = bb.getInt();
213         int[] temp3 = new int[len];
214         for (int i = 0; i < len; i++) {
215             temp3[i] = bb.getInt();
216         }
217         assert bb.position() == bb.limit();
218 
219         supplementaryCharColumnMap = new SupplementaryCharacterData(temp3);
220     }
221 
222     //=========================================================================
223     // access to the words
224     //=========================================================================
225 
226     /**
227      * Uses the column map to map the character to a column number, then
228      * passes the row and column number to getNextState()
229      * @param row The current state
230      * @param ch The character whose column we're interested in
231      * @return The new state to transition to
232      */
getNextStateFromCharacter(int row, int ch)233     public final short getNextStateFromCharacter(int row, int ch) {
234         int col;
235         if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
236             col = columnMap.elementAt((char)ch);
237         } else {
238             col = supplementaryCharColumnMap.getValue(ch);
239         }
240         return getNextState(row, col);
241     }
242 
243     /**
244      * Returns the value in the cell with the specified (logical) row and
245      * column numbers.  In DictionaryBasedBreakIterator, the row number is
246      * a state number, the column number is an input, and the return value
247      * is the row number of the new state to transition to.  (0 is the
248      * "error" state, and -1 is the "end of word" state in a dictionary)
249      * @param row The row number of the current state
250      * @param col The column number of the input character (0 means "not a
251      * dictionary character")
252      * @return The row number of the new state to transition to
253      */
getNextState(int row, int col)254     public final short getNextState(int row, int col) {
255         if (cellIsPopulated(row, col)) {
256             // we map from logical to physical row number by looking up the
257             // mapping in rowIndex; we map from logical column number to
258             // physical column number by looking up a shift value for this
259             // logical row and offsetting the logical column number by
260             // the shift amount.  Then we can use internalAt() to actually
261             // get the value out of the table.
262             return internalAt(rowIndex[row], col + rowIndexShifts[row]);
263         }
264         else {
265             return 0;
266         }
267     }
268 
269     /**
270      * Given (logical) row and column numbers, returns true if the
271      * cell in that position is populated
272      */
cellIsPopulated(int row, int col)273     private boolean cellIsPopulated(int row, int col) {
274         // look up the entry in the bitmap index for the specified row.
275         // If it's a negative number, it's the column number of the only
276         // populated cell in the row
277         if (rowIndexFlagsIndex[row] < 0) {
278             return col == -rowIndexFlagsIndex[row];
279         }
280 
281         // if it's a positive number, it's the offset of an entry in the bitmap
282         // list.  If the table is more than 32 columns wide, the bitmap is stored
283         // successive entries in the bitmap list, so we have to divide the column
284         // number by 32 and offset the number we got out of the index by the result.
285         // Once we have the appropriate piece of the bitmap, test the appropriate
286         // bit and return the result.
287         else {
288             int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
289             return (flags & (1 << (col & 0x1f))) != 0;
290         }
291     }
292 
293     /**
294      * Implementation of getNextState() when we know the specified cell is
295      * populated.
296      * @param row The PHYSICAL row number of the cell
297      * @param col The PHYSICAL column number of the cell
298      * @return The value stored in the cell
299      */
internalAt(int row, int col)300     private short internalAt(int row, int col) {
301         // the table is a one-dimensional array, so this just does the math necessary
302         // to treat it as a two-dimensional array (we don't just use a two-dimensional
303         // array because two-dimensional arrays are inefficient in Java)
304         return table[row * numCols + col];
305     }
306 }
307