1 /*
2  * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 /*
27  *******************************************************************************
28  * Copyright (C) 2009-2014, International Business Machines Corporation and
29  * others. All Rights Reserved.
30  *******************************************************************************
31  */
32 
33 package sun.text.normalizer;
34 
35 import java.io.IOException;
36 import java.nio.ByteBuffer;
37 
38 
39 /**
40  * @author aheninger
41  *
42  * A read-only Trie2, holding 16 bit data values.
43  *
44  * A Trie2 is a highly optimized data structure for mapping from Unicode
45  * code points (values ranging from 0 to 0x10ffff) to a 16 or 32 bit value.
46  *
47  * See class Trie2 for descriptions of the API for accessing the contents of a trie.
48  *
49  * The fundamental data access methods are declared final in this class, with
50  * the intent that applications might gain a little extra performance, when compared
51  * with calling the same methods via the abstract UTrie2 base class.
52  */
53 public final class Trie2_16 extends Trie2 {
54 
55     /**
56      *  Internal constructor, not for general use.
57      */
Trie2_16()58     Trie2_16() {
59     }
60 
61 
62     /**
63      * Create a Trie2 from its serialized form.  Inverse of utrie2_serialize().
64      * The serialized format is identical between ICU4C and ICU4J, so this function
65      * will work with serialized Trie2s from either.
66      *
67      * The serialized Trie2 in the bytes may be in either little or big endian byte order.
68      * This allows using serialized Tries from ICU4C without needing to consider the
69      * byte order of the system that created them.
70      *
71      * @param bytes a byte buffer to the serialized form of a UTrie2.
72      * @return An unserialized Trie2_16, ready for use.
73      * @throws IllegalArgumentException if the buffer does not contain a serialized Trie2.
74      * @throws IOException if a read error occurs in the buffer.
75      * @throws ClassCastException if the bytes contain a serialized Trie2_32
76      */
createFromSerialized(ByteBuffer bytes)77     public static Trie2_16  createFromSerialized(ByteBuffer bytes) throws IOException {
78         return (Trie2_16) Trie2.createFromSerialized(bytes);
79     }
80 
81     /**
82      * Get the value for a code point as stored in the Trie2.
83      *
84      * @param codePoint the code point
85      * @return the value
86      */
87     @Override
get(int codePoint)88     public final int get(int codePoint) {
89         int value;
90         int ix;
91 
92         if (codePoint >= 0) {
93             if (codePoint < 0x0d800 || (codePoint > 0x0dbff && codePoint <= 0x0ffff)) {
94                 // Ordinary BMP code point, excluding leading surrogates.
95                 // BMP uses a single level lookup.  BMP index starts at offset 0 in the Trie2 index.
96                 // 16 bit data is stored in the index array itself.
97                 ix = index[codePoint >> UTRIE2_SHIFT_2];
98                 ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
99                 value = index[ix];
100                 return value;
101             }
102             if (codePoint <= 0xffff) {
103                 // Lead Surrogate Code Point.  A Separate index section is stored for
104                 // lead surrogate code units and code points.
105                 //   The main index has the code unit data.
106                 //   For this function, we need the code point data.
107                 // Note: this expression could be refactored for slightly improved efficiency, but
108                 //       surrogate code points will be so rare in practice that it's not worth it.
109                 ix = index[UTRIE2_LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UTRIE2_SHIFT_2)];
110                 ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
111                 value = index[ix];
112                 return value;
113             }
114             if (codePoint < highStart) {
115                 // Supplemental code point, use two-level lookup.
116                 ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> UTRIE2_SHIFT_1);
117                 ix = index[ix];
118                 ix += (codePoint >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK;
119                 ix = index[ix];
120                 ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
121                 value = index[ix];
122                 return value;
123             }
124             if (codePoint <= 0x10ffff) {
125                 value = index[highValueIndex];
126                 return value;
127             }
128         }
129 
130         // Fall through.  The code point is outside of the legal range of 0..0x10ffff.
131         return errorValue;
132     }
133 
134 
135     /**
136      * Get a Trie2 value for a UTF-16 code unit.
137      *
138      * This function returns the same value as get() if the input
139      * character is outside of the lead surrogate range
140      *
141      * There are two values stored in a Trie2 for inputs in the lead
142      * surrogate range.  This function returns the alternate value,
143      * while Trie2.get() returns the main value.
144      *
145      * @param codeUnit a 16 bit code unit or lead surrogate value.
146      * @return the value
147      */
148     @Override
getFromU16SingleLead(char codeUnit)149     public int getFromU16SingleLead(char codeUnit) {
150         int value;
151         int ix;
152 
153         // Because the input is a 16 bit char, we can skip the tests for it being in
154         // the BMP range.  It is.
155         ix = index[codeUnit >> UTRIE2_SHIFT_2];
156         ix = (ix << UTRIE2_INDEX_SHIFT) + (codeUnit & UTRIE2_DATA_MASK);
157         value = index[ix];
158         return value;
159     }
160 
161     /**
162      * @return the number of bytes of the serialized trie
163      */
getSerializedLength()164     public int getSerializedLength() {
165         return 16+(header.indexLength+dataLength)*2;
166     }
167 }
168