1 /*
2  * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 /*
26  *******************************************************************************
27  * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
28  *                                                                             *
29  * The original version of this source code and documentation is copyrighted   *
30  * and owned by IBM, These materials are provided under terms of a License     *
31  * Agreement between IBM and Sun. This technology is protected by multiple     *
32  * US and International patents. This notice and attribution to IBM may not    *
33  * to removed.                                                                 *
34  *******************************************************************************
35  */
36 
37 package sun.text.normalizer;
38 
39 import java.io.BufferedInputStream;
40 import java.io.InputStream;
41 import java.io.IOException;
42 import java.util.MissingResourceException;
43 
44 /**
45 * <p>Internal class used for Unicode character property database.</p>
46 * <p>This classes store binary data read from uprops.icu.
47 * It does not have the capability to parse the data into more high-level
48 * information. It only returns bytes of information when required.</p>
49 * <p>Due to the form most commonly used for retrieval, array of char is used
50 * to store the binary data.</p>
51 * <p>UCharacterPropertyDB also contains information on accessing indexes to
52 * significant points in the binary data.</p>
53 * <p>Responsibility for molding the binary data into more meaning form lies on
54 * <a href=UCharacter.html>UCharacter</a>.</p>
55 * @author Syn Wee Quek
56 * @since release 2.1, february 1st 2002
57 */
58 
59 public final class UCharacterProperty
60 {
61     // public data members -----------------------------------------------
62 
63     /**
64     * Trie data
65     */
66     public CharTrie m_trie_;
67     /**
68      * Optimization
69      * CharTrie index array
70      */
71     public char[] m_trieIndex_;
72     /**
73      * Optimization
74      * CharTrie data array
75      */
76     public char[] m_trieData_;
77     /**
78      * Optimization
79      * CharTrie data offset
80      */
81     public int m_trieInitialValue_;
82     /**
83     * Unicode version
84     */
85     public VersionInfo m_unicodeVersion_;
86 
87     // uprops.h enum UPropertySource --------------------------------------- ***
88 
89     /** From uchar.c/uprops.icu properties vectors trie */
90     public static final int SRC_PROPSVEC=2;
91     /** One more than the highest UPropertySource (SRC_) constant. */
92     public static final int SRC_COUNT=9;
93 
94     // public methods ----------------------------------------------------
95 
96     /**
97      * Java friends implementation
98      */
setIndexData(CharTrie.FriendAgent friendagent)99     public void setIndexData(CharTrie.FriendAgent friendagent)
100     {
101         m_trieIndex_ = friendagent.getPrivateIndex();
102         m_trieData_ = friendagent.getPrivateData();
103         m_trieInitialValue_ = friendagent.getPrivateInitialValue();
104     }
105 
106     /**
107     * Gets the property value at the index.
108     * This is optimized.
109     * Note this is alittle different from CharTrie the index m_trieData_
110     * is never negative.
111     * @param ch code point whose property value is to be retrieved
112     * @return property value of code point
113     */
getProperty(int ch)114     public final int getProperty(int ch)
115     {
116         if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
117             || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
118                 && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
119             // BMP codepoint 0000..D7FF or DC00..FFFF
120             // optimized
121             try { // using try for ch < 0 is faster than using an if statement
122                 return m_trieData_[
123                     (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_]
124                           << Trie.INDEX_STAGE_2_SHIFT_)
125                     + (ch & Trie.INDEX_STAGE_3_MASK_)];
126             } catch (ArrayIndexOutOfBoundsException e) {
127                 return m_trieInitialValue_;
128             }
129         }
130         if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
131             // lead surrogate D800..DBFF
132             return m_trieData_[
133                     (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_
134                                   + (ch >> Trie.INDEX_STAGE_1_SHIFT_)]
135                           << Trie.INDEX_STAGE_2_SHIFT_)
136                     + (ch & Trie.INDEX_STAGE_3_MASK_)];
137         }
138         if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
139             // supplementary code point 10000..10FFFF
140             // look at the construction of supplementary characters
141             // trail forms the ends of it.
142             return m_trie_.getSurrogateValue(
143                                           UTF16.getLeadSurrogate(ch),
144                                           (char)(ch & Trie.SURROGATE_MASK_));
145         }
146         // ch is out of bounds
147         // return m_dataOffset_ if there is an error, in this case we return
148         // the default value: m_initialValue_
149         // we cannot assume that m_initialValue_ is at offset 0
150         // this is for optimization.
151         return m_trieInitialValue_;
152 
153         // this all is an inlined form of return m_trie_.getCodePointValue(ch);
154     }
155 
156     /**
157     * Getting the unsigned numeric value of a character embedded in the property
158     * argument
159     * @param prop the character
160     * @return unsigned numberic value
161     */
getUnsignedValue(int prop)162     public static int getUnsignedValue(int prop)
163     {
164         return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
165     }
166 
167     /**
168      * Gets the unicode additional properties.
169      * C version getUnicodeProperties.
170      * @param codepoint codepoint whose additional properties is to be
171      *                  retrieved
172      * @param column
173      * @return unicode properties
174      */
getAdditional(int codepoint, int column)175        public int getAdditional(int codepoint, int column) {
176         if (column == -1) {
177             return getProperty(codepoint);
178         }
179            if (column < 0 || column >= m_additionalColumnsCount_) {
180            return 0;
181        }
182        return m_additionalVectors_[
183                      m_additionalTrie_.getCodePointValue(codepoint) + column];
184        }
185 
186        /**
187      * <p>Get the "age" of the code point.</p>
188      * <p>The "age" is the Unicode version when the code point was first
189      * designated (as a non-character or for Private Use) or assigned a
190      * character.</p>
191      * <p>This can be useful to avoid emitting code points to receiving
192      * processes that do not accept newer characters.</p>
193      * <p>The data is from the UCD file DerivedAge.txt.</p>
194      * <p>This API does not check the validity of the codepoint.</p>
195      * @param codepoint The code point.
196      * @return the Unicode version number
197      */
getAge(int codepoint)198     public VersionInfo getAge(int codepoint)
199     {
200         int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
201         return VersionInfo.getInstance(
202                            (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
203                            version & LAST_NIBBLE_MASK_, 0, 0);
204     }
205 
206     /**
207     * Forms a supplementary code point from the argument character<br>
208     * Note this is for internal use hence no checks for the validity of the
209     * surrogate characters are done
210     * @param lead lead surrogate character
211     * @param trail trailing surrogate character
212     * @return code point of the supplementary character
213     */
getRawSupplementary(char lead, char trail)214     public static int getRawSupplementary(char lead, char trail)
215     {
216         return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
217     }
218 
219     /**
220     * Loads the property data and initialize the UCharacterProperty instance.
221     * @throws MissingResourceException when data is missing or data has been corrupted
222     */
getInstance()223     public static UCharacterProperty getInstance()
224     {
225         if(INSTANCE_ == null) {
226             try {
227                 INSTANCE_ = new UCharacterProperty();
228             }
229             catch (Exception e) {
230                 throw new MissingResourceException(e.getMessage(),"","");
231             }
232         }
233         return INSTANCE_;
234     }
235 
236     /**
237      * Checks if the argument c is to be treated as a white space in ICU
238      * rules. Usually ICU rule white spaces are ignored unless quoted.
239      * Equivalent to test for Pattern_White_Space Unicode property.
240      * Stable set of characters, won't change.
241      * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
242      * @param c codepoint to check
243      * @return true if c is a ICU white space
244      */
isRuleWhiteSpace(int c)245     public static boolean isRuleWhiteSpace(int c)
246     {
247         /* "white space" in the sense of ICU rule parsers
248            This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
249            See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
250            U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
251            Equivalent to test for Pattern_White_Space Unicode property.
252         */
253         return (c >= 0x0009 && c <= 0x2029 &&
254                 (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
255                  c == 0x200E || c == 0x200F || c >= 0x2028));
256     }
257 
258     // protected variables -----------------------------------------------
259 
260     /**
261      * Extra property trie
262      */
263     CharTrie m_additionalTrie_;
264     /**
265      * Extra property vectors, 1st column for age and second for binary
266      * properties.
267      */
268     int m_additionalVectors_[];
269     /**
270      * Number of additional columns
271      */
272     int m_additionalColumnsCount_;
273     /**
274      * Maximum values for block, bits used as in vector word
275      * 0
276      */
277     int m_maxBlockScriptValue_;
278     /**
279      * Maximum values for script, bits used as in vector word
280      * 0
281      */
282      int m_maxJTGValue_;
283 
284     // private variables -------------------------------------------------
285 
286       /**
287      * UnicodeData.txt property object
288      */
289     private static UCharacterProperty INSTANCE_ = null;
290 
291     /**
292     * Default name of the datafile
293     */
294     private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
295 
296     /**
297     * Default buffer size of datafile
298     */
299     private static final int DATA_BUFFER_SIZE_ = 25000;
300 
301     /**
302     * Numeric value shift
303     */
304     private static final int VALUE_SHIFT_ = 8;
305 
306     /**
307     * Mask to be applied after shifting to obtain an unsigned numeric value
308     */
309     private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;
310 
311     /**
312     * Shift value for lead surrogate to form a supplementary character.
313     */
314     private static final int LEAD_SURROGATE_SHIFT_ = 10;
315     /**
316     * Offset to add to combined surrogate pair to avoid msking.
317     */
318     private static final int SURROGATE_OFFSET_ =
319                            UTF16.SUPPLEMENTARY_MIN_VALUE -
320                            (UTF16.SURROGATE_MIN_VALUE <<
321                            LEAD_SURROGATE_SHIFT_) -
322                            UTF16.TRAIL_SURROGATE_MIN_VALUE;
323 
324     // additional properties ----------------------------------------------
325 
326     /**
327      * First nibble shift
328      */
329     private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
330     /**
331      * Second nibble mask
332      */
333     private static final int LAST_NIBBLE_MASK_ = 0xF;
334     /**
335      * Age value shift
336      */
337     private static final int AGE_SHIFT_ = 24;
338 
339     // private constructors --------------------------------------------------
340 
341     /**
342     * Constructor
343     * @exception IOException thrown when data reading fails or data corrupted
344     */
UCharacterProperty()345     private UCharacterProperty() throws IOException
346     {
347         // jar access
348         InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);
349         BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_);
350         UCharacterPropertyReader reader = new UCharacterPropertyReader(b);
351         reader.read(this);
352         b.close();
353 
354         m_trie_.putIndexData(this);
355     }
356 
upropsvec_addPropertyStarts(UnicodeSet set)357     public void upropsvec_addPropertyStarts(UnicodeSet set) {
358         /* add the start code point of each same-value range of the properties vectors trie */
359         if(m_additionalColumnsCount_>0) {
360             /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
361             TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_);
362             RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();
363             while(propsVectorsIter.next(propsVectorsResult)){
364                 set.add(propsVectorsResult.start);
365             }
366         }
367     }
368 
369 }
370