1 /*
2  * Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 /**
27 *******************************************************************************
28 * Copyright (C) 1996-2014, International Business Machines Corporation and
29 * others. All Rights Reserved.
30 *******************************************************************************
31 */
32 
33 package jdk.internal.icu.lang;
34 
35 import jdk.internal.icu.impl.UBiDiProps;
36 import jdk.internal.icu.impl.UCharacterProperty;
37 import jdk.internal.icu.text.Normalizer2;
38 import jdk.internal.icu.text.UTF16;
39 import jdk.internal.icu.util.VersionInfo;
40 
41 /**
42  * <p>The UCharacter class provides extensions to the
43  * <a href="http://java.sun.com/j2se/1.5/docs/api/java/lang/Character.html">
44  * java.lang.Character</a> class. These extensions provide support for
45  * more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
46  * class, provide support for supplementary characters (those with code
47  * points above U+FFFF).
48  * Each ICU release supports the latest version of Unicode available at that time.
49  *
50  * <p>Code points are represented in these API using ints. While it would be
51  * more convenient in Java to have a separate primitive datatype for them,
52  * ints suffice in the meantime.
53  *
54  * <p>To use this class please add the jar file name icu4j.jar to the
55  * class path, since it contains data files which supply the information used
56  * by this file.<br>
57  * E.g. In Windows <br>
58  * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
59  * Otherwise, another method would be to copy the files uprops.dat and
60  * unames.icu from the icu4j source subdirectory
61  * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
62  * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
63  *
64  * <p>Aside from the additions for UTF-16 support, and the updated Unicode
65  * properties, the main differences between UCharacter and Character are:
66  * <ul>
67  * <li> UCharacter is not designed to be a char wrapper and does not have
68  *      APIs to which involves management of that single char.<br>
69  *      These include:
70  *      <ul>
71  *        <li> char charValue(),
72  *        <li> int compareTo(java.lang.Character, java.lang.Character), etc.
73  *      </ul>
74  * <li> UCharacter does not include Character APIs that are deprecated, nor
75  *      does it include the Java-specific character information, such as
76  *      boolean isJavaIdentifierPart(char ch).
77  * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
78  *      values '10' - '35'. UCharacter also does this in digit and
79  *      getNumericValue, to adhere to the java semantics of these
80  *      methods.  New methods unicodeDigit, and
81  *      getUnicodeNumericValue do not treat the above code points
82  *      as having numeric values.  This is a semantic change from ICU4J 1.3.1.
83  * </ul>
84  * <p>
85  * Further detail on differences can be determined using the program
86  *        <a href=
87  * "http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
88  *        com.ibm.icu.dev.test.lang.UCharacterCompare</a>
89  * </p>
90  * <p>
91  * In addition to Java compatibility functions, which calculate derived properties,
92  * this API provides low-level access to the Unicode Character Database.
93  * </p>
94  * <p>
95  * Unicode assigns each code point (not just assigned character) values for
96  * many properties.
97  * Most of them are simple boolean flags, or constants from a small enumerated list.
98  * For some properties, values are strings or other relatively more complex types.
99  * </p>
100  * <p>
101  * For more information see
102  * <a href="http://www.unicode/org/ucd/">"About the Unicode Character Database"</a>
103  * (http://www.unicode.org/ucd/)
104  * and the <a href="http://www.icu-project.org/userguide/properties.html">ICU
105  * User Guide chapter on Properties</a>
106  * (http://www.icu-project.org/userguide/properties.html).
107  * </p>
108  * <p>
109  * There are also functions that provide easy migration from C/POSIX functions
110  * like isblank(). Their use is generally discouraged because the C/POSIX
111  * standards do not define their semantics beyond the ASCII range, which means
112  * that different implementations exhibit very different behavior.
113  * Instead, Unicode properties should be used directly.
114  * </p>
115  * <p>
116  * There are also only a few, broad C/POSIX character classes, and they tend
117  * to be used for conflicting purposes. For example, the "isalpha()" class
118  * is sometimes used to determine word boundaries, while a more sophisticated
119  * approach would at least distinguish initial letters from continuation
120  * characters (the latter including combining marks).
121  * (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
122  * Another example: There is no "istitle()" class for titlecase characters.
123  * </p>
124  * <p>
125  * ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
126  * ICU implements them according to the Standard Recommendations in
127  * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
128  * (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
129  * </p>
130  * <p>
131  * API access for C/POSIX character classes is as follows:
132  * <pre>{@code
133  * - alpha:     isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
134  * - lower:     isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
135  * - upper:     isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
136  * - punct:     ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|
137  *               (1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|
138  *               (1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
139  * - digit:     isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
140  * - xdigit:    hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
141  * - alnum:     hasBinaryProperty(c, UProperty.POSIX_ALNUM)
142  * - space:     isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
143  * - blank:     hasBinaryProperty(c, UProperty.POSIX_BLANK)
144  * - cntrl:     getType(c)==CONTROL
145  * - graph:     hasBinaryProperty(c, UProperty.POSIX_GRAPH)
146  * - print:     hasBinaryProperty(c, UProperty.POSIX_PRINT)
147  * }</pre>
148  * </p>
149  * <p>
150  * The C/POSIX character classes are also available in UnicodeSet patterns,
151  * using patterns like [:graph:] or \p{graph}.
152  * </p>
153  *
154  * There are several ICU (and Java) whitespace functions.
155  * Comparison:<ul>
156  * <li> isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
157  *       most of general categories "Z" (separators) + most whitespace ISO controls
158  *       (including no-break spaces, but excluding IS1..IS4 and ZWSP)
159  * <li> isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
160  * <li> isSpaceChar: just Z (including no-break spaces)</ul>
161  * </p>
162  * <p>
163  * This class is not subclassable.
164  * </p>
165  * @author Syn Wee Quek
166  * @stable ICU 2.1
167  * @see com.ibm.icu.lang.UCharacterEnums
168  */
169 
170 public final class UCharacter
171 {
172 
173     /**
174      * Joining Group constants.
175      * @see UProperty#JOINING_GROUP
176      * @stable ICU 2.4
177      */
178     public static interface JoiningGroup
179     {
180         /**
181          * @stable ICU 2.4
182          */
183         public static final int NO_JOINING_GROUP = 0;
184     }
185 
186     /**
187      * Numeric Type constants.
188      * @see UProperty#NUMERIC_TYPE
189      * @stable ICU 2.4
190      */
191     public static interface NumericType
192     {
193         /**
194          * @stable ICU 2.4
195          */
196         public static final int NONE = 0;
197         /**
198          * @stable ICU 2.4
199          */
200         public static final int DECIMAL = 1;
201         /**
202          * @stable ICU 2.4
203          */
204         public static final int DIGIT = 2;
205         /**
206          * @stable ICU 2.4
207          */
208         public static final int NUMERIC = 3;
209         /**
210          * @stable ICU 2.4
211          */
212         public static final int COUNT = 4;
213     }
214 
215     /**
216      * Hangul Syllable Type constants.
217      *
218      * @see UProperty#HANGUL_SYLLABLE_TYPE
219      * @stable ICU 2.6
220      */
221     public static interface HangulSyllableType
222     {
223         /**
224          * @stable ICU 2.6
225          */
226         public static final int NOT_APPLICABLE      = 0;   /*[NA]*/ /*See note !!*/
227         /**
228          * @stable ICU 2.6
229          */
230         public static final int LEADING_JAMO        = 1;   /*[L]*/
231         /**
232          * @stable ICU 2.6
233          */
234         public static final int VOWEL_JAMO          = 2;   /*[V]*/
235         /**
236          * @stable ICU 2.6
237          */
238         public static final int TRAILING_JAMO       = 3;   /*[T]*/
239         /**
240          * @stable ICU 2.6
241          */
242         public static final int LV_SYLLABLE         = 4;   /*[LV]*/
243         /**
244          * @stable ICU 2.6
245          */
246         public static final int LVT_SYLLABLE        = 5;   /*[LVT]*/
247         /**
248          * @stable ICU 2.6
249          */
250         public static final int COUNT               = 6;
251     }
252 
253     // public data members -----------------------------------------------
254 
255     /**
256      * The lowest Unicode code point value.
257      * @stable ICU 2.1
258      */
259     public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
260 
261     /**
262      * The highest Unicode code point value (scalar value) according to the
263      * Unicode Standard.
264      * This is a 21-bit value (21 bits, rounded up).<br>
265      * Up-to-date Unicode implementation of java.lang.Character.MAX_VALUE
266      * @stable ICU 2.1
267      */
268     public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
269 
270     // public methods ----------------------------------------------------
271 
272     /**
273      * Returns the numeric value of a decimal digit code point.
274      * <br>This method observes the semantics of
275      * <code>java.lang.Character.digit()</code>.  Note that this
276      * will return positive values for code points for which isDigit
277      * returns false, just like java.lang.Character.
278      * <br><em>Semantic Change:</em> In release 1.3.1 and
279      * prior, this did not treat the European letters as having a
280      * digit value, and also treated numeric letters and other numbers as
281      * digits.
282      * This has been changed to conform to the java semantics.
283      * <br>A code point is a valid digit if and only if:
284      * <ul>
285      *   <li>ch is a decimal digit or one of the european letters, and
286      *   <li>the value of ch is less than the specified radix.
287      * </ul>
288      * @param ch the code point to query
289      * @param radix the radix
290      * @return the numeric value represented by the code point in the
291      * specified radix, or -1 if the code point is not a decimal digit
292      * or if its value is too large for the radix
293      * @stable ICU 2.1
294      */
digit(int ch, int radix)295     public static int digit(int ch, int radix)
296     {
297         if (2 <= radix && radix <= 36) {
298             int value = digit(ch);
299             if (value < 0) {
300                 // ch is not a decimal digit, try latin letters
301                 value = UCharacterProperty.getEuropeanDigit(ch);
302             }
303             return (value < radix) ? value : -1;
304         } else {
305             return -1;  // invalid radix
306         }
307     }
308 
309     /**
310      * Returns the numeric value of a decimal digit code point.
311      * <br>This is a convenience overload of <code>digit(int, int)</code>
312      * that provides a decimal radix.
313      * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
314      * treated numeric letters and other numbers as digits.  This has
315      * been changed to conform to the java semantics.
316      * @param ch the code point to query
317      * @return the numeric value represented by the code point,
318      * or -1 if the code point is not a decimal digit or if its
319      * value is too large for a decimal radix
320      * @stable ICU 2.1
321      */
digit(int ch)322     public static int digit(int ch)
323     {
324         return UCharacterProperty.INSTANCE.digit(ch);
325     }
326 
327     /**
328      * Returns a value indicating a code point's Unicode category.
329      * Up-to-date Unicode implementation of java.lang.Character.getType()
330      * except for the above mentioned code points that had their category
331      * changed.<br>
332      * Return results are constants from the interface
333      * <a href=UCharacterCategory.html>UCharacterCategory</a><br>
334      * <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with
335      * those returned by java.lang.Character.getType.  UCharacterCategory values
336      * match the ones used in ICU4C, while java.lang.Character type
337      * values, though similar, skip the value 17.</p>
338      * @param ch code point whose type is to be determined
339      * @return category which is a value of UCharacterCategory
340      * @stable ICU 2.1
341      */
getType(int ch)342     public static int getType(int ch)
343     {
344         return UCharacterProperty.INSTANCE.getType(ch);
345     }
346 
347     /**
348      * Returns the Bidirection property of a code point.
349      * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
350      * property.<br>
351      * Result returned belongs to the interface
352      * <a href=UCharacterDirection.html>UCharacterDirection</a>
353      * @param ch the code point to be determined its direction
354      * @return direction constant from UCharacterDirection.
355      * @stable ICU 2.1
356      */
getDirection(int ch)357     public static int getDirection(int ch)
358     {
359         return UBiDiProps.INSTANCE.getClass(ch);
360     }
361 
362     /**
363      * Maps the specified code point to a "mirror-image" code point.
364      * For code points with the "mirrored" property, implementations sometimes
365      * need a "poor man's" mapping to another code point such that the default
366      * glyph may serve as the mirror-image of the default glyph of the
367      * specified code point.<br>
368      * This is useful for text conversion to and from codepages with visual
369      * order, and for displays without glyph selection capabilities.
370      * @param ch code point whose mirror is to be retrieved
371      * @return another code point that may serve as a mirror-image substitute,
372      *         or ch itself if there is no such mapping or ch does not have the
373      *         "mirrored" property
374      * @stable ICU 2.1
375      */
getMirror(int ch)376     public static int getMirror(int ch)
377     {
378         return UBiDiProps.INSTANCE.getMirror(ch);
379     }
380 
381     /**
382      * Maps the specified character to its paired bracket character.
383      * For Bidi_Paired_Bracket_Type!=None, this is the same as getMirror(int).
384      * Otherwise c itself is returned.
385      * See http://www.unicode.org/reports/tr9/
386      *
387      * @param c the code point to be mapped
388      * @return the paired bracket code point,
389      *         or c itself if there is no such mapping
390      *         (Bidi_Paired_Bracket_Type=None)
391      *
392      * @see UProperty#BIDI_PAIRED_BRACKET
393      * @see UProperty#BIDI_PAIRED_BRACKET_TYPE
394      * @see #getMirror(int)
395      * @stable ICU 52
396      */
getBidiPairedBracket(int c)397     public static int getBidiPairedBracket(int c) {
398         return UBiDiProps.INSTANCE.getPairedBracket(c);
399     }
400 
401     /**
402      * Returns the combining class of the argument codepoint
403      * @param ch code point whose combining is to be retrieved
404      * @return the combining class of the codepoint
405      * @stable ICU 2.1
406      */
getCombiningClass(int ch)407     public static int getCombiningClass(int ch)
408     {
409         return Normalizer2.getNFDInstance().getCombiningClass(ch);
410     }
411 
412     /**
413      * Returns the version of Unicode data used.
414      * @return the unicode version number used
415      * @stable ICU 2.1
416      */
getUnicodeVersion()417     public static VersionInfo getUnicodeVersion()
418     {
419         return UCharacterProperty.INSTANCE.m_unicodeVersion_;
420     }
421 
422     /**
423      * Returns a code point corresponding to the two UTF16 characters.
424      * @param lead the lead char
425      * @param trail the trail char
426      * @return code point if surrogate characters are valid.
427      * @exception IllegalArgumentException thrown when argument characters do
428      *            not form a valid codepoint
429      * @stable ICU 2.1
430      */
getCodePoint(char lead, char trail)431     public static int getCodePoint(char lead, char trail)
432     {
433         if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
434             return UCharacterProperty.getRawSupplementary(lead, trail);
435         }
436         throw new IllegalArgumentException("Illegal surrogate characters");
437     }
438 
439     /**
440      * Returns the "age" of the code point.</p>
441      * <p>The "age" is the Unicode version when the code point was first
442      * designated (as a non-character or for Private Use) or assigned a
443      * character.
444      * <p>This can be useful to avoid emitting code points to receiving
445      * processes that do not accept newer characters.</p>
446      * <p>The data is from the UCD file DerivedAge.txt.</p>
447      * @param ch The code point.
448      * @return the Unicode version number
449      * @stable ICU 2.6
450      */
getAge(int ch)451     public static VersionInfo getAge(int ch)
452     {
453         if (ch < MIN_VALUE || ch > MAX_VALUE) {
454             throw new IllegalArgumentException("Codepoint out of bounds");
455         }
456         return UCharacterProperty.INSTANCE.getAge(ch);
457     }
458 
459     /**
460      * Returns the property value for an Unicode property type of a code point.
461      * Also returns binary and mask property values.</p>
462      * <p>Unicode, especially in version 3.2, defines many more properties than
463      * the original set in UnicodeData.txt.</p>
464      * <p>The properties APIs are intended to reflect Unicode properties as
465      * defined in the Unicode Character Database (UCD) and Unicode Technical
466      * Reports (UTR). For details about the properties see
467      * http://www.unicode.org/.</p>
468      * <p>For names of Unicode properties see the UCD file PropertyAliases.txt.
469      * </p>
470      * <pre>
471      * Sample usage:
472      * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
473      * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
474      * boolean b = (ideo == 1) ? true : false;
475      * </pre>
476      * @param ch code point to test.
477      * @param type UProperty selector constant, identifies which binary
478      *        property to check. Must be
479      *        UProperty.BINARY_START &lt;= type &lt; UProperty.BINARY_LIMIT or
480      *        UProperty.INT_START &lt;= type &lt; UProperty.INT_LIMIT or
481      *        UProperty.MASK_START &lt;= type &lt; UProperty.MASK_LIMIT.
482      * @return numeric value that is directly the property value or,
483      *         for enumerated properties, corresponds to the numeric value of
484      *         the enumerated constant of the respective property value
485      *         enumeration type (cast to enum type if necessary).
486      *         Returns 0 or 1 (for false / true) for binary Unicode properties.
487      *         Returns a bit-mask for mask properties.
488      *         Returns 0 if 'type' is out of bounds or if the Unicode version
489      *         does not have data for the property at all, or not for this code
490      *         point.
491      * @see UProperty
492      * @see #hasBinaryProperty
493      * @see #getIntPropertyMinValue
494      * @see #getIntPropertyMaxValue
495      * @see #getUnicodeVersion
496      * @stable ICU 2.4
497      */
498      // for BiDiBase.java
getIntPropertyValue(int ch, int type)499     public static int getIntPropertyValue(int ch, int type) {
500         return UCharacterProperty.INSTANCE.getIntPropertyValue(ch, type);
501     }
502 
503     // private constructor -----------------------------------------------
504 
505     /**
506      * Private constructor to prevent instantiation
507      */
UCharacter()508     private UCharacter() { }
509 
510       /*
511        * Copied from UCharacterEnums.java
512        */
513 
514         /**
515          * Character type Mn
516          * @stable ICU 2.1
517          */
518         public static final byte NON_SPACING_MARK        = 6;
519         /**
520          * Character type Me
521          * @stable ICU 2.1
522          */
523         public static final byte ENCLOSING_MARK          = 7;
524         /**
525          * Character type Mc
526          * @stable ICU 2.1
527          */
528         public static final byte COMBINING_SPACING_MARK  = 8;
529         /**
530          * Character type count
531          * @stable ICU 2.1
532          */
533         public static final byte CHAR_CATEGORY_COUNT     = 30;
534 
535         /**
536          * Directional type R
537          * @stable ICU 2.1
538          */
539         public static final int RIGHT_TO_LEFT              = 1;
540         /**
541          * Directional type AL
542          * @stable ICU 2.1
543          */
544         public static final int RIGHT_TO_LEFT_ARABIC       = 13;
545 }
546