1 /*
2  * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 /**
26  *******************************************************************************
27  * Copyright (C) 1996-2014, International Business Machines Corporation and
28  * others. All Rights Reserved.
29  *******************************************************************************
30  */
31 
32 package sun.text.normalizer;
33 
34 /**
35  * <p>Standalone utility class providing UTF16 character conversions and
36  * indexing conversions.
37  * <p>Code that uses strings alone rarely need modification.
38  * By design, UTF-16 does not allow overlap, so searching for strings is a safe
39  * operation. Similarly, concatenation is always safe. Substringing is safe if
40  * the start and end are both on UTF-32 boundaries. In normal code, the values
41  * for start and end are on those boundaries, since they arose from operations
42  * like searching. If not, the nearest UTF-32 boundaries can be determined
43  * using <code>bounds()</code>.
44  * <strong>Examples:</strong>
45  * <p>The following examples illustrate use of some of these methods.
46  * <pre>{@code
47  * // iteration forwards: Original
48  * for (int i = 0; i < s.length(); ++i) {
49  *     char ch = s.charAt(i);
50  *     doSomethingWith(ch);
51  * }
52  *
53  * // iteration forwards: Changes for UTF-32
54  * int ch;
55  * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
56  *     ch = UTF16.charAt(s, i);
57  *     doSomethingWith(ch);
58  * }
59  *
60  * // iteration backwards: Original
61  * for (int i = s.length() - 1; i >= 0; --i) {
62  *     char ch = s.charAt(i);
63  *     doSomethingWith(ch);
64  * }
65  *
66  * // iteration backwards: Changes for UTF-32
67  * int ch;
68  * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
69  *     ch = UTF16.charAt(s, i);
70  *     doSomethingWith(ch);
71  * }
72  * }</pre>
73  * <strong>Notes:</strong>
74  * <ul>
75  *   <li>
76  *   <strong>Naming:</strong> For clarity, High and Low surrogates are called
77  *   <code>Lead</code> and <code>Trail</code> in the API, which gives a better
78  *   sense of their ordering in a string. <code>offset16</code> and
79  *   <code>offset32</code> are used to distinguish offsets to UTF-16
80  *   boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
81  *   used to contain UTF-32 characters, as opposed to <code>char16</code>,
82  *   which is a UTF-16 code unit.
83  *   </li>
84  *   <li>
85  *   <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
86  *   UTF-32 offset to a UTF-16 offset and back. Because of the difference in
87  *   structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
88  *   back if and only if <code>bounds(string, offset16) != TRAIL</code>.
89  *   </li>
90  *   <li>
91  *   <strong>Exceptions:</strong> The error checking will throw an exception
92  *   if indices are out of bounds. Other than that, all methods will
93  *   behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32
94  *   values are present. <code>UCharacter.isLegal()</code> can be used to check
95  *   for validity if desired.
96  *   </li>
97  *   <li>
98  *   <strong>Unmatched Surrogates:</strong> If the string contains unmatched
99  *   surrogates, then these are counted as one UTF-32 value. This matches
100  *   their iteration behavior, which is vital. It also matches common display
101  *   practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
102  *   </li>
103  *   <li>
104  *   <strong>Optimization:</strong> The method implementations may need
105  *   optimization if the compiler doesn't fold static final methods. Since
106  *   surrogate pairs will form an exceeding small percentage of all the text
107  *   in the world, the singleton case should always be optimized for.
108  *   </li>
109  * </ul>
110  * @author Mark Davis, with help from Markus Scherer
111  * @stable ICU 2.1
112  */
113 
114 public final class UTF16
115 {
116     // public variables ---------------------------------------------------
117 
118     /**
119      * The lowest Unicode code point value.
120      * @stable ICU 2.1
121      */
122     public static final int CODEPOINT_MIN_VALUE = 0;
123     /**
124      * The highest Unicode code point value (scalar value) according to the
125      * Unicode Standard.
126      * @stable ICU 2.1
127      */
128     public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
129     /**
130      * The minimum value for Supplementary code points
131      * @stable ICU 2.1
132      */
133     public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
134     /**
135      * Lead surrogate minimum value
136      * @stable ICU 2.1
137      */
138     public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
139     /**
140      * Trail surrogate minimum value
141      * @stable ICU 2.1
142      */
143     public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
144     /**
145      * Lead surrogate maximum value
146      * @stable ICU 2.1
147      */
148     public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
149     /**
150      * Trail surrogate maximum value
151      * @stable ICU 2.1
152      */
153     public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
154     /**
155      * Surrogate minimum value
156      * @stable ICU 2.1
157      */
158     public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
159     /**
160      * Lead surrogate bitmask
161      */
162     private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
163     /**
164      * Trail surrogate bitmask
165      */
166     private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
167     /**
168      * Surrogate bitmask
169      */
170     private static final int SURROGATE_BITMASK = 0xFFFFF800;
171     /**
172      * Lead surrogate bits
173      */
174     private static final int LEAD_SURROGATE_BITS = 0xD800;
175     /**
176      * Trail surrogate bits
177      */
178     private static final int TRAIL_SURROGATE_BITS = 0xDC00;
179     /**
180      * Surrogate bits
181      */
182     private static final int SURROGATE_BITS = 0xD800;
183 
184     // constructor --------------------------------------------------------
185 
186     // /CLOVER:OFF
187     /**
188      * Prevent instance from being created.
189      */
UTF16()190     private UTF16() {
191     }
192 
193     // /CLOVER:ON
194     // public method ------------------------------------------------------
195 
196     /**
197      * Extract a single UTF-32 value from a string.
198      * Used when iterating forwards or backwards (with
199      * <code>UTF16.getCharCount()</code>, as well as random access. If a
200      * validity check is required, use
201      * <code><a href="../lang/UCharacter.html#isLegal(char)">
202      * UCharacter.isLegal()</a></code> on the return value.
203      * If the char retrieved is part of a surrogate pair, its supplementary
204      * character will be returned. If a complete supplementary character is
205      * not found the incomplete character will be returned
206      * @param source array of UTF-16 chars
207      * @param offset16 UTF-16 offset to the start of the character.
208      * @return UTF-32 value for the UTF-32 value that contains the char at
209      *         offset16. The boundaries of that codepoint are the same as in
210      *         <code>bounds32()</code>.
211      * @exception IndexOutOfBoundsException thrown if offset16 is out of
212      *            bounds.
213      * @stable ICU 2.1
214      */
charAt(String source, int offset16)215     public static int charAt(String source, int offset16) {
216         char single = source.charAt(offset16);
217         if (single < LEAD_SURROGATE_MIN_VALUE) {
218             return single;
219         }
220         return _charAt(source, offset16, single);
221     }
222 
_charAt(String source, int offset16, char single)223     private static int _charAt(String source, int offset16, char single) {
224         if (single > TRAIL_SURROGATE_MAX_VALUE) {
225             return single;
226         }
227 
228         // Convert the UTF-16 surrogate pair if necessary.
229         // For simplicity in usage, and because the frequency of pairs is
230         // low, look both directions.
231 
232         if (single <= LEAD_SURROGATE_MAX_VALUE) {
233             ++offset16;
234             if (source.length() != offset16) {
235                 char trail = source.charAt(offset16);
236                 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
237                     return UCharacterProperty.getRawSupplementary(single, trail);
238                 }
239             }
240         } else {
241             --offset16;
242             if (offset16 >= 0) {
243                 // single is a trail surrogate so
244                 char lead = source.charAt(offset16);
245                 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
246                     return UCharacterProperty.getRawSupplementary(lead, single);
247                 }
248             }
249         }
250         return single; // return unmatched surrogate
251     }
252 
253     /**
254      * Extract a single UTF-32 value from a string.
255      * Used when iterating forwards or backwards (with
256      * <code>UTF16.getCharCount()</code>, as well as random access. If a
257      * validity check is required, use
258      * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
259      * </a></code> on the return value.
260      * If the char retrieved is part of a surrogate pair, its supplementary
261      * character will be returned. If a complete supplementary character is
262      * not found the incomplete character will be returned
263      * @param source array of UTF-16 chars
264      * @param offset16 UTF-16 offset to the start of the character.
265      * @return UTF-32 value for the UTF-32 value that contains the char at
266      *         offset16. The boundaries of that codepoint are the same as in
267      *         <code>bounds32()</code>.
268      * @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
269      * @stable ICU 2.1
270      */
charAt(CharSequence source, int offset16)271     public static int charAt(CharSequence source, int offset16) {
272         char single = source.charAt(offset16);
273         if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
274             return single;
275         }
276         return _charAt(source, offset16, single);
277     }
278 
_charAt(CharSequence source, int offset16, char single)279     private static int _charAt(CharSequence source, int offset16, char single) {
280         if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
281             return single;
282         }
283 
284         // Convert the UTF-16 surrogate pair if necessary.
285         // For simplicity in usage, and because the frequency of pairs is
286         // low, look both directions.
287 
288         if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
289             ++offset16;
290             if (source.length() != offset16) {
291                 char trail = source.charAt(offset16);
292                 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
293                         && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
294                     return UCharacterProperty.getRawSupplementary(single, trail);
295                 }
296             }
297         } else {
298             --offset16;
299             if (offset16 >= 0) {
300                 // single is a trail surrogate so
301                 char lead = source.charAt(offset16);
302                 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
303                         && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
304                     return UCharacterProperty.getRawSupplementary(lead, single);
305                 }
306             }
307         }
308         return single; // return unmatched surrogate
309     }
310 
311     /**
312      * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
313      * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
314      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
315      * </a></code>
316      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
317      * character will be returned. If a complete supplementary character is not found the incomplete
318      * character will be returned
319      *
320      * @param source Array of UTF-16 chars
321      * @param start Offset to substring in the source array for analyzing
322      * @param limit Offset to substring in the source array for analyzing
323      * @param offset16 UTF-16 offset relative to start
324      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
325      *         of that codepoint are the same as in <code>bounds32()</code>.
326      * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
327      * @stable ICU 2.1
328      */
charAt(char source[], int start, int limit, int offset16)329     public static int charAt(char source[], int start, int limit, int offset16) {
330         offset16 += start;
331         if (offset16 < start || offset16 >= limit) {
332             throw new ArrayIndexOutOfBoundsException(offset16);
333         }
334 
335         char single = source[offset16];
336         if (!isSurrogate(single)) {
337             return single;
338         }
339 
340         // Convert the UTF-16 surrogate pair if necessary.
341         // For simplicity in usage, and because the frequency of pairs is
342         // low, look both directions.
343         if (single <= LEAD_SURROGATE_MAX_VALUE) {
344             offset16++;
345             if (offset16 >= limit) {
346                 return single;
347             }
348             char trail = source[offset16];
349             if (isTrailSurrogate(trail)) {
350                 return UCharacterProperty.getRawSupplementary(single, trail);
351             }
352         }
353         else { // isTrailSurrogate(single), so
354             if (offset16 == start) {
355                 return single;
356             }
357             offset16--;
358             char lead = source[offset16];
359             if (isLeadSurrogate(lead))
360                 return UCharacterProperty.getRawSupplementary(lead, single);
361         }
362         return single; // return unmatched surrogate
363     }
364 
365     /**
366      * Determines how many chars this char32 requires.
367      * If a validity check is required, use <code>
368      * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
369      * char32 before calling.
370      * @param char32 the input codepoint.
371      * @return 2 if is in supplementary space, otherwise 1.
372      * @stable ICU 2.1
373      */
getCharCount(int char32)374     public static int getCharCount(int char32)
375     {
376         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
377             return 1;
378         }
379         return 2;
380     }
381 
382     /**
383      * Determines whether the code value is a surrogate.
384      * @param char16 the input character.
385      * @return true if the input character is a surrogate.
386      * @stable ICU 2.1
387      */
isSurrogate(char char16)388     public static boolean isSurrogate(char char16)
389     {
390         return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
391     }
392 
393     /**
394      * Determines whether the character is a trail surrogate.
395      * @param char16 the input character.
396      * @return true if the input character is a trail surrogate.
397      * @stable ICU 2.1
398      */
isTrailSurrogate(char char16)399     public static boolean isTrailSurrogate(char char16)
400     {
401         return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
402     }
403 
404     /**
405      * Determines whether the character is a lead surrogate.
406      * @param char16 the input character.
407      * @return true if the input character is a lead surrogate
408      * @stable ICU 2.1
409      */
isLeadSurrogate(char char16)410     public static boolean isLeadSurrogate(char char16)
411     {
412         return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
413     }
414 
415     /**
416      * Returns the lead surrogate.
417      * If a validity check is required, use
418      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
419      * on char32 before calling.
420      * @param char32 the input character.
421      * @return lead surrogate if the getCharCount(ch) is 2; <br>
422      *         and 0 otherwise (note: 0 is not a valid lead surrogate).
423      * @stable ICU 2.1
424      */
getLeadSurrogate(int char32)425     public static char getLeadSurrogate(int char32)
426     {
427         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
428             return (char)(LEAD_SURROGATE_OFFSET_ +
429                           (char32 >> LEAD_SURROGATE_SHIFT_));
430         }
431 
432         return 0;
433     }
434 
435     /**
436      * Returns the trail surrogate.
437      * If a validity check is required, use
438      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
439      * on char32 before calling.
440      * @param char32 the input character.
441      * @return the trail surrogate if the getCharCount(ch) is 2; <br> otherwise
442      *         the character itself
443      * @stable ICU 2.1
444      */
getTrailSurrogate(int char32)445     public static char getTrailSurrogate(int char32)
446     {
447         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
448             return (char)(TRAIL_SURROGATE_MIN_VALUE +
449                           (char32 & TRAIL_SURROGATE_MASK_));
450         }
451 
452         return (char) char32;
453     }
454 
455     /**
456      * Convenience method corresponding to String.valueOf(char). Returns a one
457      * or two char string containing the UTF-32 value in UTF16 format. If a
458      * validity check is required, use
459      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
460      * on char32 before calling.
461      * @param char32 the input character.
462      * @return string value of char32 in UTF16 format
463      * @exception IllegalArgumentException thrown if char32 is a invalid
464      *            codepoint.
465      * @stable ICU 2.1
466      */
valueOf(int char32)467     public static String valueOf(int char32)
468     {
469         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
470             throw new IllegalArgumentException("Illegal codepoint");
471         }
472         return toString(char32);
473     }
474 
475     /**
476      * Append a single UTF-32 value to the end of a StringBuffer.
477      * If a validity check is required, use
478      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
479      * on char32 before calling.
480      * @param target the buffer to append to
481      * @param char32 value to append.
482      * @return the updated StringBuffer
483      * @exception IllegalArgumentException thrown when char32 does not lie
484      *            within the range of the Unicode codepoints
485      * @stable ICU 2.1
486      */
append(StringBuffer target, int char32)487     public static StringBuffer append(StringBuffer target, int char32)
488     {
489         // Check for irregular values
490         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
491             throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
492         }
493 
494         // Write the UTF-16 values
495         if (char32 >= SUPPLEMENTARY_MIN_VALUE)
496             {
497             target.append(getLeadSurrogate(char32));
498             target.append(getTrailSurrogate(char32));
499         }
500         else {
501             target.append((char) char32);
502         }
503         return target;
504     }
505 
506     /**
507      * Shifts offset16 by the argument number of codepoints within a subarray.
508      * @param source char array
509      * @param start position of the subarray to be performed on
510      * @param limit position of the subarray to be performed on
511      * @param offset16 UTF16 position to shift relative to start
512      * @param shift32 number of codepoints to shift
513      * @return new shifted offset16 relative to start
514      * @exception IndexOutOfBoundsException if the new offset16 is out of
515      *            bounds with respect to the subarray or the subarray bounds
516      *            are out of range.
517      * @stable ICU 2.1
518      */
moveCodePointOffset(char source[], int start, int limit, int offset16, int shift32)519     public static int moveCodePointOffset(char source[], int start, int limit,
520                                           int offset16, int shift32)
521     {
522         int size = source.length;
523         int count;
524         char ch;
525         int result = offset16 + start;
526         if (start < 0 || limit < start) {
527             throw new StringIndexOutOfBoundsException(start);
528         }
529         if (limit > size) {
530             throw new StringIndexOutOfBoundsException(limit);
531         }
532         if (offset16 < 0 || result > limit) {
533             throw new StringIndexOutOfBoundsException(offset16);
534         }
535         if (shift32 > 0) {
536             if (shift32 + result > size) {
537                 throw new StringIndexOutOfBoundsException(result);
538             }
539             count = shift32;
540             while (result < limit && count > 0)
541             {
542                 ch = source[result];
543                 if (isLeadSurrogate(ch) && (result + 1 < limit) &&
544                     isTrailSurrogate(source[result + 1])) {
545                     result++;
546                 }
547                 count--;
548                 result++;
549             }
550         } else {
551             if (result + shift32 < start) {
552                 throw new StringIndexOutOfBoundsException(result);
553             }
554             for (count = -shift32; count > 0; count--) {
555                 result--;
556                 if (result < start) {
557                     break;
558                 }
559                 ch = source[result];
560                 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
561                     result--;
562                 }
563             }
564         }
565         if (count != 0) {
566             throw new StringIndexOutOfBoundsException(shift32);
567         }
568         result -= start;
569         return result;
570     }
571 
572     // private data members -------------------------------------------------
573 
574     /**
575      * Shift value for lead surrogate to form a supplementary character.
576      */
577     private static final int LEAD_SURROGATE_SHIFT_ = 10;
578 
579     /**
580      * Mask to retrieve the significant value from a trail surrogate.
581      */
582     private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
583 
584     /**
585      * Value that all lead surrogate starts with
586      */
587     private static final int LEAD_SURROGATE_OFFSET_ =
588         LEAD_SURROGATE_MIN_VALUE -
589         (SUPPLEMENTARY_MIN_VALUE
590         >> LEAD_SURROGATE_SHIFT_);
591 
592     // private methods ------------------------------------------------------
593 
594     /**
595      * <p>Converts argument code point and returns a String object representing
596      * the code point's value in UTF16 format.
597      * <p>This method does not check for the validity of the codepoint, the
598      * results are not guaranteed if a invalid codepoint is passed as
599      * argument.
600      * <p>The result is a string whose length is 1 for non-supplementary code
601      * points, 2 otherwise.
602      * @param ch code point
603      * @return string representation of the code point
604      */
toString(int ch)605     private static String toString(int ch)
606     {
607         if (ch < SUPPLEMENTARY_MIN_VALUE) {
608             return String.valueOf((char) ch);
609         }
610 
611         StringBuilder result = new StringBuilder();
612         result.append(getLeadSurrogate(ch));
613         result.append(getTrailSurrogate(ch));
614         return result.toString();
615     }
616 }
617