1 /* java.lang.Character -- Wrapper class for char, and Unicode subsets
2    Copyright (C) 1998, 1999, 2001, 2002, 2004, 2005 Free Software Foundation, Inc.
3 
4 This file is part of GNU Classpath.
5 
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10 
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING.  If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
20 
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library.  Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
25 
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module.  An independent module is a module which is not derived from
33 or based on this library.  If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so.  If you do not wish to do so, delete this
36 exception statement from your version. */
37 
38 
39 package java.lang;
40 
41 import gnu.java.lang.CharData;
42 
43 import java.io.Serializable;
44 import java.text.Collator;
45 import java.util.Locale;
46 
47 /**
48  * Wrapper class for the primitive char data type.  In addition, this class
49  * allows one to retrieve property information and perform transformations
50  * on the defined characters in the Unicode Standard, Version 4.0.0.
51  * java.lang.Character is designed to be very dynamic, and as such, it
52  * retrieves information on the Unicode character set from a separate
53  * database, gnu.java.lang.CharData, which can be easily upgraded.
54  *
55  * <p>For predicates, boundaries are used to describe
56  * the set of characters for which the method will return true.
57  * This syntax uses fairly normal regular expression notation.
58  * See 5.13 of the Unicode Standard, Version 4.0, for the
59  * boundary specification.
60  *
61  * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a>
62  * for more information on the Unicode Standard.
63  *
64  * @author Tom Tromey (tromey@cygnus.com)
65  * @author Paul N. Fisher
66  * @author Jochen Hoenicke
67  * @author Eric Blake (ebb9@email.byu.edu)
68  * @author Andrew John Hughes (gnu_andrew@member.fsf.org)
69  * @see CharData
70  * @since 1.0
71  * @status partly updated to 1.5; some things still missing
72  */
73 public final class Character implements Serializable, Comparable<Character>
74 {
75   /**
76    * A subset of Unicode blocks.
77    *
78    * @author Paul N. Fisher
79    * @author Eric Blake (ebb9@email.byu.edu)
80    * @since 1.2
81    */
82   public static class Subset
83   {
84     /** The name of the subset. */
85     private final String name;
86 
87     /**
88      * Construct a new subset of characters.
89      *
90      * @param name the name of the subset
91      * @throws NullPointerException if name is null
92      */
Subset(String name)93     protected Subset(String name)
94     {
95       // Note that name.toString() is name, unless name was null.
96       this.name = name.toString();
97     }
98 
99     /**
100      * Compares two Subsets for equality. This is <code>final</code>, and
101      * restricts the comparison on the <code>==</code> operator, so it returns
102      * true only for the same object.
103      *
104      * @param o the object to compare
105      * @return true if o is this
106      */
equals(Object o)107     public final boolean equals(Object o)
108     {
109       return o == this;
110     }
111 
112     /**
113      * Makes the original hashCode of Object final, to be consistent with
114      * equals.
115      *
116      * @return the hash code for this object
117      */
hashCode()118     public final int hashCode()
119     {
120       return super.hashCode();
121     }
122 
123     /**
124      * Returns the name of the subset.
125      *
126      * @return the name
127      */
toString()128     public final String toString()
129     {
130       return name;
131     }
132   } // class Subset
133 
134   /**
135    * A family of character subsets in the Unicode specification. A character
136    * is in at most one of these blocks.
137    *
138    * This inner class was generated automatically from
139    * <code>doc/unicode/Blocks-4.0.0.txt</code>, by some perl scripts.
140    * This Unicode definition file can be found on the
141    * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
142    * JDK 1.5 uses Unicode version 4.0.0.
143    *
144    * @author scripts/unicode-blocks.pl (written by Eric Blake)
145    * @since 1.2
146    */
147   public static final class UnicodeBlock extends Subset
148   {
149     /** The start of the subset. */
150     private final int start;
151 
152     /** The end of the subset. */
153     private final int end;
154 
155     /** The canonical name of the block according to the Unicode standard. */
156     private final String canonicalName;
157 
158     /** Enumeration for the <code>forName()</code> method */
159     private enum NameType { CANONICAL, NO_SPACES, CONSTANT; }
160 
161     /**
162      * Constructor for strictly defined blocks.
163      *
164      * @param start the start character of the range
165      * @param end the end character of the range
166      * @param name the block name
167      * @param canonicalName the name of the block as defined in the Unicode
168      *        standard.
169      */
UnicodeBlock(int start, int end, String name, String canonicalName)170     private UnicodeBlock(int start, int end, String name,
171                          String canonicalName)
172     {
173       super(name);
174       this.start = start;
175       this.end = end;
176       this.canonicalName = canonicalName;
177     }
178 
179     /**
180      * Returns the Unicode character block which a character belongs to.
181      * <strong>Note</strong>: This method does not support the use of
182      * supplementary characters.  For such support, <code>of(int)</code>
183      * should be used instead.
184      *
185      * @param ch the character to look up
186      * @return the set it belongs to, or null if it is not in one
187      */
of(char ch)188     public static UnicodeBlock of(char ch)
189     {
190       return of((int) ch);
191     }
192 
193     /**
194      * Returns the Unicode character block which a code point belongs to.
195      *
196      * @param codePoint the character to look up
197      * @return the set it belongs to, or null if it is not in one.
198      * @throws IllegalArgumentException if the specified code point is
199      *         invalid.
200      * @since 1.5
201      */
of(int codePoint)202     public static UnicodeBlock of(int codePoint)
203     {
204       if (codePoint > MAX_CODE_POINT)
205         throw new IllegalArgumentException("The supplied integer value is " +
206                                            "too large to be a codepoint.");
207       // Simple binary search for the correct block.
208       int low = 0;
209       int hi = sets.length - 1;
210       while (low <= hi)
211         {
212           int mid = (low + hi) >> 1;
213           UnicodeBlock b = sets[mid];
214           if (codePoint < b.start)
215             hi = mid - 1;
216           else if (codePoint > b.end)
217             low = mid + 1;
218           else
219             return b;
220         }
221       return null;
222     }
223 
224     /**
225      * <p>
226      * Returns the <code>UnicodeBlock</code> with the given name, as defined
227      * by the Unicode standard.  The version of Unicode in use is defined by
228      * the <code>Character</code> class, and the names are given in the
229      * <code>Blocks-<version>.txt</code> file corresponding to that version.
230      * The name may be specified in one of three ways:
231      * </p>
232      * <ol>
233      * <li>The canonical, human-readable name used by the Unicode standard.
234      * This is the name with all spaces and hyphens retained.  For example,
235      * `Basic Latin' retrieves the block, UnicodeBlock.BASIC_LATIN.</li>
236      * <li>The canonical name with all spaces removed e.g. `BasicLatin'.</li>
237      * <li>The name used for the constants specified by this class, which
238      * is the canonical name with all spaces and hyphens replaced with
239      * underscores e.g. `BASIC_LATIN'</li>
240      * </ol>
241      * <p>
242      * The names are compared case-insensitively using the case comparison
243      * associated with the U.S. English locale.  The method recognises the
244      * previous names used for blocks as well as the current ones.  At
245      * present, this simply means that the deprecated `SURROGATES_AREA'
246      * will be recognised by this method (the <code>of()</code> methods
247      * only return one of the three new surrogate blocks).
248      * </p>
249      *
250      * @param blockName the name of the block to look up.
251      * @return the specified block.
252      * @throws NullPointerException if the <code>blockName</code> is
253      *         <code>null</code>.
254      * @throws IllegalArgumentException if the name does not match any Unicode
255      *         block.
256      * @since 1.5
257      */
forName(String blockName)258     public static final UnicodeBlock forName(String blockName)
259     {
260       NameType type;
261       if (blockName.indexOf(' ') != -1)
262         type = NameType.CANONICAL;
263       else if (blockName.indexOf('_') != -1)
264         type = NameType.CONSTANT;
265       else
266         type = NameType.NO_SPACES;
267       Collator usCollator = Collator.getInstance(Locale.US);
268       usCollator.setStrength(Collator.PRIMARY);
269       /* Special case for deprecated blocks not in sets */
270       switch (type)
271       {
272         case CANONICAL:
273           if (usCollator.compare(blockName, "Surrogates Area") == 0)
274             return SURROGATES_AREA;
275           break;
276         case NO_SPACES:
277           if (usCollator.compare(blockName, "SurrogatesArea") == 0)
278             return SURROGATES_AREA;
279           break;
280         case CONSTANT:
281           if (usCollator.compare(blockName, "SURROGATES_AREA") == 0)
282             return SURROGATES_AREA;
283           break;
284       }
285       /* Other cases */
286       switch (type)
287       {
288         case CANONICAL:
289           for (UnicodeBlock block : sets)
290             if (usCollator.compare(blockName, block.canonicalName) == 0)
291               return block;
292           break;
293         case NO_SPACES:
294           for (UnicodeBlock block : sets)
295             {
296               String nsName = block.canonicalName.replaceAll(" ","");
297               if (usCollator.compare(blockName, nsName) == 0)
298                 return block;
299             }
300           break;
301         case CONSTANT:
302           for (UnicodeBlock block : sets)
303             if (usCollator.compare(blockName, block.toString()) == 0)
304               return block;
305           break;
306       }
307       throw new IllegalArgumentException("No Unicode block found for " +
308                                          blockName + ".");
309     }
310 
311     /**
312      * Basic Latin.
313      * 0x0000 - 0x007F.
314      */
315     public static final UnicodeBlock BASIC_LATIN
316       = new UnicodeBlock(0x0000, 0x007F,
317                          "BASIC_LATIN",
318                          "Basic Latin");
319 
320     /**
321      * Latin-1 Supplement.
322      * 0x0080 - 0x00FF.
323      */
324     public static final UnicodeBlock LATIN_1_SUPPLEMENT
325       = new UnicodeBlock(0x0080, 0x00FF,
326                          "LATIN_1_SUPPLEMENT",
327                          "Latin-1 Supplement");
328 
329     /**
330      * Latin Extended-A.
331      * 0x0100 - 0x017F.
332      */
333     public static final UnicodeBlock LATIN_EXTENDED_A
334       = new UnicodeBlock(0x0100, 0x017F,
335                          "LATIN_EXTENDED_A",
336                          "Latin Extended-A");
337 
338     /**
339      * Latin Extended-B.
340      * 0x0180 - 0x024F.
341      */
342     public static final UnicodeBlock LATIN_EXTENDED_B
343       = new UnicodeBlock(0x0180, 0x024F,
344                          "LATIN_EXTENDED_B",
345                          "Latin Extended-B");
346 
347     /**
348      * IPA Extensions.
349      * 0x0250 - 0x02AF.
350      */
351     public static final UnicodeBlock IPA_EXTENSIONS
352       = new UnicodeBlock(0x0250, 0x02AF,
353                          "IPA_EXTENSIONS",
354                          "IPA Extensions");
355 
356     /**
357      * Spacing Modifier Letters.
358      * 0x02B0 - 0x02FF.
359      */
360     public static final UnicodeBlock SPACING_MODIFIER_LETTERS
361       = new UnicodeBlock(0x02B0, 0x02FF,
362                          "SPACING_MODIFIER_LETTERS",
363                          "Spacing Modifier Letters");
364 
365     /**
366      * Combining Diacritical Marks.
367      * 0x0300 - 0x036F.
368      */
369     public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS
370       = new UnicodeBlock(0x0300, 0x036F,
371                          "COMBINING_DIACRITICAL_MARKS",
372                          "Combining Diacritical Marks");
373 
374     /**
375      * Greek.
376      * 0x0370 - 0x03FF.
377      */
378     public static final UnicodeBlock GREEK
379       = new UnicodeBlock(0x0370, 0x03FF,
380                          "GREEK",
381                          "Greek");
382 
383     /**
384      * Cyrillic.
385      * 0x0400 - 0x04FF.
386      */
387     public static final UnicodeBlock CYRILLIC
388       = new UnicodeBlock(0x0400, 0x04FF,
389                          "CYRILLIC",
390                          "Cyrillic");
391 
392     /**
393      * Cyrillic Supplementary.
394      * 0x0500 - 0x052F.
395      * @since 1.5
396      */
397     public static final UnicodeBlock CYRILLIC_SUPPLEMENTARY
398       = new UnicodeBlock(0x0500, 0x052F,
399                          "CYRILLIC_SUPPLEMENTARY",
400                          "Cyrillic Supplementary");
401 
402     /**
403      * Armenian.
404      * 0x0530 - 0x058F.
405      */
406     public static final UnicodeBlock ARMENIAN
407       = new UnicodeBlock(0x0530, 0x058F,
408                          "ARMENIAN",
409                          "Armenian");
410 
411     /**
412      * Hebrew.
413      * 0x0590 - 0x05FF.
414      */
415     public static final UnicodeBlock HEBREW
416       = new UnicodeBlock(0x0590, 0x05FF,
417                          "HEBREW",
418                          "Hebrew");
419 
420     /**
421      * Arabic.
422      * 0x0600 - 0x06FF.
423      */
424     public static final UnicodeBlock ARABIC
425       = new UnicodeBlock(0x0600, 0x06FF,
426                          "ARABIC",
427                          "Arabic");
428 
429     /**
430      * Syriac.
431      * 0x0700 - 0x074F.
432      * @since 1.4
433      */
434     public static final UnicodeBlock SYRIAC
435       = new UnicodeBlock(0x0700, 0x074F,
436                          "SYRIAC",
437                          "Syriac");
438 
439     /**
440      * Thaana.
441      * 0x0780 - 0x07BF.
442      * @since 1.4
443      */
444     public static final UnicodeBlock THAANA
445       = new UnicodeBlock(0x0780, 0x07BF,
446                          "THAANA",
447                          "Thaana");
448 
449     /**
450      * Devanagari.
451      * 0x0900 - 0x097F.
452      */
453     public static final UnicodeBlock DEVANAGARI
454       = new UnicodeBlock(0x0900, 0x097F,
455                          "DEVANAGARI",
456                          "Devanagari");
457 
458     /**
459      * Bengali.
460      * 0x0980 - 0x09FF.
461      */
462     public static final UnicodeBlock BENGALI
463       = new UnicodeBlock(0x0980, 0x09FF,
464                          "BENGALI",
465                          "Bengali");
466 
467     /**
468      * Gurmukhi.
469      * 0x0A00 - 0x0A7F.
470      */
471     public static final UnicodeBlock GURMUKHI
472       = new UnicodeBlock(0x0A00, 0x0A7F,
473                          "GURMUKHI",
474                          "Gurmukhi");
475 
476     /**
477      * Gujarati.
478      * 0x0A80 - 0x0AFF.
479      */
480     public static final UnicodeBlock GUJARATI
481       = new UnicodeBlock(0x0A80, 0x0AFF,
482                          "GUJARATI",
483                          "Gujarati");
484 
485     /**
486      * Oriya.
487      * 0x0B00 - 0x0B7F.
488      */
489     public static final UnicodeBlock ORIYA
490       = new UnicodeBlock(0x0B00, 0x0B7F,
491                          "ORIYA",
492                          "Oriya");
493 
494     /**
495      * Tamil.
496      * 0x0B80 - 0x0BFF.
497      */
498     public static final UnicodeBlock TAMIL
499       = new UnicodeBlock(0x0B80, 0x0BFF,
500                          "TAMIL",
501                          "Tamil");
502 
503     /**
504      * Telugu.
505      * 0x0C00 - 0x0C7F.
506      */
507     public static final UnicodeBlock TELUGU
508       = new UnicodeBlock(0x0C00, 0x0C7F,
509                          "TELUGU",
510                          "Telugu");
511 
512     /**
513      * Kannada.
514      * 0x0C80 - 0x0CFF.
515      */
516     public static final UnicodeBlock KANNADA
517       = new UnicodeBlock(0x0C80, 0x0CFF,
518                          "KANNADA",
519                          "Kannada");
520 
521     /**
522      * Malayalam.
523      * 0x0D00 - 0x0D7F.
524      */
525     public static final UnicodeBlock MALAYALAM
526       = new UnicodeBlock(0x0D00, 0x0D7F,
527                          "MALAYALAM",
528                          "Malayalam");
529 
530     /**
531      * Sinhala.
532      * 0x0D80 - 0x0DFF.
533      * @since 1.4
534      */
535     public static final UnicodeBlock SINHALA
536       = new UnicodeBlock(0x0D80, 0x0DFF,
537                          "SINHALA",
538                          "Sinhala");
539 
540     /**
541      * Thai.
542      * 0x0E00 - 0x0E7F.
543      */
544     public static final UnicodeBlock THAI
545       = new UnicodeBlock(0x0E00, 0x0E7F,
546                          "THAI",
547                          "Thai");
548 
549     /**
550      * Lao.
551      * 0x0E80 - 0x0EFF.
552      */
553     public static final UnicodeBlock LAO
554       = new UnicodeBlock(0x0E80, 0x0EFF,
555                          "LAO",
556                          "Lao");
557 
558     /**
559      * Tibetan.
560      * 0x0F00 - 0x0FFF.
561      */
562     public static final UnicodeBlock TIBETAN
563       = new UnicodeBlock(0x0F00, 0x0FFF,
564                          "TIBETAN",
565                          "Tibetan");
566 
567     /**
568      * Myanmar.
569      * 0x1000 - 0x109F.
570      * @since 1.4
571      */
572     public static final UnicodeBlock MYANMAR
573       = new UnicodeBlock(0x1000, 0x109F,
574                          "MYANMAR",
575                          "Myanmar");
576 
577     /**
578      * Georgian.
579      * 0x10A0 - 0x10FF.
580      */
581     public static final UnicodeBlock GEORGIAN
582       = new UnicodeBlock(0x10A0, 0x10FF,
583                          "GEORGIAN",
584                          "Georgian");
585 
586     /**
587      * Hangul Jamo.
588      * 0x1100 - 0x11FF.
589      */
590     public static final UnicodeBlock HANGUL_JAMO
591       = new UnicodeBlock(0x1100, 0x11FF,
592                          "HANGUL_JAMO",
593                          "Hangul Jamo");
594 
595     /**
596      * Ethiopic.
597      * 0x1200 - 0x137F.
598      * @since 1.4
599      */
600     public static final UnicodeBlock ETHIOPIC
601       = new UnicodeBlock(0x1200, 0x137F,
602                          "ETHIOPIC",
603                          "Ethiopic");
604 
605     /**
606      * Cherokee.
607      * 0x13A0 - 0x13FF.
608      * @since 1.4
609      */
610     public static final UnicodeBlock CHEROKEE
611       = new UnicodeBlock(0x13A0, 0x13FF,
612                          "CHEROKEE",
613                          "Cherokee");
614 
615     /**
616      * Unified Canadian Aboriginal Syllabics.
617      * 0x1400 - 0x167F.
618      * @since 1.4
619      */
620     public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
621       = new UnicodeBlock(0x1400, 0x167F,
622                          "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS",
623                          "Unified Canadian Aboriginal Syllabics");
624 
625     /**
626      * Ogham.
627      * 0x1680 - 0x169F.
628      * @since 1.4
629      */
630     public static final UnicodeBlock OGHAM
631       = new UnicodeBlock(0x1680, 0x169F,
632                          "OGHAM",
633                          "Ogham");
634 
635     /**
636      * Runic.
637      * 0x16A0 - 0x16FF.
638      * @since 1.4
639      */
640     public static final UnicodeBlock RUNIC
641       = new UnicodeBlock(0x16A0, 0x16FF,
642                          "RUNIC",
643                          "Runic");
644 
645     /**
646      * Tagalog.
647      * 0x1700 - 0x171F.
648      * @since 1.5
649      */
650     public static final UnicodeBlock TAGALOG
651       = new UnicodeBlock(0x1700, 0x171F,
652                          "TAGALOG",
653                          "Tagalog");
654 
655     /**
656      * Hanunoo.
657      * 0x1720 - 0x173F.
658      * @since 1.5
659      */
660     public static final UnicodeBlock HANUNOO
661       = new UnicodeBlock(0x1720, 0x173F,
662                          "HANUNOO",
663                          "Hanunoo");
664 
665     /**
666      * Buhid.
667      * 0x1740 - 0x175F.
668      * @since 1.5
669      */
670     public static final UnicodeBlock BUHID
671       = new UnicodeBlock(0x1740, 0x175F,
672                          "BUHID",
673                          "Buhid");
674 
675     /**
676      * Tagbanwa.
677      * 0x1760 - 0x177F.
678      * @since 1.5
679      */
680     public static final UnicodeBlock TAGBANWA
681       = new UnicodeBlock(0x1760, 0x177F,
682                          "TAGBANWA",
683                          "Tagbanwa");
684 
685     /**
686      * Khmer.
687      * 0x1780 - 0x17FF.
688      * @since 1.4
689      */
690     public static final UnicodeBlock KHMER
691       = new UnicodeBlock(0x1780, 0x17FF,
692                          "KHMER",
693                          "Khmer");
694 
695     /**
696      * Mongolian.
697      * 0x1800 - 0x18AF.
698      * @since 1.4
699      */
700     public static final UnicodeBlock MONGOLIAN
701       = new UnicodeBlock(0x1800, 0x18AF,
702                          "MONGOLIAN",
703                          "Mongolian");
704 
705     /**
706      * Limbu.
707      * 0x1900 - 0x194F.
708      * @since 1.5
709      */
710     public static final UnicodeBlock LIMBU
711       = new UnicodeBlock(0x1900, 0x194F,
712                          "LIMBU",
713                          "Limbu");
714 
715     /**
716      * Tai Le.
717      * 0x1950 - 0x197F.
718      * @since 1.5
719      */
720     public static final UnicodeBlock TAI_LE
721       = new UnicodeBlock(0x1950, 0x197F,
722                          "TAI_LE",
723                          "Tai Le");
724 
725     /**
726      * Khmer Symbols.
727      * 0x19E0 - 0x19FF.
728      * @since 1.5
729      */
730     public static final UnicodeBlock KHMER_SYMBOLS
731       = new UnicodeBlock(0x19E0, 0x19FF,
732                          "KHMER_SYMBOLS",
733                          "Khmer Symbols");
734 
735     /**
736      * Phonetic Extensions.
737      * 0x1D00 - 0x1D7F.
738      * @since 1.5
739      */
740     public static final UnicodeBlock PHONETIC_EXTENSIONS
741       = new UnicodeBlock(0x1D00, 0x1D7F,
742                          "PHONETIC_EXTENSIONS",
743                          "Phonetic Extensions");
744 
745     /**
746      * Latin Extended Additional.
747      * 0x1E00 - 0x1EFF.
748      */
749     public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL
750       = new UnicodeBlock(0x1E00, 0x1EFF,
751                          "LATIN_EXTENDED_ADDITIONAL",
752                          "Latin Extended Additional");
753 
754     /**
755      * Greek Extended.
756      * 0x1F00 - 0x1FFF.
757      */
758     public static final UnicodeBlock GREEK_EXTENDED
759       = new UnicodeBlock(0x1F00, 0x1FFF,
760                          "GREEK_EXTENDED",
761                          "Greek Extended");
762 
763     /**
764      * General Punctuation.
765      * 0x2000 - 0x206F.
766      */
767     public static final UnicodeBlock GENERAL_PUNCTUATION
768       = new UnicodeBlock(0x2000, 0x206F,
769                          "GENERAL_PUNCTUATION",
770                          "General Punctuation");
771 
772     /**
773      * Superscripts and Subscripts.
774      * 0x2070 - 0x209F.
775      */
776     public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS
777       = new UnicodeBlock(0x2070, 0x209F,
778                          "SUPERSCRIPTS_AND_SUBSCRIPTS",
779                          "Superscripts and Subscripts");
780 
781     /**
782      * Currency Symbols.
783      * 0x20A0 - 0x20CF.
784      */
785     public static final UnicodeBlock CURRENCY_SYMBOLS
786       = new UnicodeBlock(0x20A0, 0x20CF,
787                          "CURRENCY_SYMBOLS",
788                          "Currency Symbols");
789 
790     /**
791      * Combining Marks for Symbols.
792      * 0x20D0 - 0x20FF.
793      */
794     public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS
795       = new UnicodeBlock(0x20D0, 0x20FF,
796                          "COMBINING_MARKS_FOR_SYMBOLS",
797                          "Combining Marks for Symbols");
798 
799     /**
800      * Letterlike Symbols.
801      * 0x2100 - 0x214F.
802      */
803     public static final UnicodeBlock LETTERLIKE_SYMBOLS
804       = new UnicodeBlock(0x2100, 0x214F,
805                          "LETTERLIKE_SYMBOLS",
806                          "Letterlike Symbols");
807 
808     /**
809      * Number Forms.
810      * 0x2150 - 0x218F.
811      */
812     public static final UnicodeBlock NUMBER_FORMS
813       = new UnicodeBlock(0x2150, 0x218F,
814                          "NUMBER_FORMS",
815                          "Number Forms");
816 
817     /**
818      * Arrows.
819      * 0x2190 - 0x21FF.
820      */
821     public static final UnicodeBlock ARROWS
822       = new UnicodeBlock(0x2190, 0x21FF,
823                          "ARROWS",
824                          "Arrows");
825 
826     /**
827      * Mathematical Operators.
828      * 0x2200 - 0x22FF.
829      */
830     public static final UnicodeBlock MATHEMATICAL_OPERATORS
831       = new UnicodeBlock(0x2200, 0x22FF,
832                          "MATHEMATICAL_OPERATORS",
833                          "Mathematical Operators");
834 
835     /**
836      * Miscellaneous Technical.
837      * 0x2300 - 0x23FF.
838      */
839     public static final UnicodeBlock MISCELLANEOUS_TECHNICAL
840       = new UnicodeBlock(0x2300, 0x23FF,
841                          "MISCELLANEOUS_TECHNICAL",
842                          "Miscellaneous Technical");
843 
844     /**
845      * Control Pictures.
846      * 0x2400 - 0x243F.
847      */
848     public static final UnicodeBlock CONTROL_PICTURES
849       = new UnicodeBlock(0x2400, 0x243F,
850                          "CONTROL_PICTURES",
851                          "Control Pictures");
852 
853     /**
854      * Optical Character Recognition.
855      * 0x2440 - 0x245F.
856      */
857     public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION
858       = new UnicodeBlock(0x2440, 0x245F,
859                          "OPTICAL_CHARACTER_RECOGNITION",
860                          "Optical Character Recognition");
861 
862     /**
863      * Enclosed Alphanumerics.
864      * 0x2460 - 0x24FF.
865      */
866     public static final UnicodeBlock ENCLOSED_ALPHANUMERICS
867       = new UnicodeBlock(0x2460, 0x24FF,
868                          "ENCLOSED_ALPHANUMERICS",
869                          "Enclosed Alphanumerics");
870 
871     /**
872      * Box Drawing.
873      * 0x2500 - 0x257F.
874      */
875     public static final UnicodeBlock BOX_DRAWING
876       = new UnicodeBlock(0x2500, 0x257F,
877                          "BOX_DRAWING",
878                          "Box Drawing");
879 
880     /**
881      * Block Elements.
882      * 0x2580 - 0x259F.
883      */
884     public static final UnicodeBlock BLOCK_ELEMENTS
885       = new UnicodeBlock(0x2580, 0x259F,
886                          "BLOCK_ELEMENTS",
887                          "Block Elements");
888 
889     /**
890      * Geometric Shapes.
891      * 0x25A0 - 0x25FF.
892      */
893     public static final UnicodeBlock GEOMETRIC_SHAPES
894       = new UnicodeBlock(0x25A0, 0x25FF,
895                          "GEOMETRIC_SHAPES",
896                          "Geometric Shapes");
897 
898     /**
899      * Miscellaneous Symbols.
900      * 0x2600 - 0x26FF.
901      */
902     public static final UnicodeBlock MISCELLANEOUS_SYMBOLS
903       = new UnicodeBlock(0x2600, 0x26FF,
904                          "MISCELLANEOUS_SYMBOLS",
905                          "Miscellaneous Symbols");
906 
907     /**
908      * Dingbats.
909      * 0x2700 - 0x27BF.
910      */
911     public static final UnicodeBlock DINGBATS
912       = new UnicodeBlock(0x2700, 0x27BF,
913                          "DINGBATS",
914                          "Dingbats");
915 
916     /**
917      * Miscellaneous Mathematical Symbols-A.
918      * 0x27C0 - 0x27EF.
919      * @since 1.5
920      */
921     public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A
922       = new UnicodeBlock(0x27C0, 0x27EF,
923                          "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A",
924                          "Miscellaneous Mathematical Symbols-A");
925 
926     /**
927      * Supplemental Arrows-A.
928      * 0x27F0 - 0x27FF.
929      * @since 1.5
930      */
931     public static final UnicodeBlock SUPPLEMENTAL_ARROWS_A
932       = new UnicodeBlock(0x27F0, 0x27FF,
933                          "SUPPLEMENTAL_ARROWS_A",
934                          "Supplemental Arrows-A");
935 
936     /**
937      * Braille Patterns.
938      * 0x2800 - 0x28FF.
939      * @since 1.4
940      */
941     public static final UnicodeBlock BRAILLE_PATTERNS
942       = new UnicodeBlock(0x2800, 0x28FF,
943                          "BRAILLE_PATTERNS",
944                          "Braille Patterns");
945 
946     /**
947      * Supplemental Arrows-B.
948      * 0x2900 - 0x297F.
949      * @since 1.5
950      */
951     public static final UnicodeBlock SUPPLEMENTAL_ARROWS_B
952       = new UnicodeBlock(0x2900, 0x297F,
953                          "SUPPLEMENTAL_ARROWS_B",
954                          "Supplemental Arrows-B");
955 
956     /**
957      * Miscellaneous Mathematical Symbols-B.
958      * 0x2980 - 0x29FF.
959      * @since 1.5
960      */
961     public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B
962       = new UnicodeBlock(0x2980, 0x29FF,
963                          "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B",
964                          "Miscellaneous Mathematical Symbols-B");
965 
966     /**
967      * Supplemental Mathematical Operators.
968      * 0x2A00 - 0x2AFF.
969      * @since 1.5
970      */
971     public static final UnicodeBlock SUPPLEMENTAL_MATHEMATICAL_OPERATORS
972       = new UnicodeBlock(0x2A00, 0x2AFF,
973                          "SUPPLEMENTAL_MATHEMATICAL_OPERATORS",
974                          "Supplemental Mathematical Operators");
975 
976     /**
977      * Miscellaneous Symbols and Arrows.
978      * 0x2B00 - 0x2BFF.
979      * @since 1.5
980      */
981     public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_ARROWS
982       = new UnicodeBlock(0x2B00, 0x2BFF,
983                          "MISCELLANEOUS_SYMBOLS_AND_ARROWS",
984                          "Miscellaneous Symbols and Arrows");
985 
986     /**
987      * CJK Radicals Supplement.
988      * 0x2E80 - 0x2EFF.
989      * @since 1.4
990      */
991     public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT
992       = new UnicodeBlock(0x2E80, 0x2EFF,
993                          "CJK_RADICALS_SUPPLEMENT",
994                          "CJK Radicals Supplement");
995 
996     /**
997      * Kangxi Radicals.
998      * 0x2F00 - 0x2FDF.
999      * @since 1.4
1000      */
1001     public static final UnicodeBlock KANGXI_RADICALS
1002       = new UnicodeBlock(0x2F00, 0x2FDF,
1003                          "KANGXI_RADICALS",
1004                          "Kangxi Radicals");
1005 
1006     /**
1007      * Ideographic Description Characters.
1008      * 0x2FF0 - 0x2FFF.
1009      * @since 1.4
1010      */
1011     public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS
1012       = new UnicodeBlock(0x2FF0, 0x2FFF,
1013                          "IDEOGRAPHIC_DESCRIPTION_CHARACTERS",
1014                          "Ideographic Description Characters");
1015 
1016     /**
1017      * CJK Symbols and Punctuation.
1018      * 0x3000 - 0x303F.
1019      */
1020     public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION
1021       = new UnicodeBlock(0x3000, 0x303F,
1022                          "CJK_SYMBOLS_AND_PUNCTUATION",
1023                          "CJK Symbols and Punctuation");
1024 
1025     /**
1026      * Hiragana.
1027      * 0x3040 - 0x309F.
1028      */
1029     public static final UnicodeBlock HIRAGANA
1030       = new UnicodeBlock(0x3040, 0x309F,
1031                          "HIRAGANA",
1032                          "Hiragana");
1033 
1034     /**
1035      * Katakana.
1036      * 0x30A0 - 0x30FF.
1037      */
1038     public static final UnicodeBlock KATAKANA
1039       = new UnicodeBlock(0x30A0, 0x30FF,
1040                          "KATAKANA",
1041                          "Katakana");
1042 
1043     /**
1044      * Bopomofo.
1045      * 0x3100 - 0x312F.
1046      */
1047     public static final UnicodeBlock BOPOMOFO
1048       = new UnicodeBlock(0x3100, 0x312F,
1049                          "BOPOMOFO",
1050                          "Bopomofo");
1051 
1052     /**
1053      * Hangul Compatibility Jamo.
1054      * 0x3130 - 0x318F.
1055      */
1056     public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO
1057       = new UnicodeBlock(0x3130, 0x318F,
1058                          "HANGUL_COMPATIBILITY_JAMO",
1059                          "Hangul Compatibility Jamo");
1060 
1061     /**
1062      * Kanbun.
1063      * 0x3190 - 0x319F.
1064      */
1065     public static final UnicodeBlock KANBUN
1066       = new UnicodeBlock(0x3190, 0x319F,
1067                          "KANBUN",
1068                          "Kanbun");
1069 
1070     /**
1071      * Bopomofo Extended.
1072      * 0x31A0 - 0x31BF.
1073      * @since 1.4
1074      */
1075     public static final UnicodeBlock BOPOMOFO_EXTENDED
1076       = new UnicodeBlock(0x31A0, 0x31BF,
1077                          "BOPOMOFO_EXTENDED",
1078                          "Bopomofo Extended");
1079 
1080     /**
1081      * Katakana Phonetic Extensions.
1082      * 0x31F0 - 0x31FF.
1083      * @since 1.5
1084      */
1085     public static final UnicodeBlock KATAKANA_PHONETIC_EXTENSIONS
1086       = new UnicodeBlock(0x31F0, 0x31FF,
1087                          "KATAKANA_PHONETIC_EXTENSIONS",
1088                          "Katakana Phonetic Extensions");
1089 
1090     /**
1091      * Enclosed CJK Letters and Months.
1092      * 0x3200 - 0x32FF.
1093      */
1094     public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS
1095       = new UnicodeBlock(0x3200, 0x32FF,
1096                          "ENCLOSED_CJK_LETTERS_AND_MONTHS",
1097                          "Enclosed CJK Letters and Months");
1098 
1099     /**
1100      * CJK Compatibility.
1101      * 0x3300 - 0x33FF.
1102      */
1103     public static final UnicodeBlock CJK_COMPATIBILITY
1104       = new UnicodeBlock(0x3300, 0x33FF,
1105                          "CJK_COMPATIBILITY",
1106                          "CJK Compatibility");
1107 
1108     /**
1109      * CJK Unified Ideographs Extension A.
1110      * 0x3400 - 0x4DBF.
1111      * @since 1.4
1112      */
1113     public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
1114       = new UnicodeBlock(0x3400, 0x4DBF,
1115                          "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A",
1116                          "CJK Unified Ideographs Extension A");
1117 
1118     /**
1119      * Yijing Hexagram Symbols.
1120      * 0x4DC0 - 0x4DFF.
1121      * @since 1.5
1122      */
1123     public static final UnicodeBlock YIJING_HEXAGRAM_SYMBOLS
1124       = new UnicodeBlock(0x4DC0, 0x4DFF,
1125                          "YIJING_HEXAGRAM_SYMBOLS",
1126                          "Yijing Hexagram Symbols");
1127 
1128     /**
1129      * CJK Unified Ideographs.
1130      * 0x4E00 - 0x9FFF.
1131      */
1132     public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS
1133       = new UnicodeBlock(0x4E00, 0x9FFF,
1134                          "CJK_UNIFIED_IDEOGRAPHS",
1135                          "CJK Unified Ideographs");
1136 
1137     /**
1138      * Yi Syllables.
1139      * 0xA000 - 0xA48F.
1140      * @since 1.4
1141      */
1142     public static final UnicodeBlock YI_SYLLABLES
1143       = new UnicodeBlock(0xA000, 0xA48F,
1144                          "YI_SYLLABLES",
1145                          "Yi Syllables");
1146 
1147     /**
1148      * Yi Radicals.
1149      * 0xA490 - 0xA4CF.
1150      * @since 1.4
1151      */
1152     public static final UnicodeBlock YI_RADICALS
1153       = new UnicodeBlock(0xA490, 0xA4CF,
1154                          "YI_RADICALS",
1155                          "Yi Radicals");
1156 
1157     /**
1158      * Hangul Syllables.
1159      * 0xAC00 - 0xD7AF.
1160      */
1161     public static final UnicodeBlock HANGUL_SYLLABLES
1162       = new UnicodeBlock(0xAC00, 0xD7AF,
1163                          "HANGUL_SYLLABLES",
1164                          "Hangul Syllables");
1165 
1166     /**
1167      * High Surrogates.
1168      * 0xD800 - 0xDB7F.
1169      * @since 1.5
1170      */
1171     public static final UnicodeBlock HIGH_SURROGATES
1172       = new UnicodeBlock(0xD800, 0xDB7F,
1173                          "HIGH_SURROGATES",
1174                          "High Surrogates");
1175 
1176     /**
1177      * High Private Use Surrogates.
1178      * 0xDB80 - 0xDBFF.
1179      * @since 1.5
1180      */
1181     public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES
1182       = new UnicodeBlock(0xDB80, 0xDBFF,
1183                          "HIGH_PRIVATE_USE_SURROGATES",
1184                          "High Private Use Surrogates");
1185 
1186     /**
1187      * Low Surrogates.
1188      * 0xDC00 - 0xDFFF.
1189      * @since 1.5
1190      */
1191     public static final UnicodeBlock LOW_SURROGATES
1192       = new UnicodeBlock(0xDC00, 0xDFFF,
1193                          "LOW_SURROGATES",
1194                          "Low Surrogates");
1195 
1196     /**
1197      * Private Use Area.
1198      * 0xE000 - 0xF8FF.
1199      */
1200     public static final UnicodeBlock PRIVATE_USE_AREA
1201       = new UnicodeBlock(0xE000, 0xF8FF,
1202                          "PRIVATE_USE_AREA",
1203                          "Private Use Area");
1204 
1205     /**
1206      * CJK Compatibility Ideographs.
1207      * 0xF900 - 0xFAFF.
1208      */
1209     public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS
1210       = new UnicodeBlock(0xF900, 0xFAFF,
1211                          "CJK_COMPATIBILITY_IDEOGRAPHS",
1212                          "CJK Compatibility Ideographs");
1213 
1214     /**
1215      * Alphabetic Presentation Forms.
1216      * 0xFB00 - 0xFB4F.
1217      */
1218     public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS
1219       = new UnicodeBlock(0xFB00, 0xFB4F,
1220                          "ALPHABETIC_PRESENTATION_FORMS",
1221                          "Alphabetic Presentation Forms");
1222 
1223     /**
1224      * Arabic Presentation Forms-A.
1225      * 0xFB50 - 0xFDFF.
1226      */
1227     public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A
1228       = new UnicodeBlock(0xFB50, 0xFDFF,
1229                          "ARABIC_PRESENTATION_FORMS_A",
1230                          "Arabic Presentation Forms-A");
1231 
1232     /**
1233      * Variation Selectors.
1234      * 0xFE00 - 0xFE0F.
1235      * @since 1.5
1236      */
1237     public static final UnicodeBlock VARIATION_SELECTORS
1238       = new UnicodeBlock(0xFE00, 0xFE0F,
1239                          "VARIATION_SELECTORS",
1240                          "Variation Selectors");
1241 
1242     /**
1243      * Combining Half Marks.
1244      * 0xFE20 - 0xFE2F.
1245      */
1246     public static final UnicodeBlock COMBINING_HALF_MARKS
1247       = new UnicodeBlock(0xFE20, 0xFE2F,
1248                          "COMBINING_HALF_MARKS",
1249                          "Combining Half Marks");
1250 
1251     /**
1252      * CJK Compatibility Forms.
1253      * 0xFE30 - 0xFE4F.
1254      */
1255     public static final UnicodeBlock CJK_COMPATIBILITY_FORMS
1256       = new UnicodeBlock(0xFE30, 0xFE4F,
1257                          "CJK_COMPATIBILITY_FORMS",
1258                          "CJK Compatibility Forms");
1259 
1260     /**
1261      * Small Form Variants.
1262      * 0xFE50 - 0xFE6F.
1263      */
1264     public static final UnicodeBlock SMALL_FORM_VARIANTS
1265       = new UnicodeBlock(0xFE50, 0xFE6F,
1266                          "SMALL_FORM_VARIANTS",
1267                          "Small Form Variants");
1268 
1269     /**
1270      * Arabic Presentation Forms-B.
1271      * 0xFE70 - 0xFEFF.
1272      */
1273     public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B
1274       = new UnicodeBlock(0xFE70, 0xFEFF,
1275                          "ARABIC_PRESENTATION_FORMS_B",
1276                          "Arabic Presentation Forms-B");
1277 
1278     /**
1279      * Halfwidth and Fullwidth Forms.
1280      * 0xFF00 - 0xFFEF.
1281      */
1282     public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS
1283       = new UnicodeBlock(0xFF00, 0xFFEF,
1284                          "HALFWIDTH_AND_FULLWIDTH_FORMS",
1285                          "Halfwidth and Fullwidth Forms");
1286 
1287     /**
1288      * Specials.
1289      * 0xFFF0 - 0xFFFF.
1290      */
1291     public static final UnicodeBlock SPECIALS
1292       = new UnicodeBlock(0xFFF0, 0xFFFF,
1293                          "SPECIALS",
1294                          "Specials");
1295 
1296     /**
1297      * Linear B Syllabary.
1298      * 0x10000 - 0x1007F.
1299      * @since 1.5
1300      */
1301     public static final UnicodeBlock LINEAR_B_SYLLABARY
1302       = new UnicodeBlock(0x10000, 0x1007F,
1303                          "LINEAR_B_SYLLABARY",
1304                          "Linear B Syllabary");
1305 
1306     /**
1307      * Linear B Ideograms.
1308      * 0x10080 - 0x100FF.
1309      * @since 1.5
1310      */
1311     public static final UnicodeBlock LINEAR_B_IDEOGRAMS
1312       = new UnicodeBlock(0x10080, 0x100FF,
1313                          "LINEAR_B_IDEOGRAMS",
1314                          "Linear B Ideograms");
1315 
1316     /**
1317      * Aegean Numbers.
1318      * 0x10100 - 0x1013F.
1319      * @since 1.5
1320      */
1321     public static final UnicodeBlock AEGEAN_NUMBERS
1322       = new UnicodeBlock(0x10100, 0x1013F,
1323                          "AEGEAN_NUMBERS",
1324                          "Aegean Numbers");
1325 
1326     /**
1327      * Old Italic.
1328      * 0x10300 - 0x1032F.
1329      * @since 1.5
1330      */
1331     public static final UnicodeBlock OLD_ITALIC
1332       = new UnicodeBlock(0x10300, 0x1032F,
1333                          "OLD_ITALIC",
1334                          "Old Italic");
1335 
1336     /**
1337      * Gothic.
1338      * 0x10330 - 0x1034F.
1339      * @since 1.5
1340      */
1341     public static final UnicodeBlock GOTHIC
1342       = new UnicodeBlock(0x10330, 0x1034F,
1343                          "GOTHIC",
1344                          "Gothic");
1345 
1346     /**
1347      * Ugaritic.
1348      * 0x10380 - 0x1039F.
1349      * @since 1.5
1350      */
1351     public static final UnicodeBlock UGARITIC
1352       = new UnicodeBlock(0x10380, 0x1039F,
1353                          "UGARITIC",
1354                          "Ugaritic");
1355 
1356     /**
1357      * Deseret.
1358      * 0x10400 - 0x1044F.
1359      * @since 1.5
1360      */
1361     public static final UnicodeBlock DESERET
1362       = new UnicodeBlock(0x10400, 0x1044F,
1363                          "DESERET",
1364                          "Deseret");
1365 
1366     /**
1367      * Shavian.
1368      * 0x10450 - 0x1047F.
1369      * @since 1.5
1370      */
1371     public static final UnicodeBlock SHAVIAN
1372       = new UnicodeBlock(0x10450, 0x1047F,
1373                          "SHAVIAN",
1374                          "Shavian");
1375 
1376     /**
1377      * Osmanya.
1378      * 0x10480 - 0x104AF.
1379      * @since 1.5
1380      */
1381     public static final UnicodeBlock OSMANYA
1382       = new UnicodeBlock(0x10480, 0x104AF,
1383                          "OSMANYA",
1384                          "Osmanya");
1385 
1386     /**
1387      * Cypriot Syllabary.
1388      * 0x10800 - 0x1083F.
1389      * @since 1.5
1390      */
1391     public static final UnicodeBlock CYPRIOT_SYLLABARY
1392       = new UnicodeBlock(0x10800, 0x1083F,
1393                          "CYPRIOT_SYLLABARY",
1394                          "Cypriot Syllabary");
1395 
1396     /**
1397      * Byzantine Musical Symbols.
1398      * 0x1D000 - 0x1D0FF.
1399      * @since 1.5
1400      */
1401     public static final UnicodeBlock BYZANTINE_MUSICAL_SYMBOLS
1402       = new UnicodeBlock(0x1D000, 0x1D0FF,
1403                          "BYZANTINE_MUSICAL_SYMBOLS",
1404                          "Byzantine Musical Symbols");
1405 
1406     /**
1407      * Musical Symbols.
1408      * 0x1D100 - 0x1D1FF.
1409      * @since 1.5
1410      */
1411     public static final UnicodeBlock MUSICAL_SYMBOLS
1412       = new UnicodeBlock(0x1D100, 0x1D1FF,
1413                          "MUSICAL_SYMBOLS",
1414                          "Musical Symbols");
1415 
1416     /**
1417      * Tai Xuan Jing Symbols.
1418      * 0x1D300 - 0x1D35F.
1419      * @since 1.5
1420      */
1421     public static final UnicodeBlock TAI_XUAN_JING_SYMBOLS
1422       = new UnicodeBlock(0x1D300, 0x1D35F,
1423                          "TAI_XUAN_JING_SYMBOLS",
1424                          "Tai Xuan Jing Symbols");
1425 
1426     /**
1427      * Mathematical Alphanumeric Symbols.
1428      * 0x1D400 - 0x1D7FF.
1429      * @since 1.5
1430      */
1431     public static final UnicodeBlock MATHEMATICAL_ALPHANUMERIC_SYMBOLS
1432       = new UnicodeBlock(0x1D400, 0x1D7FF,
1433                          "MATHEMATICAL_ALPHANUMERIC_SYMBOLS",
1434                          "Mathematical Alphanumeric Symbols");
1435 
1436     /**
1437      * CJK Unified Ideographs Extension B.
1438      * 0x20000 - 0x2A6DF.
1439      * @since 1.5
1440      */
1441     public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
1442       = new UnicodeBlock(0x20000, 0x2A6DF,
1443                          "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B",
1444                          "CJK Unified Ideographs Extension B");
1445 
1446     /**
1447      * CJK Compatibility Ideographs Supplement.
1448      * 0x2F800 - 0x2FA1F.
1449      * @since 1.5
1450      */
1451     public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT
1452       = new UnicodeBlock(0x2F800, 0x2FA1F,
1453                          "CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT",
1454                          "CJK Compatibility Ideographs Supplement");
1455 
1456     /**
1457      * Tags.
1458      * 0xE0000 - 0xE007F.
1459      * @since 1.5
1460      */
1461     public static final UnicodeBlock TAGS
1462       = new UnicodeBlock(0xE0000, 0xE007F,
1463                          "TAGS",
1464                          "Tags");
1465 
1466     /**
1467      * Variation Selectors Supplement.
1468      * 0xE0100 - 0xE01EF.
1469      * @since 1.5
1470      */
1471     public static final UnicodeBlock VARIATION_SELECTORS_SUPPLEMENT
1472       = new UnicodeBlock(0xE0100, 0xE01EF,
1473                          "VARIATION_SELECTORS_SUPPLEMENT",
1474                          "Variation Selectors Supplement");
1475 
1476     /**
1477      * Supplementary Private Use Area-A.
1478      * 0xF0000 - 0xFFFFF.
1479      * @since 1.5
1480      */
1481     public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_A
1482       = new UnicodeBlock(0xF0000, 0xFFFFF,
1483                          "SUPPLEMENTARY_PRIVATE_USE_AREA_A",
1484                          "Supplementary Private Use Area-A");
1485 
1486     /**
1487      * Supplementary Private Use Area-B.
1488      * 0x100000 - 0x10FFFF.
1489      * @since 1.5
1490      */
1491     public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_B
1492       = new UnicodeBlock(0x100000, 0x10FFFF,
1493                          "SUPPLEMENTARY_PRIVATE_USE_AREA_B",
1494                          "Supplementary Private Use Area-B");
1495 
1496     /**
1497      * Surrogates Area.
1498      * 'D800' - 'DFFF'.
1499      * @deprecated As of 1.5, the three areas,
1500      * <a href="#HIGH_SURROGATES">HIGH_SURROGATES</a>,
1501      * <a href="#HIGH_PRIVATE_USE_SURROGATES">HIGH_PRIVATE_USE_SURROGATES</a>
1502      * and <a href="#LOW_SURROGATES">LOW_SURROGATES</a>, as defined
1503      * by the Unicode standard, should be used in preference to
1504      * this.  These are also returned from calls to <code>of(int)</code>
1505      * and <code>of(char)</code>.
1506      */
1507     @Deprecated
1508     public static final UnicodeBlock SURROGATES_AREA
1509       = new UnicodeBlock(0xD800, 0xDFFF,
1510                          "SURROGATES_AREA",
1511                          "Surrogates Area");
1512 
1513     /**
1514      * The defined subsets.
1515      */
1516     private static final UnicodeBlock sets[] = {
1517       BASIC_LATIN,
1518       LATIN_1_SUPPLEMENT,
1519       LATIN_EXTENDED_A,
1520       LATIN_EXTENDED_B,
1521       IPA_EXTENSIONS,
1522       SPACING_MODIFIER_LETTERS,
1523       COMBINING_DIACRITICAL_MARKS,
1524       GREEK,
1525       CYRILLIC,
1526       CYRILLIC_SUPPLEMENTARY,
1527       ARMENIAN,
1528       HEBREW,
1529       ARABIC,
1530       SYRIAC,
1531       THAANA,
1532       DEVANAGARI,
1533       BENGALI,
1534       GURMUKHI,
1535       GUJARATI,
1536       ORIYA,
1537       TAMIL,
1538       TELUGU,
1539       KANNADA,
1540       MALAYALAM,
1541       SINHALA,
1542       THAI,
1543       LAO,
1544       TIBETAN,
1545       MYANMAR,
1546       GEORGIAN,
1547       HANGUL_JAMO,
1548       ETHIOPIC,
1549       CHEROKEE,
1550       UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
1551       OGHAM,
1552       RUNIC,
1553       TAGALOG,
1554       HANUNOO,
1555       BUHID,
1556       TAGBANWA,
1557       KHMER,
1558       MONGOLIAN,
1559       LIMBU,
1560       TAI_LE,
1561       KHMER_SYMBOLS,
1562       PHONETIC_EXTENSIONS,
1563       LATIN_EXTENDED_ADDITIONAL,
1564       GREEK_EXTENDED,
1565       GENERAL_PUNCTUATION,
1566       SUPERSCRIPTS_AND_SUBSCRIPTS,
1567       CURRENCY_SYMBOLS,
1568       COMBINING_MARKS_FOR_SYMBOLS,
1569       LETTERLIKE_SYMBOLS,
1570       NUMBER_FORMS,
1571       ARROWS,
1572       MATHEMATICAL_OPERATORS,
1573       MISCELLANEOUS_TECHNICAL,
1574       CONTROL_PICTURES,
1575       OPTICAL_CHARACTER_RECOGNITION,
1576       ENCLOSED_ALPHANUMERICS,
1577       BOX_DRAWING,
1578       BLOCK_ELEMENTS,
1579       GEOMETRIC_SHAPES,
1580       MISCELLANEOUS_SYMBOLS,
1581       DINGBATS,
1582       MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A,
1583       SUPPLEMENTAL_ARROWS_A,
1584       BRAILLE_PATTERNS,
1585       SUPPLEMENTAL_ARROWS_B,
1586       MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B,
1587       SUPPLEMENTAL_MATHEMATICAL_OPERATORS,
1588       MISCELLANEOUS_SYMBOLS_AND_ARROWS,
1589       CJK_RADICALS_SUPPLEMENT,
1590       KANGXI_RADICALS,
1591       IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
1592       CJK_SYMBOLS_AND_PUNCTUATION,
1593       HIRAGANA,
1594       KATAKANA,
1595       BOPOMOFO,
1596       HANGUL_COMPATIBILITY_JAMO,
1597       KANBUN,
1598       BOPOMOFO_EXTENDED,
1599       KATAKANA_PHONETIC_EXTENSIONS,
1600       ENCLOSED_CJK_LETTERS_AND_MONTHS,
1601       CJK_COMPATIBILITY,
1602       CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
1603       YIJING_HEXAGRAM_SYMBOLS,
1604       CJK_UNIFIED_IDEOGRAPHS,
1605       YI_SYLLABLES,
1606       YI_RADICALS,
1607       HANGUL_SYLLABLES,
1608       HIGH_SURROGATES,
1609       HIGH_PRIVATE_USE_SURROGATES,
1610       LOW_SURROGATES,
1611       PRIVATE_USE_AREA,
1612       CJK_COMPATIBILITY_IDEOGRAPHS,
1613       ALPHABETIC_PRESENTATION_FORMS,
1614       ARABIC_PRESENTATION_FORMS_A,
1615       VARIATION_SELECTORS,
1616       COMBINING_HALF_MARKS,
1617       CJK_COMPATIBILITY_FORMS,
1618       SMALL_FORM_VARIANTS,
1619       ARABIC_PRESENTATION_FORMS_B,
1620       HALFWIDTH_AND_FULLWIDTH_FORMS,
1621       SPECIALS,
1622       LINEAR_B_SYLLABARY,
1623       LINEAR_B_IDEOGRAMS,
1624       AEGEAN_NUMBERS,
1625       OLD_ITALIC,
1626       GOTHIC,
1627       UGARITIC,
1628       DESERET,
1629       SHAVIAN,
1630       OSMANYA,
1631       CYPRIOT_SYLLABARY,
1632       BYZANTINE_MUSICAL_SYMBOLS,
1633       MUSICAL_SYMBOLS,
1634       TAI_XUAN_JING_SYMBOLS,
1635       MATHEMATICAL_ALPHANUMERIC_SYMBOLS,
1636       CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
1637       CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
1638       TAGS,
1639       VARIATION_SELECTORS_SUPPLEMENT,
1640       SUPPLEMENTARY_PRIVATE_USE_AREA_A,
1641       SUPPLEMENTARY_PRIVATE_USE_AREA_B,
1642     };
1643   } // class UnicodeBlock
1644 
1645   /**
1646    * A class to encompass all the properties of characters in the
1647    * private use blocks in the Unicode standard.  This class extends
1648    * UnassignedCharacters because the return type from getType() is
1649    * different.
1650    * @author Anthony Balkissoon abalkiss at redhat dot com
1651    *
1652    */
1653   private static class PrivateUseCharacters extends UnassignedCharacters
1654   {
1655     /**
1656      * Returns the type of the character cp.
1657      */
getType(int cp)1658     static int getType(int cp)
1659     {
1660       // The upper 2 code points in any plane are considered unassigned,
1661       // even in the private-use planes.
1662       if ((cp & 0xffff) >= 0xfffe)
1663         return UnassignedCharacters.getType(cp);
1664       return PRIVATE_USE;
1665     }
1666 
1667     /**
1668      * Returns true if the character cp is defined.
1669      */
isDefined(int cp)1670     static boolean isDefined(int cp)
1671     {
1672       // The upper 2 code points in any plane are considered unassigned,
1673       // even in the private-use planes.
1674       if ((cp & 0xffff) >= 0xfffe)
1675         return UnassignedCharacters.isDefined(cp);
1676       return true;
1677     }
1678 
1679     /**
1680      * Gets the directionality for the character cp.
1681      */
getDirectionality(int cp)1682     static byte getDirectionality(int cp)
1683     {
1684       if ((cp & 0xffff) >= 0xfffe)
1685         return UnassignedCharacters.getDirectionality(cp);
1686       return DIRECTIONALITY_LEFT_TO_RIGHT;
1687     }
1688   }
1689 
1690   /**
1691    * A class to encompass all the properties of code points that are
1692    * currently undefined in the Unicode standard.
1693    * @author Anthony Balkissoon abalkiss at redhat dot com
1694    *
1695    */
1696   private static class UnassignedCharacters
1697   {
1698     /**
1699      * Returns the numeric value for the unassigned characters.
1700      * @param cp the character
1701      * @param radix the radix (not used)
1702      * @return the numeric value of this character in this radix
1703      */
digit(int cp, int radix)1704     static int digit(int cp, int radix)
1705     {
1706       return -1;
1707     }
1708 
1709     /**
1710      * Returns the Unicode directionality property for unassigned
1711      * characters.
1712      * @param cp the character
1713      * @return DIRECTIONALITY_UNDEFINED
1714      */
getDirectionality(int cp)1715     static byte getDirectionality(int cp)
1716     {
1717       return DIRECTIONALITY_UNDEFINED;
1718     }
1719 
1720     /**
1721      * Returns -1, the numeric value for unassigned Unicode characters.
1722      * @param cp the character
1723      * @return -1
1724      */
getNumericValue(int cp)1725     static int getNumericValue(int cp)
1726     {
1727       return -1;
1728     }
1729 
1730     /**
1731      * Returns UNASSIGNED, the type of unassigned Unicode characters.
1732      * @param cp the character
1733      * @return UNASSIGNED
1734      */
getType(int cp)1735     static int getType(int cp)
1736     {
1737       return UNASSIGNED;
1738     }
1739 
1740     /**
1741      * Returns false to indiciate that the character is not defined in the
1742      * Unicode standard.
1743      * @param cp the character
1744      * @return false
1745      */
isDefined(int cp)1746     static boolean isDefined(int cp)
1747     {
1748       return false;
1749     }
1750 
1751     /**
1752      * Returns false to indicate that the character is not a digit.
1753      * @param cp the character
1754      * @return false
1755      */
isDigit(int cp)1756     static boolean isDigit(int cp)
1757     {
1758       return false;
1759     }
1760 
1761     /**
1762      * Returns false to indicate that the character cannot be ignored
1763      * within an identifier
1764      * @param cp the character
1765      * @return false
1766      */
isIdentifierIgnorable(int cp)1767     static boolean isIdentifierIgnorable(int cp)
1768     {
1769       return false;
1770     }
1771 
1772     /**
1773      * Returns false to indicate that the character cannot be part of a
1774      * Java identifier.
1775      * @param cp the character
1776      * @return false
1777      */
isJavaIdentifierPart(int cp)1778     static boolean isJavaIdentifierPart(int cp)
1779     {
1780       return false;
1781     }
1782 
1783     /**
1784      * Returns false to indicate that the character cannot be start a
1785      * Java identifier.
1786      * @param cp the character
1787      * @return false
1788      */
isJavaIdentiferStart(int cp)1789     static boolean isJavaIdentiferStart(int cp)
1790     {
1791       return false;
1792     }
1793 
1794     /**
1795      * Returns false to indicate that the character is not a letter.
1796      * @param cp the character
1797      * @return false
1798      */
isLetter(int cp)1799     static boolean isLetter(int cp)
1800     {
1801       return false;
1802     }
1803 
1804     /**
1805      * Returns false to indicate that the character cannot is neither a letter
1806      * nor a digit.
1807      * @param cp the character
1808      * @return false
1809      */
isLetterOrDigit(int cp)1810     static boolean isLetterOrDigit(int cp)
1811     {
1812       return false;
1813     }
1814 
1815     /**
1816      * Returns false to indicate that the character is not a lowercase letter.
1817      * @param cp the character
1818      * @return false
1819      */
isLowerCase(int cp)1820     static boolean isLowerCase(int cp)
1821     {
1822       return false;
1823     }
1824 
1825     /**
1826      * Returns false to indicate that the character cannot is not mirrored.
1827      * @param cp the character
1828      * @return false
1829      */
isMirrored(int cp)1830     static boolean isMirrored(int cp)
1831     {
1832       return false;
1833     }
1834 
1835     /**
1836      * Returns false to indicate that the character is not a space character.
1837      * @param cp the character
1838      * @return false
1839      */
isSpaceChar(int cp)1840     static boolean isSpaceChar(int cp)
1841     {
1842       return false;
1843     }
1844 
1845     /**
1846      * Returns false to indicate that the character it not a titlecase letter.
1847      * @param cp the character
1848      * @return false
1849      */
isTitleCase(int cp)1850     static boolean isTitleCase(int cp)
1851     {
1852       return false;
1853     }
1854 
1855     /**
1856      * Returns false to indicate that the character cannot be part of a
1857      * Unicode identifier.
1858      * @param cp the character
1859      * @return false
1860      */
isUnicodeIdentifierPart(int cp)1861     static boolean isUnicodeIdentifierPart(int cp)
1862     {
1863       return false;
1864     }
1865 
1866     /**
1867      * Returns false to indicate that the character cannot start a
1868      * Unicode identifier.
1869      * @param cp the character
1870      * @return false
1871      */
isUnicodeIdentifierStart(int cp)1872     static boolean isUnicodeIdentifierStart(int cp)
1873     {
1874       return false;
1875     }
1876 
1877     /**
1878      * Returns false to indicate that the character is not an uppercase letter.
1879      * @param cp the character
1880      * @return false
1881      */
isUpperCase(int cp)1882     static boolean isUpperCase(int cp)
1883     {
1884       return false;
1885     }
1886 
1887     /**
1888      * Returns false to indicate that the character is not a whitespace
1889      * character.
1890      * @param cp the character
1891      * @return false
1892      */
isWhiteSpace(int cp)1893     static boolean isWhiteSpace(int cp)
1894     {
1895       return false;
1896     }
1897 
1898     /**
1899      * Returns cp to indicate this character has no lowercase conversion.
1900      * @param cp the character
1901      * @return cp
1902      */
toLowerCase(int cp)1903     static int toLowerCase(int cp)
1904     {
1905       return cp;
1906     }
1907 
1908     /**
1909      * Returns cp to indicate this character has no titlecase conversion.
1910      * @param cp the character
1911      * @return cp
1912      */
toTitleCase(int cp)1913     static int toTitleCase(int cp)
1914     {
1915       return cp;
1916     }
1917 
1918     /**
1919      * Returns cp to indicate this character has no uppercase conversion.
1920      * @param cp the character
1921      * @return cp
1922      */
toUpperCase(int cp)1923     static int toUpperCase(int cp)
1924     {
1925       return cp;
1926     }
1927   }
1928 
1929   /**
1930    * The immutable value of this Character.
1931    *
1932    * @serial the value of this Character
1933    */
1934   private final char value;
1935 
1936   /**
1937    * Compatible with JDK 1.0+.
1938    */
1939   private static final long serialVersionUID = 3786198910865385080L;
1940 
1941   /**
1942    * Smallest value allowed for radix arguments in Java. This value is 2.
1943    *
1944    * @see #digit(char, int)
1945    * @see #forDigit(int, int)
1946    * @see Integer#toString(int, int)
1947    * @see Integer#valueOf(String)
1948    */
1949   public static final int MIN_RADIX = 2;
1950 
1951   /**
1952    * Largest value allowed for radix arguments in Java. This value is 36.
1953    *
1954    * @see #digit(char, int)
1955    * @see #forDigit(int, int)
1956    * @see Integer#toString(int, int)
1957    * @see Integer#valueOf(String)
1958    */
1959   public static final int MAX_RADIX = 36;
1960 
1961   /**
1962    * The minimum value the char data type can hold.
1963    * This value is <code>'\\u0000'</code>.
1964    */
1965   public static final char MIN_VALUE = '\u0000';
1966 
1967   /**
1968    * The maximum value the char data type can hold.
1969    * This value is <code>'\\uFFFF'</code>.
1970    */
1971   public static final char MAX_VALUE = '\uFFFF';
1972 
1973   /**
1974    * The minimum Unicode 4.0 code point.  This value is <code>0</code>.
1975    * @since 1.5
1976    */
1977   public static final int MIN_CODE_POINT = 0;
1978 
1979   /**
1980    * The maximum Unicode 4.0 code point, which is greater than the range
1981    * of the char data type.
1982    * This value is <code>0x10FFFF</code>.
1983    * @since 1.5
1984    */
1985   public static final int MAX_CODE_POINT = 0x10FFFF;
1986 
1987   /**
1988    * The minimum Unicode high surrogate code unit, or
1989    * <emph>leading-surrogate</emph>, in the UTF-16 character encoding.
1990    * This value is <code>'\uD800'</code>.
1991    * @since 1.5
1992    */
1993   public static final char MIN_HIGH_SURROGATE = '\uD800';
1994 
1995   /**
1996    * The maximum Unicode high surrogate code unit, or
1997    * <emph>leading-surrogate</emph>, in the UTF-16 character encoding.
1998    * This value is <code>'\uDBFF'</code>.
1999    * @since 1.5
2000    */
2001   public static final char MAX_HIGH_SURROGATE = '\uDBFF';
2002 
2003   /**
2004    * The minimum Unicode low surrogate code unit, or
2005    * <emph>trailing-surrogate</emph>, in the UTF-16 character encoding.
2006    * This value is <code>'\uDC00'</code>.
2007    * @since 1.5
2008    */
2009   public static final char MIN_LOW_SURROGATE = '\uDC00';
2010 
2011   /**
2012    * The maximum Unicode low surrogate code unit, or
2013    * <emph>trailing-surrogate</emph>, in the UTF-16 character encoding.
2014    * This value is <code>'\uDFFF'</code>.
2015    * @since 1.5
2016    */
2017   public static final char MAX_LOW_SURROGATE = '\uDFFF';
2018 
2019   /**
2020    * The minimum Unicode surrogate code unit in the UTF-16 character encoding.
2021    * This value is <code>'\uD800'</code>.
2022    * @since 1.5
2023    */
2024   public static final char MIN_SURROGATE = MIN_HIGH_SURROGATE;
2025 
2026   /**
2027    * The maximum Unicode surrogate code unit in the UTF-16 character encoding.
2028    * This value is <code>'\uDFFF'</code>.
2029    * @since 1.5
2030    */
2031   public static final char MAX_SURROGATE = MAX_LOW_SURROGATE;
2032 
2033   /**
2034    * The lowest possible supplementary Unicode code point (the first code
2035    * point outside the basic multilingual plane (BMP)).
2036    * This value is <code>0x10000</code>.
2037    */
2038   public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
2039 
2040   /**
2041    * Class object representing the primitive char data type.
2042    *
2043    * @since 1.1
2044    */
2045   public static final Class<Character> TYPE = (Class<Character>) VMClassLoader.getPrimitiveClass('C');
2046 
2047   /**
2048    * The number of bits needed to represent a <code>char</code>.
2049    * @since 1.5
2050    */
2051   public static final int SIZE = 16;
2052 
2053   // This caches some Character values, and is used by boxing
2054   // conversions via valueOf().  We must cache at least 0..127;
2055   // this constant controls how much we actually cache.
2056   private static final int MAX_CACHE = 127;
2057   private static Character[] charCache = new Character[MAX_CACHE + 1];
2058   static
2059   {
2060      for (char i=0; i <= MAX_CACHE; i++)
2061        charCache[i] = new Character(i);
2062   }
2063 
2064   /**
2065    * Lu = Letter, Uppercase (Informative).
2066    *
2067    * @since 1.1
2068    */
2069   public static final byte UPPERCASE_LETTER = 1;
2070 
2071   /**
2072    * Ll = Letter, Lowercase (Informative).
2073    *
2074    * @since 1.1
2075    */
2076   public static final byte LOWERCASE_LETTER = 2;
2077 
2078   /**
2079    * Lt = Letter, Titlecase (Informative).
2080    *
2081    * @since 1.1
2082    */
2083   public static final byte TITLECASE_LETTER = 3;
2084 
2085   /**
2086    * Mn = Mark, Non-Spacing (Normative).
2087    *
2088    * @since 1.1
2089    */
2090   public static final byte NON_SPACING_MARK = 6;
2091 
2092   /**
2093    * Mc = Mark, Spacing Combining (Normative).
2094    *
2095    * @since 1.1
2096    */
2097   public static final byte COMBINING_SPACING_MARK = 8;
2098 
2099   /**
2100    * Me = Mark, Enclosing (Normative).
2101    *
2102    * @since 1.1
2103    */
2104   public static final byte ENCLOSING_MARK = 7;
2105 
2106   /**
2107    * Nd = Number, Decimal Digit (Normative).
2108    *
2109    * @since 1.1
2110    */
2111   public static final byte DECIMAL_DIGIT_NUMBER = 9;
2112 
2113   /**
2114    * Nl = Number, Letter (Normative).
2115    *
2116    * @since 1.1
2117    */
2118   public static final byte LETTER_NUMBER = 10;
2119 
2120   /**
2121    * No = Number, Other (Normative).
2122    *
2123    * @since 1.1
2124    */
2125   public static final byte OTHER_NUMBER = 11;
2126 
2127   /**
2128    * Zs = Separator, Space (Normative).
2129    *
2130    * @since 1.1
2131    */
2132   public static final byte SPACE_SEPARATOR = 12;
2133 
2134   /**
2135    * Zl = Separator, Line (Normative).
2136    *
2137    * @since 1.1
2138    */
2139   public static final byte LINE_SEPARATOR = 13;
2140 
2141   /**
2142    * Zp = Separator, Paragraph (Normative).
2143    *
2144    * @since 1.1
2145    */
2146   public static final byte PARAGRAPH_SEPARATOR = 14;
2147 
2148   /**
2149    * Cc = Other, Control (Normative).
2150    *
2151    * @since 1.1
2152    */
2153   public static final byte CONTROL = 15;
2154 
2155   /**
2156    * Cf = Other, Format (Normative).
2157    *
2158    * @since 1.1
2159    */
2160   public static final byte FORMAT = 16;
2161 
2162   /**
2163    * Cs = Other, Surrogate (Normative).
2164    *
2165    * @since 1.1
2166    */
2167   public static final byte SURROGATE = 19;
2168 
2169   /**
2170    * Co = Other, Private Use (Normative).
2171    *
2172    * @since 1.1
2173    */
2174   public static final byte PRIVATE_USE = 18;
2175 
2176   /**
2177    * Cn = Other, Not Assigned (Normative).
2178    *
2179    * @since 1.1
2180    */
2181   public static final byte UNASSIGNED = 0;
2182 
2183   /**
2184    * Lm = Letter, Modifier (Informative).
2185    *
2186    * @since 1.1
2187    */
2188   public static final byte MODIFIER_LETTER = 4;
2189 
2190   /**
2191    * Lo = Letter, Other (Informative).
2192    *
2193    * @since 1.1
2194    */
2195   public static final byte OTHER_LETTER = 5;
2196 
2197   /**
2198    * Pc = Punctuation, Connector (Informative).
2199    *
2200    * @since 1.1
2201    */
2202   public static final byte CONNECTOR_PUNCTUATION = 23;
2203 
2204   /**
2205    * Pd = Punctuation, Dash (Informative).
2206    *
2207    * @since 1.1
2208    */
2209   public static final byte DASH_PUNCTUATION = 20;
2210 
2211   /**
2212    * Ps = Punctuation, Open (Informative).
2213    *
2214    * @since 1.1
2215    */
2216   public static final byte START_PUNCTUATION = 21;
2217 
2218   /**
2219    * Pe = Punctuation, Close (Informative).
2220    *
2221    * @since 1.1
2222    */
2223   public static final byte END_PUNCTUATION = 22;
2224 
2225   /**
2226    * Pi = Punctuation, Initial Quote (Informative).
2227    *
2228    * @since 1.4
2229    */
2230   public static final byte INITIAL_QUOTE_PUNCTUATION = 29;
2231 
2232   /**
2233    * Pf = Punctuation, Final Quote (Informative).
2234    *
2235    * @since 1.4
2236    */
2237   public static final byte FINAL_QUOTE_PUNCTUATION = 30;
2238 
2239   /**
2240    * Po = Punctuation, Other (Informative).
2241    *
2242    * @since 1.1
2243    */
2244   public static final byte OTHER_PUNCTUATION = 24;
2245 
2246   /**
2247    * Sm = Symbol, Math (Informative).
2248    *
2249    * @since 1.1
2250    */
2251   public static final byte MATH_SYMBOL = 25;
2252 
2253   /**
2254    * Sc = Symbol, Currency (Informative).
2255    *
2256    * @since 1.1
2257    */
2258   public static final byte CURRENCY_SYMBOL = 26;
2259 
2260   /**
2261    * Sk = Symbol, Modifier (Informative).
2262    *
2263    * @since 1.1
2264    */
2265   public static final byte MODIFIER_SYMBOL = 27;
2266 
2267   /**
2268    * So = Symbol, Other (Informative).
2269    *
2270    * @since 1.1
2271    */
2272   public static final byte OTHER_SYMBOL = 28;
2273 
2274   /**
2275    * Undefined bidirectional character type. Undefined char values have
2276    * undefined directionality in the Unicode specification.
2277    *
2278    * @since 1.4
2279    */
2280   public static final byte DIRECTIONALITY_UNDEFINED = -1;
2281 
2282   /**
2283    * Strong bidirectional character type "L".
2284    *
2285    * @since 1.4
2286    */
2287   public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0;
2288 
2289   /**
2290    * Strong bidirectional character type "R".
2291    *
2292    * @since 1.4
2293    */
2294   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1;
2295 
2296   /**
2297    * Strong bidirectional character type "AL".
2298    *
2299    * @since 1.4
2300    */
2301   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2;
2302 
2303   /**
2304    * Weak bidirectional character type "EN".
2305    *
2306    * @since 1.4
2307    */
2308   public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3;
2309 
2310   /**
2311    * Weak bidirectional character type "ES".
2312    *
2313    * @since 1.4
2314    */
2315   public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4;
2316 
2317   /**
2318    * Weak bidirectional character type "ET".
2319    *
2320    * @since 1.4
2321    */
2322   public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5;
2323 
2324   /**
2325    * Weak bidirectional character type "AN".
2326    *
2327    * @since 1.4
2328    */
2329   public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6;
2330 
2331   /**
2332    * Weak bidirectional character type "CS".
2333    *
2334    * @since 1.4
2335    */
2336   public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7;
2337 
2338   /**
2339    * Weak bidirectional character type "NSM".
2340    *
2341    * @since 1.4
2342    */
2343   public static final byte DIRECTIONALITY_NONSPACING_MARK = 8;
2344 
2345   /**
2346    * Weak bidirectional character type "BN".
2347    *
2348    * @since 1.4
2349    */
2350   public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9;
2351 
2352   /**
2353    * Neutral bidirectional character type "B".
2354    *
2355    * @since 1.4
2356    */
2357   public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10;
2358 
2359   /**
2360    * Neutral bidirectional character type "S".
2361    *
2362    * @since 1.4
2363    */
2364   public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11;
2365 
2366   /**
2367    * Strong bidirectional character type "WS".
2368    *
2369    * @since 1.4
2370    */
2371   public static final byte DIRECTIONALITY_WHITESPACE = 12;
2372 
2373   /**
2374    * Neutral bidirectional character type "ON".
2375    *
2376    * @since 1.4
2377    */
2378   public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13;
2379 
2380   /**
2381    * Strong bidirectional character type "LRE".
2382    *
2383    * @since 1.4
2384    */
2385   public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14;
2386 
2387   /**
2388    * Strong bidirectional character type "LRO".
2389    *
2390    * @since 1.4
2391    */
2392   public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15;
2393 
2394   /**
2395    * Strong bidirectional character type "RLE".
2396    *
2397    * @since 1.4
2398    */
2399   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16;
2400 
2401   /**
2402    * Strong bidirectional character type "RLO".
2403    *
2404    * @since 1.4
2405    */
2406   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17;
2407 
2408   /**
2409    * Weak bidirectional character type "PDF".
2410    *
2411    * @since 1.4
2412    */
2413   public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18;
2414 
2415   /**
2416    * Stores unicode block offset lookup table. Exploit package visibility of
2417    * String.value to avoid copying the array.
2418    * @see #readCodePoint(int)
2419    * @see CharData#BLOCKS
2420    */
2421   private static final char[][] blocks =
2422     new char[][]{
2423                  String.zeroBasedStringValue(CharData.BLOCKS[0]),
2424                  String.zeroBasedStringValue(CharData.BLOCKS[1]),
2425                  String.zeroBasedStringValue(CharData.BLOCKS[2]),
2426                  String.zeroBasedStringValue(CharData.BLOCKS[3]),
2427                  String.zeroBasedStringValue(CharData.BLOCKS[4]),
2428                  String.zeroBasedStringValue(CharData.BLOCKS[5]),
2429                  String.zeroBasedStringValue(CharData.BLOCKS[6]),
2430                  String.zeroBasedStringValue(CharData.BLOCKS[7]),
2431                  String.zeroBasedStringValue(CharData.BLOCKS[8]),
2432                  String.zeroBasedStringValue(CharData.BLOCKS[9]),
2433                  String.zeroBasedStringValue(CharData.BLOCKS[10]),
2434                  String.zeroBasedStringValue(CharData.BLOCKS[11]),
2435                  String.zeroBasedStringValue(CharData.BLOCKS[12]),
2436                  String.zeroBasedStringValue(CharData.BLOCKS[13]),
2437                  String.zeroBasedStringValue(CharData.BLOCKS[14]),
2438                  String.zeroBasedStringValue(CharData.BLOCKS[15]),
2439                  String.zeroBasedStringValue(CharData.BLOCKS[16])};
2440 
2441   /**
2442    * Stores unicode attribute offset lookup table. Exploit package visibility
2443    * of String.value to avoid copying the array.
2444    * @see CharData#DATA
2445    */
2446   private static final char[][] data =
2447     new char[][]{
2448                  String.zeroBasedStringValue(CharData.DATA[0]),
2449                  String.zeroBasedStringValue(CharData.DATA[1]),
2450                  String.zeroBasedStringValue(CharData.DATA[2]),
2451                  String.zeroBasedStringValue(CharData.DATA[3]),
2452                  String.zeroBasedStringValue(CharData.DATA[4]),
2453                  String.zeroBasedStringValue(CharData.DATA[5]),
2454                  String.zeroBasedStringValue(CharData.DATA[6]),
2455                  String.zeroBasedStringValue(CharData.DATA[7]),
2456                  String.zeroBasedStringValue(CharData.DATA[8]),
2457                  String.zeroBasedStringValue(CharData.DATA[9]),
2458                  String.zeroBasedStringValue(CharData.DATA[10]),
2459                  String.zeroBasedStringValue(CharData.DATA[11]),
2460                  String.zeroBasedStringValue(CharData.DATA[12]),
2461                  String.zeroBasedStringValue(CharData.DATA[13]),
2462                  String.zeroBasedStringValue(CharData.DATA[14]),
2463                  String.zeroBasedStringValue(CharData.DATA[15]),
2464                  String.zeroBasedStringValue(CharData.DATA[16])};
2465 
2466   /**
2467    * Stores unicode numeric value attribute table. Exploit package visibility
2468    * of String.value to avoid copying the array.
2469    * @see CharData#NUM_VALUE
2470    */
2471   private static final char[][] numValue =
2472     new char[][]{
2473                  String.zeroBasedStringValue(CharData.NUM_VALUE[0]),
2474                  String.zeroBasedStringValue(CharData.NUM_VALUE[1]),
2475                  String.zeroBasedStringValue(CharData.NUM_VALUE[2]),
2476                  String.zeroBasedStringValue(CharData.NUM_VALUE[3]),
2477                  String.zeroBasedStringValue(CharData.NUM_VALUE[4]),
2478                  String.zeroBasedStringValue(CharData.NUM_VALUE[5]),
2479                  String.zeroBasedStringValue(CharData.NUM_VALUE[6]),
2480                  String.zeroBasedStringValue(CharData.NUM_VALUE[7]),
2481                  String.zeroBasedStringValue(CharData.NUM_VALUE[8]),
2482                  String.zeroBasedStringValue(CharData.NUM_VALUE[9]),
2483                  String.zeroBasedStringValue(CharData.NUM_VALUE[10]),
2484                  String.zeroBasedStringValue(CharData.NUM_VALUE[11]),
2485                  String.zeroBasedStringValue(CharData.NUM_VALUE[12]),
2486                  String.zeroBasedStringValue(CharData.NUM_VALUE[13]),
2487                  String.zeroBasedStringValue(CharData.NUM_VALUE[14]),
2488                  String.zeroBasedStringValue(CharData.NUM_VALUE[15]),
2489                  String.zeroBasedStringValue(CharData.NUM_VALUE[16])};
2490 
2491   /**
2492    * Stores unicode uppercase attribute table. Exploit package visibility
2493    * of String.value to avoid copying the array.
2494    * @see CharData#UPPER
2495    */
2496   private static final char[][] upper =
2497     new char[][]{
2498                  String.zeroBasedStringValue(CharData.UPPER[0]),
2499                  String.zeroBasedStringValue(CharData.UPPER[1]),
2500                  String.zeroBasedStringValue(CharData.UPPER[2]),
2501                  String.zeroBasedStringValue(CharData.UPPER[3]),
2502                  String.zeroBasedStringValue(CharData.UPPER[4]),
2503                  String.zeroBasedStringValue(CharData.UPPER[5]),
2504                  String.zeroBasedStringValue(CharData.UPPER[6]),
2505                  String.zeroBasedStringValue(CharData.UPPER[7]),
2506                  String.zeroBasedStringValue(CharData.UPPER[8]),
2507                  String.zeroBasedStringValue(CharData.UPPER[9]),
2508                  String.zeroBasedStringValue(CharData.UPPER[10]),
2509                  String.zeroBasedStringValue(CharData.UPPER[11]),
2510                  String.zeroBasedStringValue(CharData.UPPER[12]),
2511                  String.zeroBasedStringValue(CharData.UPPER[13]),
2512                  String.zeroBasedStringValue(CharData.UPPER[14]),
2513                  String.zeroBasedStringValue(CharData.UPPER[15]),
2514                  String.zeroBasedStringValue(CharData.UPPER[16])};
2515 
2516   /**
2517    * Stores unicode lowercase attribute table. Exploit package visibility
2518    * of String.value to avoid copying the array.
2519    * @see CharData#LOWER
2520    */
2521   private static final char[][] lower =
2522     new char[][]{
2523                  String.zeroBasedStringValue(CharData.LOWER[0]),
2524                  String.zeroBasedStringValue(CharData.LOWER[1]),
2525                  String.zeroBasedStringValue(CharData.LOWER[2]),
2526                  String.zeroBasedStringValue(CharData.LOWER[3]),
2527                  String.zeroBasedStringValue(CharData.LOWER[4]),
2528                  String.zeroBasedStringValue(CharData.LOWER[5]),
2529                  String.zeroBasedStringValue(CharData.LOWER[6]),
2530                  String.zeroBasedStringValue(CharData.LOWER[7]),
2531                  String.zeroBasedStringValue(CharData.LOWER[8]),
2532                  String.zeroBasedStringValue(CharData.LOWER[9]),
2533                  String.zeroBasedStringValue(CharData.LOWER[10]),
2534                  String.zeroBasedStringValue(CharData.LOWER[11]),
2535                  String.zeroBasedStringValue(CharData.LOWER[12]),
2536                  String.zeroBasedStringValue(CharData.LOWER[13]),
2537                  String.zeroBasedStringValue(CharData.LOWER[14]),
2538                  String.zeroBasedStringValue(CharData.LOWER[15]),
2539                  String.zeroBasedStringValue(CharData.LOWER[16])};
2540 
2541   /**
2542    * Stores unicode direction attribute table. Exploit package visibility
2543    * of String.value to avoid copying the array.
2544    * @see CharData#DIRECTION
2545    */
2546   // Package visible for use by String.
2547   static final char[][] direction =
2548     new char[][]{
2549                  String.zeroBasedStringValue(CharData.DIRECTION[0]),
2550                  String.zeroBasedStringValue(CharData.DIRECTION[1]),
2551                  String.zeroBasedStringValue(CharData.DIRECTION[2]),
2552                  String.zeroBasedStringValue(CharData.DIRECTION[3]),
2553                  String.zeroBasedStringValue(CharData.DIRECTION[4]),
2554                  String.zeroBasedStringValue(CharData.DIRECTION[5]),
2555                  String.zeroBasedStringValue(CharData.DIRECTION[6]),
2556                  String.zeroBasedStringValue(CharData.DIRECTION[7]),
2557                  String.zeroBasedStringValue(CharData.DIRECTION[8]),
2558                  String.zeroBasedStringValue(CharData.DIRECTION[9]),
2559                  String.zeroBasedStringValue(CharData.DIRECTION[10]),
2560                  String.zeroBasedStringValue(CharData.DIRECTION[11]),
2561                  String.zeroBasedStringValue(CharData.DIRECTION[12]),
2562                  String.zeroBasedStringValue(CharData.DIRECTION[13]),
2563                  String.zeroBasedStringValue(CharData.DIRECTION[14]),
2564                  String.zeroBasedStringValue(CharData.DIRECTION[15]),
2565                  String.zeroBasedStringValue(CharData.DIRECTION[16])};
2566 
2567   /**
2568    * Stores unicode titlecase table. Exploit package visibility of
2569    * String.value to avoid copying the array.
2570    * @see CharData#TITLE
2571    */
2572   private static final char[] title = String.zeroBasedStringValue(CharData.TITLE);
2573 
2574   /**
2575    * Mask for grabbing the type out of the contents of data.
2576    * @see CharData#DATA
2577    */
2578   private static final int TYPE_MASK = 0x1F;
2579 
2580   /**
2581    * Mask for grabbing the non-breaking space flag out of the contents of
2582    * data.
2583    * @see CharData#DATA
2584    */
2585   private static final int NO_BREAK_MASK = 0x20;
2586 
2587   /**
2588    * Mask for grabbing the mirrored directionality flag out of the contents
2589    * of data.
2590    * @see CharData#DATA
2591    */
2592   private static final int MIRROR_MASK = 0x40;
2593 
2594   /**
2595    * Grabs an attribute offset from the Unicode attribute database. The lower
2596    * 5 bits are the character type, the next 2 bits are flags, and the top
2597    * 9 bits are the offset into the attribute tables.
2598    *
2599    * @param codePoint the character to look up
2600    * @return the character's attribute offset and type
2601    * @see #TYPE_MASK
2602    * @see #NO_BREAK_MASK
2603    * @see #MIRROR_MASK
2604    * @see CharData#DATA
2605    * @see CharData#SHIFT
2606    */
2607   // Package visible for use in String.
readCodePoint(int codePoint)2608   static char readCodePoint(int codePoint)
2609   {
2610     int plane = codePoint >>> 16;
2611     char offset = (char) (codePoint & 0xffff);
2612     return data[plane][(char) (blocks[plane][offset >> CharData.SHIFT[plane]] + offset)];
2613   }
2614 
2615   /**
2616    * Wraps up a character.
2617    *
2618    * @param value the character to wrap
2619    */
Character(char value)2620   public Character(char value)
2621   {
2622     this.value = value;
2623   }
2624 
2625   /**
2626    * Returns the character which has been wrapped by this class.
2627    *
2628    * @return the character wrapped
2629    */
charValue()2630   public char charValue()
2631   {
2632     return value;
2633   }
2634 
2635   /**
2636    * Returns the numerical value (unsigned) of the wrapped character.
2637    * Range of returned values: 0x0000-0xFFFF.
2638    *
2639    * @return the value of the wrapped character
2640    */
hashCode()2641   public int hashCode()
2642   {
2643     return value;
2644   }
2645 
2646   /**
2647    * Determines if an object is equal to this object. This is only true for
2648    * another Character object wrapping the same value.
2649    *
2650    * @param o object to compare
2651    * @return true if o is a Character with the same value
2652    */
equals(Object o)2653   public boolean equals(Object o)
2654   {
2655     return o instanceof Character && value == ((Character) o).value;
2656   }
2657 
2658   /**
2659    * Converts the wrapped character into a String.
2660    *
2661    * @return a String containing one character -- the wrapped character
2662    *         of this instance
2663    */
toString()2664   public String toString()
2665   {
2666     // Package constructor avoids an array copy.
2667     return new String(new char[] { value }, 0, 1, true);
2668   }
2669 
2670   /**
2671    * Returns a String of length 1 representing the specified character.
2672    *
2673    * @param ch the character to convert
2674    * @return a String containing the character
2675    * @since 1.4
2676    */
toString(char ch)2677   public static String toString(char ch)
2678   {
2679     // Package constructor avoids an array copy.
2680     return new String(new char[] { ch }, 0, 1, true);
2681   }
2682 
2683   /**
2684    * Determines if a character is a Unicode lowercase letter. For example,
2685    * <code>'a'</code> is lowercase.  Returns true if getType() returns
2686    * LOWERCASE_LETTER.
2687    * <br>
2688    * lowercase = [Ll]
2689    *
2690    * @param ch character to test
2691    * @return true if ch is a Unicode lowercase letter, else false
2692    * @see #isUpperCase(char)
2693    * @see #isTitleCase(char)
2694    * @see #toLowerCase(char)
2695    * @see #getType(char)
2696    */
isLowerCase(char ch)2697   public static boolean isLowerCase(char ch)
2698   {
2699     return isLowerCase((int)ch);
2700   }
2701 
2702   /**
2703    * Determines if a character is a Unicode lowercase letter. For example,
2704    * <code>'a'</code> is lowercase.  Returns true if getType() returns
2705    * LOWERCASE_LETTER.
2706    * <br>
2707    * lowercase = [Ll]
2708    *
2709    * @param codePoint character to test
2710    * @return true if ch is a Unicode lowercase letter, else false
2711    * @see #isUpperCase(char)
2712    * @see #isTitleCase(char)
2713    * @see #toLowerCase(char)
2714    * @see #getType(char)
2715    *
2716    * @since 1.5
2717    */
isLowerCase(int codePoint)2718   public static boolean isLowerCase(int codePoint)
2719   {
2720     return getType(codePoint) == LOWERCASE_LETTER;
2721   }
2722 
2723   /**
2724    * Determines if a character is a Unicode uppercase letter. For example,
2725    * <code>'A'</code> is uppercase.  Returns true if getType() returns
2726    * UPPERCASE_LETTER.
2727    * <br>
2728    * uppercase = [Lu]
2729    *
2730    * @param ch character to test
2731    * @return true if ch is a Unicode uppercase letter, else false
2732    * @see #isLowerCase(char)
2733    * @see #isTitleCase(char)
2734    * @see #toUpperCase(char)
2735    * @see #getType(char)
2736    */
isUpperCase(char ch)2737   public static boolean isUpperCase(char ch)
2738   {
2739     return isUpperCase((int)ch);
2740   }
2741 
2742   /**
2743    * Determines if a character is a Unicode uppercase letter. For example,
2744    * <code>'A'</code> is uppercase.  Returns true if getType() returns
2745    * UPPERCASE_LETTER.
2746    * <br>
2747    * uppercase = [Lu]
2748    *
2749    * @param codePoint character to test
2750    * @return true if ch is a Unicode uppercase letter, else false
2751    * @see #isLowerCase(char)
2752    * @see #isTitleCase(char)
2753    * @see #toUpperCase(char)
2754    * @see #getType(char)
2755    *
2756    * @since 1.5
2757    */
isUpperCase(int codePoint)2758   public static boolean isUpperCase(int codePoint)
2759   {
2760     return getType(codePoint) == UPPERCASE_LETTER;
2761   }
2762 
2763   /**
2764    * Determines if a character is a Unicode titlecase letter. For example,
2765    * the character "Lj" (Latin capital L with small letter j) is titlecase.
2766    * True if getType() returns TITLECASE_LETTER.
2767    * <br>
2768    * titlecase = [Lt]
2769    *
2770    * @param ch character to test
2771    * @return true if ch is a Unicode titlecase letter, else false
2772    * @see #isLowerCase(char)
2773    * @see #isUpperCase(char)
2774    * @see #toTitleCase(char)
2775    * @see #getType(char)
2776    */
isTitleCase(char ch)2777   public static boolean isTitleCase(char ch)
2778   {
2779     return isTitleCase((int)ch);
2780   }
2781 
2782   /**
2783    * Determines if a character is a Unicode titlecase letter. For example,
2784    * the character "Lj" (Latin capital L with small letter j) is titlecase.
2785    * True if getType() returns TITLECASE_LETTER.
2786    * <br>
2787    * titlecase = [Lt]
2788    *
2789    * @param codePoint character to test
2790    * @return true if ch is a Unicode titlecase letter, else false
2791    * @see #isLowerCase(char)
2792    * @see #isUpperCase(char)
2793    * @see #toTitleCase(char)
2794    * @see #getType(char)
2795    *
2796    * @since 1.5
2797    */
isTitleCase(int codePoint)2798   public static boolean isTitleCase(int codePoint)
2799   {
2800     return getType(codePoint) == TITLECASE_LETTER;
2801   }
2802 
2803 
2804   /**
2805    * Determines if a character is a Unicode decimal digit. For example,
2806    * <code>'0'</code> is a digit.  A character is a Unicode digit if
2807    * getType() returns DECIMAL_DIGIT_NUMBER.
2808    * <br>
2809    * Unicode decimal digit = [Nd]
2810    *
2811    * @param ch character to test
2812    * @return true if ch is a Unicode decimal digit, else false
2813    * @see #digit(char, int)
2814    * @see #forDigit(int, int)
2815    * @see #getType(char)
2816    */
isDigit(char ch)2817   public static boolean isDigit(char ch)
2818   {
2819     return isDigit((int)ch);
2820   }
2821 
2822   /**
2823    * Determines if a character is a Unicode decimal digit. For example,
2824    * <code>'0'</code> is a digit. A character is a Unicode digit if
2825    * getType() returns DECIMAL_DIGIT_NUMBER.
2826    * <br>
2827    * Unicode decimal digit = [Nd]
2828    *
2829    * @param codePoint character to test
2830    * @return true if ch is a Unicode decimal digit, else false
2831    * @see #digit(char, int)
2832    * @see #forDigit(int, int)
2833    * @see #getType(char)
2834    *
2835    * @since 1.5
2836    */
2837 
isDigit(int codePoint)2838   public static boolean isDigit(int codePoint)
2839   {
2840     return getType(codePoint) == DECIMAL_DIGIT_NUMBER;
2841   }
2842 
2843   /**
2844    * Determines if a character is part of the Unicode Standard. This is an
2845    * evolving standard, but covers every character in the data file.
2846    * <br>
2847    * defined = not [Cn]
2848    *
2849    * @param ch character to test
2850    * @return true if ch is a Unicode character, else false
2851    * @see #isDigit(char)
2852    * @see #isLetter(char)
2853    * @see #isLetterOrDigit(char)
2854    * @see #isLowerCase(char)
2855    * @see #isTitleCase(char)
2856    * @see #isUpperCase(char)
2857    */
isDefined(char ch)2858   public static boolean isDefined(char ch)
2859   {
2860     return isDefined((int)ch);
2861   }
2862 
2863   /**
2864    * Determines if a character is part of the Unicode Standard. This is an
2865    * evolving standard, but covers every character in the data file.
2866    * <br>
2867    * defined = not [Cn]
2868    *
2869    * @param codePoint character to test
2870    * @return true if ch is a Unicode character, else false
2871    * @see #isDigit(char)
2872    * @see #isLetter(char)
2873    * @see #isLetterOrDigit(char)
2874    * @see #isLowerCase(char)
2875    * @see #isTitleCase(char)
2876    * @see #isUpperCase(char)
2877    *
2878    * @since 1.5
2879    */
isDefined(int codePoint)2880   public static boolean isDefined(int codePoint)
2881   {
2882     return getType(codePoint) != UNASSIGNED;
2883   }
2884 
2885   /**
2886    * Determines if a character is a Unicode letter. Not all letters have case,
2887    * so this may return true when isLowerCase and isUpperCase return false.
2888    * A character is a Unicode letter if getType() returns one of
2889    * UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER,
2890    * or OTHER_LETTER.
2891    * <br>
2892    * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]
2893    *
2894    * @param ch character to test
2895    * @return true if ch is a Unicode letter, else false
2896    * @see #isDigit(char)
2897    * @see #isJavaIdentifierStart(char)
2898    * @see #isJavaLetter(char)
2899    * @see #isJavaLetterOrDigit(char)
2900    * @see #isLetterOrDigit(char)
2901    * @see #isLowerCase(char)
2902    * @see #isTitleCase(char)
2903    * @see #isUnicodeIdentifierStart(char)
2904    * @see #isUpperCase(char)
2905    */
isLetter(char ch)2906   public static boolean isLetter(char ch)
2907   {
2908     return isLetter((int)ch);
2909   }
2910 
2911   /**
2912    * Determines if a character is a Unicode letter. Not all letters have case,
2913    * so this may return true when isLowerCase and isUpperCase return false.
2914    * A character is a Unicode letter if getType() returns one of
2915    * UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER,
2916    * or OTHER_LETTER.
2917    * <br>
2918    * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]
2919    *
2920    * @param codePoint character to test
2921    * @return true if ch is a Unicode letter, else false
2922    * @see #isDigit(char)
2923    * @see #isJavaIdentifierStart(char)
2924    * @see #isJavaLetter(char)
2925    * @see #isJavaLetterOrDigit(char)
2926    * @see #isLetterOrDigit(char)
2927    * @see #isLowerCase(char)
2928    * @see #isTitleCase(char)
2929    * @see #isUnicodeIdentifierStart(char)
2930    * @see #isUpperCase(char)
2931    *
2932    * @since 1.5
2933    */
isLetter(int codePoint)2934   public static boolean isLetter(int codePoint)
2935   {
2936     return ((1 << getType(codePoint))
2937         & ((1 << UPPERCASE_LETTER)
2938             | (1 << LOWERCASE_LETTER)
2939             | (1 << TITLECASE_LETTER)
2940             | (1 << MODIFIER_LETTER)
2941             | (1 << OTHER_LETTER))) != 0;
2942   }
2943   /**
2944    * Returns the index into the given CharSequence that is offset
2945    * <code>codePointOffset</code> code points from <code>index</code>.
2946    * @param seq the CharSequence
2947    * @param index the start position in the CharSequence
2948    * @param codePointOffset the number of code points offset from the start
2949    * position
2950    * @return the index into the CharSequence that is codePointOffset code
2951    * points offset from index
2952    *
2953    * @throws NullPointerException if seq is null
2954    * @throws IndexOutOfBoundsException if index is negative or greater than the
2955    * length of the sequence.
2956    * @throws IndexOutOfBoundsException if codePointOffset is positive and the
2957    * subsequence from index to the end of seq has fewer than codePointOffset
2958    * code points
2959    * @throws IndexOutOfBoundsException if codePointOffset is negative and the
2960    * subsequence from the start of seq to index has fewer than
2961    * (-codePointOffset) code points
2962    * @since 1.5
2963    */
offsetByCodePoints(CharSequence seq, int index, int codePointOffset)2964   public static int offsetByCodePoints(CharSequence seq,
2965                                        int index,
2966                                        int codePointOffset)
2967   {
2968     int len = seq.length();
2969     if (index < 0 || index > len)
2970       throw new IndexOutOfBoundsException();
2971 
2972     int numToGo = codePointOffset;
2973     int offset = index;
2974     int adjust = 1;
2975     if (numToGo >= 0)
2976       {
2977         for (; numToGo > 0; offset++)
2978           {
2979             numToGo--;
2980             if (Character.isHighSurrogate(seq.charAt(offset))
2981                 && (offset + 1) < len
2982                 && Character.isLowSurrogate(seq.charAt(offset + 1)))
2983               offset++;
2984           }
2985         return offset;
2986       }
2987     else
2988       {
2989         numToGo *= -1;
2990         for (; numToGo > 0;)
2991           {
2992             numToGo--;
2993             offset--;
2994             if (Character.isLowSurrogate(seq.charAt(offset))
2995                 && (offset - 1) >= 0
2996                 && Character.isHighSurrogate(seq.charAt(offset - 1)))
2997               offset--;
2998           }
2999         return offset;
3000       }
3001   }
3002 
3003   /**
3004    * Returns the index into the given char subarray that is offset
3005    * <code>codePointOffset</code> code points from <code>index</code>.
3006    * @param a the char array
3007    * @param start the start index of the subarray
3008    * @param count the length of the subarray
3009    * @param index the index to be offset
3010    * @param codePointOffset the number of code points offset from <code>index
3011    * </code>
3012    * @return the index into the char array
3013    *
3014    * @throws NullPointerException if a is null
3015    * @throws IndexOutOfBoundsException if start or count is negative or if
3016    * start + count is greater than the length of the array
3017    * @throws IndexOutOfBoundsException if index is less than start or larger
3018    * than start + count
3019    * @throws IndexOutOfBoundsException if codePointOffset is positive and the
3020    * subarray from index to start + count - 1 has fewer than codePointOffset
3021    * code points.
3022    * @throws IndexOutOfBoundsException if codePointOffset is negative and the
3023    * subarray from start to index - 1 has fewer than (-codePointOffset) code
3024    * points
3025    *
3026    * @since 1.5
3027    */
offsetByCodePoints(char[] a, int start, int count, int index, int codePointOffset)3028   public static int offsetByCodePoints(char[] a,
3029                                        int start,
3030                                        int count,
3031                                        int index,
3032                                        int codePointOffset)
3033   {
3034     int len = a.length;
3035     int end = start + count;
3036     if (start < 0 || count < 0 || end > len || index < start || index > end)
3037       throw new IndexOutOfBoundsException();
3038 
3039     int numToGo = codePointOffset;
3040     int offset = index;
3041     int adjust = 1;
3042     if (numToGo >= 0)
3043       {
3044         for (; numToGo > 0; offset++)
3045           {
3046             numToGo--;
3047             if (Character.isHighSurrogate(a[offset])
3048                 && (offset + 1) < len
3049                 && Character.isLowSurrogate(a[offset + 1]))
3050               offset++;
3051           }
3052         return offset;
3053       }
3054     else
3055       {
3056         numToGo *= -1;
3057         for (; numToGo > 0;)
3058           {
3059             numToGo--;
3060             offset--;
3061             if (Character.isLowSurrogate(a[offset])
3062                 && (offset - 1) >= 0
3063                 && Character.isHighSurrogate(a[offset - 1]))
3064               offset--;
3065             if (offset < start)
3066               throw new IndexOutOfBoundsException();
3067           }
3068         return offset;
3069       }
3070 
3071   }
3072 
3073   /**
3074    * Returns the number of Unicode code points in the specified range of the
3075    * given CharSequence.  The first char in the range is at position
3076    * beginIndex and the last one is at position endIndex - 1.  Paired
3077    * surrogates (supplementary characters are represented by a pair of chars -
3078    * one from the high surrogates and one from the low surrogates)
3079    * count as just one code point.
3080    * @param seq the CharSequence to inspect
3081    * @param beginIndex the beginning of the range
3082    * @param endIndex the end of the range
3083    * @return the number of Unicode code points in the given range of the
3084    * sequence
3085    * @throws NullPointerException if seq is null
3086    * @throws IndexOutOfBoundsException if beginIndex is negative, endIndex is
3087    * larger than the length of seq, or if beginIndex is greater than endIndex.
3088    * @since 1.5
3089    */
codePointCount(CharSequence seq, int beginIndex, int endIndex)3090   public static int codePointCount(CharSequence seq, int beginIndex,
3091                                    int endIndex)
3092   {
3093     int len = seq.length();
3094     if (beginIndex < 0 || endIndex > len || beginIndex > endIndex)
3095       throw new IndexOutOfBoundsException();
3096 
3097     int count = 0;
3098     for (int i = beginIndex; i < endIndex; i++)
3099       {
3100         count++;
3101         // If there is a pairing, count it only once.
3102         if (isHighSurrogate(seq.charAt(i)) && (i + 1) < endIndex
3103             && isLowSurrogate(seq.charAt(i + 1)))
3104           i ++;
3105       }
3106     return count;
3107   }
3108 
3109   /**
3110    * Returns the number of Unicode code points in the specified range of the
3111    * given char array.  The first char in the range is at position
3112    * offset and the length of the range is count.  Paired surrogates
3113    * (supplementary characters are represented by a pair of chars -
3114    * one from the high surrogates and one from the low surrogates)
3115    * count as just one code point.
3116    * @param a the char array to inspect
3117    * @param offset the beginning of the range
3118    * @param count the length of the range
3119    * @return the number of Unicode code points in the given range of the
3120    * array
3121    * @throws NullPointerException if a is null
3122    * @throws IndexOutOfBoundsException if offset or count is negative or if
3123    * offset + countendIndex is larger than the length of a.
3124    * @since 1.5
3125    */
codePointCount(char[] a, int offset, int count)3126   public static int codePointCount(char[] a, int offset,
3127                                    int count)
3128   {
3129     int len = a.length;
3130     int end = offset + count;
3131     if (offset < 0 || count < 0 || end > len)
3132       throw new IndexOutOfBoundsException();
3133 
3134     int counter = 0;
3135     for (int i = offset; i < end; i++)
3136       {
3137         counter++;
3138         // If there is a pairing, count it only once.
3139         if (isHighSurrogate(a[i]) && (i + 1) < end
3140             && isLowSurrogate(a[i + 1]))
3141           i ++;
3142       }
3143     return counter;
3144   }
3145 
3146   /**
3147    * Determines if a character is a Unicode letter or a Unicode digit. This
3148    * is the combination of isLetter and isDigit.
3149    * <br>
3150    * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd]
3151    *
3152    * @param ch character to test
3153    * @return true if ch is a Unicode letter or a Unicode digit, else false
3154    * @see #isDigit(char)
3155    * @see #isJavaIdentifierPart(char)
3156    * @see #isJavaLetter(char)
3157    * @see #isJavaLetterOrDigit(char)
3158    * @see #isLetter(char)
3159    * @see #isUnicodeIdentifierPart(char)
3160    */
isLetterOrDigit(char ch)3161   public static boolean isLetterOrDigit(char ch)
3162   {
3163     return isLetterOrDigit((int)ch);
3164   }
3165 
3166   /**
3167    * Determines if a character is a Unicode letter or a Unicode digit. This
3168    * is the combination of isLetter and isDigit.
3169    * <br>
3170    * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd]
3171    *
3172    * @param codePoint character to test
3173    * @return true if ch is a Unicode letter or a Unicode digit, else false
3174    * @see #isDigit(char)
3175    * @see #isJavaIdentifierPart(char)
3176    * @see #isJavaLetter(char)
3177    * @see #isJavaLetterOrDigit(char)
3178    * @see #isLetter(char)
3179    * @see #isUnicodeIdentifierPart(char)
3180    *
3181    * @since 1.5
3182    */
isLetterOrDigit(int codePoint)3183   public static boolean isLetterOrDigit(int codePoint)
3184   {
3185     return ((1 << getType(codePoint))
3186         & ((1 << UPPERCASE_LETTER)
3187            | (1 << LOWERCASE_LETTER)
3188            | (1 << TITLECASE_LETTER)
3189            | (1 << MODIFIER_LETTER)
3190            | (1 << OTHER_LETTER)
3191            | (1 << DECIMAL_DIGIT_NUMBER))) != 0;
3192   }
3193 
3194   /**
3195    * Determines if a character can start a Java identifier. This is the
3196    * combination of isLetter, any character where getType returns
3197    * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
3198    * (like '_').
3199    *
3200    * @param ch character to test
3201    * @return true if ch can start a Java identifier, else false
3202    * @deprecated Replaced by {@link #isJavaIdentifierStart(char)}
3203    * @see #isJavaLetterOrDigit(char)
3204    * @see #isJavaIdentifierStart(char)
3205    * @see #isJavaIdentifierPart(char)
3206    * @see #isLetter(char)
3207    * @see #isLetterOrDigit(char)
3208    * @see #isUnicodeIdentifierStart(char)
3209    */
isJavaLetter(char ch)3210   public static boolean isJavaLetter(char ch)
3211   {
3212     return isJavaIdentifierStart(ch);
3213   }
3214 
3215   /**
3216    * Determines if a character can follow the first letter in
3217    * a Java identifier.  This is the combination of isJavaLetter (isLetter,
3218    * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
3219    * numeric letter (like Roman numerals), combining marks, non-spacing marks,
3220    * or isIdentifierIgnorable.
3221    *
3222    * @param ch character to test
3223    * @return true if ch can follow the first letter in a Java identifier
3224    * @deprecated Replaced by {@link #isJavaIdentifierPart(char)}
3225    * @see #isJavaLetter(char)
3226    * @see #isJavaIdentifierStart(char)
3227    * @see #isJavaIdentifierPart(char)
3228    * @see #isLetter(char)
3229    * @see #isLetterOrDigit(char)
3230    * @see #isUnicodeIdentifierPart(char)
3231    * @see #isIdentifierIgnorable(char)
3232    */
isJavaLetterOrDigit(char ch)3233   public static boolean isJavaLetterOrDigit(char ch)
3234   {
3235     return isJavaIdentifierPart(ch);
3236   }
3237 
3238   /**
3239    * Determines if a character can start a Java identifier. This is the
3240    * combination of isLetter, any character where getType returns
3241    * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
3242    * (like '_').
3243    * <br>
3244    * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]
3245    *
3246    * @param ch character to test
3247    * @return true if ch can start a Java identifier, else false
3248    * @see #isJavaIdentifierPart(char)
3249    * @see #isLetter(char)
3250    * @see #isUnicodeIdentifierStart(char)
3251    * @since 1.1
3252    */
isJavaIdentifierStart(char ch)3253   public static boolean isJavaIdentifierStart(char ch)
3254   {
3255     return isJavaIdentifierStart((int)ch);
3256   }
3257 
3258   /**
3259    * Determines if a character can start a Java identifier. This is the
3260    * combination of isLetter, any character where getType returns
3261    * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
3262    * (like '_').
3263    * <br>
3264    * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]
3265    *
3266    * @param codePoint character to test
3267    * @return true if ch can start a Java identifier, else false
3268    * @see #isJavaIdentifierPart(char)
3269    * @see #isLetter(char)
3270    * @see #isUnicodeIdentifierStart(char)
3271    * @since 1.5
3272    */
isJavaIdentifierStart(int codePoint)3273   public static boolean isJavaIdentifierStart(int codePoint)
3274   {
3275     return ((1 << getType(codePoint))
3276             & ((1 << UPPERCASE_LETTER)
3277                | (1 << LOWERCASE_LETTER)
3278                | (1 << TITLECASE_LETTER)
3279                | (1 << MODIFIER_LETTER)
3280                | (1 << OTHER_LETTER)
3281                | (1 << LETTER_NUMBER)
3282                | (1 << CURRENCY_SYMBOL)
3283                | (1 << CONNECTOR_PUNCTUATION))) != 0;
3284   }
3285 
3286   /**
3287    * Determines if a character can follow the first letter in
3288    * a Java identifier.  This is the combination of isJavaLetter (isLetter,
3289    * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
3290    * numeric letter (like Roman numerals), combining marks, non-spacing marks,
3291    * or isIdentifierIgnorable.
3292    * <br>
3293    * Java identifier extender =
3294    *   [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf]
3295    *   |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
3296    *
3297    * @param ch character to test
3298    * @return true if ch can follow the first letter in a Java identifier
3299    * @see #isIdentifierIgnorable(char)
3300    * @see #isJavaIdentifierStart(char)
3301    * @see #isLetterOrDigit(char)
3302    * @see #isUnicodeIdentifierPart(char)
3303    * @since 1.1
3304    */
isJavaIdentifierPart(char ch)3305   public static boolean isJavaIdentifierPart(char ch)
3306   {
3307     return isJavaIdentifierPart((int)ch);
3308   }
3309 
3310   /**
3311    * Determines if a character can follow the first letter in
3312    * a Java identifier.  This is the combination of isJavaLetter (isLetter,
3313    * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
3314    * numeric letter (like Roman numerals), combining marks, non-spacing marks,
3315    * or isIdentifierIgnorable.
3316    * <br>
3317    * Java identifier extender =
3318    *   [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf]
3319    *   |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
3320    *
3321    * @param codePoint character to test
3322    * @return true if ch can follow the first letter in a Java identifier
3323    * @see #isIdentifierIgnorable(char)
3324    * @see #isJavaIdentifierStart(char)
3325    * @see #isLetterOrDigit(char)
3326    * @see #isUnicodeIdentifierPart(char)
3327    * @since 1.5
3328    */
isJavaIdentifierPart(int codePoint)3329   public static boolean isJavaIdentifierPart(int codePoint)
3330   {
3331     int category = getType(codePoint);
3332     return ((1 << category)
3333             & ((1 << UPPERCASE_LETTER)
3334                | (1 << LOWERCASE_LETTER)
3335                | (1 << TITLECASE_LETTER)
3336                | (1 << MODIFIER_LETTER)
3337                | (1 << OTHER_LETTER)
3338                | (1 << NON_SPACING_MARK)
3339                | (1 << COMBINING_SPACING_MARK)
3340                | (1 << DECIMAL_DIGIT_NUMBER)
3341                | (1 << LETTER_NUMBER)
3342                | (1 << CURRENCY_SYMBOL)
3343                | (1 << CONNECTOR_PUNCTUATION)
3344                | (1 << FORMAT))) != 0
3345       || (category == CONTROL && isIdentifierIgnorable(codePoint));
3346   }
3347 
3348   /**
3349    * Determines if a character can start a Unicode identifier.  Only
3350    * letters can start a Unicode identifier, but this includes characters
3351    * in LETTER_NUMBER.
3352    * <br>
3353    * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]
3354    *
3355    * @param ch character to test
3356    * @return true if ch can start a Unicode identifier, else false
3357    * @see #isJavaIdentifierStart(char)
3358    * @see #isLetter(char)
3359    * @see #isUnicodeIdentifierPart(char)
3360    * @since 1.1
3361    */
isUnicodeIdentifierStart(char ch)3362   public static boolean isUnicodeIdentifierStart(char ch)
3363   {
3364     return isUnicodeIdentifierStart((int)ch);
3365   }
3366 
3367   /**
3368    * Determines if a character can start a Unicode identifier.  Only
3369    * letters can start a Unicode identifier, but this includes characters
3370    * in LETTER_NUMBER.
3371    * <br>
3372    * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]
3373    *
3374    * @param codePoint character to test
3375    * @return true if ch can start a Unicode identifier, else false
3376    * @see #isJavaIdentifierStart(char)
3377    * @see #isLetter(char)
3378    * @see #isUnicodeIdentifierPart(char)
3379    * @since 1.5
3380    */
isUnicodeIdentifierStart(int codePoint)3381   public static boolean isUnicodeIdentifierStart(int codePoint)
3382   {
3383     return ((1 << getType(codePoint))
3384             & ((1 << UPPERCASE_LETTER)
3385                | (1 << LOWERCASE_LETTER)
3386                | (1 << TITLECASE_LETTER)
3387                | (1 << MODIFIER_LETTER)
3388                | (1 << OTHER_LETTER)
3389                | (1 << LETTER_NUMBER))) != 0;
3390   }
3391 
3392   /**
3393    * Determines if a character can follow the first letter in
3394    * a Unicode identifier. This includes letters, connecting punctuation,
3395    * digits, numeric letters, combining marks, non-spacing marks, and
3396    * isIdentifierIgnorable.
3397    * <br>
3398    * Unicode identifier extender =
3399    *   [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]|
3400    *   |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
3401    *
3402    * @param ch character to test
3403    * @return true if ch can follow the first letter in a Unicode identifier
3404    * @see #isIdentifierIgnorable(char)
3405    * @see #isJavaIdentifierPart(char)
3406    * @see #isLetterOrDigit(char)
3407    * @see #isUnicodeIdentifierStart(char)
3408    * @since 1.1
3409    */
isUnicodeIdentifierPart(char ch)3410   public static boolean isUnicodeIdentifierPart(char ch)
3411   {
3412     return isUnicodeIdentifierPart((int)ch);
3413   }
3414 
3415   /**
3416    * Determines if a character can follow the first letter in
3417    * a Unicode identifier. This includes letters, connecting punctuation,
3418    * digits, numeric letters, combining marks, non-spacing marks, and
3419    * isIdentifierIgnorable.
3420    * <br>
3421    * Unicode identifier extender =
3422    *   [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]|
3423    *   |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
3424    *
3425    * @param codePoint character to test
3426    * @return true if ch can follow the first letter in a Unicode identifier
3427    * @see #isIdentifierIgnorable(char)
3428    * @see #isJavaIdentifierPart(char)
3429    * @see #isLetterOrDigit(char)
3430    * @see #isUnicodeIdentifierStart(char)
3431    * @since 1.5
3432    */
isUnicodeIdentifierPart(int codePoint)3433   public static boolean isUnicodeIdentifierPart(int codePoint)
3434   {
3435     int category = getType(codePoint);
3436     return ((1 << category)
3437             & ((1 << UPPERCASE_LETTER)
3438                | (1 << LOWERCASE_LETTER)
3439                | (1 << TITLECASE_LETTER)
3440                | (1 << MODIFIER_LETTER)
3441                | (1 << OTHER_LETTER)
3442                | (1 << NON_SPACING_MARK)
3443                | (1 << COMBINING_SPACING_MARK)
3444                | (1 << DECIMAL_DIGIT_NUMBER)
3445                | (1 << LETTER_NUMBER)
3446                | (1 << CONNECTOR_PUNCTUATION)
3447                | (1 << FORMAT))) != 0
3448       || (category == CONTROL && isIdentifierIgnorable(codePoint));
3449   }
3450 
3451   /**
3452    * Determines if a character is ignorable in a Unicode identifier. This
3453    * includes the non-whitespace ISO control characters (<code>'\u0000'</code>
3454    * through <code>'\u0008'</code>, <code>'\u000E'</code> through
3455    * <code>'\u001B'</code>, and <code>'\u007F'</code> through
3456    * <code>'\u009F'</code>), and FORMAT characters.
3457    * <br>
3458    * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B
3459    *    |U+007F-U+009F
3460    *
3461    * @param ch character to test
3462    * @return true if ch is ignorable in a Unicode or Java identifier
3463    * @see #isJavaIdentifierPart(char)
3464    * @see #isUnicodeIdentifierPart(char)
3465    * @since 1.1
3466    */
isIdentifierIgnorable(char ch)3467   public static boolean isIdentifierIgnorable(char ch)
3468   {
3469     return isIdentifierIgnorable((int)ch);
3470   }
3471 
3472   /**
3473    * Determines if a character is ignorable in a Unicode identifier. This
3474    * includes the non-whitespace ISO control characters (<code>'\u0000'</code>
3475    * through <code>'\u0008'</code>, <code>'\u000E'</code> through
3476    * <code>'\u001B'</code>, and <code>'\u007F'</code> through
3477    * <code>'\u009F'</code>), and FORMAT characters.
3478    * <br>
3479    * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B
3480    *    |U+007F-U+009F
3481    *
3482    * @param codePoint character to test
3483    * @return true if ch is ignorable in a Unicode or Java identifier
3484    * @see #isJavaIdentifierPart(char)
3485    * @see #isUnicodeIdentifierPart(char)
3486    * @since 1.5
3487    */
isIdentifierIgnorable(int codePoint)3488   public static boolean isIdentifierIgnorable(int codePoint)
3489   {
3490     if ((codePoint >= 0 && codePoint <= 0x0008)
3491         || (codePoint >= 0x000E && codePoint <= 0x001B)
3492         || (codePoint >= 0x007F && codePoint <= 0x009F)
3493         || getType(codePoint) == FORMAT)
3494       return true;
3495     return false;
3496   }
3497 
3498   /**
3499    * Converts a Unicode character into its lowercase equivalent mapping.
3500    * If a mapping does not exist, then the character passed is returned.
3501    * Note that isLowerCase(toLowerCase(ch)) does not always return true.
3502    *
3503    * @param ch character to convert to lowercase
3504    * @return lowercase mapping of ch, or ch if lowercase mapping does
3505    *         not exist
3506    * @see #isLowerCase(char)
3507    * @see #isUpperCase(char)
3508    * @see #toTitleCase(char)
3509    * @see #toUpperCase(char)
3510    */
toLowerCase(char ch)3511   public static char toLowerCase(char ch)
3512   {
3513     return (char) (lower[0][readCodePoint((int)ch) >>> 7] + ch);
3514   }
3515 
3516   /**
3517    * Converts a Unicode character into its lowercase equivalent mapping.
3518    * If a mapping does not exist, then the character passed is returned.
3519    * Note that isLowerCase(toLowerCase(ch)) does not always return true.
3520    *
3521    * @param codePoint character to convert to lowercase
3522    * @return lowercase mapping of ch, or ch if lowercase mapping does
3523    *         not exist
3524    * @see #isLowerCase(char)
3525    * @see #isUpperCase(char)
3526    * @see #toTitleCase(char)
3527    * @see #toUpperCase(char)
3528    *
3529    * @since 1.5
3530    */
toLowerCase(int codePoint)3531   public static int toLowerCase(int codePoint)
3532   {
3533     // If the code point is unassigned or in one of the private use areas
3534     // then we delegate the call to the appropriate private static inner class.
3535     int plane = codePoint >>> 16;
3536     if (plane > 2 && plane < 14)
3537       return UnassignedCharacters.toLowerCase(codePoint);
3538     if (plane > 14)
3539       return PrivateUseCharacters.toLowerCase(codePoint);
3540 
3541     // The short value stored in lower[plane] is the signed difference between
3542     // codePoint and its lowercase conversion.
3543     return ((short)lower[plane][readCodePoint(codePoint) >>> 7]) + codePoint;
3544   }
3545 
3546   /**
3547    * Converts a Unicode character into its uppercase equivalent mapping.
3548    * If a mapping does not exist, then the character passed is returned.
3549    * Note that isUpperCase(toUpperCase(ch)) does not always return true.
3550    *
3551    * @param ch character to convert to uppercase
3552    * @return uppercase mapping of ch, or ch if uppercase mapping does
3553    *         not exist
3554    * @see #isLowerCase(char)
3555    * @see #isUpperCase(char)
3556    * @see #toLowerCase(char)
3557    * @see #toTitleCase(char)
3558    */
toUpperCase(char ch)3559   public static char toUpperCase(char ch)
3560   {
3561     return (char) (upper[0][readCodePoint((int)ch) >>> 7] + ch);
3562   }
3563 
3564   /**
3565    * Converts a Unicode character into its uppercase equivalent mapping.
3566    * If a mapping does not exist, then the character passed is returned.
3567    * Note that isUpperCase(toUpperCase(ch)) does not always return true.
3568    *
3569    * @param codePoint character to convert to uppercase
3570    * @return uppercase mapping of ch, or ch if uppercase mapping does
3571    *         not exist
3572    * @see #isLowerCase(char)
3573    * @see #isUpperCase(char)
3574    * @see #toLowerCase(char)
3575    * @see #toTitleCase(char)
3576    *
3577    * @since 1.5
3578    */
toUpperCase(int codePoint)3579   public static int toUpperCase(int codePoint)
3580   {
3581     // If the code point is unassigned or in one of the private use areas
3582     // then we delegate the call to the appropriate private static inner class.
3583     int plane = codePoint >>> 16;
3584     if (plane > 2 && plane < 14)
3585       return UnassignedCharacters.toUpperCase(codePoint);
3586     if (plane > 14)
3587       return PrivateUseCharacters.toUpperCase(codePoint);
3588 
3589     // The short value stored in upper[plane] is the signed difference between
3590     // codePoint and its uppercase conversion.
3591     return ((short)upper[plane][readCodePoint(codePoint) >>> 7]) + codePoint;
3592   }
3593 
3594   /**
3595    * Converts a Unicode character into its titlecase equivalent mapping.
3596    * If a mapping does not exist, then the character passed is returned.
3597    * Note that isTitleCase(toTitleCase(ch)) does not always return true.
3598    *
3599    * @param ch character to convert to titlecase
3600    * @return titlecase mapping of ch, or ch if titlecase mapping does
3601    *         not exist
3602    * @see #isTitleCase(char)
3603    * @see #toLowerCase(char)
3604    * @see #toUpperCase(char)
3605    */
toTitleCase(char ch)3606   public static char toTitleCase(char ch)
3607   {
3608     // As title is short, it doesn't hurt to exhaustively iterate over it.
3609     for (int i = title.length - 2; i >= 0; i -= 2)
3610       if (title[i] == ch)
3611         return title[i + 1];
3612     return toUpperCase(ch);
3613   }
3614 
3615   /**
3616    * Converts a Unicode character into its titlecase equivalent mapping.
3617    * If a mapping does not exist, then the character passed is returned.
3618    * Note that isTitleCase(toTitleCase(ch)) does not always return true.
3619    *
3620    * @param codePoint character to convert to titlecase
3621    * @return titlecase mapping of ch, or ch if titlecase mapping does
3622    *         not exist
3623    * @see #isTitleCase(char)
3624    * @see #toLowerCase(char)
3625    * @see #toUpperCase(char)
3626    *
3627    * @since 1.5
3628    */
toTitleCase(int codePoint)3629   public static int toTitleCase(int codePoint)
3630   {
3631     // As of Unicode 4.0.0 no characters outside of plane 0 have
3632     // titlecase mappings that are different from their uppercase
3633     // mapping.
3634     if (codePoint < 0x10000)
3635       return (int) toTitleCase((char)codePoint);
3636     return toUpperCase(codePoint);
3637   }
3638 
3639   /**
3640    * Converts a character into a digit of the specified radix. If the radix
3641    * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch)
3642    * exceeds the radix, or if ch is not a decimal digit or in the case
3643    * insensitive set of 'a'-'z', the result is -1.
3644    * <br>
3645    * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A
3646    *    |U+FF21-U+FF3A|U+FF41-U+FF5A
3647    *
3648    * @param ch character to convert into a digit
3649    * @param radix radix in which ch is a digit
3650    * @return digit which ch represents in radix, or -1 not a valid digit
3651    * @see #MIN_RADIX
3652    * @see #MAX_RADIX
3653    * @see #forDigit(int, int)
3654    * @see #isDigit(char)
3655    * @see #getNumericValue(char)
3656    */
digit(char ch, int radix)3657   public static int digit(char ch, int radix)
3658   {
3659     if (radix < MIN_RADIX || radix > MAX_RADIX)
3660       return -1;
3661     char attr = readCodePoint((int)ch);
3662     if (((1 << (attr & TYPE_MASK))
3663          & ((1 << UPPERCASE_LETTER)
3664             | (1 << LOWERCASE_LETTER)
3665             | (1 << DECIMAL_DIGIT_NUMBER))) != 0)
3666       {
3667         // Signedness doesn't matter; 0xffff vs. -1 are both rejected.
3668         int digit = numValue[0][attr >> 7];
3669         return (digit < radix) ? digit : -1;
3670       }
3671     return -1;
3672   }
3673 
3674   /**
3675    * Converts a character into a digit of the specified radix. If the radix
3676    * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch)
3677    * exceeds the radix, or if ch is not a decimal digit or in the case
3678    * insensitive set of 'a'-'z', the result is -1.
3679    * <br>
3680    * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A
3681    *    |U+FF21-U+FF3A|U+FF41-U+FF5A
3682    *
3683    * @param codePoint character to convert into a digit
3684    * @param radix radix in which ch is a digit
3685    * @return digit which ch represents in radix, or -1 not a valid digit
3686    * @see #MIN_RADIX
3687    * @see #MAX_RADIX
3688    * @see #forDigit(int, int)
3689    * @see #isDigit(char)
3690    * @see #getNumericValue(char)
3691    */
digit(int codePoint, int radix)3692   public static int digit(int codePoint, int radix)
3693   {
3694     if (radix < MIN_RADIX || radix > MAX_RADIX)
3695       return -1;
3696 
3697     // If the code point is unassigned or in one of the private use areas
3698     // then we delegate the call to the appropriate private static inner class.
3699     int plane = codePoint >>> 16;
3700     if (plane > 2 && plane < 14)
3701       return UnassignedCharacters.digit(codePoint, radix);
3702     if (plane > 14)
3703       return PrivateUseCharacters.digit(codePoint, radix);
3704     char attr = readCodePoint(codePoint);
3705     if (((1 << (attr & TYPE_MASK))
3706          & ((1 << UPPERCASE_LETTER)
3707             | (1 << LOWERCASE_LETTER)
3708             | (1 << DECIMAL_DIGIT_NUMBER))) != 0)
3709       {
3710         // Signedness doesn't matter; 0xffff vs. -1 are both rejected.
3711         int digit = numValue[plane][attr >> 7];
3712 
3713         // If digit is less than or equal to -3 then the numerical value was
3714         // too large to fit into numValue and is stored in CharData.LARGENUMS.
3715         if (digit <= -3)
3716           digit = CharData.LARGENUMS[-digit - 3];
3717         return (digit < radix) ? digit : -1;
3718       }
3719     return -1;
3720   }
3721 
3722   /**
3723    * Returns the Unicode numeric value property of a character. For example,
3724    * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50.
3725    *
3726    * <p>This method also returns values for the letters A through Z, (not
3727    * specified by Unicode), in these ranges: <code>'\u0041'</code>
3728    * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code>
3729    * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code>
3730    * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through
3731    * <code>'\uFF5A'</code> (full width variants).
3732    *
3733    * <p>If the character lacks a numeric value property, -1 is returned.
3734    * If the character has a numeric value property which is not representable
3735    * as a nonnegative integer, such as a fraction, -2 is returned.
3736    *
3737    * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A
3738    *    |U+FF21-U+FF3A|U+FF41-U+FF5A
3739    *
3740    * @param ch character from which the numeric value property will
3741    *        be retrieved
3742    * @return the numeric value property of ch, or -1 if it does not exist, or
3743    *         -2 if it is not representable as a nonnegative integer
3744    * @see #forDigit(int, int)
3745    * @see #digit(char, int)
3746    * @see #isDigit(char)
3747    * @since 1.1
3748    */
getNumericValue(char ch)3749   public static int getNumericValue(char ch)
3750   {
3751     // Treat numValue as signed.
3752     return (short) numValue[0][readCodePoint((int)ch) >> 7];
3753   }
3754 
3755   /**
3756    * Returns the Unicode numeric value property of a character. For example,
3757    * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50.
3758    *
3759    * <p>This method also returns values for the letters A through Z, (not
3760    * specified by Unicode), in these ranges: <code>'\u0041'</code>
3761    * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code>
3762    * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code>
3763    * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through
3764    * <code>'\uFF5A'</code> (full width variants).
3765    *
3766    * <p>If the character lacks a numeric value property, -1 is returned.
3767    * If the character has a numeric value property which is not representable
3768    * as a nonnegative integer, such as a fraction, -2 is returned.
3769    *
3770    * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A
3771    *    |U+FF21-U+FF3A|U+FF41-U+FF5A
3772    *
3773    * @param codePoint character from which the numeric value property will
3774    *        be retrieved
3775    * @return the numeric value property of ch, or -1 if it does not exist, or
3776    *         -2 if it is not representable as a nonnegative integer
3777    * @see #forDigit(int, int)
3778    * @see #digit(char, int)
3779    * @see #isDigit(char)
3780    * @since 1.5
3781    */
getNumericValue(int codePoint)3782   public static int getNumericValue(int codePoint)
3783   {
3784     // If the code point is unassigned or in one of the private use areas
3785     // then we delegate the call to the appropriate private static inner class.
3786     int plane = codePoint >>> 16;
3787     if (plane > 2 && plane < 14)
3788       return UnassignedCharacters.getNumericValue(codePoint);
3789     if (plane > 14)
3790       return PrivateUseCharacters.getNumericValue(codePoint);
3791 
3792     // If the value N found in numValue[plane] is less than or equal to -3
3793     // then the numeric value was too big to fit into 16 bits and is
3794     // stored in CharData.LARGENUMS at offset (-N - 3).
3795     short num = (short)numValue[plane][readCodePoint(codePoint) >> 7];
3796     if (num <= -3)
3797       return CharData.LARGENUMS[-num - 3];
3798     return num;
3799   }
3800 
3801   /**
3802    * Determines if a character is a ISO-LATIN-1 space. This is only the five
3803    * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>,
3804    * <code>'\r'</code>, and <code>' '</code>.
3805    * <br>
3806    * Java space = U+0020|U+0009|U+000A|U+000C|U+000D
3807    *
3808    * @param ch character to test
3809    * @return true if ch is a space, else false
3810    * @deprecated Replaced by {@link #isWhitespace(char)}
3811    * @see #isSpaceChar(char)
3812    * @see #isWhitespace(char)
3813    */
isSpace(char ch)3814   public static boolean isSpace(char ch)
3815   {
3816     // Performing the subtraction up front alleviates need to compare longs.
3817     return ch-- <= ' ' && ((1 << ch)
3818                            & ((1 << (' ' - 1))
3819                               | (1 << ('\t' - 1))
3820                               | (1 << ('\n' - 1))
3821                               | (1 << ('\r' - 1))
3822                               | (1 << ('\f' - 1)))) != 0;
3823   }
3824 
3825   /**
3826    * Determines if a character is a Unicode space character. This includes
3827    * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR.
3828    * <br>
3829    * Unicode space = [Zs]|[Zp]|[Zl]
3830    *
3831    * @param ch character to test
3832    * @return true if ch is a Unicode space, else false
3833    * @see #isWhitespace(char)
3834    * @since 1.1
3835    */
isSpaceChar(char ch)3836   public static boolean isSpaceChar(char ch)
3837   {
3838     return isSpaceChar((int)ch);
3839   }
3840 
3841   /**
3842    * Determines if a character is a Unicode space character. This includes
3843    * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR.
3844    * <br>
3845    * Unicode space = [Zs]|[Zp]|[Zl]
3846    *
3847    * @param codePoint character to test
3848    * @return true if ch is a Unicode space, else false
3849    * @see #isWhitespace(char)
3850    * @since 1.5
3851    */
isSpaceChar(int codePoint)3852   public static boolean isSpaceChar(int codePoint)
3853   {
3854     return ((1 << getType(codePoint))
3855             & ((1 << SPACE_SEPARATOR)
3856                | (1 << LINE_SEPARATOR)
3857                | (1 << PARAGRAPH_SEPARATOR))) != 0;
3858   }
3859 
3860   /**
3861    * Determines if a character is Java whitespace. This includes Unicode
3862    * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and
3863    * PARAGRAPH_SEPARATOR) except the non-breaking spaces
3864    * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>);
3865    * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>,
3866    * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>,
3867    * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>,
3868    * and <code>'\u001F'</code>.
3869    * <br>
3870    * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F
3871    *
3872    * @param ch character to test
3873    * @return true if ch is Java whitespace, else false
3874    * @see #isSpaceChar(char)
3875    * @since 1.1
3876    */
isWhitespace(char ch)3877   public static boolean isWhitespace(char ch)
3878   {
3879     return isWhitespace((int) ch);
3880   }
3881 
3882   /**
3883    * Determines if a character is Java whitespace. This includes Unicode
3884    * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and
3885    * PARAGRAPH_SEPARATOR) except the non-breaking spaces
3886    * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>);
3887    * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>,
3888    * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>,
3889    * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>,
3890    * and <code>'\u001F'</code>.
3891    * <br>
3892    * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F
3893    *
3894    * @param codePoint character to test
3895    * @return true if ch is Java whitespace, else false
3896    * @see #isSpaceChar(char)
3897    * @since 1.5
3898    */
isWhitespace(int codePoint)3899   public static boolean isWhitespace(int codePoint)
3900   {
3901     int plane = codePoint >>> 16;
3902     if (plane > 2 && plane < 14)
3903       return UnassignedCharacters.isWhiteSpace(codePoint);
3904     if (plane > 14)
3905       return PrivateUseCharacters.isWhiteSpace(codePoint);
3906 
3907     int attr = readCodePoint(codePoint);
3908     return ((((1 << (attr & TYPE_MASK))
3909               & ((1 << SPACE_SEPARATOR)
3910                  | (1 << LINE_SEPARATOR)
3911                  | (1 << PARAGRAPH_SEPARATOR))) != 0)
3912             && (attr & NO_BREAK_MASK) == 0)
3913       || (codePoint <= '\u001F' && ((1 << codePoint)
3914                              & ((1 << '\t')
3915                                 | (1 << '\n')
3916                                 | (1 << '\u000B')
3917                                 | (1 << '\u000C')
3918                                 | (1 << '\r')
3919                                 | (1 << '\u001C')
3920                                 | (1 << '\u001D')
3921                                 | (1 << '\u001E')
3922                                 | (1 << '\u001F'))) != 0);
3923   }
3924 
3925   /**
3926    * Determines if a character has the ISO Control property.
3927    * <br>
3928    * ISO Control = [Cc]
3929    *
3930    * @param ch character to test
3931    * @return true if ch is an ISO Control character, else false
3932    * @see #isSpaceChar(char)
3933    * @see #isWhitespace(char)
3934    * @since 1.1
3935    */
isISOControl(char ch)3936   public static boolean isISOControl(char ch)
3937   {
3938     return isISOControl((int)ch);
3939   }
3940 
3941   /**
3942    * Determines if the character is an ISO Control character.  This is true
3943    * if the code point is in the range [0, 0x001F] or if it is in the range
3944    * [0x007F, 0x009F].
3945    * @param codePoint the character to check
3946    * @return true if the character is in one of the above ranges
3947    *
3948    * @since 1.5
3949    */
isISOControl(int codePoint)3950   public static boolean isISOControl(int codePoint)
3951   {
3952     if ((codePoint >= 0 && codePoint <= 0x001F)
3953         || (codePoint >= 0x007F && codePoint <= 0x009F))
3954       return true;
3955     return false;
3956   }
3957 
3958   /**
3959    * Returns the Unicode general category property of a character.
3960    *
3961    * @param ch character from which the general category property will
3962    *        be retrieved
3963    * @return the character category property of ch as an integer
3964    * @see #UNASSIGNED
3965    * @see #UPPERCASE_LETTER
3966    * @see #LOWERCASE_LETTER
3967    * @see #TITLECASE_LETTER
3968    * @see #MODIFIER_LETTER
3969    * @see #OTHER_LETTER
3970    * @see #NON_SPACING_MARK
3971    * @see #ENCLOSING_MARK
3972    * @see #COMBINING_SPACING_MARK
3973    * @see #DECIMAL_DIGIT_NUMBER
3974    * @see #LETTER_NUMBER
3975    * @see #OTHER_NUMBER
3976    * @see #SPACE_SEPARATOR
3977    * @see #LINE_SEPARATOR
3978    * @see #PARAGRAPH_SEPARATOR
3979    * @see #CONTROL
3980    * @see #FORMAT
3981    * @see #PRIVATE_USE
3982    * @see #SURROGATE
3983    * @see #DASH_PUNCTUATION
3984    * @see #START_PUNCTUATION
3985    * @see #END_PUNCTUATION
3986    * @see #CONNECTOR_PUNCTUATION
3987    * @see #OTHER_PUNCTUATION
3988    * @see #MATH_SYMBOL
3989    * @see #CURRENCY_SYMBOL
3990    * @see #MODIFIER_SYMBOL
3991    * @see #INITIAL_QUOTE_PUNCTUATION
3992    * @see #FINAL_QUOTE_PUNCTUATION
3993    * @since 1.1
3994    */
getType(char ch)3995   public static int getType(char ch)
3996   {
3997     return getType((int)ch);
3998   }
3999 
4000   /**
4001    * Returns the Unicode general category property of a character.
4002    *
4003    * @param codePoint character from which the general category property will
4004    *        be retrieved
4005    * @return the character category property of ch as an integer
4006    * @see #UNASSIGNED
4007    * @see #UPPERCASE_LETTER
4008    * @see #LOWERCASE_LETTER
4009    * @see #TITLECASE_LETTER
4010    * @see #MODIFIER_LETTER
4011    * @see #OTHER_LETTER
4012    * @see #NON_SPACING_MARK
4013    * @see #ENCLOSING_MARK
4014    * @see #COMBINING_SPACING_MARK
4015    * @see #DECIMAL_DIGIT_NUMBER
4016    * @see #LETTER_NUMBER
4017    * @see #OTHER_NUMBER
4018    * @see #SPACE_SEPARATOR
4019    * @see #LINE_SEPARATOR
4020    * @see #PARAGRAPH_SEPARATOR
4021    * @see #CONTROL
4022    * @see #FORMAT
4023    * @see #PRIVATE_USE
4024    * @see #SURROGATE
4025    * @see #DASH_PUNCTUATION
4026    * @see #START_PUNCTUATION
4027    * @see #END_PUNCTUATION
4028    * @see #CONNECTOR_PUNCTUATION
4029    * @see #OTHER_PUNCTUATION
4030    * @see #MATH_SYMBOL
4031    * @see #CURRENCY_SYMBOL
4032    * @see #MODIFIER_SYMBOL
4033    * @see #INITIAL_QUOTE_PUNCTUATION
4034    * @see #FINAL_QUOTE_PUNCTUATION
4035    *
4036    * @since 1.5
4037    */
getType(int codePoint)4038   public static int getType(int codePoint)
4039   {
4040     // If the codePoint is unassigned or in one of the private use areas
4041     // then we delegate the call to the appropriate private static inner class.
4042     int plane = codePoint >>> 16;
4043     if (plane > 2 && plane < 14)
4044       return UnassignedCharacters.getType(codePoint);
4045     if (plane > 14)
4046       return PrivateUseCharacters.getType(codePoint);
4047 
4048     return readCodePoint(codePoint) & TYPE_MASK;
4049   }
4050 
4051   /**
4052    * Converts a digit into a character which represents that digit
4053    * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX,
4054    * or the digit exceeds the radix, then the null character <code>'\0'</code>
4055    * is returned.  Otherwise the return value is in '0'-'9' and 'a'-'z'.
4056    * <br>
4057    * return value boundary = U+0030-U+0039|U+0061-U+007A
4058    *
4059    * @param digit digit to be converted into a character
4060    * @param radix radix of digit
4061    * @return character representing digit in radix, or '\0'
4062    * @see #MIN_RADIX
4063    * @see #MAX_RADIX
4064    * @see #digit(char, int)
4065    */
forDigit(int digit, int radix)4066   public static char forDigit(int digit, int radix)
4067   {
4068     if (radix < MIN_RADIX || radix > MAX_RADIX
4069         || digit < 0 || digit >= radix)
4070       return '\0';
4071     return Number.digits[digit];
4072   }
4073 
4074   /**
4075    * Returns the Unicode directionality property of the character. This
4076    * is used in the visual ordering of text.
4077    *
4078    * @param ch the character to look up
4079    * @return the directionality constant, or DIRECTIONALITY_UNDEFINED
4080    * @see #DIRECTIONALITY_UNDEFINED
4081    * @see #DIRECTIONALITY_LEFT_TO_RIGHT
4082    * @see #DIRECTIONALITY_RIGHT_TO_LEFT
4083    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
4084    * @see #DIRECTIONALITY_EUROPEAN_NUMBER
4085    * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR
4086    * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR
4087    * @see #DIRECTIONALITY_ARABIC_NUMBER
4088    * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR
4089    * @see #DIRECTIONALITY_NONSPACING_MARK
4090    * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL
4091    * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR
4092    * @see #DIRECTIONALITY_SEGMENT_SEPARATOR
4093    * @see #DIRECTIONALITY_WHITESPACE
4094    * @see #DIRECTIONALITY_OTHER_NEUTRALS
4095    * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING
4096    * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE
4097    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
4098    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE
4099    * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
4100    * @since 1.4
4101    */
getDirectionality(char ch)4102   public static byte getDirectionality(char ch)
4103   {
4104     // The result will correctly be signed.
4105     return getDirectionality((int)ch);
4106   }
4107 
4108 
4109   /**
4110    * Returns the Unicode directionality property of the character. This
4111    * is used in the visual ordering of text.
4112    *
4113    * @param codePoint the character to look up
4114    * @return the directionality constant, or DIRECTIONALITY_UNDEFINED
4115    * @see #DIRECTIONALITY_UNDEFINED
4116    * @see #DIRECTIONALITY_LEFT_TO_RIGHT
4117    * @see #DIRECTIONALITY_RIGHT_TO_LEFT
4118    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
4119    * @see #DIRECTIONALITY_EUROPEAN_NUMBER
4120    * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR
4121    * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR
4122    * @see #DIRECTIONALITY_ARABIC_NUMBER
4123    * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR
4124    * @see #DIRECTIONALITY_NONSPACING_MARK
4125    * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL
4126    * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR
4127    * @see #DIRECTIONALITY_SEGMENT_SEPARATOR
4128    * @see #DIRECTIONALITY_WHITESPACE
4129    * @see #DIRECTIONALITY_OTHER_NEUTRALS
4130    * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING
4131    * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE
4132    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
4133    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE
4134    * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
4135    * @since 1.5
4136    */
getDirectionality(int codePoint)4137   public static byte getDirectionality(int codePoint)
4138   {
4139     // If the code point is unassigned or in one of the private use areas
4140     // then we delegate the call to the appropriate private static inner class.
4141     int plane = codePoint >>> 16;
4142     if (plane > 2 && plane < 14)
4143       return UnassignedCharacters.getDirectionality(codePoint);
4144     if (plane > 14)
4145       return PrivateUseCharacters.getDirectionality(codePoint);
4146 
4147     // The result will correctly be signed.
4148     return (byte) (direction[plane][readCodePoint(codePoint) >> 7] >> 2);
4149   }
4150 
4151   /**
4152    * Determines whether the character is mirrored according to Unicode. For
4153    * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in
4154    * left-to-right text, but ')' in right-to-left text.
4155    *
4156    * @param ch the character to look up
4157    * @return true if the character is mirrored
4158    * @since 1.4
4159    */
isMirrored(char ch)4160   public static boolean isMirrored(char ch)
4161   {
4162     return (readCodePoint((int)ch) & MIRROR_MASK) != 0;
4163   }
4164 
4165   /**
4166    * Determines whether the character is mirrored according to Unicode. For
4167    * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in
4168    * left-to-right text, but ')' in right-to-left text.
4169    *
4170    * @param codePoint the character to look up
4171    * @return true if the character is mirrored
4172    * @since 1.5
4173    */
isMirrored(int codePoint)4174   public static boolean isMirrored(int codePoint)
4175   {
4176     // If the code point is unassigned or part of one of the private use areas
4177     // then we delegate the call to the appropriate private static inner class.
4178     int plane = codePoint >>> 16;
4179     if (plane > 2 && plane < 14)
4180       return UnassignedCharacters.isMirrored(codePoint);
4181     if (plane > 14)
4182       return PrivateUseCharacters.isMirrored(codePoint);
4183 
4184     return (readCodePoint(codePoint) & MIRROR_MASK) != 0;
4185   }
4186 
4187   /**
4188    * Compares another Character to this Character, numerically.
4189    *
4190    * @param anotherCharacter Character to compare with this Character
4191    * @return a negative integer if this Character is less than
4192    *         anotherCharacter, zero if this Character is equal, and
4193    *         a positive integer if this Character is greater
4194    * @throws NullPointerException if anotherCharacter is null
4195    * @since 1.2
4196    */
compareTo(Character anotherCharacter)4197   public int compareTo(Character anotherCharacter)
4198   {
4199     return value - anotherCharacter.value;
4200   }
4201 
4202   /**
4203    * Compares two unboxed char values.
4204    * The result is positive if the first is greater, negative if the second
4205    * is greater, and 0 if the two are equal.
4206    *
4207    * @param x First value to compare.
4208    * @param y Second value to compare.
4209    *
4210    * @return positive int if the first value is greater, negative if the second
4211    * is greater, and 0 if the two are equal.
4212    * @since 1.7
4213    */
compare(char x, char y)4214   public static int compare(char x, char y)
4215   {
4216     return Character.valueOf(x).compareTo(Character.valueOf(y));
4217   }
4218 
4219   /**
4220    * Returns an <code>Character</code> object wrapping the value.
4221    * In contrast to the <code>Character</code> constructor, this method
4222    * will cache some values.  It is used by boxing conversion.
4223    *
4224    * @param val the value to wrap
4225    * @return the <code>Character</code>
4226    *
4227    * @since 1.5
4228    */
valueOf(char val)4229   public static Character valueOf(char val)
4230   {
4231     if (val > MAX_CACHE)
4232       return new Character(val);
4233     else
4234       return charCache[val - MIN_VALUE];
4235   }
4236 
4237   /**
4238    * Reverse the bytes in val.
4239    * @since 1.5
4240    */
reverseBytes(char val)4241   public static char reverseBytes(char val)
4242   {
4243     return (char) (((val >> 8) & 0xff) | ((val << 8) & 0xff00));
4244   }
4245 
4246   /**
4247    * Converts a unicode code point to a UTF-16 representation of that
4248    * code point.
4249    *
4250    * @param codePoint the unicode code point
4251    *
4252    * @return the UTF-16 representation of that code point
4253    *
4254    * @throws IllegalArgumentException if the code point is not a valid
4255    *         unicode code point
4256    *
4257    * @since 1.5
4258    */
toChars(int codePoint)4259   public static char[] toChars(int codePoint)
4260   {
4261     if (!isValidCodePoint(codePoint))
4262       throw new IllegalArgumentException("Illegal Unicode code point : "
4263                                          + codePoint);
4264     char[] result = new char[charCount(codePoint)];
4265     int ignore = toChars(codePoint, result, 0);
4266     return result;
4267   }
4268 
4269   /**
4270    * Converts a unicode code point to its UTF-16 representation.
4271    *
4272    * @param codePoint the unicode code point
4273    * @param dst the target char array
4274    * @param dstIndex the start index for the target
4275    *
4276    * @return number of characters written to <code>dst</code>
4277    *
4278    * @throws IllegalArgumentException if <code>codePoint</code> is not a
4279    *         valid unicode code point
4280    * @throws NullPointerException if <code>dst</code> is <code>null</code>
4281    * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid
4282    *         in <code>dst</code> or if the UTF-16 representation does not
4283    *         fit into <code>dst</code>
4284    *
4285    * @since 1.5
4286    */
toChars(int codePoint, char[] dst, int dstIndex)4287   public static int toChars(int codePoint, char[] dst, int dstIndex)
4288   {
4289     if (!isValidCodePoint(codePoint))
4290       {
4291         throw new IllegalArgumentException("not a valid code point: "
4292                                            + codePoint);
4293       }
4294 
4295     int result;
4296     if (isSupplementaryCodePoint(codePoint))
4297       {
4298         // Write second char first to cause IndexOutOfBoundsException
4299         // immediately.
4300         final int cp2 = codePoint - 0x10000;
4301         dst[dstIndex + 1] = (char) ((cp2 % 0x400) + (int) MIN_LOW_SURROGATE);
4302         dst[dstIndex] = (char) ((cp2 / 0x400) + (int) MIN_HIGH_SURROGATE);
4303         result = 2;
4304       }
4305     else
4306       {
4307         dst[dstIndex] = (char) codePoint;
4308         result = 1;
4309       }
4310     return result;
4311   }
4312 
4313   /**
4314    * Return number of 16-bit characters required to represent the given
4315    * code point.
4316    *
4317    * @param codePoint a unicode code point
4318    *
4319    * @return 2 if codePoint >= 0x10000, 1 otherwise.
4320    *
4321    * @since 1.5
4322    */
charCount(int codePoint)4323   public static int charCount(int codePoint)
4324   {
4325     return
4326       (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT)
4327       ? 2
4328       : 1;
4329   }
4330 
4331   /**
4332    * Determines whether the specified code point is
4333    * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode
4334    * supplementary character range.
4335    *
4336    * @param codePoint a Unicode code point
4337    *
4338    * @return <code>true</code> if code point is in supplementary range
4339    *
4340    * @since 1.5
4341    */
isSupplementaryCodePoint(int codePoint)4342   public static boolean isSupplementaryCodePoint(int codePoint)
4343   {
4344     return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT
4345       && codePoint <= MAX_CODE_POINT;
4346   }
4347 
4348   /**
4349    * Determines whether the specified code point is
4350    * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point.
4351    *
4352    * @param codePoint a Unicode code point
4353    *
4354    * @return <code>true</code> if code point is valid
4355    *
4356    * @since 1.5
4357    */
isValidCodePoint(int codePoint)4358   public static boolean isValidCodePoint(int codePoint)
4359   {
4360     return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
4361   }
4362 
4363   /**
4364    * Return true if the given character is a high surrogate.
4365    * @param ch the character
4366    * @return true if the character is a high surrogate character
4367    *
4368    * @since 1.5
4369    */
isHighSurrogate(char ch)4370   public static boolean isHighSurrogate(char ch)
4371   {
4372     return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
4373   }
4374 
4375   /**
4376    * Return true if the given character is a low surrogate.
4377    * @param ch the character
4378    * @return true if the character is a low surrogate character
4379    *
4380    * @since 1.5
4381    */
isLowSurrogate(char ch)4382   public static boolean isLowSurrogate(char ch)
4383   {
4384     return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
4385   }
4386 
4387   /**
4388    * Return true if the given characters compose a surrogate pair.
4389    * This is true if the first character is a high surrogate and the
4390    * second character is a low surrogate.
4391    * @param ch1 the first character
4392    * @param ch2 the first character
4393    * @return true if the characters compose a surrogate pair
4394    *
4395    * @since 1.5
4396    */
isSurrogatePair(char ch1, char ch2)4397   public static boolean isSurrogatePair(char ch1, char ch2)
4398   {
4399     return isHighSurrogate(ch1) && isLowSurrogate(ch2);
4400   }
4401 
4402   /**
4403    * Given a valid surrogate pair, this returns the corresponding
4404    * code point.
4405    * @param high the high character of the pair
4406    * @param low the low character of the pair
4407    * @return the corresponding code point
4408    *
4409    * @since 1.5
4410    */
toCodePoint(char high, char low)4411   public static int toCodePoint(char high, char low)
4412   {
4413     return ((high - MIN_HIGH_SURROGATE) * 0x400) +
4414       (low - MIN_LOW_SURROGATE) + 0x10000;
4415   }
4416 
4417   /**
4418    * Get the code point at the specified index in the CharSequence.
4419    * This is like CharSequence#charAt(int), but if the character is
4420    * the start of a surrogate pair, and there is a following
4421    * character, and this character completes the pair, then the
4422    * corresponding supplementary code point is returned.  Otherwise,
4423    * the character at the index is returned.
4424    *
4425    * @param sequence the CharSequence
4426    * @param index the index of the codepoint to get, starting at 0
4427    * @return the codepoint at the specified index
4428    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
4429    * @since 1.5
4430    */
codePointAt(CharSequence sequence, int index)4431   public static int codePointAt(CharSequence sequence, int index)
4432   {
4433     int len = sequence.length();
4434     if (index < 0 || index >= len)
4435       throw new IndexOutOfBoundsException();
4436     char high = sequence.charAt(index);
4437     if (! isHighSurrogate(high) || ++index >= len)
4438       return high;
4439     char low = sequence.charAt(index);
4440     if (! isLowSurrogate(low))
4441       return high;
4442     return toCodePoint(high, low);
4443   }
4444 
4445   /**
4446    * Get the code point at the specified index in the CharSequence.
4447    * If the character is the start of a surrogate pair, and there is a
4448    * following character, and this character completes the pair, then
4449    * the corresponding supplementary code point is returned.
4450    * Otherwise, the character at the index is returned.
4451    *
4452    * @param chars the character array in which to look
4453    * @param index the index of the codepoint to get, starting at 0
4454    * @return the codepoint at the specified index
4455    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
4456    * @since 1.5
4457    */
codePointAt(char[] chars, int index)4458   public static int codePointAt(char[] chars, int index)
4459   {
4460     return codePointAt(chars, index, chars.length);
4461   }
4462 
4463   /**
4464    * Get the code point at the specified index in the CharSequence.
4465    * If the character is the start of a surrogate pair, and there is a
4466    * following character within the specified range, and this
4467    * character completes the pair, then the corresponding
4468    * supplementary code point is returned.  Otherwise, the character
4469    * at the index is returned.
4470    *
4471    * @param chars the character array in which to look
4472    * @param index the index of the codepoint to get, starting at 0
4473    * @param limit the limit past which characters should not be examined
4474    * @return the codepoint at the specified index
4475    * @throws IndexOutOfBoundsException if index is negative or &gt;=
4476    * limit, or if limit is negative or &gt;= the length of the array
4477    * @since 1.5
4478    */
codePointAt(char[] chars, int index, int limit)4479   public static int codePointAt(char[] chars, int index, int limit)
4480   {
4481     if (index < 0 || index >= limit || limit < 0 || limit > chars.length)
4482       throw new IndexOutOfBoundsException();
4483     char high = chars[index];
4484     if (! isHighSurrogate(high) || ++index >= limit)
4485       return high;
4486     char low = chars[index];
4487     if (! isLowSurrogate(low))
4488       return high;
4489     return toCodePoint(high, low);
4490   }
4491 
4492   /**
4493    * Get the code point before the specified index.  This is like
4494    * #codePointAt(char[], int), but checks the characters at
4495    * <code>index-1</code> and <code>index-2</code> to see if they form
4496    * a supplementary code point.  If they do not, the character at
4497    * <code>index-1</code> is returned.
4498    *
4499    * @param chars the character array
4500    * @param index the index just past the codepoint to get, starting at 0
4501    * @return the codepoint at the specified index
4502    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
4503    * @since 1.5
4504    */
codePointBefore(char[] chars, int index)4505   public static int codePointBefore(char[] chars, int index)
4506   {
4507     return codePointBefore(chars, index, 1);
4508   }
4509 
4510   /**
4511    * Get the code point before the specified index.  This is like
4512    * #codePointAt(char[], int), but checks the characters at
4513    * <code>index-1</code> and <code>index-2</code> to see if they form
4514    * a supplementary code point.  If they do not, the character at
4515    * <code>index-1</code> is returned.  The start parameter is used to
4516    * limit the range of the array which may be examined.
4517    *
4518    * @param chars the character array
4519    * @param index the index just past the codepoint to get, starting at 0
4520    * @param start the index before which characters should not be examined
4521    * @return the codepoint at the specified index
4522    * @throws IndexOutOfBoundsException if index is &gt; start or &gt;
4523    * the length of the array, or if limit is negative or &gt;= the
4524    * length of the array
4525    * @since 1.5
4526    */
codePointBefore(char[] chars, int index, int start)4527   public static int codePointBefore(char[] chars, int index, int start)
4528   {
4529     if (index < start || index > chars.length
4530         || start < 0 || start >= chars.length)
4531       throw new IndexOutOfBoundsException();
4532     --index;
4533     char low = chars[index];
4534     if (! isLowSurrogate(low) || --index < start)
4535       return low;
4536     char high = chars[index];
4537     if (! isHighSurrogate(high))
4538       return low;
4539     return toCodePoint(high, low);
4540   }
4541 
4542   /**
4543    * Get the code point before the specified index.  This is like
4544    * #codePointAt(CharSequence, int), but checks the characters at
4545    * <code>index-1</code> and <code>index-2</code> to see if they form
4546    * a supplementary code point.  If they do not, the character at
4547    * <code>index-1</code> is returned.
4548    *
4549    * @param sequence the CharSequence
4550    * @param index the index just past the codepoint to get, starting at 0
4551    * @return the codepoint at the specified index
4552    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
4553    * @since 1.5
4554    */
codePointBefore(CharSequence sequence, int index)4555   public static int codePointBefore(CharSequence sequence, int index)
4556   {
4557     int len = sequence.length();
4558     if (index < 1 || index > len)
4559       throw new IndexOutOfBoundsException();
4560     --index;
4561     char low = sequence.charAt(index);
4562     if (! isLowSurrogate(low) || --index < 0)
4563       return low;
4564     char high = sequence.charAt(index);
4565     if (! isHighSurrogate(high))
4566       return low;
4567     return toCodePoint(high, low);
4568   }
4569 } // class Character
4570