1 /* java.lang.Character -- Wrapper class for char, and Unicode subsets 2 Copyright (C) 1998, 1999, 2001, 2002, 2004, 2005 Free Software Foundation, Inc. 3 4 This file is part of GNU Classpath. 5 6 GNU Classpath is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 GNU Classpath is distributed in the hope that it will be useful, but 12 WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GNU Classpath; see the file COPYING. If not, write to the 18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19 02110-1301 USA. 20 21 Linking this library statically or dynamically with other modules is 22 making a combined work based on this library. Thus, the terms and 23 conditions of the GNU General Public License cover the whole 24 combination. 25 26 As a special exception, the copyright holders of this library give you 27 permission to link this library with independent modules to produce an 28 executable, regardless of the license terms of these independent 29 modules, and to copy and distribute the resulting executable under 30 terms of your choice, provided that you also meet, for each linked 31 independent module, the terms and conditions of the license of that 32 module. An independent module is a module which is not derived from 33 or based on this library. If you modify this library, you may extend 34 this exception to your version of the library, but you are not 35 obligated to do so. If you do not wish to do so, delete this 36 exception statement from your version. */ 37 38 39 package java.lang; 40 41 import gnu.java.lang.CharData; 42 43 import java.io.Serializable; 44 import java.text.Collator; 45 import java.util.Locale; 46 47 /** 48 * Wrapper class for the primitive char data type. In addition, this class 49 * allows one to retrieve property information and perform transformations 50 * on the defined characters in the Unicode Standard, Version 4.0.0. 51 * java.lang.Character is designed to be very dynamic, and as such, it 52 * retrieves information on the Unicode character set from a separate 53 * database, gnu.java.lang.CharData, which can be easily upgraded. 54 * 55 * <p>For predicates, boundaries are used to describe 56 * the set of characters for which the method will return true. 57 * This syntax uses fairly normal regular expression notation. 58 * See 5.13 of the Unicode Standard, Version 4.0, for the 59 * boundary specification. 60 * 61 * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a> 62 * for more information on the Unicode Standard. 63 * 64 * @author Tom Tromey (tromey@cygnus.com) 65 * @author Paul N. Fisher 66 * @author Jochen Hoenicke 67 * @author Eric Blake (ebb9@email.byu.edu) 68 * @author Andrew John Hughes (gnu_andrew@member.fsf.org) 69 * @see CharData 70 * @since 1.0 71 * @status partly updated to 1.5; some things still missing 72 */ 73 public final class Character implements Serializable, Comparable<Character> 74 { 75 /** 76 * A subset of Unicode blocks. 77 * 78 * @author Paul N. Fisher 79 * @author Eric Blake (ebb9@email.byu.edu) 80 * @since 1.2 81 */ 82 public static class Subset 83 { 84 /** The name of the subset. */ 85 private final String name; 86 87 /** 88 * Construct a new subset of characters. 89 * 90 * @param name the name of the subset 91 * @throws NullPointerException if name is null 92 */ Subset(String name)93 protected Subset(String name) 94 { 95 // Note that name.toString() is name, unless name was null. 96 this.name = name.toString(); 97 } 98 99 /** 100 * Compares two Subsets for equality. This is <code>final</code>, and 101 * restricts the comparison on the <code>==</code> operator, so it returns 102 * true only for the same object. 103 * 104 * @param o the object to compare 105 * @return true if o is this 106 */ equals(Object o)107 public final boolean equals(Object o) 108 { 109 return o == this; 110 } 111 112 /** 113 * Makes the original hashCode of Object final, to be consistent with 114 * equals. 115 * 116 * @return the hash code for this object 117 */ hashCode()118 public final int hashCode() 119 { 120 return super.hashCode(); 121 } 122 123 /** 124 * Returns the name of the subset. 125 * 126 * @return the name 127 */ toString()128 public final String toString() 129 { 130 return name; 131 } 132 } // class Subset 133 134 /** 135 * A family of character subsets in the Unicode specification. A character 136 * is in at most one of these blocks. 137 * 138 * This inner class was generated automatically from 139 * <code>doc/unicode/Blocks-4.0.0.txt</code>, by some perl scripts. 140 * This Unicode definition file can be found on the 141 * <a href="http://www.unicode.org">http://www.unicode.org</a> website. 142 * JDK 1.5 uses Unicode version 4.0.0. 143 * 144 * @author scripts/unicode-blocks.pl (written by Eric Blake) 145 * @since 1.2 146 */ 147 public static final class UnicodeBlock extends Subset 148 { 149 /** The start of the subset. */ 150 private final int start; 151 152 /** The end of the subset. */ 153 private final int end; 154 155 /** The canonical name of the block according to the Unicode standard. */ 156 private final String canonicalName; 157 158 /** Enumeration for the <code>forName()</code> method */ 159 private enum NameType { CANONICAL, NO_SPACES, CONSTANT; } 160 161 /** 162 * Constructor for strictly defined blocks. 163 * 164 * @param start the start character of the range 165 * @param end the end character of the range 166 * @param name the block name 167 * @param canonicalName the name of the block as defined in the Unicode 168 * standard. 169 */ UnicodeBlock(int start, int end, String name, String canonicalName)170 private UnicodeBlock(int start, int end, String name, 171 String canonicalName) 172 { 173 super(name); 174 this.start = start; 175 this.end = end; 176 this.canonicalName = canonicalName; 177 } 178 179 /** 180 * Returns the Unicode character block which a character belongs to. 181 * <strong>Note</strong>: This method does not support the use of 182 * supplementary characters. For such support, <code>of(int)</code> 183 * should be used instead. 184 * 185 * @param ch the character to look up 186 * @return the set it belongs to, or null if it is not in one 187 */ of(char ch)188 public static UnicodeBlock of(char ch) 189 { 190 return of((int) ch); 191 } 192 193 /** 194 * Returns the Unicode character block which a code point belongs to. 195 * 196 * @param codePoint the character to look up 197 * @return the set it belongs to, or null if it is not in one. 198 * @throws IllegalArgumentException if the specified code point is 199 * invalid. 200 * @since 1.5 201 */ of(int codePoint)202 public static UnicodeBlock of(int codePoint) 203 { 204 if (codePoint > MAX_CODE_POINT) 205 throw new IllegalArgumentException("The supplied integer value is " + 206 "too large to be a codepoint."); 207 // Simple binary search for the correct block. 208 int low = 0; 209 int hi = sets.length - 1; 210 while (low <= hi) 211 { 212 int mid = (low + hi) >> 1; 213 UnicodeBlock b = sets[mid]; 214 if (codePoint < b.start) 215 hi = mid - 1; 216 else if (codePoint > b.end) 217 low = mid + 1; 218 else 219 return b; 220 } 221 return null; 222 } 223 224 /** 225 * <p> 226 * Returns the <code>UnicodeBlock</code> with the given name, as defined 227 * by the Unicode standard. The version of Unicode in use is defined by 228 * the <code>Character</code> class, and the names are given in the 229 * <code>Blocks-<version>.txt</code> file corresponding to that version. 230 * The name may be specified in one of three ways: 231 * </p> 232 * <ol> 233 * <li>The canonical, human-readable name used by the Unicode standard. 234 * This is the name with all spaces and hyphens retained. For example, 235 * `Basic Latin' retrieves the block, UnicodeBlock.BASIC_LATIN.</li> 236 * <li>The canonical name with all spaces removed e.g. `BasicLatin'.</li> 237 * <li>The name used for the constants specified by this class, which 238 * is the canonical name with all spaces and hyphens replaced with 239 * underscores e.g. `BASIC_LATIN'</li> 240 * </ol> 241 * <p> 242 * The names are compared case-insensitively using the case comparison 243 * associated with the U.S. English locale. The method recognises the 244 * previous names used for blocks as well as the current ones. At 245 * present, this simply means that the deprecated `SURROGATES_AREA' 246 * will be recognised by this method (the <code>of()</code> methods 247 * only return one of the three new surrogate blocks). 248 * </p> 249 * 250 * @param blockName the name of the block to look up. 251 * @return the specified block. 252 * @throws NullPointerException if the <code>blockName</code> is 253 * <code>null</code>. 254 * @throws IllegalArgumentException if the name does not match any Unicode 255 * block. 256 * @since 1.5 257 */ forName(String blockName)258 public static final UnicodeBlock forName(String blockName) 259 { 260 NameType type; 261 if (blockName.indexOf(' ') != -1) 262 type = NameType.CANONICAL; 263 else if (blockName.indexOf('_') != -1) 264 type = NameType.CONSTANT; 265 else 266 type = NameType.NO_SPACES; 267 Collator usCollator = Collator.getInstance(Locale.US); 268 usCollator.setStrength(Collator.PRIMARY); 269 /* Special case for deprecated blocks not in sets */ 270 switch (type) 271 { 272 case CANONICAL: 273 if (usCollator.compare(blockName, "Surrogates Area") == 0) 274 return SURROGATES_AREA; 275 break; 276 case NO_SPACES: 277 if (usCollator.compare(blockName, "SurrogatesArea") == 0) 278 return SURROGATES_AREA; 279 break; 280 case CONSTANT: 281 if (usCollator.compare(blockName, "SURROGATES_AREA") == 0) 282 return SURROGATES_AREA; 283 break; 284 } 285 /* Other cases */ 286 switch (type) 287 { 288 case CANONICAL: 289 for (UnicodeBlock block : sets) 290 if (usCollator.compare(blockName, block.canonicalName) == 0) 291 return block; 292 break; 293 case NO_SPACES: 294 for (UnicodeBlock block : sets) 295 { 296 String nsName = block.canonicalName.replaceAll(" ",""); 297 if (usCollator.compare(blockName, nsName) == 0) 298 return block; 299 } 300 break; 301 case CONSTANT: 302 for (UnicodeBlock block : sets) 303 if (usCollator.compare(blockName, block.toString()) == 0) 304 return block; 305 break; 306 } 307 throw new IllegalArgumentException("No Unicode block found for " + 308 blockName + "."); 309 } 310 311 /** 312 * Basic Latin. 313 * 0x0000 - 0x007F. 314 */ 315 public static final UnicodeBlock BASIC_LATIN 316 = new UnicodeBlock(0x0000, 0x007F, 317 "BASIC_LATIN", 318 "Basic Latin"); 319 320 /** 321 * Latin-1 Supplement. 322 * 0x0080 - 0x00FF. 323 */ 324 public static final UnicodeBlock LATIN_1_SUPPLEMENT 325 = new UnicodeBlock(0x0080, 0x00FF, 326 "LATIN_1_SUPPLEMENT", 327 "Latin-1 Supplement"); 328 329 /** 330 * Latin Extended-A. 331 * 0x0100 - 0x017F. 332 */ 333 public static final UnicodeBlock LATIN_EXTENDED_A 334 = new UnicodeBlock(0x0100, 0x017F, 335 "LATIN_EXTENDED_A", 336 "Latin Extended-A"); 337 338 /** 339 * Latin Extended-B. 340 * 0x0180 - 0x024F. 341 */ 342 public static final UnicodeBlock LATIN_EXTENDED_B 343 = new UnicodeBlock(0x0180, 0x024F, 344 "LATIN_EXTENDED_B", 345 "Latin Extended-B"); 346 347 /** 348 * IPA Extensions. 349 * 0x0250 - 0x02AF. 350 */ 351 public static final UnicodeBlock IPA_EXTENSIONS 352 = new UnicodeBlock(0x0250, 0x02AF, 353 "IPA_EXTENSIONS", 354 "IPA Extensions"); 355 356 /** 357 * Spacing Modifier Letters. 358 * 0x02B0 - 0x02FF. 359 */ 360 public static final UnicodeBlock SPACING_MODIFIER_LETTERS 361 = new UnicodeBlock(0x02B0, 0x02FF, 362 "SPACING_MODIFIER_LETTERS", 363 "Spacing Modifier Letters"); 364 365 /** 366 * Combining Diacritical Marks. 367 * 0x0300 - 0x036F. 368 */ 369 public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS 370 = new UnicodeBlock(0x0300, 0x036F, 371 "COMBINING_DIACRITICAL_MARKS", 372 "Combining Diacritical Marks"); 373 374 /** 375 * Greek. 376 * 0x0370 - 0x03FF. 377 */ 378 public static final UnicodeBlock GREEK 379 = new UnicodeBlock(0x0370, 0x03FF, 380 "GREEK", 381 "Greek"); 382 383 /** 384 * Cyrillic. 385 * 0x0400 - 0x04FF. 386 */ 387 public static final UnicodeBlock CYRILLIC 388 = new UnicodeBlock(0x0400, 0x04FF, 389 "CYRILLIC", 390 "Cyrillic"); 391 392 /** 393 * Cyrillic Supplementary. 394 * 0x0500 - 0x052F. 395 * @since 1.5 396 */ 397 public static final UnicodeBlock CYRILLIC_SUPPLEMENTARY 398 = new UnicodeBlock(0x0500, 0x052F, 399 "CYRILLIC_SUPPLEMENTARY", 400 "Cyrillic Supplementary"); 401 402 /** 403 * Armenian. 404 * 0x0530 - 0x058F. 405 */ 406 public static final UnicodeBlock ARMENIAN 407 = new UnicodeBlock(0x0530, 0x058F, 408 "ARMENIAN", 409 "Armenian"); 410 411 /** 412 * Hebrew. 413 * 0x0590 - 0x05FF. 414 */ 415 public static final UnicodeBlock HEBREW 416 = new UnicodeBlock(0x0590, 0x05FF, 417 "HEBREW", 418 "Hebrew"); 419 420 /** 421 * Arabic. 422 * 0x0600 - 0x06FF. 423 */ 424 public static final UnicodeBlock ARABIC 425 = new UnicodeBlock(0x0600, 0x06FF, 426 "ARABIC", 427 "Arabic"); 428 429 /** 430 * Syriac. 431 * 0x0700 - 0x074F. 432 * @since 1.4 433 */ 434 public static final UnicodeBlock SYRIAC 435 = new UnicodeBlock(0x0700, 0x074F, 436 "SYRIAC", 437 "Syriac"); 438 439 /** 440 * Thaana. 441 * 0x0780 - 0x07BF. 442 * @since 1.4 443 */ 444 public static final UnicodeBlock THAANA 445 = new UnicodeBlock(0x0780, 0x07BF, 446 "THAANA", 447 "Thaana"); 448 449 /** 450 * Devanagari. 451 * 0x0900 - 0x097F. 452 */ 453 public static final UnicodeBlock DEVANAGARI 454 = new UnicodeBlock(0x0900, 0x097F, 455 "DEVANAGARI", 456 "Devanagari"); 457 458 /** 459 * Bengali. 460 * 0x0980 - 0x09FF. 461 */ 462 public static final UnicodeBlock BENGALI 463 = new UnicodeBlock(0x0980, 0x09FF, 464 "BENGALI", 465 "Bengali"); 466 467 /** 468 * Gurmukhi. 469 * 0x0A00 - 0x0A7F. 470 */ 471 public static final UnicodeBlock GURMUKHI 472 = new UnicodeBlock(0x0A00, 0x0A7F, 473 "GURMUKHI", 474 "Gurmukhi"); 475 476 /** 477 * Gujarati. 478 * 0x0A80 - 0x0AFF. 479 */ 480 public static final UnicodeBlock GUJARATI 481 = new UnicodeBlock(0x0A80, 0x0AFF, 482 "GUJARATI", 483 "Gujarati"); 484 485 /** 486 * Oriya. 487 * 0x0B00 - 0x0B7F. 488 */ 489 public static final UnicodeBlock ORIYA 490 = new UnicodeBlock(0x0B00, 0x0B7F, 491 "ORIYA", 492 "Oriya"); 493 494 /** 495 * Tamil. 496 * 0x0B80 - 0x0BFF. 497 */ 498 public static final UnicodeBlock TAMIL 499 = new UnicodeBlock(0x0B80, 0x0BFF, 500 "TAMIL", 501 "Tamil"); 502 503 /** 504 * Telugu. 505 * 0x0C00 - 0x0C7F. 506 */ 507 public static final UnicodeBlock TELUGU 508 = new UnicodeBlock(0x0C00, 0x0C7F, 509 "TELUGU", 510 "Telugu"); 511 512 /** 513 * Kannada. 514 * 0x0C80 - 0x0CFF. 515 */ 516 public static final UnicodeBlock KANNADA 517 = new UnicodeBlock(0x0C80, 0x0CFF, 518 "KANNADA", 519 "Kannada"); 520 521 /** 522 * Malayalam. 523 * 0x0D00 - 0x0D7F. 524 */ 525 public static final UnicodeBlock MALAYALAM 526 = new UnicodeBlock(0x0D00, 0x0D7F, 527 "MALAYALAM", 528 "Malayalam"); 529 530 /** 531 * Sinhala. 532 * 0x0D80 - 0x0DFF. 533 * @since 1.4 534 */ 535 public static final UnicodeBlock SINHALA 536 = new UnicodeBlock(0x0D80, 0x0DFF, 537 "SINHALA", 538 "Sinhala"); 539 540 /** 541 * Thai. 542 * 0x0E00 - 0x0E7F. 543 */ 544 public static final UnicodeBlock THAI 545 = new UnicodeBlock(0x0E00, 0x0E7F, 546 "THAI", 547 "Thai"); 548 549 /** 550 * Lao. 551 * 0x0E80 - 0x0EFF. 552 */ 553 public static final UnicodeBlock LAO 554 = new UnicodeBlock(0x0E80, 0x0EFF, 555 "LAO", 556 "Lao"); 557 558 /** 559 * Tibetan. 560 * 0x0F00 - 0x0FFF. 561 */ 562 public static final UnicodeBlock TIBETAN 563 = new UnicodeBlock(0x0F00, 0x0FFF, 564 "TIBETAN", 565 "Tibetan"); 566 567 /** 568 * Myanmar. 569 * 0x1000 - 0x109F. 570 * @since 1.4 571 */ 572 public static final UnicodeBlock MYANMAR 573 = new UnicodeBlock(0x1000, 0x109F, 574 "MYANMAR", 575 "Myanmar"); 576 577 /** 578 * Georgian. 579 * 0x10A0 - 0x10FF. 580 */ 581 public static final UnicodeBlock GEORGIAN 582 = new UnicodeBlock(0x10A0, 0x10FF, 583 "GEORGIAN", 584 "Georgian"); 585 586 /** 587 * Hangul Jamo. 588 * 0x1100 - 0x11FF. 589 */ 590 public static final UnicodeBlock HANGUL_JAMO 591 = new UnicodeBlock(0x1100, 0x11FF, 592 "HANGUL_JAMO", 593 "Hangul Jamo"); 594 595 /** 596 * Ethiopic. 597 * 0x1200 - 0x137F. 598 * @since 1.4 599 */ 600 public static final UnicodeBlock ETHIOPIC 601 = new UnicodeBlock(0x1200, 0x137F, 602 "ETHIOPIC", 603 "Ethiopic"); 604 605 /** 606 * Cherokee. 607 * 0x13A0 - 0x13FF. 608 * @since 1.4 609 */ 610 public static final UnicodeBlock CHEROKEE 611 = new UnicodeBlock(0x13A0, 0x13FF, 612 "CHEROKEE", 613 "Cherokee"); 614 615 /** 616 * Unified Canadian Aboriginal Syllabics. 617 * 0x1400 - 0x167F. 618 * @since 1.4 619 */ 620 public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS 621 = new UnicodeBlock(0x1400, 0x167F, 622 "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS", 623 "Unified Canadian Aboriginal Syllabics"); 624 625 /** 626 * Ogham. 627 * 0x1680 - 0x169F. 628 * @since 1.4 629 */ 630 public static final UnicodeBlock OGHAM 631 = new UnicodeBlock(0x1680, 0x169F, 632 "OGHAM", 633 "Ogham"); 634 635 /** 636 * Runic. 637 * 0x16A0 - 0x16FF. 638 * @since 1.4 639 */ 640 public static final UnicodeBlock RUNIC 641 = new UnicodeBlock(0x16A0, 0x16FF, 642 "RUNIC", 643 "Runic"); 644 645 /** 646 * Tagalog. 647 * 0x1700 - 0x171F. 648 * @since 1.5 649 */ 650 public static final UnicodeBlock TAGALOG 651 = new UnicodeBlock(0x1700, 0x171F, 652 "TAGALOG", 653 "Tagalog"); 654 655 /** 656 * Hanunoo. 657 * 0x1720 - 0x173F. 658 * @since 1.5 659 */ 660 public static final UnicodeBlock HANUNOO 661 = new UnicodeBlock(0x1720, 0x173F, 662 "HANUNOO", 663 "Hanunoo"); 664 665 /** 666 * Buhid. 667 * 0x1740 - 0x175F. 668 * @since 1.5 669 */ 670 public static final UnicodeBlock BUHID 671 = new UnicodeBlock(0x1740, 0x175F, 672 "BUHID", 673 "Buhid"); 674 675 /** 676 * Tagbanwa. 677 * 0x1760 - 0x177F. 678 * @since 1.5 679 */ 680 public static final UnicodeBlock TAGBANWA 681 = new UnicodeBlock(0x1760, 0x177F, 682 "TAGBANWA", 683 "Tagbanwa"); 684 685 /** 686 * Khmer. 687 * 0x1780 - 0x17FF. 688 * @since 1.4 689 */ 690 public static final UnicodeBlock KHMER 691 = new UnicodeBlock(0x1780, 0x17FF, 692 "KHMER", 693 "Khmer"); 694 695 /** 696 * Mongolian. 697 * 0x1800 - 0x18AF. 698 * @since 1.4 699 */ 700 public static final UnicodeBlock MONGOLIAN 701 = new UnicodeBlock(0x1800, 0x18AF, 702 "MONGOLIAN", 703 "Mongolian"); 704 705 /** 706 * Limbu. 707 * 0x1900 - 0x194F. 708 * @since 1.5 709 */ 710 public static final UnicodeBlock LIMBU 711 = new UnicodeBlock(0x1900, 0x194F, 712 "LIMBU", 713 "Limbu"); 714 715 /** 716 * Tai Le. 717 * 0x1950 - 0x197F. 718 * @since 1.5 719 */ 720 public static final UnicodeBlock TAI_LE 721 = new UnicodeBlock(0x1950, 0x197F, 722 "TAI_LE", 723 "Tai Le"); 724 725 /** 726 * Khmer Symbols. 727 * 0x19E0 - 0x19FF. 728 * @since 1.5 729 */ 730 public static final UnicodeBlock KHMER_SYMBOLS 731 = new UnicodeBlock(0x19E0, 0x19FF, 732 "KHMER_SYMBOLS", 733 "Khmer Symbols"); 734 735 /** 736 * Phonetic Extensions. 737 * 0x1D00 - 0x1D7F. 738 * @since 1.5 739 */ 740 public static final UnicodeBlock PHONETIC_EXTENSIONS 741 = new UnicodeBlock(0x1D00, 0x1D7F, 742 "PHONETIC_EXTENSIONS", 743 "Phonetic Extensions"); 744 745 /** 746 * Latin Extended Additional. 747 * 0x1E00 - 0x1EFF. 748 */ 749 public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL 750 = new UnicodeBlock(0x1E00, 0x1EFF, 751 "LATIN_EXTENDED_ADDITIONAL", 752 "Latin Extended Additional"); 753 754 /** 755 * Greek Extended. 756 * 0x1F00 - 0x1FFF. 757 */ 758 public static final UnicodeBlock GREEK_EXTENDED 759 = new UnicodeBlock(0x1F00, 0x1FFF, 760 "GREEK_EXTENDED", 761 "Greek Extended"); 762 763 /** 764 * General Punctuation. 765 * 0x2000 - 0x206F. 766 */ 767 public static final UnicodeBlock GENERAL_PUNCTUATION 768 = new UnicodeBlock(0x2000, 0x206F, 769 "GENERAL_PUNCTUATION", 770 "General Punctuation"); 771 772 /** 773 * Superscripts and Subscripts. 774 * 0x2070 - 0x209F. 775 */ 776 public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS 777 = new UnicodeBlock(0x2070, 0x209F, 778 "SUPERSCRIPTS_AND_SUBSCRIPTS", 779 "Superscripts and Subscripts"); 780 781 /** 782 * Currency Symbols. 783 * 0x20A0 - 0x20CF. 784 */ 785 public static final UnicodeBlock CURRENCY_SYMBOLS 786 = new UnicodeBlock(0x20A0, 0x20CF, 787 "CURRENCY_SYMBOLS", 788 "Currency Symbols"); 789 790 /** 791 * Combining Marks for Symbols. 792 * 0x20D0 - 0x20FF. 793 */ 794 public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS 795 = new UnicodeBlock(0x20D0, 0x20FF, 796 "COMBINING_MARKS_FOR_SYMBOLS", 797 "Combining Marks for Symbols"); 798 799 /** 800 * Letterlike Symbols. 801 * 0x2100 - 0x214F. 802 */ 803 public static final UnicodeBlock LETTERLIKE_SYMBOLS 804 = new UnicodeBlock(0x2100, 0x214F, 805 "LETTERLIKE_SYMBOLS", 806 "Letterlike Symbols"); 807 808 /** 809 * Number Forms. 810 * 0x2150 - 0x218F. 811 */ 812 public static final UnicodeBlock NUMBER_FORMS 813 = new UnicodeBlock(0x2150, 0x218F, 814 "NUMBER_FORMS", 815 "Number Forms"); 816 817 /** 818 * Arrows. 819 * 0x2190 - 0x21FF. 820 */ 821 public static final UnicodeBlock ARROWS 822 = new UnicodeBlock(0x2190, 0x21FF, 823 "ARROWS", 824 "Arrows"); 825 826 /** 827 * Mathematical Operators. 828 * 0x2200 - 0x22FF. 829 */ 830 public static final UnicodeBlock MATHEMATICAL_OPERATORS 831 = new UnicodeBlock(0x2200, 0x22FF, 832 "MATHEMATICAL_OPERATORS", 833 "Mathematical Operators"); 834 835 /** 836 * Miscellaneous Technical. 837 * 0x2300 - 0x23FF. 838 */ 839 public static final UnicodeBlock MISCELLANEOUS_TECHNICAL 840 = new UnicodeBlock(0x2300, 0x23FF, 841 "MISCELLANEOUS_TECHNICAL", 842 "Miscellaneous Technical"); 843 844 /** 845 * Control Pictures. 846 * 0x2400 - 0x243F. 847 */ 848 public static final UnicodeBlock CONTROL_PICTURES 849 = new UnicodeBlock(0x2400, 0x243F, 850 "CONTROL_PICTURES", 851 "Control Pictures"); 852 853 /** 854 * Optical Character Recognition. 855 * 0x2440 - 0x245F. 856 */ 857 public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION 858 = new UnicodeBlock(0x2440, 0x245F, 859 "OPTICAL_CHARACTER_RECOGNITION", 860 "Optical Character Recognition"); 861 862 /** 863 * Enclosed Alphanumerics. 864 * 0x2460 - 0x24FF. 865 */ 866 public static final UnicodeBlock ENCLOSED_ALPHANUMERICS 867 = new UnicodeBlock(0x2460, 0x24FF, 868 "ENCLOSED_ALPHANUMERICS", 869 "Enclosed Alphanumerics"); 870 871 /** 872 * Box Drawing. 873 * 0x2500 - 0x257F. 874 */ 875 public static final UnicodeBlock BOX_DRAWING 876 = new UnicodeBlock(0x2500, 0x257F, 877 "BOX_DRAWING", 878 "Box Drawing"); 879 880 /** 881 * Block Elements. 882 * 0x2580 - 0x259F. 883 */ 884 public static final UnicodeBlock BLOCK_ELEMENTS 885 = new UnicodeBlock(0x2580, 0x259F, 886 "BLOCK_ELEMENTS", 887 "Block Elements"); 888 889 /** 890 * Geometric Shapes. 891 * 0x25A0 - 0x25FF. 892 */ 893 public static final UnicodeBlock GEOMETRIC_SHAPES 894 = new UnicodeBlock(0x25A0, 0x25FF, 895 "GEOMETRIC_SHAPES", 896 "Geometric Shapes"); 897 898 /** 899 * Miscellaneous Symbols. 900 * 0x2600 - 0x26FF. 901 */ 902 public static final UnicodeBlock MISCELLANEOUS_SYMBOLS 903 = new UnicodeBlock(0x2600, 0x26FF, 904 "MISCELLANEOUS_SYMBOLS", 905 "Miscellaneous Symbols"); 906 907 /** 908 * Dingbats. 909 * 0x2700 - 0x27BF. 910 */ 911 public static final UnicodeBlock DINGBATS 912 = new UnicodeBlock(0x2700, 0x27BF, 913 "DINGBATS", 914 "Dingbats"); 915 916 /** 917 * Miscellaneous Mathematical Symbols-A. 918 * 0x27C0 - 0x27EF. 919 * @since 1.5 920 */ 921 public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A 922 = new UnicodeBlock(0x27C0, 0x27EF, 923 "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A", 924 "Miscellaneous Mathematical Symbols-A"); 925 926 /** 927 * Supplemental Arrows-A. 928 * 0x27F0 - 0x27FF. 929 * @since 1.5 930 */ 931 public static final UnicodeBlock SUPPLEMENTAL_ARROWS_A 932 = new UnicodeBlock(0x27F0, 0x27FF, 933 "SUPPLEMENTAL_ARROWS_A", 934 "Supplemental Arrows-A"); 935 936 /** 937 * Braille Patterns. 938 * 0x2800 - 0x28FF. 939 * @since 1.4 940 */ 941 public static final UnicodeBlock BRAILLE_PATTERNS 942 = new UnicodeBlock(0x2800, 0x28FF, 943 "BRAILLE_PATTERNS", 944 "Braille Patterns"); 945 946 /** 947 * Supplemental Arrows-B. 948 * 0x2900 - 0x297F. 949 * @since 1.5 950 */ 951 public static final UnicodeBlock SUPPLEMENTAL_ARROWS_B 952 = new UnicodeBlock(0x2900, 0x297F, 953 "SUPPLEMENTAL_ARROWS_B", 954 "Supplemental Arrows-B"); 955 956 /** 957 * Miscellaneous Mathematical Symbols-B. 958 * 0x2980 - 0x29FF. 959 * @since 1.5 960 */ 961 public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B 962 = new UnicodeBlock(0x2980, 0x29FF, 963 "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B", 964 "Miscellaneous Mathematical Symbols-B"); 965 966 /** 967 * Supplemental Mathematical Operators. 968 * 0x2A00 - 0x2AFF. 969 * @since 1.5 970 */ 971 public static final UnicodeBlock SUPPLEMENTAL_MATHEMATICAL_OPERATORS 972 = new UnicodeBlock(0x2A00, 0x2AFF, 973 "SUPPLEMENTAL_MATHEMATICAL_OPERATORS", 974 "Supplemental Mathematical Operators"); 975 976 /** 977 * Miscellaneous Symbols and Arrows. 978 * 0x2B00 - 0x2BFF. 979 * @since 1.5 980 */ 981 public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_ARROWS 982 = new UnicodeBlock(0x2B00, 0x2BFF, 983 "MISCELLANEOUS_SYMBOLS_AND_ARROWS", 984 "Miscellaneous Symbols and Arrows"); 985 986 /** 987 * CJK Radicals Supplement. 988 * 0x2E80 - 0x2EFF. 989 * @since 1.4 990 */ 991 public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT 992 = new UnicodeBlock(0x2E80, 0x2EFF, 993 "CJK_RADICALS_SUPPLEMENT", 994 "CJK Radicals Supplement"); 995 996 /** 997 * Kangxi Radicals. 998 * 0x2F00 - 0x2FDF. 999 * @since 1.4 1000 */ 1001 public static final UnicodeBlock KANGXI_RADICALS 1002 = new UnicodeBlock(0x2F00, 0x2FDF, 1003 "KANGXI_RADICALS", 1004 "Kangxi Radicals"); 1005 1006 /** 1007 * Ideographic Description Characters. 1008 * 0x2FF0 - 0x2FFF. 1009 * @since 1.4 1010 */ 1011 public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS 1012 = new UnicodeBlock(0x2FF0, 0x2FFF, 1013 "IDEOGRAPHIC_DESCRIPTION_CHARACTERS", 1014 "Ideographic Description Characters"); 1015 1016 /** 1017 * CJK Symbols and Punctuation. 1018 * 0x3000 - 0x303F. 1019 */ 1020 public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION 1021 = new UnicodeBlock(0x3000, 0x303F, 1022 "CJK_SYMBOLS_AND_PUNCTUATION", 1023 "CJK Symbols and Punctuation"); 1024 1025 /** 1026 * Hiragana. 1027 * 0x3040 - 0x309F. 1028 */ 1029 public static final UnicodeBlock HIRAGANA 1030 = new UnicodeBlock(0x3040, 0x309F, 1031 "HIRAGANA", 1032 "Hiragana"); 1033 1034 /** 1035 * Katakana. 1036 * 0x30A0 - 0x30FF. 1037 */ 1038 public static final UnicodeBlock KATAKANA 1039 = new UnicodeBlock(0x30A0, 0x30FF, 1040 "KATAKANA", 1041 "Katakana"); 1042 1043 /** 1044 * Bopomofo. 1045 * 0x3100 - 0x312F. 1046 */ 1047 public static final UnicodeBlock BOPOMOFO 1048 = new UnicodeBlock(0x3100, 0x312F, 1049 "BOPOMOFO", 1050 "Bopomofo"); 1051 1052 /** 1053 * Hangul Compatibility Jamo. 1054 * 0x3130 - 0x318F. 1055 */ 1056 public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO 1057 = new UnicodeBlock(0x3130, 0x318F, 1058 "HANGUL_COMPATIBILITY_JAMO", 1059 "Hangul Compatibility Jamo"); 1060 1061 /** 1062 * Kanbun. 1063 * 0x3190 - 0x319F. 1064 */ 1065 public static final UnicodeBlock KANBUN 1066 = new UnicodeBlock(0x3190, 0x319F, 1067 "KANBUN", 1068 "Kanbun"); 1069 1070 /** 1071 * Bopomofo Extended. 1072 * 0x31A0 - 0x31BF. 1073 * @since 1.4 1074 */ 1075 public static final UnicodeBlock BOPOMOFO_EXTENDED 1076 = new UnicodeBlock(0x31A0, 0x31BF, 1077 "BOPOMOFO_EXTENDED", 1078 "Bopomofo Extended"); 1079 1080 /** 1081 * Katakana Phonetic Extensions. 1082 * 0x31F0 - 0x31FF. 1083 * @since 1.5 1084 */ 1085 public static final UnicodeBlock KATAKANA_PHONETIC_EXTENSIONS 1086 = new UnicodeBlock(0x31F0, 0x31FF, 1087 "KATAKANA_PHONETIC_EXTENSIONS", 1088 "Katakana Phonetic Extensions"); 1089 1090 /** 1091 * Enclosed CJK Letters and Months. 1092 * 0x3200 - 0x32FF. 1093 */ 1094 public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS 1095 = new UnicodeBlock(0x3200, 0x32FF, 1096 "ENCLOSED_CJK_LETTERS_AND_MONTHS", 1097 "Enclosed CJK Letters and Months"); 1098 1099 /** 1100 * CJK Compatibility. 1101 * 0x3300 - 0x33FF. 1102 */ 1103 public static final UnicodeBlock CJK_COMPATIBILITY 1104 = new UnicodeBlock(0x3300, 0x33FF, 1105 "CJK_COMPATIBILITY", 1106 "CJK Compatibility"); 1107 1108 /** 1109 * CJK Unified Ideographs Extension A. 1110 * 0x3400 - 0x4DBF. 1111 * @since 1.4 1112 */ 1113 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 1114 = new UnicodeBlock(0x3400, 0x4DBF, 1115 "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A", 1116 "CJK Unified Ideographs Extension A"); 1117 1118 /** 1119 * Yijing Hexagram Symbols. 1120 * 0x4DC0 - 0x4DFF. 1121 * @since 1.5 1122 */ 1123 public static final UnicodeBlock YIJING_HEXAGRAM_SYMBOLS 1124 = new UnicodeBlock(0x4DC0, 0x4DFF, 1125 "YIJING_HEXAGRAM_SYMBOLS", 1126 "Yijing Hexagram Symbols"); 1127 1128 /** 1129 * CJK Unified Ideographs. 1130 * 0x4E00 - 0x9FFF. 1131 */ 1132 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS 1133 = new UnicodeBlock(0x4E00, 0x9FFF, 1134 "CJK_UNIFIED_IDEOGRAPHS", 1135 "CJK Unified Ideographs"); 1136 1137 /** 1138 * Yi Syllables. 1139 * 0xA000 - 0xA48F. 1140 * @since 1.4 1141 */ 1142 public static final UnicodeBlock YI_SYLLABLES 1143 = new UnicodeBlock(0xA000, 0xA48F, 1144 "YI_SYLLABLES", 1145 "Yi Syllables"); 1146 1147 /** 1148 * Yi Radicals. 1149 * 0xA490 - 0xA4CF. 1150 * @since 1.4 1151 */ 1152 public static final UnicodeBlock YI_RADICALS 1153 = new UnicodeBlock(0xA490, 0xA4CF, 1154 "YI_RADICALS", 1155 "Yi Radicals"); 1156 1157 /** 1158 * Hangul Syllables. 1159 * 0xAC00 - 0xD7AF. 1160 */ 1161 public static final UnicodeBlock HANGUL_SYLLABLES 1162 = new UnicodeBlock(0xAC00, 0xD7AF, 1163 "HANGUL_SYLLABLES", 1164 "Hangul Syllables"); 1165 1166 /** 1167 * High Surrogates. 1168 * 0xD800 - 0xDB7F. 1169 * @since 1.5 1170 */ 1171 public static final UnicodeBlock HIGH_SURROGATES 1172 = new UnicodeBlock(0xD800, 0xDB7F, 1173 "HIGH_SURROGATES", 1174 "High Surrogates"); 1175 1176 /** 1177 * High Private Use Surrogates. 1178 * 0xDB80 - 0xDBFF. 1179 * @since 1.5 1180 */ 1181 public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES 1182 = new UnicodeBlock(0xDB80, 0xDBFF, 1183 "HIGH_PRIVATE_USE_SURROGATES", 1184 "High Private Use Surrogates"); 1185 1186 /** 1187 * Low Surrogates. 1188 * 0xDC00 - 0xDFFF. 1189 * @since 1.5 1190 */ 1191 public static final UnicodeBlock LOW_SURROGATES 1192 = new UnicodeBlock(0xDC00, 0xDFFF, 1193 "LOW_SURROGATES", 1194 "Low Surrogates"); 1195 1196 /** 1197 * Private Use Area. 1198 * 0xE000 - 0xF8FF. 1199 */ 1200 public static final UnicodeBlock PRIVATE_USE_AREA 1201 = new UnicodeBlock(0xE000, 0xF8FF, 1202 "PRIVATE_USE_AREA", 1203 "Private Use Area"); 1204 1205 /** 1206 * CJK Compatibility Ideographs. 1207 * 0xF900 - 0xFAFF. 1208 */ 1209 public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS 1210 = new UnicodeBlock(0xF900, 0xFAFF, 1211 "CJK_COMPATIBILITY_IDEOGRAPHS", 1212 "CJK Compatibility Ideographs"); 1213 1214 /** 1215 * Alphabetic Presentation Forms. 1216 * 0xFB00 - 0xFB4F. 1217 */ 1218 public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS 1219 = new UnicodeBlock(0xFB00, 0xFB4F, 1220 "ALPHABETIC_PRESENTATION_FORMS", 1221 "Alphabetic Presentation Forms"); 1222 1223 /** 1224 * Arabic Presentation Forms-A. 1225 * 0xFB50 - 0xFDFF. 1226 */ 1227 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A 1228 = new UnicodeBlock(0xFB50, 0xFDFF, 1229 "ARABIC_PRESENTATION_FORMS_A", 1230 "Arabic Presentation Forms-A"); 1231 1232 /** 1233 * Variation Selectors. 1234 * 0xFE00 - 0xFE0F. 1235 * @since 1.5 1236 */ 1237 public static final UnicodeBlock VARIATION_SELECTORS 1238 = new UnicodeBlock(0xFE00, 0xFE0F, 1239 "VARIATION_SELECTORS", 1240 "Variation Selectors"); 1241 1242 /** 1243 * Combining Half Marks. 1244 * 0xFE20 - 0xFE2F. 1245 */ 1246 public static final UnicodeBlock COMBINING_HALF_MARKS 1247 = new UnicodeBlock(0xFE20, 0xFE2F, 1248 "COMBINING_HALF_MARKS", 1249 "Combining Half Marks"); 1250 1251 /** 1252 * CJK Compatibility Forms. 1253 * 0xFE30 - 0xFE4F. 1254 */ 1255 public static final UnicodeBlock CJK_COMPATIBILITY_FORMS 1256 = new UnicodeBlock(0xFE30, 0xFE4F, 1257 "CJK_COMPATIBILITY_FORMS", 1258 "CJK Compatibility Forms"); 1259 1260 /** 1261 * Small Form Variants. 1262 * 0xFE50 - 0xFE6F. 1263 */ 1264 public static final UnicodeBlock SMALL_FORM_VARIANTS 1265 = new UnicodeBlock(0xFE50, 0xFE6F, 1266 "SMALL_FORM_VARIANTS", 1267 "Small Form Variants"); 1268 1269 /** 1270 * Arabic Presentation Forms-B. 1271 * 0xFE70 - 0xFEFF. 1272 */ 1273 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B 1274 = new UnicodeBlock(0xFE70, 0xFEFF, 1275 "ARABIC_PRESENTATION_FORMS_B", 1276 "Arabic Presentation Forms-B"); 1277 1278 /** 1279 * Halfwidth and Fullwidth Forms. 1280 * 0xFF00 - 0xFFEF. 1281 */ 1282 public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS 1283 = new UnicodeBlock(0xFF00, 0xFFEF, 1284 "HALFWIDTH_AND_FULLWIDTH_FORMS", 1285 "Halfwidth and Fullwidth Forms"); 1286 1287 /** 1288 * Specials. 1289 * 0xFFF0 - 0xFFFF. 1290 */ 1291 public static final UnicodeBlock SPECIALS 1292 = new UnicodeBlock(0xFFF0, 0xFFFF, 1293 "SPECIALS", 1294 "Specials"); 1295 1296 /** 1297 * Linear B Syllabary. 1298 * 0x10000 - 0x1007F. 1299 * @since 1.5 1300 */ 1301 public static final UnicodeBlock LINEAR_B_SYLLABARY 1302 = new UnicodeBlock(0x10000, 0x1007F, 1303 "LINEAR_B_SYLLABARY", 1304 "Linear B Syllabary"); 1305 1306 /** 1307 * Linear B Ideograms. 1308 * 0x10080 - 0x100FF. 1309 * @since 1.5 1310 */ 1311 public static final UnicodeBlock LINEAR_B_IDEOGRAMS 1312 = new UnicodeBlock(0x10080, 0x100FF, 1313 "LINEAR_B_IDEOGRAMS", 1314 "Linear B Ideograms"); 1315 1316 /** 1317 * Aegean Numbers. 1318 * 0x10100 - 0x1013F. 1319 * @since 1.5 1320 */ 1321 public static final UnicodeBlock AEGEAN_NUMBERS 1322 = new UnicodeBlock(0x10100, 0x1013F, 1323 "AEGEAN_NUMBERS", 1324 "Aegean Numbers"); 1325 1326 /** 1327 * Old Italic. 1328 * 0x10300 - 0x1032F. 1329 * @since 1.5 1330 */ 1331 public static final UnicodeBlock OLD_ITALIC 1332 = new UnicodeBlock(0x10300, 0x1032F, 1333 "OLD_ITALIC", 1334 "Old Italic"); 1335 1336 /** 1337 * Gothic. 1338 * 0x10330 - 0x1034F. 1339 * @since 1.5 1340 */ 1341 public static final UnicodeBlock GOTHIC 1342 = new UnicodeBlock(0x10330, 0x1034F, 1343 "GOTHIC", 1344 "Gothic"); 1345 1346 /** 1347 * Ugaritic. 1348 * 0x10380 - 0x1039F. 1349 * @since 1.5 1350 */ 1351 public static final UnicodeBlock UGARITIC 1352 = new UnicodeBlock(0x10380, 0x1039F, 1353 "UGARITIC", 1354 "Ugaritic"); 1355 1356 /** 1357 * Deseret. 1358 * 0x10400 - 0x1044F. 1359 * @since 1.5 1360 */ 1361 public static final UnicodeBlock DESERET 1362 = new UnicodeBlock(0x10400, 0x1044F, 1363 "DESERET", 1364 "Deseret"); 1365 1366 /** 1367 * Shavian. 1368 * 0x10450 - 0x1047F. 1369 * @since 1.5 1370 */ 1371 public static final UnicodeBlock SHAVIAN 1372 = new UnicodeBlock(0x10450, 0x1047F, 1373 "SHAVIAN", 1374 "Shavian"); 1375 1376 /** 1377 * Osmanya. 1378 * 0x10480 - 0x104AF. 1379 * @since 1.5 1380 */ 1381 public static final UnicodeBlock OSMANYA 1382 = new UnicodeBlock(0x10480, 0x104AF, 1383 "OSMANYA", 1384 "Osmanya"); 1385 1386 /** 1387 * Cypriot Syllabary. 1388 * 0x10800 - 0x1083F. 1389 * @since 1.5 1390 */ 1391 public static final UnicodeBlock CYPRIOT_SYLLABARY 1392 = new UnicodeBlock(0x10800, 0x1083F, 1393 "CYPRIOT_SYLLABARY", 1394 "Cypriot Syllabary"); 1395 1396 /** 1397 * Byzantine Musical Symbols. 1398 * 0x1D000 - 0x1D0FF. 1399 * @since 1.5 1400 */ 1401 public static final UnicodeBlock BYZANTINE_MUSICAL_SYMBOLS 1402 = new UnicodeBlock(0x1D000, 0x1D0FF, 1403 "BYZANTINE_MUSICAL_SYMBOLS", 1404 "Byzantine Musical Symbols"); 1405 1406 /** 1407 * Musical Symbols. 1408 * 0x1D100 - 0x1D1FF. 1409 * @since 1.5 1410 */ 1411 public static final UnicodeBlock MUSICAL_SYMBOLS 1412 = new UnicodeBlock(0x1D100, 0x1D1FF, 1413 "MUSICAL_SYMBOLS", 1414 "Musical Symbols"); 1415 1416 /** 1417 * Tai Xuan Jing Symbols. 1418 * 0x1D300 - 0x1D35F. 1419 * @since 1.5 1420 */ 1421 public static final UnicodeBlock TAI_XUAN_JING_SYMBOLS 1422 = new UnicodeBlock(0x1D300, 0x1D35F, 1423 "TAI_XUAN_JING_SYMBOLS", 1424 "Tai Xuan Jing Symbols"); 1425 1426 /** 1427 * Mathematical Alphanumeric Symbols. 1428 * 0x1D400 - 0x1D7FF. 1429 * @since 1.5 1430 */ 1431 public static final UnicodeBlock MATHEMATICAL_ALPHANUMERIC_SYMBOLS 1432 = new UnicodeBlock(0x1D400, 0x1D7FF, 1433 "MATHEMATICAL_ALPHANUMERIC_SYMBOLS", 1434 "Mathematical Alphanumeric Symbols"); 1435 1436 /** 1437 * CJK Unified Ideographs Extension B. 1438 * 0x20000 - 0x2A6DF. 1439 * @since 1.5 1440 */ 1441 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B 1442 = new UnicodeBlock(0x20000, 0x2A6DF, 1443 "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B", 1444 "CJK Unified Ideographs Extension B"); 1445 1446 /** 1447 * CJK Compatibility Ideographs Supplement. 1448 * 0x2F800 - 0x2FA1F. 1449 * @since 1.5 1450 */ 1451 public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT 1452 = new UnicodeBlock(0x2F800, 0x2FA1F, 1453 "CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT", 1454 "CJK Compatibility Ideographs Supplement"); 1455 1456 /** 1457 * Tags. 1458 * 0xE0000 - 0xE007F. 1459 * @since 1.5 1460 */ 1461 public static final UnicodeBlock TAGS 1462 = new UnicodeBlock(0xE0000, 0xE007F, 1463 "TAGS", 1464 "Tags"); 1465 1466 /** 1467 * Variation Selectors Supplement. 1468 * 0xE0100 - 0xE01EF. 1469 * @since 1.5 1470 */ 1471 public static final UnicodeBlock VARIATION_SELECTORS_SUPPLEMENT 1472 = new UnicodeBlock(0xE0100, 0xE01EF, 1473 "VARIATION_SELECTORS_SUPPLEMENT", 1474 "Variation Selectors Supplement"); 1475 1476 /** 1477 * Supplementary Private Use Area-A. 1478 * 0xF0000 - 0xFFFFF. 1479 * @since 1.5 1480 */ 1481 public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_A 1482 = new UnicodeBlock(0xF0000, 0xFFFFF, 1483 "SUPPLEMENTARY_PRIVATE_USE_AREA_A", 1484 "Supplementary Private Use Area-A"); 1485 1486 /** 1487 * Supplementary Private Use Area-B. 1488 * 0x100000 - 0x10FFFF. 1489 * @since 1.5 1490 */ 1491 public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_B 1492 = new UnicodeBlock(0x100000, 0x10FFFF, 1493 "SUPPLEMENTARY_PRIVATE_USE_AREA_B", 1494 "Supplementary Private Use Area-B"); 1495 1496 /** 1497 * Surrogates Area. 1498 * 'D800' - 'DFFF'. 1499 * @deprecated As of 1.5, the three areas, 1500 * <a href="#HIGH_SURROGATES">HIGH_SURROGATES</a>, 1501 * <a href="#HIGH_PRIVATE_USE_SURROGATES">HIGH_PRIVATE_USE_SURROGATES</a> 1502 * and <a href="#LOW_SURROGATES">LOW_SURROGATES</a>, as defined 1503 * by the Unicode standard, should be used in preference to 1504 * this. These are also returned from calls to <code>of(int)</code> 1505 * and <code>of(char)</code>. 1506 */ 1507 @Deprecated 1508 public static final UnicodeBlock SURROGATES_AREA 1509 = new UnicodeBlock(0xD800, 0xDFFF, 1510 "SURROGATES_AREA", 1511 "Surrogates Area"); 1512 1513 /** 1514 * The defined subsets. 1515 */ 1516 private static final UnicodeBlock sets[] = { 1517 BASIC_LATIN, 1518 LATIN_1_SUPPLEMENT, 1519 LATIN_EXTENDED_A, 1520 LATIN_EXTENDED_B, 1521 IPA_EXTENSIONS, 1522 SPACING_MODIFIER_LETTERS, 1523 COMBINING_DIACRITICAL_MARKS, 1524 GREEK, 1525 CYRILLIC, 1526 CYRILLIC_SUPPLEMENTARY, 1527 ARMENIAN, 1528 HEBREW, 1529 ARABIC, 1530 SYRIAC, 1531 THAANA, 1532 DEVANAGARI, 1533 BENGALI, 1534 GURMUKHI, 1535 GUJARATI, 1536 ORIYA, 1537 TAMIL, 1538 TELUGU, 1539 KANNADA, 1540 MALAYALAM, 1541 SINHALA, 1542 THAI, 1543 LAO, 1544 TIBETAN, 1545 MYANMAR, 1546 GEORGIAN, 1547 HANGUL_JAMO, 1548 ETHIOPIC, 1549 CHEROKEE, 1550 UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, 1551 OGHAM, 1552 RUNIC, 1553 TAGALOG, 1554 HANUNOO, 1555 BUHID, 1556 TAGBANWA, 1557 KHMER, 1558 MONGOLIAN, 1559 LIMBU, 1560 TAI_LE, 1561 KHMER_SYMBOLS, 1562 PHONETIC_EXTENSIONS, 1563 LATIN_EXTENDED_ADDITIONAL, 1564 GREEK_EXTENDED, 1565 GENERAL_PUNCTUATION, 1566 SUPERSCRIPTS_AND_SUBSCRIPTS, 1567 CURRENCY_SYMBOLS, 1568 COMBINING_MARKS_FOR_SYMBOLS, 1569 LETTERLIKE_SYMBOLS, 1570 NUMBER_FORMS, 1571 ARROWS, 1572 MATHEMATICAL_OPERATORS, 1573 MISCELLANEOUS_TECHNICAL, 1574 CONTROL_PICTURES, 1575 OPTICAL_CHARACTER_RECOGNITION, 1576 ENCLOSED_ALPHANUMERICS, 1577 BOX_DRAWING, 1578 BLOCK_ELEMENTS, 1579 GEOMETRIC_SHAPES, 1580 MISCELLANEOUS_SYMBOLS, 1581 DINGBATS, 1582 MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A, 1583 SUPPLEMENTAL_ARROWS_A, 1584 BRAILLE_PATTERNS, 1585 SUPPLEMENTAL_ARROWS_B, 1586 MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B, 1587 SUPPLEMENTAL_MATHEMATICAL_OPERATORS, 1588 MISCELLANEOUS_SYMBOLS_AND_ARROWS, 1589 CJK_RADICALS_SUPPLEMENT, 1590 KANGXI_RADICALS, 1591 IDEOGRAPHIC_DESCRIPTION_CHARACTERS, 1592 CJK_SYMBOLS_AND_PUNCTUATION, 1593 HIRAGANA, 1594 KATAKANA, 1595 BOPOMOFO, 1596 HANGUL_COMPATIBILITY_JAMO, 1597 KANBUN, 1598 BOPOMOFO_EXTENDED, 1599 KATAKANA_PHONETIC_EXTENSIONS, 1600 ENCLOSED_CJK_LETTERS_AND_MONTHS, 1601 CJK_COMPATIBILITY, 1602 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, 1603 YIJING_HEXAGRAM_SYMBOLS, 1604 CJK_UNIFIED_IDEOGRAPHS, 1605 YI_SYLLABLES, 1606 YI_RADICALS, 1607 HANGUL_SYLLABLES, 1608 HIGH_SURROGATES, 1609 HIGH_PRIVATE_USE_SURROGATES, 1610 LOW_SURROGATES, 1611 PRIVATE_USE_AREA, 1612 CJK_COMPATIBILITY_IDEOGRAPHS, 1613 ALPHABETIC_PRESENTATION_FORMS, 1614 ARABIC_PRESENTATION_FORMS_A, 1615 VARIATION_SELECTORS, 1616 COMBINING_HALF_MARKS, 1617 CJK_COMPATIBILITY_FORMS, 1618 SMALL_FORM_VARIANTS, 1619 ARABIC_PRESENTATION_FORMS_B, 1620 HALFWIDTH_AND_FULLWIDTH_FORMS, 1621 SPECIALS, 1622 LINEAR_B_SYLLABARY, 1623 LINEAR_B_IDEOGRAMS, 1624 AEGEAN_NUMBERS, 1625 OLD_ITALIC, 1626 GOTHIC, 1627 UGARITIC, 1628 DESERET, 1629 SHAVIAN, 1630 OSMANYA, 1631 CYPRIOT_SYLLABARY, 1632 BYZANTINE_MUSICAL_SYMBOLS, 1633 MUSICAL_SYMBOLS, 1634 TAI_XUAN_JING_SYMBOLS, 1635 MATHEMATICAL_ALPHANUMERIC_SYMBOLS, 1636 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, 1637 CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, 1638 TAGS, 1639 VARIATION_SELECTORS_SUPPLEMENT, 1640 SUPPLEMENTARY_PRIVATE_USE_AREA_A, 1641 SUPPLEMENTARY_PRIVATE_USE_AREA_B, 1642 }; 1643 } // class UnicodeBlock 1644 1645 /** 1646 * A class to encompass all the properties of characters in the 1647 * private use blocks in the Unicode standard. This class extends 1648 * UnassignedCharacters because the return type from getType() is 1649 * different. 1650 * @author Anthony Balkissoon abalkiss at redhat dot com 1651 * 1652 */ 1653 private static class PrivateUseCharacters extends UnassignedCharacters 1654 { 1655 /** 1656 * Returns the type of the character cp. 1657 */ getType(int cp)1658 static int getType(int cp) 1659 { 1660 // The upper 2 code points in any plane are considered unassigned, 1661 // even in the private-use planes. 1662 if ((cp & 0xffff) >= 0xfffe) 1663 return UnassignedCharacters.getType(cp); 1664 return PRIVATE_USE; 1665 } 1666 1667 /** 1668 * Returns true if the character cp is defined. 1669 */ isDefined(int cp)1670 static boolean isDefined(int cp) 1671 { 1672 // The upper 2 code points in any plane are considered unassigned, 1673 // even in the private-use planes. 1674 if ((cp & 0xffff) >= 0xfffe) 1675 return UnassignedCharacters.isDefined(cp); 1676 return true; 1677 } 1678 1679 /** 1680 * Gets the directionality for the character cp. 1681 */ getDirectionality(int cp)1682 static byte getDirectionality(int cp) 1683 { 1684 if ((cp & 0xffff) >= 0xfffe) 1685 return UnassignedCharacters.getDirectionality(cp); 1686 return DIRECTIONALITY_LEFT_TO_RIGHT; 1687 } 1688 } 1689 1690 /** 1691 * A class to encompass all the properties of code points that are 1692 * currently undefined in the Unicode standard. 1693 * @author Anthony Balkissoon abalkiss at redhat dot com 1694 * 1695 */ 1696 private static class UnassignedCharacters 1697 { 1698 /** 1699 * Returns the numeric value for the unassigned characters. 1700 * @param cp the character 1701 * @param radix the radix (not used) 1702 * @return the numeric value of this character in this radix 1703 */ digit(int cp, int radix)1704 static int digit(int cp, int radix) 1705 { 1706 return -1; 1707 } 1708 1709 /** 1710 * Returns the Unicode directionality property for unassigned 1711 * characters. 1712 * @param cp the character 1713 * @return DIRECTIONALITY_UNDEFINED 1714 */ getDirectionality(int cp)1715 static byte getDirectionality(int cp) 1716 { 1717 return DIRECTIONALITY_UNDEFINED; 1718 } 1719 1720 /** 1721 * Returns -1, the numeric value for unassigned Unicode characters. 1722 * @param cp the character 1723 * @return -1 1724 */ getNumericValue(int cp)1725 static int getNumericValue(int cp) 1726 { 1727 return -1; 1728 } 1729 1730 /** 1731 * Returns UNASSIGNED, the type of unassigned Unicode characters. 1732 * @param cp the character 1733 * @return UNASSIGNED 1734 */ getType(int cp)1735 static int getType(int cp) 1736 { 1737 return UNASSIGNED; 1738 } 1739 1740 /** 1741 * Returns false to indiciate that the character is not defined in the 1742 * Unicode standard. 1743 * @param cp the character 1744 * @return false 1745 */ isDefined(int cp)1746 static boolean isDefined(int cp) 1747 { 1748 return false; 1749 } 1750 1751 /** 1752 * Returns false to indicate that the character is not a digit. 1753 * @param cp the character 1754 * @return false 1755 */ isDigit(int cp)1756 static boolean isDigit(int cp) 1757 { 1758 return false; 1759 } 1760 1761 /** 1762 * Returns false to indicate that the character cannot be ignored 1763 * within an identifier 1764 * @param cp the character 1765 * @return false 1766 */ isIdentifierIgnorable(int cp)1767 static boolean isIdentifierIgnorable(int cp) 1768 { 1769 return false; 1770 } 1771 1772 /** 1773 * Returns false to indicate that the character cannot be part of a 1774 * Java identifier. 1775 * @param cp the character 1776 * @return false 1777 */ isJavaIdentifierPart(int cp)1778 static boolean isJavaIdentifierPart(int cp) 1779 { 1780 return false; 1781 } 1782 1783 /** 1784 * Returns false to indicate that the character cannot be start a 1785 * Java identifier. 1786 * @param cp the character 1787 * @return false 1788 */ isJavaIdentiferStart(int cp)1789 static boolean isJavaIdentiferStart(int cp) 1790 { 1791 return false; 1792 } 1793 1794 /** 1795 * Returns false to indicate that the character is not a letter. 1796 * @param cp the character 1797 * @return false 1798 */ isLetter(int cp)1799 static boolean isLetter(int cp) 1800 { 1801 return false; 1802 } 1803 1804 /** 1805 * Returns false to indicate that the character cannot is neither a letter 1806 * nor a digit. 1807 * @param cp the character 1808 * @return false 1809 */ isLetterOrDigit(int cp)1810 static boolean isLetterOrDigit(int cp) 1811 { 1812 return false; 1813 } 1814 1815 /** 1816 * Returns false to indicate that the character is not a lowercase letter. 1817 * @param cp the character 1818 * @return false 1819 */ isLowerCase(int cp)1820 static boolean isLowerCase(int cp) 1821 { 1822 return false; 1823 } 1824 1825 /** 1826 * Returns false to indicate that the character cannot is not mirrored. 1827 * @param cp the character 1828 * @return false 1829 */ isMirrored(int cp)1830 static boolean isMirrored(int cp) 1831 { 1832 return false; 1833 } 1834 1835 /** 1836 * Returns false to indicate that the character is not a space character. 1837 * @param cp the character 1838 * @return false 1839 */ isSpaceChar(int cp)1840 static boolean isSpaceChar(int cp) 1841 { 1842 return false; 1843 } 1844 1845 /** 1846 * Returns false to indicate that the character it not a titlecase letter. 1847 * @param cp the character 1848 * @return false 1849 */ isTitleCase(int cp)1850 static boolean isTitleCase(int cp) 1851 { 1852 return false; 1853 } 1854 1855 /** 1856 * Returns false to indicate that the character cannot be part of a 1857 * Unicode identifier. 1858 * @param cp the character 1859 * @return false 1860 */ isUnicodeIdentifierPart(int cp)1861 static boolean isUnicodeIdentifierPart(int cp) 1862 { 1863 return false; 1864 } 1865 1866 /** 1867 * Returns false to indicate that the character cannot start a 1868 * Unicode identifier. 1869 * @param cp the character 1870 * @return false 1871 */ isUnicodeIdentifierStart(int cp)1872 static boolean isUnicodeIdentifierStart(int cp) 1873 { 1874 return false; 1875 } 1876 1877 /** 1878 * Returns false to indicate that the character is not an uppercase letter. 1879 * @param cp the character 1880 * @return false 1881 */ isUpperCase(int cp)1882 static boolean isUpperCase(int cp) 1883 { 1884 return false; 1885 } 1886 1887 /** 1888 * Returns false to indicate that the character is not a whitespace 1889 * character. 1890 * @param cp the character 1891 * @return false 1892 */ isWhiteSpace(int cp)1893 static boolean isWhiteSpace(int cp) 1894 { 1895 return false; 1896 } 1897 1898 /** 1899 * Returns cp to indicate this character has no lowercase conversion. 1900 * @param cp the character 1901 * @return cp 1902 */ toLowerCase(int cp)1903 static int toLowerCase(int cp) 1904 { 1905 return cp; 1906 } 1907 1908 /** 1909 * Returns cp to indicate this character has no titlecase conversion. 1910 * @param cp the character 1911 * @return cp 1912 */ toTitleCase(int cp)1913 static int toTitleCase(int cp) 1914 { 1915 return cp; 1916 } 1917 1918 /** 1919 * Returns cp to indicate this character has no uppercase conversion. 1920 * @param cp the character 1921 * @return cp 1922 */ toUpperCase(int cp)1923 static int toUpperCase(int cp) 1924 { 1925 return cp; 1926 } 1927 } 1928 1929 /** 1930 * The immutable value of this Character. 1931 * 1932 * @serial the value of this Character 1933 */ 1934 private final char value; 1935 1936 /** 1937 * Compatible with JDK 1.0+. 1938 */ 1939 private static final long serialVersionUID = 3786198910865385080L; 1940 1941 /** 1942 * Smallest value allowed for radix arguments in Java. This value is 2. 1943 * 1944 * @see #digit(char, int) 1945 * @see #forDigit(int, int) 1946 * @see Integer#toString(int, int) 1947 * @see Integer#valueOf(String) 1948 */ 1949 public static final int MIN_RADIX = 2; 1950 1951 /** 1952 * Largest value allowed for radix arguments in Java. This value is 36. 1953 * 1954 * @see #digit(char, int) 1955 * @see #forDigit(int, int) 1956 * @see Integer#toString(int, int) 1957 * @see Integer#valueOf(String) 1958 */ 1959 public static final int MAX_RADIX = 36; 1960 1961 /** 1962 * The minimum value the char data type can hold. 1963 * This value is <code>'\\u0000'</code>. 1964 */ 1965 public static final char MIN_VALUE = '\u0000'; 1966 1967 /** 1968 * The maximum value the char data type can hold. 1969 * This value is <code>'\\uFFFF'</code>. 1970 */ 1971 public static final char MAX_VALUE = '\uFFFF'; 1972 1973 /** 1974 * The minimum Unicode 4.0 code point. This value is <code>0</code>. 1975 * @since 1.5 1976 */ 1977 public static final int MIN_CODE_POINT = 0; 1978 1979 /** 1980 * The maximum Unicode 4.0 code point, which is greater than the range 1981 * of the char data type. 1982 * This value is <code>0x10FFFF</code>. 1983 * @since 1.5 1984 */ 1985 public static final int MAX_CODE_POINT = 0x10FFFF; 1986 1987 /** 1988 * The minimum Unicode high surrogate code unit, or 1989 * <emph>leading-surrogate</emph>, in the UTF-16 character encoding. 1990 * This value is <code>'\uD800'</code>. 1991 * @since 1.5 1992 */ 1993 public static final char MIN_HIGH_SURROGATE = '\uD800'; 1994 1995 /** 1996 * The maximum Unicode high surrogate code unit, or 1997 * <emph>leading-surrogate</emph>, in the UTF-16 character encoding. 1998 * This value is <code>'\uDBFF'</code>. 1999 * @since 1.5 2000 */ 2001 public static final char MAX_HIGH_SURROGATE = '\uDBFF'; 2002 2003 /** 2004 * The minimum Unicode low surrogate code unit, or 2005 * <emph>trailing-surrogate</emph>, in the UTF-16 character encoding. 2006 * This value is <code>'\uDC00'</code>. 2007 * @since 1.5 2008 */ 2009 public static final char MIN_LOW_SURROGATE = '\uDC00'; 2010 2011 /** 2012 * The maximum Unicode low surrogate code unit, or 2013 * <emph>trailing-surrogate</emph>, in the UTF-16 character encoding. 2014 * This value is <code>'\uDFFF'</code>. 2015 * @since 1.5 2016 */ 2017 public static final char MAX_LOW_SURROGATE = '\uDFFF'; 2018 2019 /** 2020 * The minimum Unicode surrogate code unit in the UTF-16 character encoding. 2021 * This value is <code>'\uD800'</code>. 2022 * @since 1.5 2023 */ 2024 public static final char MIN_SURROGATE = MIN_HIGH_SURROGATE; 2025 2026 /** 2027 * The maximum Unicode surrogate code unit in the UTF-16 character encoding. 2028 * This value is <code>'\uDFFF'</code>. 2029 * @since 1.5 2030 */ 2031 public static final char MAX_SURROGATE = MAX_LOW_SURROGATE; 2032 2033 /** 2034 * The lowest possible supplementary Unicode code point (the first code 2035 * point outside the basic multilingual plane (BMP)). 2036 * This value is <code>0x10000</code>. 2037 */ 2038 public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000; 2039 2040 /** 2041 * Class object representing the primitive char data type. 2042 * 2043 * @since 1.1 2044 */ 2045 public static final Class<Character> TYPE = (Class<Character>) VMClassLoader.getPrimitiveClass('C'); 2046 2047 /** 2048 * The number of bits needed to represent a <code>char</code>. 2049 * @since 1.5 2050 */ 2051 public static final int SIZE = 16; 2052 2053 // This caches some Character values, and is used by boxing 2054 // conversions via valueOf(). We must cache at least 0..127; 2055 // this constant controls how much we actually cache. 2056 private static final int MAX_CACHE = 127; 2057 private static Character[] charCache = new Character[MAX_CACHE + 1]; 2058 static 2059 { 2060 for (char i=0; i <= MAX_CACHE; i++) 2061 charCache[i] = new Character(i); 2062 } 2063 2064 /** 2065 * Lu = Letter, Uppercase (Informative). 2066 * 2067 * @since 1.1 2068 */ 2069 public static final byte UPPERCASE_LETTER = 1; 2070 2071 /** 2072 * Ll = Letter, Lowercase (Informative). 2073 * 2074 * @since 1.1 2075 */ 2076 public static final byte LOWERCASE_LETTER = 2; 2077 2078 /** 2079 * Lt = Letter, Titlecase (Informative). 2080 * 2081 * @since 1.1 2082 */ 2083 public static final byte TITLECASE_LETTER = 3; 2084 2085 /** 2086 * Mn = Mark, Non-Spacing (Normative). 2087 * 2088 * @since 1.1 2089 */ 2090 public static final byte NON_SPACING_MARK = 6; 2091 2092 /** 2093 * Mc = Mark, Spacing Combining (Normative). 2094 * 2095 * @since 1.1 2096 */ 2097 public static final byte COMBINING_SPACING_MARK = 8; 2098 2099 /** 2100 * Me = Mark, Enclosing (Normative). 2101 * 2102 * @since 1.1 2103 */ 2104 public static final byte ENCLOSING_MARK = 7; 2105 2106 /** 2107 * Nd = Number, Decimal Digit (Normative). 2108 * 2109 * @since 1.1 2110 */ 2111 public static final byte DECIMAL_DIGIT_NUMBER = 9; 2112 2113 /** 2114 * Nl = Number, Letter (Normative). 2115 * 2116 * @since 1.1 2117 */ 2118 public static final byte LETTER_NUMBER = 10; 2119 2120 /** 2121 * No = Number, Other (Normative). 2122 * 2123 * @since 1.1 2124 */ 2125 public static final byte OTHER_NUMBER = 11; 2126 2127 /** 2128 * Zs = Separator, Space (Normative). 2129 * 2130 * @since 1.1 2131 */ 2132 public static final byte SPACE_SEPARATOR = 12; 2133 2134 /** 2135 * Zl = Separator, Line (Normative). 2136 * 2137 * @since 1.1 2138 */ 2139 public static final byte LINE_SEPARATOR = 13; 2140 2141 /** 2142 * Zp = Separator, Paragraph (Normative). 2143 * 2144 * @since 1.1 2145 */ 2146 public static final byte PARAGRAPH_SEPARATOR = 14; 2147 2148 /** 2149 * Cc = Other, Control (Normative). 2150 * 2151 * @since 1.1 2152 */ 2153 public static final byte CONTROL = 15; 2154 2155 /** 2156 * Cf = Other, Format (Normative). 2157 * 2158 * @since 1.1 2159 */ 2160 public static final byte FORMAT = 16; 2161 2162 /** 2163 * Cs = Other, Surrogate (Normative). 2164 * 2165 * @since 1.1 2166 */ 2167 public static final byte SURROGATE = 19; 2168 2169 /** 2170 * Co = Other, Private Use (Normative). 2171 * 2172 * @since 1.1 2173 */ 2174 public static final byte PRIVATE_USE = 18; 2175 2176 /** 2177 * Cn = Other, Not Assigned (Normative). 2178 * 2179 * @since 1.1 2180 */ 2181 public static final byte UNASSIGNED = 0; 2182 2183 /** 2184 * Lm = Letter, Modifier (Informative). 2185 * 2186 * @since 1.1 2187 */ 2188 public static final byte MODIFIER_LETTER = 4; 2189 2190 /** 2191 * Lo = Letter, Other (Informative). 2192 * 2193 * @since 1.1 2194 */ 2195 public static final byte OTHER_LETTER = 5; 2196 2197 /** 2198 * Pc = Punctuation, Connector (Informative). 2199 * 2200 * @since 1.1 2201 */ 2202 public static final byte CONNECTOR_PUNCTUATION = 23; 2203 2204 /** 2205 * Pd = Punctuation, Dash (Informative). 2206 * 2207 * @since 1.1 2208 */ 2209 public static final byte DASH_PUNCTUATION = 20; 2210 2211 /** 2212 * Ps = Punctuation, Open (Informative). 2213 * 2214 * @since 1.1 2215 */ 2216 public static final byte START_PUNCTUATION = 21; 2217 2218 /** 2219 * Pe = Punctuation, Close (Informative). 2220 * 2221 * @since 1.1 2222 */ 2223 public static final byte END_PUNCTUATION = 22; 2224 2225 /** 2226 * Pi = Punctuation, Initial Quote (Informative). 2227 * 2228 * @since 1.4 2229 */ 2230 public static final byte INITIAL_QUOTE_PUNCTUATION = 29; 2231 2232 /** 2233 * Pf = Punctuation, Final Quote (Informative). 2234 * 2235 * @since 1.4 2236 */ 2237 public static final byte FINAL_QUOTE_PUNCTUATION = 30; 2238 2239 /** 2240 * Po = Punctuation, Other (Informative). 2241 * 2242 * @since 1.1 2243 */ 2244 public static final byte OTHER_PUNCTUATION = 24; 2245 2246 /** 2247 * Sm = Symbol, Math (Informative). 2248 * 2249 * @since 1.1 2250 */ 2251 public static final byte MATH_SYMBOL = 25; 2252 2253 /** 2254 * Sc = Symbol, Currency (Informative). 2255 * 2256 * @since 1.1 2257 */ 2258 public static final byte CURRENCY_SYMBOL = 26; 2259 2260 /** 2261 * Sk = Symbol, Modifier (Informative). 2262 * 2263 * @since 1.1 2264 */ 2265 public static final byte MODIFIER_SYMBOL = 27; 2266 2267 /** 2268 * So = Symbol, Other (Informative). 2269 * 2270 * @since 1.1 2271 */ 2272 public static final byte OTHER_SYMBOL = 28; 2273 2274 /** 2275 * Undefined bidirectional character type. Undefined char values have 2276 * undefined directionality in the Unicode specification. 2277 * 2278 * @since 1.4 2279 */ 2280 public static final byte DIRECTIONALITY_UNDEFINED = -1; 2281 2282 /** 2283 * Strong bidirectional character type "L". 2284 * 2285 * @since 1.4 2286 */ 2287 public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0; 2288 2289 /** 2290 * Strong bidirectional character type "R". 2291 * 2292 * @since 1.4 2293 */ 2294 public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1; 2295 2296 /** 2297 * Strong bidirectional character type "AL". 2298 * 2299 * @since 1.4 2300 */ 2301 public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2; 2302 2303 /** 2304 * Weak bidirectional character type "EN". 2305 * 2306 * @since 1.4 2307 */ 2308 public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3; 2309 2310 /** 2311 * Weak bidirectional character type "ES". 2312 * 2313 * @since 1.4 2314 */ 2315 public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4; 2316 2317 /** 2318 * Weak bidirectional character type "ET". 2319 * 2320 * @since 1.4 2321 */ 2322 public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5; 2323 2324 /** 2325 * Weak bidirectional character type "AN". 2326 * 2327 * @since 1.4 2328 */ 2329 public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6; 2330 2331 /** 2332 * Weak bidirectional character type "CS". 2333 * 2334 * @since 1.4 2335 */ 2336 public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7; 2337 2338 /** 2339 * Weak bidirectional character type "NSM". 2340 * 2341 * @since 1.4 2342 */ 2343 public static final byte DIRECTIONALITY_NONSPACING_MARK = 8; 2344 2345 /** 2346 * Weak bidirectional character type "BN". 2347 * 2348 * @since 1.4 2349 */ 2350 public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9; 2351 2352 /** 2353 * Neutral bidirectional character type "B". 2354 * 2355 * @since 1.4 2356 */ 2357 public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10; 2358 2359 /** 2360 * Neutral bidirectional character type "S". 2361 * 2362 * @since 1.4 2363 */ 2364 public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11; 2365 2366 /** 2367 * Strong bidirectional character type "WS". 2368 * 2369 * @since 1.4 2370 */ 2371 public static final byte DIRECTIONALITY_WHITESPACE = 12; 2372 2373 /** 2374 * Neutral bidirectional character type "ON". 2375 * 2376 * @since 1.4 2377 */ 2378 public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13; 2379 2380 /** 2381 * Strong bidirectional character type "LRE". 2382 * 2383 * @since 1.4 2384 */ 2385 public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14; 2386 2387 /** 2388 * Strong bidirectional character type "LRO". 2389 * 2390 * @since 1.4 2391 */ 2392 public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15; 2393 2394 /** 2395 * Strong bidirectional character type "RLE". 2396 * 2397 * @since 1.4 2398 */ 2399 public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16; 2400 2401 /** 2402 * Strong bidirectional character type "RLO". 2403 * 2404 * @since 1.4 2405 */ 2406 public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17; 2407 2408 /** 2409 * Weak bidirectional character type "PDF". 2410 * 2411 * @since 1.4 2412 */ 2413 public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18; 2414 2415 /** 2416 * Stores unicode block offset lookup table. Exploit package visibility of 2417 * String.value to avoid copying the array. 2418 * @see #readCodePoint(int) 2419 * @see CharData#BLOCKS 2420 */ 2421 private static final char[][] blocks = 2422 new char[][]{ 2423 String.zeroBasedStringValue(CharData.BLOCKS[0]), 2424 String.zeroBasedStringValue(CharData.BLOCKS[1]), 2425 String.zeroBasedStringValue(CharData.BLOCKS[2]), 2426 String.zeroBasedStringValue(CharData.BLOCKS[3]), 2427 String.zeroBasedStringValue(CharData.BLOCKS[4]), 2428 String.zeroBasedStringValue(CharData.BLOCKS[5]), 2429 String.zeroBasedStringValue(CharData.BLOCKS[6]), 2430 String.zeroBasedStringValue(CharData.BLOCKS[7]), 2431 String.zeroBasedStringValue(CharData.BLOCKS[8]), 2432 String.zeroBasedStringValue(CharData.BLOCKS[9]), 2433 String.zeroBasedStringValue(CharData.BLOCKS[10]), 2434 String.zeroBasedStringValue(CharData.BLOCKS[11]), 2435 String.zeroBasedStringValue(CharData.BLOCKS[12]), 2436 String.zeroBasedStringValue(CharData.BLOCKS[13]), 2437 String.zeroBasedStringValue(CharData.BLOCKS[14]), 2438 String.zeroBasedStringValue(CharData.BLOCKS[15]), 2439 String.zeroBasedStringValue(CharData.BLOCKS[16])}; 2440 2441 /** 2442 * Stores unicode attribute offset lookup table. Exploit package visibility 2443 * of String.value to avoid copying the array. 2444 * @see CharData#DATA 2445 */ 2446 private static final char[][] data = 2447 new char[][]{ 2448 String.zeroBasedStringValue(CharData.DATA[0]), 2449 String.zeroBasedStringValue(CharData.DATA[1]), 2450 String.zeroBasedStringValue(CharData.DATA[2]), 2451 String.zeroBasedStringValue(CharData.DATA[3]), 2452 String.zeroBasedStringValue(CharData.DATA[4]), 2453 String.zeroBasedStringValue(CharData.DATA[5]), 2454 String.zeroBasedStringValue(CharData.DATA[6]), 2455 String.zeroBasedStringValue(CharData.DATA[7]), 2456 String.zeroBasedStringValue(CharData.DATA[8]), 2457 String.zeroBasedStringValue(CharData.DATA[9]), 2458 String.zeroBasedStringValue(CharData.DATA[10]), 2459 String.zeroBasedStringValue(CharData.DATA[11]), 2460 String.zeroBasedStringValue(CharData.DATA[12]), 2461 String.zeroBasedStringValue(CharData.DATA[13]), 2462 String.zeroBasedStringValue(CharData.DATA[14]), 2463 String.zeroBasedStringValue(CharData.DATA[15]), 2464 String.zeroBasedStringValue(CharData.DATA[16])}; 2465 2466 /** 2467 * Stores unicode numeric value attribute table. Exploit package visibility 2468 * of String.value to avoid copying the array. 2469 * @see CharData#NUM_VALUE 2470 */ 2471 private static final char[][] numValue = 2472 new char[][]{ 2473 String.zeroBasedStringValue(CharData.NUM_VALUE[0]), 2474 String.zeroBasedStringValue(CharData.NUM_VALUE[1]), 2475 String.zeroBasedStringValue(CharData.NUM_VALUE[2]), 2476 String.zeroBasedStringValue(CharData.NUM_VALUE[3]), 2477 String.zeroBasedStringValue(CharData.NUM_VALUE[4]), 2478 String.zeroBasedStringValue(CharData.NUM_VALUE[5]), 2479 String.zeroBasedStringValue(CharData.NUM_VALUE[6]), 2480 String.zeroBasedStringValue(CharData.NUM_VALUE[7]), 2481 String.zeroBasedStringValue(CharData.NUM_VALUE[8]), 2482 String.zeroBasedStringValue(CharData.NUM_VALUE[9]), 2483 String.zeroBasedStringValue(CharData.NUM_VALUE[10]), 2484 String.zeroBasedStringValue(CharData.NUM_VALUE[11]), 2485 String.zeroBasedStringValue(CharData.NUM_VALUE[12]), 2486 String.zeroBasedStringValue(CharData.NUM_VALUE[13]), 2487 String.zeroBasedStringValue(CharData.NUM_VALUE[14]), 2488 String.zeroBasedStringValue(CharData.NUM_VALUE[15]), 2489 String.zeroBasedStringValue(CharData.NUM_VALUE[16])}; 2490 2491 /** 2492 * Stores unicode uppercase attribute table. Exploit package visibility 2493 * of String.value to avoid copying the array. 2494 * @see CharData#UPPER 2495 */ 2496 private static final char[][] upper = 2497 new char[][]{ 2498 String.zeroBasedStringValue(CharData.UPPER[0]), 2499 String.zeroBasedStringValue(CharData.UPPER[1]), 2500 String.zeroBasedStringValue(CharData.UPPER[2]), 2501 String.zeroBasedStringValue(CharData.UPPER[3]), 2502 String.zeroBasedStringValue(CharData.UPPER[4]), 2503 String.zeroBasedStringValue(CharData.UPPER[5]), 2504 String.zeroBasedStringValue(CharData.UPPER[6]), 2505 String.zeroBasedStringValue(CharData.UPPER[7]), 2506 String.zeroBasedStringValue(CharData.UPPER[8]), 2507 String.zeroBasedStringValue(CharData.UPPER[9]), 2508 String.zeroBasedStringValue(CharData.UPPER[10]), 2509 String.zeroBasedStringValue(CharData.UPPER[11]), 2510 String.zeroBasedStringValue(CharData.UPPER[12]), 2511 String.zeroBasedStringValue(CharData.UPPER[13]), 2512 String.zeroBasedStringValue(CharData.UPPER[14]), 2513 String.zeroBasedStringValue(CharData.UPPER[15]), 2514 String.zeroBasedStringValue(CharData.UPPER[16])}; 2515 2516 /** 2517 * Stores unicode lowercase attribute table. Exploit package visibility 2518 * of String.value to avoid copying the array. 2519 * @see CharData#LOWER 2520 */ 2521 private static final char[][] lower = 2522 new char[][]{ 2523 String.zeroBasedStringValue(CharData.LOWER[0]), 2524 String.zeroBasedStringValue(CharData.LOWER[1]), 2525 String.zeroBasedStringValue(CharData.LOWER[2]), 2526 String.zeroBasedStringValue(CharData.LOWER[3]), 2527 String.zeroBasedStringValue(CharData.LOWER[4]), 2528 String.zeroBasedStringValue(CharData.LOWER[5]), 2529 String.zeroBasedStringValue(CharData.LOWER[6]), 2530 String.zeroBasedStringValue(CharData.LOWER[7]), 2531 String.zeroBasedStringValue(CharData.LOWER[8]), 2532 String.zeroBasedStringValue(CharData.LOWER[9]), 2533 String.zeroBasedStringValue(CharData.LOWER[10]), 2534 String.zeroBasedStringValue(CharData.LOWER[11]), 2535 String.zeroBasedStringValue(CharData.LOWER[12]), 2536 String.zeroBasedStringValue(CharData.LOWER[13]), 2537 String.zeroBasedStringValue(CharData.LOWER[14]), 2538 String.zeroBasedStringValue(CharData.LOWER[15]), 2539 String.zeroBasedStringValue(CharData.LOWER[16])}; 2540 2541 /** 2542 * Stores unicode direction attribute table. Exploit package visibility 2543 * of String.value to avoid copying the array. 2544 * @see CharData#DIRECTION 2545 */ 2546 // Package visible for use by String. 2547 static final char[][] direction = 2548 new char[][]{ 2549 String.zeroBasedStringValue(CharData.DIRECTION[0]), 2550 String.zeroBasedStringValue(CharData.DIRECTION[1]), 2551 String.zeroBasedStringValue(CharData.DIRECTION[2]), 2552 String.zeroBasedStringValue(CharData.DIRECTION[3]), 2553 String.zeroBasedStringValue(CharData.DIRECTION[4]), 2554 String.zeroBasedStringValue(CharData.DIRECTION[5]), 2555 String.zeroBasedStringValue(CharData.DIRECTION[6]), 2556 String.zeroBasedStringValue(CharData.DIRECTION[7]), 2557 String.zeroBasedStringValue(CharData.DIRECTION[8]), 2558 String.zeroBasedStringValue(CharData.DIRECTION[9]), 2559 String.zeroBasedStringValue(CharData.DIRECTION[10]), 2560 String.zeroBasedStringValue(CharData.DIRECTION[11]), 2561 String.zeroBasedStringValue(CharData.DIRECTION[12]), 2562 String.zeroBasedStringValue(CharData.DIRECTION[13]), 2563 String.zeroBasedStringValue(CharData.DIRECTION[14]), 2564 String.zeroBasedStringValue(CharData.DIRECTION[15]), 2565 String.zeroBasedStringValue(CharData.DIRECTION[16])}; 2566 2567 /** 2568 * Stores unicode titlecase table. Exploit package visibility of 2569 * String.value to avoid copying the array. 2570 * @see CharData#TITLE 2571 */ 2572 private static final char[] title = String.zeroBasedStringValue(CharData.TITLE); 2573 2574 /** 2575 * Mask for grabbing the type out of the contents of data. 2576 * @see CharData#DATA 2577 */ 2578 private static final int TYPE_MASK = 0x1F; 2579 2580 /** 2581 * Mask for grabbing the non-breaking space flag out of the contents of 2582 * data. 2583 * @see CharData#DATA 2584 */ 2585 private static final int NO_BREAK_MASK = 0x20; 2586 2587 /** 2588 * Mask for grabbing the mirrored directionality flag out of the contents 2589 * of data. 2590 * @see CharData#DATA 2591 */ 2592 private static final int MIRROR_MASK = 0x40; 2593 2594 /** 2595 * Grabs an attribute offset from the Unicode attribute database. The lower 2596 * 5 bits are the character type, the next 2 bits are flags, and the top 2597 * 9 bits are the offset into the attribute tables. 2598 * 2599 * @param codePoint the character to look up 2600 * @return the character's attribute offset and type 2601 * @see #TYPE_MASK 2602 * @see #NO_BREAK_MASK 2603 * @see #MIRROR_MASK 2604 * @see CharData#DATA 2605 * @see CharData#SHIFT 2606 */ 2607 // Package visible for use in String. readCodePoint(int codePoint)2608 static char readCodePoint(int codePoint) 2609 { 2610 int plane = codePoint >>> 16; 2611 char offset = (char) (codePoint & 0xffff); 2612 return data[plane][(char) (blocks[plane][offset >> CharData.SHIFT[plane]] + offset)]; 2613 } 2614 2615 /** 2616 * Wraps up a character. 2617 * 2618 * @param value the character to wrap 2619 */ Character(char value)2620 public Character(char value) 2621 { 2622 this.value = value; 2623 } 2624 2625 /** 2626 * Returns the character which has been wrapped by this class. 2627 * 2628 * @return the character wrapped 2629 */ charValue()2630 public char charValue() 2631 { 2632 return value; 2633 } 2634 2635 /** 2636 * Returns the numerical value (unsigned) of the wrapped character. 2637 * Range of returned values: 0x0000-0xFFFF. 2638 * 2639 * @return the value of the wrapped character 2640 */ hashCode()2641 public int hashCode() 2642 { 2643 return value; 2644 } 2645 2646 /** 2647 * Determines if an object is equal to this object. This is only true for 2648 * another Character object wrapping the same value. 2649 * 2650 * @param o object to compare 2651 * @return true if o is a Character with the same value 2652 */ equals(Object o)2653 public boolean equals(Object o) 2654 { 2655 return o instanceof Character && value == ((Character) o).value; 2656 } 2657 2658 /** 2659 * Converts the wrapped character into a String. 2660 * 2661 * @return a String containing one character -- the wrapped character 2662 * of this instance 2663 */ toString()2664 public String toString() 2665 { 2666 // Package constructor avoids an array copy. 2667 return new String(new char[] { value }, 0, 1, true); 2668 } 2669 2670 /** 2671 * Returns a String of length 1 representing the specified character. 2672 * 2673 * @param ch the character to convert 2674 * @return a String containing the character 2675 * @since 1.4 2676 */ toString(char ch)2677 public static String toString(char ch) 2678 { 2679 // Package constructor avoids an array copy. 2680 return new String(new char[] { ch }, 0, 1, true); 2681 } 2682 2683 /** 2684 * Determines if a character is a Unicode lowercase letter. For example, 2685 * <code>'a'</code> is lowercase. Returns true if getType() returns 2686 * LOWERCASE_LETTER. 2687 * <br> 2688 * lowercase = [Ll] 2689 * 2690 * @param ch character to test 2691 * @return true if ch is a Unicode lowercase letter, else false 2692 * @see #isUpperCase(char) 2693 * @see #isTitleCase(char) 2694 * @see #toLowerCase(char) 2695 * @see #getType(char) 2696 */ isLowerCase(char ch)2697 public static boolean isLowerCase(char ch) 2698 { 2699 return isLowerCase((int)ch); 2700 } 2701 2702 /** 2703 * Determines if a character is a Unicode lowercase letter. For example, 2704 * <code>'a'</code> is lowercase. Returns true if getType() returns 2705 * LOWERCASE_LETTER. 2706 * <br> 2707 * lowercase = [Ll] 2708 * 2709 * @param codePoint character to test 2710 * @return true if ch is a Unicode lowercase letter, else false 2711 * @see #isUpperCase(char) 2712 * @see #isTitleCase(char) 2713 * @see #toLowerCase(char) 2714 * @see #getType(char) 2715 * 2716 * @since 1.5 2717 */ isLowerCase(int codePoint)2718 public static boolean isLowerCase(int codePoint) 2719 { 2720 return getType(codePoint) == LOWERCASE_LETTER; 2721 } 2722 2723 /** 2724 * Determines if a character is a Unicode uppercase letter. For example, 2725 * <code>'A'</code> is uppercase. Returns true if getType() returns 2726 * UPPERCASE_LETTER. 2727 * <br> 2728 * uppercase = [Lu] 2729 * 2730 * @param ch character to test 2731 * @return true if ch is a Unicode uppercase letter, else false 2732 * @see #isLowerCase(char) 2733 * @see #isTitleCase(char) 2734 * @see #toUpperCase(char) 2735 * @see #getType(char) 2736 */ isUpperCase(char ch)2737 public static boolean isUpperCase(char ch) 2738 { 2739 return isUpperCase((int)ch); 2740 } 2741 2742 /** 2743 * Determines if a character is a Unicode uppercase letter. For example, 2744 * <code>'A'</code> is uppercase. Returns true if getType() returns 2745 * UPPERCASE_LETTER. 2746 * <br> 2747 * uppercase = [Lu] 2748 * 2749 * @param codePoint character to test 2750 * @return true if ch is a Unicode uppercase letter, else false 2751 * @see #isLowerCase(char) 2752 * @see #isTitleCase(char) 2753 * @see #toUpperCase(char) 2754 * @see #getType(char) 2755 * 2756 * @since 1.5 2757 */ isUpperCase(int codePoint)2758 public static boolean isUpperCase(int codePoint) 2759 { 2760 return getType(codePoint) == UPPERCASE_LETTER; 2761 } 2762 2763 /** 2764 * Determines if a character is a Unicode titlecase letter. For example, 2765 * the character "Lj" (Latin capital L with small letter j) is titlecase. 2766 * True if getType() returns TITLECASE_LETTER. 2767 * <br> 2768 * titlecase = [Lt] 2769 * 2770 * @param ch character to test 2771 * @return true if ch is a Unicode titlecase letter, else false 2772 * @see #isLowerCase(char) 2773 * @see #isUpperCase(char) 2774 * @see #toTitleCase(char) 2775 * @see #getType(char) 2776 */ isTitleCase(char ch)2777 public static boolean isTitleCase(char ch) 2778 { 2779 return isTitleCase((int)ch); 2780 } 2781 2782 /** 2783 * Determines if a character is a Unicode titlecase letter. For example, 2784 * the character "Lj" (Latin capital L with small letter j) is titlecase. 2785 * True if getType() returns TITLECASE_LETTER. 2786 * <br> 2787 * titlecase = [Lt] 2788 * 2789 * @param codePoint character to test 2790 * @return true if ch is a Unicode titlecase letter, else false 2791 * @see #isLowerCase(char) 2792 * @see #isUpperCase(char) 2793 * @see #toTitleCase(char) 2794 * @see #getType(char) 2795 * 2796 * @since 1.5 2797 */ isTitleCase(int codePoint)2798 public static boolean isTitleCase(int codePoint) 2799 { 2800 return getType(codePoint) == TITLECASE_LETTER; 2801 } 2802 2803 2804 /** 2805 * Determines if a character is a Unicode decimal digit. For example, 2806 * <code>'0'</code> is a digit. A character is a Unicode digit if 2807 * getType() returns DECIMAL_DIGIT_NUMBER. 2808 * <br> 2809 * Unicode decimal digit = [Nd] 2810 * 2811 * @param ch character to test 2812 * @return true if ch is a Unicode decimal digit, else false 2813 * @see #digit(char, int) 2814 * @see #forDigit(int, int) 2815 * @see #getType(char) 2816 */ isDigit(char ch)2817 public static boolean isDigit(char ch) 2818 { 2819 return isDigit((int)ch); 2820 } 2821 2822 /** 2823 * Determines if a character is a Unicode decimal digit. For example, 2824 * <code>'0'</code> is a digit. A character is a Unicode digit if 2825 * getType() returns DECIMAL_DIGIT_NUMBER. 2826 * <br> 2827 * Unicode decimal digit = [Nd] 2828 * 2829 * @param codePoint character to test 2830 * @return true if ch is a Unicode decimal digit, else false 2831 * @see #digit(char, int) 2832 * @see #forDigit(int, int) 2833 * @see #getType(char) 2834 * 2835 * @since 1.5 2836 */ 2837 isDigit(int codePoint)2838 public static boolean isDigit(int codePoint) 2839 { 2840 return getType(codePoint) == DECIMAL_DIGIT_NUMBER; 2841 } 2842 2843 /** 2844 * Determines if a character is part of the Unicode Standard. This is an 2845 * evolving standard, but covers every character in the data file. 2846 * <br> 2847 * defined = not [Cn] 2848 * 2849 * @param ch character to test 2850 * @return true if ch is a Unicode character, else false 2851 * @see #isDigit(char) 2852 * @see #isLetter(char) 2853 * @see #isLetterOrDigit(char) 2854 * @see #isLowerCase(char) 2855 * @see #isTitleCase(char) 2856 * @see #isUpperCase(char) 2857 */ isDefined(char ch)2858 public static boolean isDefined(char ch) 2859 { 2860 return isDefined((int)ch); 2861 } 2862 2863 /** 2864 * Determines if a character is part of the Unicode Standard. This is an 2865 * evolving standard, but covers every character in the data file. 2866 * <br> 2867 * defined = not [Cn] 2868 * 2869 * @param codePoint character to test 2870 * @return true if ch is a Unicode character, else false 2871 * @see #isDigit(char) 2872 * @see #isLetter(char) 2873 * @see #isLetterOrDigit(char) 2874 * @see #isLowerCase(char) 2875 * @see #isTitleCase(char) 2876 * @see #isUpperCase(char) 2877 * 2878 * @since 1.5 2879 */ isDefined(int codePoint)2880 public static boolean isDefined(int codePoint) 2881 { 2882 return getType(codePoint) != UNASSIGNED; 2883 } 2884 2885 /** 2886 * Determines if a character is a Unicode letter. Not all letters have case, 2887 * so this may return true when isLowerCase and isUpperCase return false. 2888 * A character is a Unicode letter if getType() returns one of 2889 * UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER, 2890 * or OTHER_LETTER. 2891 * <br> 2892 * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo] 2893 * 2894 * @param ch character to test 2895 * @return true if ch is a Unicode letter, else false 2896 * @see #isDigit(char) 2897 * @see #isJavaIdentifierStart(char) 2898 * @see #isJavaLetter(char) 2899 * @see #isJavaLetterOrDigit(char) 2900 * @see #isLetterOrDigit(char) 2901 * @see #isLowerCase(char) 2902 * @see #isTitleCase(char) 2903 * @see #isUnicodeIdentifierStart(char) 2904 * @see #isUpperCase(char) 2905 */ isLetter(char ch)2906 public static boolean isLetter(char ch) 2907 { 2908 return isLetter((int)ch); 2909 } 2910 2911 /** 2912 * Determines if a character is a Unicode letter. Not all letters have case, 2913 * so this may return true when isLowerCase and isUpperCase return false. 2914 * A character is a Unicode letter if getType() returns one of 2915 * UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER, 2916 * or OTHER_LETTER. 2917 * <br> 2918 * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo] 2919 * 2920 * @param codePoint character to test 2921 * @return true if ch is a Unicode letter, else false 2922 * @see #isDigit(char) 2923 * @see #isJavaIdentifierStart(char) 2924 * @see #isJavaLetter(char) 2925 * @see #isJavaLetterOrDigit(char) 2926 * @see #isLetterOrDigit(char) 2927 * @see #isLowerCase(char) 2928 * @see #isTitleCase(char) 2929 * @see #isUnicodeIdentifierStart(char) 2930 * @see #isUpperCase(char) 2931 * 2932 * @since 1.5 2933 */ isLetter(int codePoint)2934 public static boolean isLetter(int codePoint) 2935 { 2936 return ((1 << getType(codePoint)) 2937 & ((1 << UPPERCASE_LETTER) 2938 | (1 << LOWERCASE_LETTER) 2939 | (1 << TITLECASE_LETTER) 2940 | (1 << MODIFIER_LETTER) 2941 | (1 << OTHER_LETTER))) != 0; 2942 } 2943 /** 2944 * Returns the index into the given CharSequence that is offset 2945 * <code>codePointOffset</code> code points from <code>index</code>. 2946 * @param seq the CharSequence 2947 * @param index the start position in the CharSequence 2948 * @param codePointOffset the number of code points offset from the start 2949 * position 2950 * @return the index into the CharSequence that is codePointOffset code 2951 * points offset from index 2952 * 2953 * @throws NullPointerException if seq is null 2954 * @throws IndexOutOfBoundsException if index is negative or greater than the 2955 * length of the sequence. 2956 * @throws IndexOutOfBoundsException if codePointOffset is positive and the 2957 * subsequence from index to the end of seq has fewer than codePointOffset 2958 * code points 2959 * @throws IndexOutOfBoundsException if codePointOffset is negative and the 2960 * subsequence from the start of seq to index has fewer than 2961 * (-codePointOffset) code points 2962 * @since 1.5 2963 */ offsetByCodePoints(CharSequence seq, int index, int codePointOffset)2964 public static int offsetByCodePoints(CharSequence seq, 2965 int index, 2966 int codePointOffset) 2967 { 2968 int len = seq.length(); 2969 if (index < 0 || index > len) 2970 throw new IndexOutOfBoundsException(); 2971 2972 int numToGo = codePointOffset; 2973 int offset = index; 2974 int adjust = 1; 2975 if (numToGo >= 0) 2976 { 2977 for (; numToGo > 0; offset++) 2978 { 2979 numToGo--; 2980 if (Character.isHighSurrogate(seq.charAt(offset)) 2981 && (offset + 1) < len 2982 && Character.isLowSurrogate(seq.charAt(offset + 1))) 2983 offset++; 2984 } 2985 return offset; 2986 } 2987 else 2988 { 2989 numToGo *= -1; 2990 for (; numToGo > 0;) 2991 { 2992 numToGo--; 2993 offset--; 2994 if (Character.isLowSurrogate(seq.charAt(offset)) 2995 && (offset - 1) >= 0 2996 && Character.isHighSurrogate(seq.charAt(offset - 1))) 2997 offset--; 2998 } 2999 return offset; 3000 } 3001 } 3002 3003 /** 3004 * Returns the index into the given char subarray that is offset 3005 * <code>codePointOffset</code> code points from <code>index</code>. 3006 * @param a the char array 3007 * @param start the start index of the subarray 3008 * @param count the length of the subarray 3009 * @param index the index to be offset 3010 * @param codePointOffset the number of code points offset from <code>index 3011 * </code> 3012 * @return the index into the char array 3013 * 3014 * @throws NullPointerException if a is null 3015 * @throws IndexOutOfBoundsException if start or count is negative or if 3016 * start + count is greater than the length of the array 3017 * @throws IndexOutOfBoundsException if index is less than start or larger 3018 * than start + count 3019 * @throws IndexOutOfBoundsException if codePointOffset is positive and the 3020 * subarray from index to start + count - 1 has fewer than codePointOffset 3021 * code points. 3022 * @throws IndexOutOfBoundsException if codePointOffset is negative and the 3023 * subarray from start to index - 1 has fewer than (-codePointOffset) code 3024 * points 3025 * 3026 * @since 1.5 3027 */ offsetByCodePoints(char[] a, int start, int count, int index, int codePointOffset)3028 public static int offsetByCodePoints(char[] a, 3029 int start, 3030 int count, 3031 int index, 3032 int codePointOffset) 3033 { 3034 int len = a.length; 3035 int end = start + count; 3036 if (start < 0 || count < 0 || end > len || index < start || index > end) 3037 throw new IndexOutOfBoundsException(); 3038 3039 int numToGo = codePointOffset; 3040 int offset = index; 3041 int adjust = 1; 3042 if (numToGo >= 0) 3043 { 3044 for (; numToGo > 0; offset++) 3045 { 3046 numToGo--; 3047 if (Character.isHighSurrogate(a[offset]) 3048 && (offset + 1) < len 3049 && Character.isLowSurrogate(a[offset + 1])) 3050 offset++; 3051 } 3052 return offset; 3053 } 3054 else 3055 { 3056 numToGo *= -1; 3057 for (; numToGo > 0;) 3058 { 3059 numToGo--; 3060 offset--; 3061 if (Character.isLowSurrogate(a[offset]) 3062 && (offset - 1) >= 0 3063 && Character.isHighSurrogate(a[offset - 1])) 3064 offset--; 3065 if (offset < start) 3066 throw new IndexOutOfBoundsException(); 3067 } 3068 return offset; 3069 } 3070 3071 } 3072 3073 /** 3074 * Returns the number of Unicode code points in the specified range of the 3075 * given CharSequence. The first char in the range is at position 3076 * beginIndex and the last one is at position endIndex - 1. Paired 3077 * surrogates (supplementary characters are represented by a pair of chars - 3078 * one from the high surrogates and one from the low surrogates) 3079 * count as just one code point. 3080 * @param seq the CharSequence to inspect 3081 * @param beginIndex the beginning of the range 3082 * @param endIndex the end of the range 3083 * @return the number of Unicode code points in the given range of the 3084 * sequence 3085 * @throws NullPointerException if seq is null 3086 * @throws IndexOutOfBoundsException if beginIndex is negative, endIndex is 3087 * larger than the length of seq, or if beginIndex is greater than endIndex. 3088 * @since 1.5 3089 */ codePointCount(CharSequence seq, int beginIndex, int endIndex)3090 public static int codePointCount(CharSequence seq, int beginIndex, 3091 int endIndex) 3092 { 3093 int len = seq.length(); 3094 if (beginIndex < 0 || endIndex > len || beginIndex > endIndex) 3095 throw new IndexOutOfBoundsException(); 3096 3097 int count = 0; 3098 for (int i = beginIndex; i < endIndex; i++) 3099 { 3100 count++; 3101 // If there is a pairing, count it only once. 3102 if (isHighSurrogate(seq.charAt(i)) && (i + 1) < endIndex 3103 && isLowSurrogate(seq.charAt(i + 1))) 3104 i ++; 3105 } 3106 return count; 3107 } 3108 3109 /** 3110 * Returns the number of Unicode code points in the specified range of the 3111 * given char array. The first char in the range is at position 3112 * offset and the length of the range is count. Paired surrogates 3113 * (supplementary characters are represented by a pair of chars - 3114 * one from the high surrogates and one from the low surrogates) 3115 * count as just one code point. 3116 * @param a the char array to inspect 3117 * @param offset the beginning of the range 3118 * @param count the length of the range 3119 * @return the number of Unicode code points in the given range of the 3120 * array 3121 * @throws NullPointerException if a is null 3122 * @throws IndexOutOfBoundsException if offset or count is negative or if 3123 * offset + countendIndex is larger than the length of a. 3124 * @since 1.5 3125 */ codePointCount(char[] a, int offset, int count)3126 public static int codePointCount(char[] a, int offset, 3127 int count) 3128 { 3129 int len = a.length; 3130 int end = offset + count; 3131 if (offset < 0 || count < 0 || end > len) 3132 throw new IndexOutOfBoundsException(); 3133 3134 int counter = 0; 3135 for (int i = offset; i < end; i++) 3136 { 3137 counter++; 3138 // If there is a pairing, count it only once. 3139 if (isHighSurrogate(a[i]) && (i + 1) < end 3140 && isLowSurrogate(a[i + 1])) 3141 i ++; 3142 } 3143 return counter; 3144 } 3145 3146 /** 3147 * Determines if a character is a Unicode letter or a Unicode digit. This 3148 * is the combination of isLetter and isDigit. 3149 * <br> 3150 * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd] 3151 * 3152 * @param ch character to test 3153 * @return true if ch is a Unicode letter or a Unicode digit, else false 3154 * @see #isDigit(char) 3155 * @see #isJavaIdentifierPart(char) 3156 * @see #isJavaLetter(char) 3157 * @see #isJavaLetterOrDigit(char) 3158 * @see #isLetter(char) 3159 * @see #isUnicodeIdentifierPart(char) 3160 */ isLetterOrDigit(char ch)3161 public static boolean isLetterOrDigit(char ch) 3162 { 3163 return isLetterOrDigit((int)ch); 3164 } 3165 3166 /** 3167 * Determines if a character is a Unicode letter or a Unicode digit. This 3168 * is the combination of isLetter and isDigit. 3169 * <br> 3170 * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd] 3171 * 3172 * @param codePoint character to test 3173 * @return true if ch is a Unicode letter or a Unicode digit, else false 3174 * @see #isDigit(char) 3175 * @see #isJavaIdentifierPart(char) 3176 * @see #isJavaLetter(char) 3177 * @see #isJavaLetterOrDigit(char) 3178 * @see #isLetter(char) 3179 * @see #isUnicodeIdentifierPart(char) 3180 * 3181 * @since 1.5 3182 */ isLetterOrDigit(int codePoint)3183 public static boolean isLetterOrDigit(int codePoint) 3184 { 3185 return ((1 << getType(codePoint)) 3186 & ((1 << UPPERCASE_LETTER) 3187 | (1 << LOWERCASE_LETTER) 3188 | (1 << TITLECASE_LETTER) 3189 | (1 << MODIFIER_LETTER) 3190 | (1 << OTHER_LETTER) 3191 | (1 << DECIMAL_DIGIT_NUMBER))) != 0; 3192 } 3193 3194 /** 3195 * Determines if a character can start a Java identifier. This is the 3196 * combination of isLetter, any character where getType returns 3197 * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 3198 * (like '_'). 3199 * 3200 * @param ch character to test 3201 * @return true if ch can start a Java identifier, else false 3202 * @deprecated Replaced by {@link #isJavaIdentifierStart(char)} 3203 * @see #isJavaLetterOrDigit(char) 3204 * @see #isJavaIdentifierStart(char) 3205 * @see #isJavaIdentifierPart(char) 3206 * @see #isLetter(char) 3207 * @see #isLetterOrDigit(char) 3208 * @see #isUnicodeIdentifierStart(char) 3209 */ isJavaLetter(char ch)3210 public static boolean isJavaLetter(char ch) 3211 { 3212 return isJavaIdentifierStart(ch); 3213 } 3214 3215 /** 3216 * Determines if a character can follow the first letter in 3217 * a Java identifier. This is the combination of isJavaLetter (isLetter, 3218 * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 3219 * numeric letter (like Roman numerals), combining marks, non-spacing marks, 3220 * or isIdentifierIgnorable. 3221 * 3222 * @param ch character to test 3223 * @return true if ch can follow the first letter in a Java identifier 3224 * @deprecated Replaced by {@link #isJavaIdentifierPart(char)} 3225 * @see #isJavaLetter(char) 3226 * @see #isJavaIdentifierStart(char) 3227 * @see #isJavaIdentifierPart(char) 3228 * @see #isLetter(char) 3229 * @see #isLetterOrDigit(char) 3230 * @see #isUnicodeIdentifierPart(char) 3231 * @see #isIdentifierIgnorable(char) 3232 */ isJavaLetterOrDigit(char ch)3233 public static boolean isJavaLetterOrDigit(char ch) 3234 { 3235 return isJavaIdentifierPart(ch); 3236 } 3237 3238 /** 3239 * Determines if a character can start a Java identifier. This is the 3240 * combination of isLetter, any character where getType returns 3241 * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 3242 * (like '_'). 3243 * <br> 3244 * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc] 3245 * 3246 * @param ch character to test 3247 * @return true if ch can start a Java identifier, else false 3248 * @see #isJavaIdentifierPart(char) 3249 * @see #isLetter(char) 3250 * @see #isUnicodeIdentifierStart(char) 3251 * @since 1.1 3252 */ isJavaIdentifierStart(char ch)3253 public static boolean isJavaIdentifierStart(char ch) 3254 { 3255 return isJavaIdentifierStart((int)ch); 3256 } 3257 3258 /** 3259 * Determines if a character can start a Java identifier. This is the 3260 * combination of isLetter, any character where getType returns 3261 * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 3262 * (like '_'). 3263 * <br> 3264 * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc] 3265 * 3266 * @param codePoint character to test 3267 * @return true if ch can start a Java identifier, else false 3268 * @see #isJavaIdentifierPart(char) 3269 * @see #isLetter(char) 3270 * @see #isUnicodeIdentifierStart(char) 3271 * @since 1.5 3272 */ isJavaIdentifierStart(int codePoint)3273 public static boolean isJavaIdentifierStart(int codePoint) 3274 { 3275 return ((1 << getType(codePoint)) 3276 & ((1 << UPPERCASE_LETTER) 3277 | (1 << LOWERCASE_LETTER) 3278 | (1 << TITLECASE_LETTER) 3279 | (1 << MODIFIER_LETTER) 3280 | (1 << OTHER_LETTER) 3281 | (1 << LETTER_NUMBER) 3282 | (1 << CURRENCY_SYMBOL) 3283 | (1 << CONNECTOR_PUNCTUATION))) != 0; 3284 } 3285 3286 /** 3287 * Determines if a character can follow the first letter in 3288 * a Java identifier. This is the combination of isJavaLetter (isLetter, 3289 * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 3290 * numeric letter (like Roman numerals), combining marks, non-spacing marks, 3291 * or isIdentifierIgnorable. 3292 * <br> 3293 * Java identifier extender = 3294 * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf] 3295 * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 3296 * 3297 * @param ch character to test 3298 * @return true if ch can follow the first letter in a Java identifier 3299 * @see #isIdentifierIgnorable(char) 3300 * @see #isJavaIdentifierStart(char) 3301 * @see #isLetterOrDigit(char) 3302 * @see #isUnicodeIdentifierPart(char) 3303 * @since 1.1 3304 */ isJavaIdentifierPart(char ch)3305 public static boolean isJavaIdentifierPart(char ch) 3306 { 3307 return isJavaIdentifierPart((int)ch); 3308 } 3309 3310 /** 3311 * Determines if a character can follow the first letter in 3312 * a Java identifier. This is the combination of isJavaLetter (isLetter, 3313 * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 3314 * numeric letter (like Roman numerals), combining marks, non-spacing marks, 3315 * or isIdentifierIgnorable. 3316 * <br> 3317 * Java identifier extender = 3318 * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf] 3319 * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 3320 * 3321 * @param codePoint character to test 3322 * @return true if ch can follow the first letter in a Java identifier 3323 * @see #isIdentifierIgnorable(char) 3324 * @see #isJavaIdentifierStart(char) 3325 * @see #isLetterOrDigit(char) 3326 * @see #isUnicodeIdentifierPart(char) 3327 * @since 1.5 3328 */ isJavaIdentifierPart(int codePoint)3329 public static boolean isJavaIdentifierPart(int codePoint) 3330 { 3331 int category = getType(codePoint); 3332 return ((1 << category) 3333 & ((1 << UPPERCASE_LETTER) 3334 | (1 << LOWERCASE_LETTER) 3335 | (1 << TITLECASE_LETTER) 3336 | (1 << MODIFIER_LETTER) 3337 | (1 << OTHER_LETTER) 3338 | (1 << NON_SPACING_MARK) 3339 | (1 << COMBINING_SPACING_MARK) 3340 | (1 << DECIMAL_DIGIT_NUMBER) 3341 | (1 << LETTER_NUMBER) 3342 | (1 << CURRENCY_SYMBOL) 3343 | (1 << CONNECTOR_PUNCTUATION) 3344 | (1 << FORMAT))) != 0 3345 || (category == CONTROL && isIdentifierIgnorable(codePoint)); 3346 } 3347 3348 /** 3349 * Determines if a character can start a Unicode identifier. Only 3350 * letters can start a Unicode identifier, but this includes characters 3351 * in LETTER_NUMBER. 3352 * <br> 3353 * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl] 3354 * 3355 * @param ch character to test 3356 * @return true if ch can start a Unicode identifier, else false 3357 * @see #isJavaIdentifierStart(char) 3358 * @see #isLetter(char) 3359 * @see #isUnicodeIdentifierPart(char) 3360 * @since 1.1 3361 */ isUnicodeIdentifierStart(char ch)3362 public static boolean isUnicodeIdentifierStart(char ch) 3363 { 3364 return isUnicodeIdentifierStart((int)ch); 3365 } 3366 3367 /** 3368 * Determines if a character can start a Unicode identifier. Only 3369 * letters can start a Unicode identifier, but this includes characters 3370 * in LETTER_NUMBER. 3371 * <br> 3372 * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl] 3373 * 3374 * @param codePoint character to test 3375 * @return true if ch can start a Unicode identifier, else false 3376 * @see #isJavaIdentifierStart(char) 3377 * @see #isLetter(char) 3378 * @see #isUnicodeIdentifierPart(char) 3379 * @since 1.5 3380 */ isUnicodeIdentifierStart(int codePoint)3381 public static boolean isUnicodeIdentifierStart(int codePoint) 3382 { 3383 return ((1 << getType(codePoint)) 3384 & ((1 << UPPERCASE_LETTER) 3385 | (1 << LOWERCASE_LETTER) 3386 | (1 << TITLECASE_LETTER) 3387 | (1 << MODIFIER_LETTER) 3388 | (1 << OTHER_LETTER) 3389 | (1 << LETTER_NUMBER))) != 0; 3390 } 3391 3392 /** 3393 * Determines if a character can follow the first letter in 3394 * a Unicode identifier. This includes letters, connecting punctuation, 3395 * digits, numeric letters, combining marks, non-spacing marks, and 3396 * isIdentifierIgnorable. 3397 * <br> 3398 * Unicode identifier extender = 3399 * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]| 3400 * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 3401 * 3402 * @param ch character to test 3403 * @return true if ch can follow the first letter in a Unicode identifier 3404 * @see #isIdentifierIgnorable(char) 3405 * @see #isJavaIdentifierPart(char) 3406 * @see #isLetterOrDigit(char) 3407 * @see #isUnicodeIdentifierStart(char) 3408 * @since 1.1 3409 */ isUnicodeIdentifierPart(char ch)3410 public static boolean isUnicodeIdentifierPart(char ch) 3411 { 3412 return isUnicodeIdentifierPart((int)ch); 3413 } 3414 3415 /** 3416 * Determines if a character can follow the first letter in 3417 * a Unicode identifier. This includes letters, connecting punctuation, 3418 * digits, numeric letters, combining marks, non-spacing marks, and 3419 * isIdentifierIgnorable. 3420 * <br> 3421 * Unicode identifier extender = 3422 * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]| 3423 * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 3424 * 3425 * @param codePoint character to test 3426 * @return true if ch can follow the first letter in a Unicode identifier 3427 * @see #isIdentifierIgnorable(char) 3428 * @see #isJavaIdentifierPart(char) 3429 * @see #isLetterOrDigit(char) 3430 * @see #isUnicodeIdentifierStart(char) 3431 * @since 1.5 3432 */ isUnicodeIdentifierPart(int codePoint)3433 public static boolean isUnicodeIdentifierPart(int codePoint) 3434 { 3435 int category = getType(codePoint); 3436 return ((1 << category) 3437 & ((1 << UPPERCASE_LETTER) 3438 | (1 << LOWERCASE_LETTER) 3439 | (1 << TITLECASE_LETTER) 3440 | (1 << MODIFIER_LETTER) 3441 | (1 << OTHER_LETTER) 3442 | (1 << NON_SPACING_MARK) 3443 | (1 << COMBINING_SPACING_MARK) 3444 | (1 << DECIMAL_DIGIT_NUMBER) 3445 | (1 << LETTER_NUMBER) 3446 | (1 << CONNECTOR_PUNCTUATION) 3447 | (1 << FORMAT))) != 0 3448 || (category == CONTROL && isIdentifierIgnorable(codePoint)); 3449 } 3450 3451 /** 3452 * Determines if a character is ignorable in a Unicode identifier. This 3453 * includes the non-whitespace ISO control characters (<code>'\u0000'</code> 3454 * through <code>'\u0008'</code>, <code>'\u000E'</code> through 3455 * <code>'\u001B'</code>, and <code>'\u007F'</code> through 3456 * <code>'\u009F'</code>), and FORMAT characters. 3457 * <br> 3458 * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B 3459 * |U+007F-U+009F 3460 * 3461 * @param ch character to test 3462 * @return true if ch is ignorable in a Unicode or Java identifier 3463 * @see #isJavaIdentifierPart(char) 3464 * @see #isUnicodeIdentifierPart(char) 3465 * @since 1.1 3466 */ isIdentifierIgnorable(char ch)3467 public static boolean isIdentifierIgnorable(char ch) 3468 { 3469 return isIdentifierIgnorable((int)ch); 3470 } 3471 3472 /** 3473 * Determines if a character is ignorable in a Unicode identifier. This 3474 * includes the non-whitespace ISO control characters (<code>'\u0000'</code> 3475 * through <code>'\u0008'</code>, <code>'\u000E'</code> through 3476 * <code>'\u001B'</code>, and <code>'\u007F'</code> through 3477 * <code>'\u009F'</code>), and FORMAT characters. 3478 * <br> 3479 * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B 3480 * |U+007F-U+009F 3481 * 3482 * @param codePoint character to test 3483 * @return true if ch is ignorable in a Unicode or Java identifier 3484 * @see #isJavaIdentifierPart(char) 3485 * @see #isUnicodeIdentifierPart(char) 3486 * @since 1.5 3487 */ isIdentifierIgnorable(int codePoint)3488 public static boolean isIdentifierIgnorable(int codePoint) 3489 { 3490 if ((codePoint >= 0 && codePoint <= 0x0008) 3491 || (codePoint >= 0x000E && codePoint <= 0x001B) 3492 || (codePoint >= 0x007F && codePoint <= 0x009F) 3493 || getType(codePoint) == FORMAT) 3494 return true; 3495 return false; 3496 } 3497 3498 /** 3499 * Converts a Unicode character into its lowercase equivalent mapping. 3500 * If a mapping does not exist, then the character passed is returned. 3501 * Note that isLowerCase(toLowerCase(ch)) does not always return true. 3502 * 3503 * @param ch character to convert to lowercase 3504 * @return lowercase mapping of ch, or ch if lowercase mapping does 3505 * not exist 3506 * @see #isLowerCase(char) 3507 * @see #isUpperCase(char) 3508 * @see #toTitleCase(char) 3509 * @see #toUpperCase(char) 3510 */ toLowerCase(char ch)3511 public static char toLowerCase(char ch) 3512 { 3513 return (char) (lower[0][readCodePoint((int)ch) >>> 7] + ch); 3514 } 3515 3516 /** 3517 * Converts a Unicode character into its lowercase equivalent mapping. 3518 * If a mapping does not exist, then the character passed is returned. 3519 * Note that isLowerCase(toLowerCase(ch)) does not always return true. 3520 * 3521 * @param codePoint character to convert to lowercase 3522 * @return lowercase mapping of ch, or ch if lowercase mapping does 3523 * not exist 3524 * @see #isLowerCase(char) 3525 * @see #isUpperCase(char) 3526 * @see #toTitleCase(char) 3527 * @see #toUpperCase(char) 3528 * 3529 * @since 1.5 3530 */ toLowerCase(int codePoint)3531 public static int toLowerCase(int codePoint) 3532 { 3533 // If the code point is unassigned or in one of the private use areas 3534 // then we delegate the call to the appropriate private static inner class. 3535 int plane = codePoint >>> 16; 3536 if (plane > 2 && plane < 14) 3537 return UnassignedCharacters.toLowerCase(codePoint); 3538 if (plane > 14) 3539 return PrivateUseCharacters.toLowerCase(codePoint); 3540 3541 // The short value stored in lower[plane] is the signed difference between 3542 // codePoint and its lowercase conversion. 3543 return ((short)lower[plane][readCodePoint(codePoint) >>> 7]) + codePoint; 3544 } 3545 3546 /** 3547 * Converts a Unicode character into its uppercase equivalent mapping. 3548 * If a mapping does not exist, then the character passed is returned. 3549 * Note that isUpperCase(toUpperCase(ch)) does not always return true. 3550 * 3551 * @param ch character to convert to uppercase 3552 * @return uppercase mapping of ch, or ch if uppercase mapping does 3553 * not exist 3554 * @see #isLowerCase(char) 3555 * @see #isUpperCase(char) 3556 * @see #toLowerCase(char) 3557 * @see #toTitleCase(char) 3558 */ toUpperCase(char ch)3559 public static char toUpperCase(char ch) 3560 { 3561 return (char) (upper[0][readCodePoint((int)ch) >>> 7] + ch); 3562 } 3563 3564 /** 3565 * Converts a Unicode character into its uppercase equivalent mapping. 3566 * If a mapping does not exist, then the character passed is returned. 3567 * Note that isUpperCase(toUpperCase(ch)) does not always return true. 3568 * 3569 * @param codePoint character to convert to uppercase 3570 * @return uppercase mapping of ch, or ch if uppercase mapping does 3571 * not exist 3572 * @see #isLowerCase(char) 3573 * @see #isUpperCase(char) 3574 * @see #toLowerCase(char) 3575 * @see #toTitleCase(char) 3576 * 3577 * @since 1.5 3578 */ toUpperCase(int codePoint)3579 public static int toUpperCase(int codePoint) 3580 { 3581 // If the code point is unassigned or in one of the private use areas 3582 // then we delegate the call to the appropriate private static inner class. 3583 int plane = codePoint >>> 16; 3584 if (plane > 2 && plane < 14) 3585 return UnassignedCharacters.toUpperCase(codePoint); 3586 if (plane > 14) 3587 return PrivateUseCharacters.toUpperCase(codePoint); 3588 3589 // The short value stored in upper[plane] is the signed difference between 3590 // codePoint and its uppercase conversion. 3591 return ((short)upper[plane][readCodePoint(codePoint) >>> 7]) + codePoint; 3592 } 3593 3594 /** 3595 * Converts a Unicode character into its titlecase equivalent mapping. 3596 * If a mapping does not exist, then the character passed is returned. 3597 * Note that isTitleCase(toTitleCase(ch)) does not always return true. 3598 * 3599 * @param ch character to convert to titlecase 3600 * @return titlecase mapping of ch, or ch if titlecase mapping does 3601 * not exist 3602 * @see #isTitleCase(char) 3603 * @see #toLowerCase(char) 3604 * @see #toUpperCase(char) 3605 */ toTitleCase(char ch)3606 public static char toTitleCase(char ch) 3607 { 3608 // As title is short, it doesn't hurt to exhaustively iterate over it. 3609 for (int i = title.length - 2; i >= 0; i -= 2) 3610 if (title[i] == ch) 3611 return title[i + 1]; 3612 return toUpperCase(ch); 3613 } 3614 3615 /** 3616 * Converts a Unicode character into its titlecase equivalent mapping. 3617 * If a mapping does not exist, then the character passed is returned. 3618 * Note that isTitleCase(toTitleCase(ch)) does not always return true. 3619 * 3620 * @param codePoint character to convert to titlecase 3621 * @return titlecase mapping of ch, or ch if titlecase mapping does 3622 * not exist 3623 * @see #isTitleCase(char) 3624 * @see #toLowerCase(char) 3625 * @see #toUpperCase(char) 3626 * 3627 * @since 1.5 3628 */ toTitleCase(int codePoint)3629 public static int toTitleCase(int codePoint) 3630 { 3631 // As of Unicode 4.0.0 no characters outside of plane 0 have 3632 // titlecase mappings that are different from their uppercase 3633 // mapping. 3634 if (codePoint < 0x10000) 3635 return (int) toTitleCase((char)codePoint); 3636 return toUpperCase(codePoint); 3637 } 3638 3639 /** 3640 * Converts a character into a digit of the specified radix. If the radix 3641 * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch) 3642 * exceeds the radix, or if ch is not a decimal digit or in the case 3643 * insensitive set of 'a'-'z', the result is -1. 3644 * <br> 3645 * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A 3646 * |U+FF21-U+FF3A|U+FF41-U+FF5A 3647 * 3648 * @param ch character to convert into a digit 3649 * @param radix radix in which ch is a digit 3650 * @return digit which ch represents in radix, or -1 not a valid digit 3651 * @see #MIN_RADIX 3652 * @see #MAX_RADIX 3653 * @see #forDigit(int, int) 3654 * @see #isDigit(char) 3655 * @see #getNumericValue(char) 3656 */ digit(char ch, int radix)3657 public static int digit(char ch, int radix) 3658 { 3659 if (radix < MIN_RADIX || radix > MAX_RADIX) 3660 return -1; 3661 char attr = readCodePoint((int)ch); 3662 if (((1 << (attr & TYPE_MASK)) 3663 & ((1 << UPPERCASE_LETTER) 3664 | (1 << LOWERCASE_LETTER) 3665 | (1 << DECIMAL_DIGIT_NUMBER))) != 0) 3666 { 3667 // Signedness doesn't matter; 0xffff vs. -1 are both rejected. 3668 int digit = numValue[0][attr >> 7]; 3669 return (digit < radix) ? digit : -1; 3670 } 3671 return -1; 3672 } 3673 3674 /** 3675 * Converts a character into a digit of the specified radix. If the radix 3676 * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch) 3677 * exceeds the radix, or if ch is not a decimal digit or in the case 3678 * insensitive set of 'a'-'z', the result is -1. 3679 * <br> 3680 * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A 3681 * |U+FF21-U+FF3A|U+FF41-U+FF5A 3682 * 3683 * @param codePoint character to convert into a digit 3684 * @param radix radix in which ch is a digit 3685 * @return digit which ch represents in radix, or -1 not a valid digit 3686 * @see #MIN_RADIX 3687 * @see #MAX_RADIX 3688 * @see #forDigit(int, int) 3689 * @see #isDigit(char) 3690 * @see #getNumericValue(char) 3691 */ digit(int codePoint, int radix)3692 public static int digit(int codePoint, int radix) 3693 { 3694 if (radix < MIN_RADIX || radix > MAX_RADIX) 3695 return -1; 3696 3697 // If the code point is unassigned or in one of the private use areas 3698 // then we delegate the call to the appropriate private static inner class. 3699 int plane = codePoint >>> 16; 3700 if (plane > 2 && plane < 14) 3701 return UnassignedCharacters.digit(codePoint, radix); 3702 if (plane > 14) 3703 return PrivateUseCharacters.digit(codePoint, radix); 3704 char attr = readCodePoint(codePoint); 3705 if (((1 << (attr & TYPE_MASK)) 3706 & ((1 << UPPERCASE_LETTER) 3707 | (1 << LOWERCASE_LETTER) 3708 | (1 << DECIMAL_DIGIT_NUMBER))) != 0) 3709 { 3710 // Signedness doesn't matter; 0xffff vs. -1 are both rejected. 3711 int digit = numValue[plane][attr >> 7]; 3712 3713 // If digit is less than or equal to -3 then the numerical value was 3714 // too large to fit into numValue and is stored in CharData.LARGENUMS. 3715 if (digit <= -3) 3716 digit = CharData.LARGENUMS[-digit - 3]; 3717 return (digit < radix) ? digit : -1; 3718 } 3719 return -1; 3720 } 3721 3722 /** 3723 * Returns the Unicode numeric value property of a character. For example, 3724 * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50. 3725 * 3726 * <p>This method also returns values for the letters A through Z, (not 3727 * specified by Unicode), in these ranges: <code>'\u0041'</code> 3728 * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code> 3729 * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code> 3730 * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through 3731 * <code>'\uFF5A'</code> (full width variants). 3732 * 3733 * <p>If the character lacks a numeric value property, -1 is returned. 3734 * If the character has a numeric value property which is not representable 3735 * as a nonnegative integer, such as a fraction, -2 is returned. 3736 * 3737 * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A 3738 * |U+FF21-U+FF3A|U+FF41-U+FF5A 3739 * 3740 * @param ch character from which the numeric value property will 3741 * be retrieved 3742 * @return the numeric value property of ch, or -1 if it does not exist, or 3743 * -2 if it is not representable as a nonnegative integer 3744 * @see #forDigit(int, int) 3745 * @see #digit(char, int) 3746 * @see #isDigit(char) 3747 * @since 1.1 3748 */ getNumericValue(char ch)3749 public static int getNumericValue(char ch) 3750 { 3751 // Treat numValue as signed. 3752 return (short) numValue[0][readCodePoint((int)ch) >> 7]; 3753 } 3754 3755 /** 3756 * Returns the Unicode numeric value property of a character. For example, 3757 * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50. 3758 * 3759 * <p>This method also returns values for the letters A through Z, (not 3760 * specified by Unicode), in these ranges: <code>'\u0041'</code> 3761 * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code> 3762 * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code> 3763 * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through 3764 * <code>'\uFF5A'</code> (full width variants). 3765 * 3766 * <p>If the character lacks a numeric value property, -1 is returned. 3767 * If the character has a numeric value property which is not representable 3768 * as a nonnegative integer, such as a fraction, -2 is returned. 3769 * 3770 * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A 3771 * |U+FF21-U+FF3A|U+FF41-U+FF5A 3772 * 3773 * @param codePoint character from which the numeric value property will 3774 * be retrieved 3775 * @return the numeric value property of ch, or -1 if it does not exist, or 3776 * -2 if it is not representable as a nonnegative integer 3777 * @see #forDigit(int, int) 3778 * @see #digit(char, int) 3779 * @see #isDigit(char) 3780 * @since 1.5 3781 */ getNumericValue(int codePoint)3782 public static int getNumericValue(int codePoint) 3783 { 3784 // If the code point is unassigned or in one of the private use areas 3785 // then we delegate the call to the appropriate private static inner class. 3786 int plane = codePoint >>> 16; 3787 if (plane > 2 && plane < 14) 3788 return UnassignedCharacters.getNumericValue(codePoint); 3789 if (plane > 14) 3790 return PrivateUseCharacters.getNumericValue(codePoint); 3791 3792 // If the value N found in numValue[plane] is less than or equal to -3 3793 // then the numeric value was too big to fit into 16 bits and is 3794 // stored in CharData.LARGENUMS at offset (-N - 3). 3795 short num = (short)numValue[plane][readCodePoint(codePoint) >> 7]; 3796 if (num <= -3) 3797 return CharData.LARGENUMS[-num - 3]; 3798 return num; 3799 } 3800 3801 /** 3802 * Determines if a character is a ISO-LATIN-1 space. This is only the five 3803 * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>, 3804 * <code>'\r'</code>, and <code>' '</code>. 3805 * <br> 3806 * Java space = U+0020|U+0009|U+000A|U+000C|U+000D 3807 * 3808 * @param ch character to test 3809 * @return true if ch is a space, else false 3810 * @deprecated Replaced by {@link #isWhitespace(char)} 3811 * @see #isSpaceChar(char) 3812 * @see #isWhitespace(char) 3813 */ isSpace(char ch)3814 public static boolean isSpace(char ch) 3815 { 3816 // Performing the subtraction up front alleviates need to compare longs. 3817 return ch-- <= ' ' && ((1 << ch) 3818 & ((1 << (' ' - 1)) 3819 | (1 << ('\t' - 1)) 3820 | (1 << ('\n' - 1)) 3821 | (1 << ('\r' - 1)) 3822 | (1 << ('\f' - 1)))) != 0; 3823 } 3824 3825 /** 3826 * Determines if a character is a Unicode space character. This includes 3827 * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR. 3828 * <br> 3829 * Unicode space = [Zs]|[Zp]|[Zl] 3830 * 3831 * @param ch character to test 3832 * @return true if ch is a Unicode space, else false 3833 * @see #isWhitespace(char) 3834 * @since 1.1 3835 */ isSpaceChar(char ch)3836 public static boolean isSpaceChar(char ch) 3837 { 3838 return isSpaceChar((int)ch); 3839 } 3840 3841 /** 3842 * Determines if a character is a Unicode space character. This includes 3843 * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR. 3844 * <br> 3845 * Unicode space = [Zs]|[Zp]|[Zl] 3846 * 3847 * @param codePoint character to test 3848 * @return true if ch is a Unicode space, else false 3849 * @see #isWhitespace(char) 3850 * @since 1.5 3851 */ isSpaceChar(int codePoint)3852 public static boolean isSpaceChar(int codePoint) 3853 { 3854 return ((1 << getType(codePoint)) 3855 & ((1 << SPACE_SEPARATOR) 3856 | (1 << LINE_SEPARATOR) 3857 | (1 << PARAGRAPH_SEPARATOR))) != 0; 3858 } 3859 3860 /** 3861 * Determines if a character is Java whitespace. This includes Unicode 3862 * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and 3863 * PARAGRAPH_SEPARATOR) except the non-breaking spaces 3864 * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>); 3865 * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>, 3866 * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>, 3867 * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>, 3868 * and <code>'\u001F'</code>. 3869 * <br> 3870 * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F 3871 * 3872 * @param ch character to test 3873 * @return true if ch is Java whitespace, else false 3874 * @see #isSpaceChar(char) 3875 * @since 1.1 3876 */ isWhitespace(char ch)3877 public static boolean isWhitespace(char ch) 3878 { 3879 return isWhitespace((int) ch); 3880 } 3881 3882 /** 3883 * Determines if a character is Java whitespace. This includes Unicode 3884 * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and 3885 * PARAGRAPH_SEPARATOR) except the non-breaking spaces 3886 * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>); 3887 * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>, 3888 * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>, 3889 * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>, 3890 * and <code>'\u001F'</code>. 3891 * <br> 3892 * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F 3893 * 3894 * @param codePoint character to test 3895 * @return true if ch is Java whitespace, else false 3896 * @see #isSpaceChar(char) 3897 * @since 1.5 3898 */ isWhitespace(int codePoint)3899 public static boolean isWhitespace(int codePoint) 3900 { 3901 int plane = codePoint >>> 16; 3902 if (plane > 2 && plane < 14) 3903 return UnassignedCharacters.isWhiteSpace(codePoint); 3904 if (plane > 14) 3905 return PrivateUseCharacters.isWhiteSpace(codePoint); 3906 3907 int attr = readCodePoint(codePoint); 3908 return ((((1 << (attr & TYPE_MASK)) 3909 & ((1 << SPACE_SEPARATOR) 3910 | (1 << LINE_SEPARATOR) 3911 | (1 << PARAGRAPH_SEPARATOR))) != 0) 3912 && (attr & NO_BREAK_MASK) == 0) 3913 || (codePoint <= '\u001F' && ((1 << codePoint) 3914 & ((1 << '\t') 3915 | (1 << '\n') 3916 | (1 << '\u000B') 3917 | (1 << '\u000C') 3918 | (1 << '\r') 3919 | (1 << '\u001C') 3920 | (1 << '\u001D') 3921 | (1 << '\u001E') 3922 | (1 << '\u001F'))) != 0); 3923 } 3924 3925 /** 3926 * Determines if a character has the ISO Control property. 3927 * <br> 3928 * ISO Control = [Cc] 3929 * 3930 * @param ch character to test 3931 * @return true if ch is an ISO Control character, else false 3932 * @see #isSpaceChar(char) 3933 * @see #isWhitespace(char) 3934 * @since 1.1 3935 */ isISOControl(char ch)3936 public static boolean isISOControl(char ch) 3937 { 3938 return isISOControl((int)ch); 3939 } 3940 3941 /** 3942 * Determines if the character is an ISO Control character. This is true 3943 * if the code point is in the range [0, 0x001F] or if it is in the range 3944 * [0x007F, 0x009F]. 3945 * @param codePoint the character to check 3946 * @return true if the character is in one of the above ranges 3947 * 3948 * @since 1.5 3949 */ isISOControl(int codePoint)3950 public static boolean isISOControl(int codePoint) 3951 { 3952 if ((codePoint >= 0 && codePoint <= 0x001F) 3953 || (codePoint >= 0x007F && codePoint <= 0x009F)) 3954 return true; 3955 return false; 3956 } 3957 3958 /** 3959 * Returns the Unicode general category property of a character. 3960 * 3961 * @param ch character from which the general category property will 3962 * be retrieved 3963 * @return the character category property of ch as an integer 3964 * @see #UNASSIGNED 3965 * @see #UPPERCASE_LETTER 3966 * @see #LOWERCASE_LETTER 3967 * @see #TITLECASE_LETTER 3968 * @see #MODIFIER_LETTER 3969 * @see #OTHER_LETTER 3970 * @see #NON_SPACING_MARK 3971 * @see #ENCLOSING_MARK 3972 * @see #COMBINING_SPACING_MARK 3973 * @see #DECIMAL_DIGIT_NUMBER 3974 * @see #LETTER_NUMBER 3975 * @see #OTHER_NUMBER 3976 * @see #SPACE_SEPARATOR 3977 * @see #LINE_SEPARATOR 3978 * @see #PARAGRAPH_SEPARATOR 3979 * @see #CONTROL 3980 * @see #FORMAT 3981 * @see #PRIVATE_USE 3982 * @see #SURROGATE 3983 * @see #DASH_PUNCTUATION 3984 * @see #START_PUNCTUATION 3985 * @see #END_PUNCTUATION 3986 * @see #CONNECTOR_PUNCTUATION 3987 * @see #OTHER_PUNCTUATION 3988 * @see #MATH_SYMBOL 3989 * @see #CURRENCY_SYMBOL 3990 * @see #MODIFIER_SYMBOL 3991 * @see #INITIAL_QUOTE_PUNCTUATION 3992 * @see #FINAL_QUOTE_PUNCTUATION 3993 * @since 1.1 3994 */ getType(char ch)3995 public static int getType(char ch) 3996 { 3997 return getType((int)ch); 3998 } 3999 4000 /** 4001 * Returns the Unicode general category property of a character. 4002 * 4003 * @param codePoint character from which the general category property will 4004 * be retrieved 4005 * @return the character category property of ch as an integer 4006 * @see #UNASSIGNED 4007 * @see #UPPERCASE_LETTER 4008 * @see #LOWERCASE_LETTER 4009 * @see #TITLECASE_LETTER 4010 * @see #MODIFIER_LETTER 4011 * @see #OTHER_LETTER 4012 * @see #NON_SPACING_MARK 4013 * @see #ENCLOSING_MARK 4014 * @see #COMBINING_SPACING_MARK 4015 * @see #DECIMAL_DIGIT_NUMBER 4016 * @see #LETTER_NUMBER 4017 * @see #OTHER_NUMBER 4018 * @see #SPACE_SEPARATOR 4019 * @see #LINE_SEPARATOR 4020 * @see #PARAGRAPH_SEPARATOR 4021 * @see #CONTROL 4022 * @see #FORMAT 4023 * @see #PRIVATE_USE 4024 * @see #SURROGATE 4025 * @see #DASH_PUNCTUATION 4026 * @see #START_PUNCTUATION 4027 * @see #END_PUNCTUATION 4028 * @see #CONNECTOR_PUNCTUATION 4029 * @see #OTHER_PUNCTUATION 4030 * @see #MATH_SYMBOL 4031 * @see #CURRENCY_SYMBOL 4032 * @see #MODIFIER_SYMBOL 4033 * @see #INITIAL_QUOTE_PUNCTUATION 4034 * @see #FINAL_QUOTE_PUNCTUATION 4035 * 4036 * @since 1.5 4037 */ getType(int codePoint)4038 public static int getType(int codePoint) 4039 { 4040 // If the codePoint is unassigned or in one of the private use areas 4041 // then we delegate the call to the appropriate private static inner class. 4042 int plane = codePoint >>> 16; 4043 if (plane > 2 && plane < 14) 4044 return UnassignedCharacters.getType(codePoint); 4045 if (plane > 14) 4046 return PrivateUseCharacters.getType(codePoint); 4047 4048 return readCodePoint(codePoint) & TYPE_MASK; 4049 } 4050 4051 /** 4052 * Converts a digit into a character which represents that digit 4053 * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX, 4054 * or the digit exceeds the radix, then the null character <code>'\0'</code> 4055 * is returned. Otherwise the return value is in '0'-'9' and 'a'-'z'. 4056 * <br> 4057 * return value boundary = U+0030-U+0039|U+0061-U+007A 4058 * 4059 * @param digit digit to be converted into a character 4060 * @param radix radix of digit 4061 * @return character representing digit in radix, or '\0' 4062 * @see #MIN_RADIX 4063 * @see #MAX_RADIX 4064 * @see #digit(char, int) 4065 */ forDigit(int digit, int radix)4066 public static char forDigit(int digit, int radix) 4067 { 4068 if (radix < MIN_RADIX || radix > MAX_RADIX 4069 || digit < 0 || digit >= radix) 4070 return '\0'; 4071 return Number.digits[digit]; 4072 } 4073 4074 /** 4075 * Returns the Unicode directionality property of the character. This 4076 * is used in the visual ordering of text. 4077 * 4078 * @param ch the character to look up 4079 * @return the directionality constant, or DIRECTIONALITY_UNDEFINED 4080 * @see #DIRECTIONALITY_UNDEFINED 4081 * @see #DIRECTIONALITY_LEFT_TO_RIGHT 4082 * @see #DIRECTIONALITY_RIGHT_TO_LEFT 4083 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC 4084 * @see #DIRECTIONALITY_EUROPEAN_NUMBER 4085 * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR 4086 * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR 4087 * @see #DIRECTIONALITY_ARABIC_NUMBER 4088 * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR 4089 * @see #DIRECTIONALITY_NONSPACING_MARK 4090 * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL 4091 * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR 4092 * @see #DIRECTIONALITY_SEGMENT_SEPARATOR 4093 * @see #DIRECTIONALITY_WHITESPACE 4094 * @see #DIRECTIONALITY_OTHER_NEUTRALS 4095 * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING 4096 * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE 4097 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING 4098 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE 4099 * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT 4100 * @since 1.4 4101 */ getDirectionality(char ch)4102 public static byte getDirectionality(char ch) 4103 { 4104 // The result will correctly be signed. 4105 return getDirectionality((int)ch); 4106 } 4107 4108 4109 /** 4110 * Returns the Unicode directionality property of the character. This 4111 * is used in the visual ordering of text. 4112 * 4113 * @param codePoint the character to look up 4114 * @return the directionality constant, or DIRECTIONALITY_UNDEFINED 4115 * @see #DIRECTIONALITY_UNDEFINED 4116 * @see #DIRECTIONALITY_LEFT_TO_RIGHT 4117 * @see #DIRECTIONALITY_RIGHT_TO_LEFT 4118 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC 4119 * @see #DIRECTIONALITY_EUROPEAN_NUMBER 4120 * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR 4121 * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR 4122 * @see #DIRECTIONALITY_ARABIC_NUMBER 4123 * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR 4124 * @see #DIRECTIONALITY_NONSPACING_MARK 4125 * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL 4126 * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR 4127 * @see #DIRECTIONALITY_SEGMENT_SEPARATOR 4128 * @see #DIRECTIONALITY_WHITESPACE 4129 * @see #DIRECTIONALITY_OTHER_NEUTRALS 4130 * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING 4131 * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE 4132 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING 4133 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE 4134 * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT 4135 * @since 1.5 4136 */ getDirectionality(int codePoint)4137 public static byte getDirectionality(int codePoint) 4138 { 4139 // If the code point is unassigned or in one of the private use areas 4140 // then we delegate the call to the appropriate private static inner class. 4141 int plane = codePoint >>> 16; 4142 if (plane > 2 && plane < 14) 4143 return UnassignedCharacters.getDirectionality(codePoint); 4144 if (plane > 14) 4145 return PrivateUseCharacters.getDirectionality(codePoint); 4146 4147 // The result will correctly be signed. 4148 return (byte) (direction[plane][readCodePoint(codePoint) >> 7] >> 2); 4149 } 4150 4151 /** 4152 * Determines whether the character is mirrored according to Unicode. For 4153 * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in 4154 * left-to-right text, but ')' in right-to-left text. 4155 * 4156 * @param ch the character to look up 4157 * @return true if the character is mirrored 4158 * @since 1.4 4159 */ isMirrored(char ch)4160 public static boolean isMirrored(char ch) 4161 { 4162 return (readCodePoint((int)ch) & MIRROR_MASK) != 0; 4163 } 4164 4165 /** 4166 * Determines whether the character is mirrored according to Unicode. For 4167 * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in 4168 * left-to-right text, but ')' in right-to-left text. 4169 * 4170 * @param codePoint the character to look up 4171 * @return true if the character is mirrored 4172 * @since 1.5 4173 */ isMirrored(int codePoint)4174 public static boolean isMirrored(int codePoint) 4175 { 4176 // If the code point is unassigned or part of one of the private use areas 4177 // then we delegate the call to the appropriate private static inner class. 4178 int plane = codePoint >>> 16; 4179 if (plane > 2 && plane < 14) 4180 return UnassignedCharacters.isMirrored(codePoint); 4181 if (plane > 14) 4182 return PrivateUseCharacters.isMirrored(codePoint); 4183 4184 return (readCodePoint(codePoint) & MIRROR_MASK) != 0; 4185 } 4186 4187 /** 4188 * Compares another Character to this Character, numerically. 4189 * 4190 * @param anotherCharacter Character to compare with this Character 4191 * @return a negative integer if this Character is less than 4192 * anotherCharacter, zero if this Character is equal, and 4193 * a positive integer if this Character is greater 4194 * @throws NullPointerException if anotherCharacter is null 4195 * @since 1.2 4196 */ compareTo(Character anotherCharacter)4197 public int compareTo(Character anotherCharacter) 4198 { 4199 return value - anotherCharacter.value; 4200 } 4201 4202 /** 4203 * Compares two unboxed char values. 4204 * The result is positive if the first is greater, negative if the second 4205 * is greater, and 0 if the two are equal. 4206 * 4207 * @param x First value to compare. 4208 * @param y Second value to compare. 4209 * 4210 * @return positive int if the first value is greater, negative if the second 4211 * is greater, and 0 if the two are equal. 4212 * @since 1.7 4213 */ compare(char x, char y)4214 public static int compare(char x, char y) 4215 { 4216 return Character.valueOf(x).compareTo(Character.valueOf(y)); 4217 } 4218 4219 /** 4220 * Returns an <code>Character</code> object wrapping the value. 4221 * In contrast to the <code>Character</code> constructor, this method 4222 * will cache some values. It is used by boxing conversion. 4223 * 4224 * @param val the value to wrap 4225 * @return the <code>Character</code> 4226 * 4227 * @since 1.5 4228 */ valueOf(char val)4229 public static Character valueOf(char val) 4230 { 4231 if (val > MAX_CACHE) 4232 return new Character(val); 4233 else 4234 return charCache[val - MIN_VALUE]; 4235 } 4236 4237 /** 4238 * Reverse the bytes in val. 4239 * @since 1.5 4240 */ reverseBytes(char val)4241 public static char reverseBytes(char val) 4242 { 4243 return (char) (((val >> 8) & 0xff) | ((val << 8) & 0xff00)); 4244 } 4245 4246 /** 4247 * Converts a unicode code point to a UTF-16 representation of that 4248 * code point. 4249 * 4250 * @param codePoint the unicode code point 4251 * 4252 * @return the UTF-16 representation of that code point 4253 * 4254 * @throws IllegalArgumentException if the code point is not a valid 4255 * unicode code point 4256 * 4257 * @since 1.5 4258 */ toChars(int codePoint)4259 public static char[] toChars(int codePoint) 4260 { 4261 if (!isValidCodePoint(codePoint)) 4262 throw new IllegalArgumentException("Illegal Unicode code point : " 4263 + codePoint); 4264 char[] result = new char[charCount(codePoint)]; 4265 int ignore = toChars(codePoint, result, 0); 4266 return result; 4267 } 4268 4269 /** 4270 * Converts a unicode code point to its UTF-16 representation. 4271 * 4272 * @param codePoint the unicode code point 4273 * @param dst the target char array 4274 * @param dstIndex the start index for the target 4275 * 4276 * @return number of characters written to <code>dst</code> 4277 * 4278 * @throws IllegalArgumentException if <code>codePoint</code> is not a 4279 * valid unicode code point 4280 * @throws NullPointerException if <code>dst</code> is <code>null</code> 4281 * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid 4282 * in <code>dst</code> or if the UTF-16 representation does not 4283 * fit into <code>dst</code> 4284 * 4285 * @since 1.5 4286 */ toChars(int codePoint, char[] dst, int dstIndex)4287 public static int toChars(int codePoint, char[] dst, int dstIndex) 4288 { 4289 if (!isValidCodePoint(codePoint)) 4290 { 4291 throw new IllegalArgumentException("not a valid code point: " 4292 + codePoint); 4293 } 4294 4295 int result; 4296 if (isSupplementaryCodePoint(codePoint)) 4297 { 4298 // Write second char first to cause IndexOutOfBoundsException 4299 // immediately. 4300 final int cp2 = codePoint - 0x10000; 4301 dst[dstIndex + 1] = (char) ((cp2 % 0x400) + (int) MIN_LOW_SURROGATE); 4302 dst[dstIndex] = (char) ((cp2 / 0x400) + (int) MIN_HIGH_SURROGATE); 4303 result = 2; 4304 } 4305 else 4306 { 4307 dst[dstIndex] = (char) codePoint; 4308 result = 1; 4309 } 4310 return result; 4311 } 4312 4313 /** 4314 * Return number of 16-bit characters required to represent the given 4315 * code point. 4316 * 4317 * @param codePoint a unicode code point 4318 * 4319 * @return 2 if codePoint >= 0x10000, 1 otherwise. 4320 * 4321 * @since 1.5 4322 */ charCount(int codePoint)4323 public static int charCount(int codePoint) 4324 { 4325 return 4326 (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) 4327 ? 2 4328 : 1; 4329 } 4330 4331 /** 4332 * Determines whether the specified code point is 4333 * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode 4334 * supplementary character range. 4335 * 4336 * @param codePoint a Unicode code point 4337 * 4338 * @return <code>true</code> if code point is in supplementary range 4339 * 4340 * @since 1.5 4341 */ isSupplementaryCodePoint(int codePoint)4342 public static boolean isSupplementaryCodePoint(int codePoint) 4343 { 4344 return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT 4345 && codePoint <= MAX_CODE_POINT; 4346 } 4347 4348 /** 4349 * Determines whether the specified code point is 4350 * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point. 4351 * 4352 * @param codePoint a Unicode code point 4353 * 4354 * @return <code>true</code> if code point is valid 4355 * 4356 * @since 1.5 4357 */ isValidCodePoint(int codePoint)4358 public static boolean isValidCodePoint(int codePoint) 4359 { 4360 return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT; 4361 } 4362 4363 /** 4364 * Return true if the given character is a high surrogate. 4365 * @param ch the character 4366 * @return true if the character is a high surrogate character 4367 * 4368 * @since 1.5 4369 */ isHighSurrogate(char ch)4370 public static boolean isHighSurrogate(char ch) 4371 { 4372 return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE; 4373 } 4374 4375 /** 4376 * Return true if the given character is a low surrogate. 4377 * @param ch the character 4378 * @return true if the character is a low surrogate character 4379 * 4380 * @since 1.5 4381 */ isLowSurrogate(char ch)4382 public static boolean isLowSurrogate(char ch) 4383 { 4384 return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE; 4385 } 4386 4387 /** 4388 * Return true if the given characters compose a surrogate pair. 4389 * This is true if the first character is a high surrogate and the 4390 * second character is a low surrogate. 4391 * @param ch1 the first character 4392 * @param ch2 the first character 4393 * @return true if the characters compose a surrogate pair 4394 * 4395 * @since 1.5 4396 */ isSurrogatePair(char ch1, char ch2)4397 public static boolean isSurrogatePair(char ch1, char ch2) 4398 { 4399 return isHighSurrogate(ch1) && isLowSurrogate(ch2); 4400 } 4401 4402 /** 4403 * Given a valid surrogate pair, this returns the corresponding 4404 * code point. 4405 * @param high the high character of the pair 4406 * @param low the low character of the pair 4407 * @return the corresponding code point 4408 * 4409 * @since 1.5 4410 */ toCodePoint(char high, char low)4411 public static int toCodePoint(char high, char low) 4412 { 4413 return ((high - MIN_HIGH_SURROGATE) * 0x400) + 4414 (low - MIN_LOW_SURROGATE) + 0x10000; 4415 } 4416 4417 /** 4418 * Get the code point at the specified index in the CharSequence. 4419 * This is like CharSequence#charAt(int), but if the character is 4420 * the start of a surrogate pair, and there is a following 4421 * character, and this character completes the pair, then the 4422 * corresponding supplementary code point is returned. Otherwise, 4423 * the character at the index is returned. 4424 * 4425 * @param sequence the CharSequence 4426 * @param index the index of the codepoint to get, starting at 0 4427 * @return the codepoint at the specified index 4428 * @throws IndexOutOfBoundsException if index is negative or >= length() 4429 * @since 1.5 4430 */ codePointAt(CharSequence sequence, int index)4431 public static int codePointAt(CharSequence sequence, int index) 4432 { 4433 int len = sequence.length(); 4434 if (index < 0 || index >= len) 4435 throw new IndexOutOfBoundsException(); 4436 char high = sequence.charAt(index); 4437 if (! isHighSurrogate(high) || ++index >= len) 4438 return high; 4439 char low = sequence.charAt(index); 4440 if (! isLowSurrogate(low)) 4441 return high; 4442 return toCodePoint(high, low); 4443 } 4444 4445 /** 4446 * Get the code point at the specified index in the CharSequence. 4447 * If the character is the start of a surrogate pair, and there is a 4448 * following character, and this character completes the pair, then 4449 * the corresponding supplementary code point is returned. 4450 * Otherwise, the character at the index is returned. 4451 * 4452 * @param chars the character array in which to look 4453 * @param index the index of the codepoint to get, starting at 0 4454 * @return the codepoint at the specified index 4455 * @throws IndexOutOfBoundsException if index is negative or >= length() 4456 * @since 1.5 4457 */ codePointAt(char[] chars, int index)4458 public static int codePointAt(char[] chars, int index) 4459 { 4460 return codePointAt(chars, index, chars.length); 4461 } 4462 4463 /** 4464 * Get the code point at the specified index in the CharSequence. 4465 * If the character is the start of a surrogate pair, and there is a 4466 * following character within the specified range, and this 4467 * character completes the pair, then the corresponding 4468 * supplementary code point is returned. Otherwise, the character 4469 * at the index is returned. 4470 * 4471 * @param chars the character array in which to look 4472 * @param index the index of the codepoint to get, starting at 0 4473 * @param limit the limit past which characters should not be examined 4474 * @return the codepoint at the specified index 4475 * @throws IndexOutOfBoundsException if index is negative or >= 4476 * limit, or if limit is negative or >= the length of the array 4477 * @since 1.5 4478 */ codePointAt(char[] chars, int index, int limit)4479 public static int codePointAt(char[] chars, int index, int limit) 4480 { 4481 if (index < 0 || index >= limit || limit < 0 || limit > chars.length) 4482 throw new IndexOutOfBoundsException(); 4483 char high = chars[index]; 4484 if (! isHighSurrogate(high) || ++index >= limit) 4485 return high; 4486 char low = chars[index]; 4487 if (! isLowSurrogate(low)) 4488 return high; 4489 return toCodePoint(high, low); 4490 } 4491 4492 /** 4493 * Get the code point before the specified index. This is like 4494 * #codePointAt(char[], int), but checks the characters at 4495 * <code>index-1</code> and <code>index-2</code> to see if they form 4496 * a supplementary code point. If they do not, the character at 4497 * <code>index-1</code> is returned. 4498 * 4499 * @param chars the character array 4500 * @param index the index just past the codepoint to get, starting at 0 4501 * @return the codepoint at the specified index 4502 * @throws IndexOutOfBoundsException if index is negative or >= length() 4503 * @since 1.5 4504 */ codePointBefore(char[] chars, int index)4505 public static int codePointBefore(char[] chars, int index) 4506 { 4507 return codePointBefore(chars, index, 1); 4508 } 4509 4510 /** 4511 * Get the code point before the specified index. This is like 4512 * #codePointAt(char[], int), but checks the characters at 4513 * <code>index-1</code> and <code>index-2</code> to see if they form 4514 * a supplementary code point. If they do not, the character at 4515 * <code>index-1</code> is returned. The start parameter is used to 4516 * limit the range of the array which may be examined. 4517 * 4518 * @param chars the character array 4519 * @param index the index just past the codepoint to get, starting at 0 4520 * @param start the index before which characters should not be examined 4521 * @return the codepoint at the specified index 4522 * @throws IndexOutOfBoundsException if index is > start or > 4523 * the length of the array, or if limit is negative or >= the 4524 * length of the array 4525 * @since 1.5 4526 */ codePointBefore(char[] chars, int index, int start)4527 public static int codePointBefore(char[] chars, int index, int start) 4528 { 4529 if (index < start || index > chars.length 4530 || start < 0 || start >= chars.length) 4531 throw new IndexOutOfBoundsException(); 4532 --index; 4533 char low = chars[index]; 4534 if (! isLowSurrogate(low) || --index < start) 4535 return low; 4536 char high = chars[index]; 4537 if (! isHighSurrogate(high)) 4538 return low; 4539 return toCodePoint(high, low); 4540 } 4541 4542 /** 4543 * Get the code point before the specified index. This is like 4544 * #codePointAt(CharSequence, int), but checks the characters at 4545 * <code>index-1</code> and <code>index-2</code> to see if they form 4546 * a supplementary code point. If they do not, the character at 4547 * <code>index-1</code> is returned. 4548 * 4549 * @param sequence the CharSequence 4550 * @param index the index just past the codepoint to get, starting at 0 4551 * @return the codepoint at the specified index 4552 * @throws IndexOutOfBoundsException if index is negative or >= length() 4553 * @since 1.5 4554 */ codePointBefore(CharSequence sequence, int index)4555 public static int codePointBefore(CharSequence sequence, int index) 4556 { 4557 int len = sequence.length(); 4558 if (index < 1 || index > len) 4559 throw new IndexOutOfBoundsException(); 4560 --index; 4561 char low = sequence.charAt(index); 4562 if (! isLowSurrogate(low) || --index < 0) 4563 return low; 4564 char high = sequence.charAt(index); 4565 if (! isHighSurrogate(high)) 4566 return low; 4567 return toCodePoint(high, low); 4568 } 4569 } // class Character 4570