1 /* Copyright 2002-2006, 2018 Elliotte Rusty Harold 2 3 This library is free software; you can redistribute it and/or modify 4 it under the terms of version 2.1 of the GNU Lesser General Public 5 License as published by the Free Software Foundation. 6 7 This library is distributed in the hope that it will be useful, 8 but WITHOUT ANY WARRANTY; without even the implied warranty of 9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 GNU Lesser General Public License for more details. 11 12 You should have received a copy of the GNU Lesser General Public 13 License along with this library; if not, write to the 14 Free Software Foundation, Inc., 59 Temple Place, Suite 330, 15 Boston, MA 02111-1307 USA 16 17 You can contact Elliotte Rusty Harold by sending e-mail to 18 elharo@ibiblio.org. Please include the word "XOM" in the 19 subject line. The XOM home page is located at http://www.xom.nu/ 20 */ 21 22 package nu.xom; 23 24 import java.io.DataInputStream; 25 import java.io.IOException; 26 import java.io.InputStream; 27 import java.io.Reader; 28 import java.io.StringReader; 29 import java.util.StringTokenizer; 30 31 import org.xml.sax.EntityResolver; 32 import org.xml.sax.InputSource; 33 import org.xml.sax.SAXException; 34 import org.xml.sax.XMLReader; 35 36 /** 37 * <p> 38 * <code>Verifier</code> checks names and data for 39 * compliance with XML 1.0 and Namespaces in XML rules. 40 * </p> 41 * 42 * @author Elliotte Rusty Harold 43 * @version 1.2.11 44 * 45 */ 46 final class Verifier { 47 Verifier()48 private Verifier() {} 49 50 // constants for the bit flags in the characters lookup table 51 private final static byte XML_CHARACTER = 1; 52 private final static byte NAME_CHARACTER = 2; 53 private final static byte NAME_START_CHARACTER = 4; 54 private final static byte NCNAME_CHARACTER = 8; 55 56 private static byte[] flags = null; 57 58 static { 59 60 ClassLoader loader = Verifier.class.getClassLoader(); 61 if (loader != null) loadFlags(loader); 62 // If that didn't work, try a different ClassLoader 63 if (flags == null) { 64 loader = Thread.currentThread().getContextClassLoader(); 65 loadFlags(loader); 66 } 67 68 } 69 70 loadFlags(ClassLoader loader)71 private static void loadFlags(ClassLoader loader) { 72 73 DataInputStream in = null; 74 try { 75 InputStream raw = loader.getResourceAsStream("nu/xom/characters.dat"); 76 if (raw == null) { 77 throw new RuntimeException("Broken XOM installation: " 78 + "could not load nu/xom/characters.dat"); 79 } 80 // buffer this???? 81 in = new DataInputStream(raw); 82 flags = new byte[65536]; 83 in.readFully(flags); 84 } 85 catch (IOException ex) { 86 throw new RuntimeException("Broken XOM installation: " 87 + "could not load nu/xom/characters.dat"); 88 } 89 finally { 90 try { 91 if (in != null) in.close(); 92 } 93 catch (IOException ex) { 94 // no big deal 95 } 96 } 97 98 } 99 100 101 /** 102 * <p> 103 * Check whether <code>name</code> is 104 * a non-colonized name as defined in 105 * <cite>Namespaces in XML</cite>. 106 * </p> 107 * 108 * @param name <code>String</code> name to check 109 * 110 * @throws IllegalNameException if <code>name</code> is not a 111 * non-colonized name 112 */ checkNCName(String name)113 static void checkNCName(String name) { 114 115 if (name == null) { 116 throwIllegalNameException(name, "NCNames cannot be null"); 117 } 118 119 int length = name.length(); 120 if (length == 0) { 121 throwIllegalNameException(name, "NCNames cannot be empty"); 122 } 123 124 char first = name.charAt(0); 125 if ((flags[first] & NAME_START_CHARACTER) == 0) { 126 throwIllegalNameException(name, "NCNames cannot start " + 127 "with the character " + Integer.toHexString(first)); 128 } 129 130 for (int i = 1; i < length; i++) { 131 char c = name.charAt(i); 132 if ((flags[c] & NCNAME_CHARACTER) == 0) { 133 if (c == ':') { 134 throwIllegalNameException(name, "NCNames cannot contain colons"); 135 } 136 else { 137 throwIllegalNameException(name, "0x" 138 + Integer.toHexString(c) + " is not a legal NCName character"); 139 } 140 } 141 } 142 143 } 144 145 throwIllegalNameException(String name, String message)146 private static void throwIllegalNameException(String name, String message) { 147 IllegalNameException ex = new IllegalNameException(message); 148 ex.setData(name); 149 throw ex; 150 } 151 152 throwIllegalCharacterDataException(String data, String message)153 private static void throwIllegalCharacterDataException(String data, String message) { 154 IllegalDataException ex = new IllegalCharacterDataException(message); 155 ex.setData(data); 156 throw ex; 157 } 158 159 throwMalformedURIException(String uri, String message)160 private static void throwMalformedURIException(String uri, String message) { 161 MalformedURIException ex = new MalformedURIException(message); 162 ex.setData(uri); 163 throw ex; 164 } 165 166 167 /** 168 * <p> 169 * This methods checks whether a string contains only 170 * characters allowed by the XML 1.0 specification. 171 * </p> 172 * 173 * @param text <code>String</code> value to verify 174 * 175 * @throws IllegalCharacterDataException if <code>text</code> is 176 * not legal PCDATA 177 */ checkPCDATA(String text)178 static void checkPCDATA(String text) { 179 180 if (text == null) throw new IllegalCharacterDataException("Null text"); 181 182 char[] data = text.toCharArray(); 183 for (int i = 0, len = data.length; i < len; i++) { 184 int result = data[i]; 185 if (result >= 0xD800 && result <= 0xDBFF) { 186 try { 187 int low = data[i+1]; 188 if (low < 0xDC00 || low > 0xDFFF) { 189 IllegalCharacterDataException ex 190 = new IllegalCharacterDataException("Bad surrogate pair"); 191 ex.setData(text); 192 throw ex; 193 } 194 i++; // increment past low surrogate 195 } 196 catch (ArrayIndexOutOfBoundsException ex) { 197 IllegalCharacterDataException ide 198 = new IllegalCharacterDataException("Bad Surrogate Pair", ex); 199 ide.setData(text); 200 throw ide; 201 } 202 // all properly matched surrogate pairs are legal in PCDATA 203 } // end if 204 else if ((flags[result] & XML_CHARACTER) == 0) { 205 throwIllegalCharacterDataException(text, "0x" 206 + Integer.toHexString(result) 207 + " is not allowed in XML content"); 208 } 209 210 } 211 212 } 213 214 215 /** 216 * <p> 217 * Checks a string to see if it is a syntactically correct 218 * RFC 3986 URI reference. Both absolute and relative 219 * URIs are supported, as are URIs with fragment identifiers. 220 * </p> 221 * 222 * @param uri <code>String</code> containing the potential URI 223 * 224 * @throws MalformedURIException if this is not a 225 * legal URI reference 226 */ checkURIReference(String uri)227 static void checkURIReference(String uri) { 228 229 if ((uri == null) || uri.length() == 0) return; 230 231 URIUtil.ParsedURI parsed = new URIUtil.ParsedURI(uri); 232 try { 233 if (parsed.scheme != null) checkScheme(parsed.scheme); 234 if (parsed.authority != null) checkAuthority(parsed.authority); 235 checkPath(parsed.path); 236 if (parsed.fragment != null) checkFragment(parsed.fragment); 237 if (parsed.query != null) checkQuery(parsed.query); 238 } 239 catch (MalformedURIException ex) { 240 ex.setData(uri); 241 throw ex; 242 } 243 244 } 245 246 checkQuery(final String query)247 private static void checkQuery(final String query) { 248 249 int length = query.length(); 250 for (int i = 0; i < length; i++) { 251 char c = query.charAt(i); 252 if (c == '%') { 253 try { 254 if (!isHexDigit(query.charAt(i+1)) || !isHexDigit(query.charAt(i+2))) { 255 throwMalformedURIException(query, 256 "Bad percent escape sequence"); 257 } 258 } 259 catch (StringIndexOutOfBoundsException ex) { 260 throwMalformedURIException(query, 261 "Bad percent escape sequence"); 262 } 263 i += 2; 264 } 265 else if (!isQueryCharacter(c)) { 266 throw new MalformedURIException( 267 "Illegal query character " + c 268 ); 269 } 270 } 271 272 } 273 274 275 // same for fragment ID isQueryCharacter(char c)276 private static boolean isQueryCharacter(char c) { 277 278 switch(c) { 279 case '!': return true; 280 case '"': return false; 281 case '#': return false; 282 case '$': return true; 283 case '%': return false; // tested in checkQuery 284 case '&': return true; 285 case '\'': return true; 286 case '(': return true; 287 case ')': return true; 288 case '*': return true; 289 case '+': return true; 290 case ',': return true; 291 case '-': return true; 292 case '.': return true; 293 case '/': return true; 294 case '0': return true; 295 case '1': return true; 296 case '2': return true; 297 case '3': return true; 298 case '4': return true; 299 case '5': return true; 300 case '6': return true; 301 case '7': return true; 302 case '8': return true; 303 case '9': return true; 304 case ':': return true; 305 case ';': return true; 306 case '<': return false; 307 case '=': return true; 308 case '>': return false; 309 case '?': return true; 310 case '@': return true; 311 case 'A': return true; 312 case 'B': return true; 313 case 'C': return true; 314 case 'D': return true; 315 case 'E': return true; 316 case 'F': return true; 317 case 'G': return true; 318 case 'H': return true; 319 case 'I': return true; 320 case 'J': return true; 321 case 'K': return true; 322 case 'L': return true; 323 case 'M': return true; 324 case 'N': return true; 325 case 'O': return true; 326 case 'P': return true; 327 case 'Q': return true; 328 case 'R': return true; 329 case 'S': return true; 330 case 'T': return true; 331 case 'U': return true; 332 case 'V': return true; 333 case 'W': return true; 334 case 'X': return true; 335 case 'Y': return true; 336 case 'Z': return true; 337 case '[': return false; 338 case '\\': return false; 339 case ']': return false; 340 case '^': return false; 341 case '_': return true; 342 case '`': return false; 343 case 'a': return true; 344 case 'b': return true; 345 case 'c': return true; 346 case 'd': return true; 347 case 'e': return true; 348 case 'f': return true; 349 case 'g': return true; 350 case 'h': return true; 351 case 'i': return true; 352 case 'j': return true; 353 case 'k': return true; 354 case 'l': return true; 355 case 'm': return true; 356 case 'n': return true; 357 case 'o': return true; 358 case 'p': return true; 359 case 'q': return true; 360 case 'r': return true; 361 case 's': return true; 362 case 't': return true; 363 case 'u': return true; 364 case 'v': return true; 365 case 'w': return true; 366 case 'x': return true; 367 case 'y': return true; 368 case 'z': return true; 369 case '{': return false; 370 case '|': return false; 371 case '}': return false; 372 case '~': return true; 373 } 374 return false; 375 376 } 377 378 checkFragment(String fragment)379 private static void checkFragment(String fragment) { 380 // The BNF for fragments is the same as for query strings 381 checkQuery(fragment); 382 } 383 384 385 // Besides the legal characters issues, a path must 386 // not contain two consecutive forward slashes checkPath(final String path)387 private static void checkPath(final String path) { 388 389 int length = path.length(); 390 char[] text = path.toCharArray(); 391 for (int i = 0; i < length; i++) { 392 char c = text[i]; 393 if (c == '/') { 394 if (i < length-1) { 395 if (text[i+1] == '/') { 396 throwMalformedURIException(path, 397 "Double slash (//) in path"); 398 } 399 } 400 } 401 else if (c == '%') { 402 try { 403 if (!isHexDigit(text[i+1]) 404 || !isHexDigit(text[i+2])) { 405 throwMalformedURIException(path, 406 "Bad percent escape sequence"); 407 } 408 } 409 catch (ArrayIndexOutOfBoundsException ex) { 410 throwMalformedURIException(path, 411 "Bad percent escape sequence"); 412 } 413 i += 2; 414 } 415 else if (!isPathCharacter(c)) { 416 throwMalformedURIException(path, 417 "Illegal path character " + c 418 ); 419 } 420 } 421 422 } 423 424 checkAuthority(String authority)425 private static void checkAuthority(String authority) { 426 427 String userInfo = null; 428 String host = null; 429 String port = null; 430 431 int atSign = authority.indexOf('@'); 432 if (atSign != -1) { 433 userInfo = authority.substring(0, atSign); 434 authority = authority.substring(atSign+1); 435 } 436 437 int colon; 438 if (authority.startsWith("[")) { 439 colon = authority.indexOf("]:"); 440 if (colon != -1) colon = colon+1; 441 } 442 else colon = authority.indexOf(':'); 443 444 if (colon != -1) { 445 host = authority.substring(0, colon); 446 port = authority.substring(colon+1); 447 } 448 else { 449 host = authority; 450 } 451 452 if (userInfo != null) checkUserInfo(userInfo); 453 if (port != null) checkPort(port); 454 checkHost(host); 455 456 } 457 458 checkHost(final String host)459 private static void checkHost(final String host) { 460 461 int length = host.length(); 462 if (length == 0) return; // file URI 463 464 char[] text = host.toCharArray(); 465 if (text[0] == '[') { 466 if (text[length-1] != ']') { 467 throw new MalformedURIException("Missing closing ]"); 468 } 469 // trim [ and ] from ends of host 470 checkIPv6Address(host.substring(1, length-1)); 471 } 472 else { 473 if (length > 255) { 474 throw new MalformedURIException("Host name too long: " + host); 475 } 476 477 for (int i = 0; i < length; i++) { 478 char c = text[i]; 479 if (c == '%') { 480 try { 481 if (!isHexDigit(text[i+1]) || !isHexDigit(text[i+2])) { 482 throwMalformedURIException(host, 483 "Bad percent escape sequence"); 484 } 485 } 486 catch (ArrayIndexOutOfBoundsException ex) { 487 throwMalformedURIException(host, 488 "Bad percent escape sequence"); 489 } 490 i += 2; 491 } 492 else if (!isRegNameCharacter(c)) { 493 throwMalformedURIException(host, 494 "Illegal host character " + c 495 ); 496 } 497 } 498 } 499 } 500 501 isRegNameCharacter(char c)502 private static boolean isRegNameCharacter(char c) { 503 504 switch(c) { 505 case '!': return true; 506 case '"': return false; 507 case '#': return false; 508 case '$': return true; 509 case '%': return false; // checked separately 510 case '&': return true; 511 case '\'': return true; 512 case '(': return true; 513 case ')': return true; 514 case '*': return true; 515 case '+': return true; 516 case ',': return true; 517 case '-': return true; 518 case '.': return true; 519 case '/': return false; 520 case '0': return true; 521 case '1': return true; 522 case '2': return true; 523 case '3': return true; 524 case '4': return true; 525 case '5': return true; 526 case '6': return true; 527 case '7': return true; 528 case '8': return true; 529 case '9': return true; 530 case ':': return false; 531 case ';': return true; 532 case '<': return false; 533 case '=': return true; 534 case '>': return false; 535 case '?': return false; 536 case '@': return false; 537 case 'A': return true; 538 case 'B': return true; 539 case 'C': return true; 540 case 'D': return true; 541 case 'E': return true; 542 case 'F': return true; 543 case 'G': return true; 544 case 'H': return true; 545 case 'I': return true; 546 case 'J': return true; 547 case 'K': return true; 548 case 'L': return true; 549 case 'M': return true; 550 case 'N': return true; 551 case 'O': return true; 552 case 'P': return true; 553 case 'Q': return true; 554 case 'R': return true; 555 case 'S': return true; 556 case 'T': return true; 557 case 'U': return true; 558 case 'V': return true; 559 case 'W': return true; 560 case 'X': return true; 561 case 'Y': return true; 562 case 'Z': return true; 563 case '[': return false; 564 case '\\': return false; 565 case ']': return false; 566 case '^': return false; 567 case '_': return true; 568 case '`': return false; 569 case 'a': return true; 570 case 'b': return true; 571 case 'c': return true; 572 case 'd': return true; 573 case 'e': return true; 574 case 'f': return true; 575 case 'g': return true; 576 case 'h': return true; 577 case 'i': return true; 578 case 'j': return true; 579 case 'k': return true; 580 case 'l': return true; 581 case 'm': return true; 582 case 'n': return true; 583 case 'o': return true; 584 case 'p': return true; 585 case 'q': return true; 586 case 'r': return true; 587 case 's': return true; 588 case 't': return true; 589 case 'u': return true; 590 case 'v': return true; 591 case 'w': return true; 592 case 'x': return true; 593 case 'y': return true; 594 case 'z': return true; 595 case '{': return false; 596 case '|': return false; 597 case '}': return false; 598 case '~': return true; 599 } 600 return false; 601 602 } 603 604 checkPort(String port)605 private static void checkPort(String port) { 606 607 for (int i = port.length()-1; i >= 0; i--) { 608 char c = port.charAt(i); 609 if (c < '0' || c > '9') { 610 throw new MalformedURIException("Bad port: " + port); 611 } 612 } 613 614 } 615 616 checkUserInfo(String userInfo)617 private static void checkUserInfo(String userInfo) { 618 619 int length = userInfo.length(); 620 for (int i = 0; i < length; i++) { 621 char c = userInfo.charAt(i); 622 if (c == '%') { 623 try { 624 if (!isHexDigit(userInfo.charAt(i+1)) 625 || !isHexDigit(userInfo.charAt(i+2))) { 626 throwMalformedURIException(userInfo, 627 "Bad percent escape sequence"); 628 } 629 } 630 catch (StringIndexOutOfBoundsException ex) { 631 throwMalformedURIException(userInfo, 632 "Bad percent escape sequence"); 633 } 634 i += 2; 635 } 636 else if (!isUserInfoCharacter(c)) { 637 throw new MalformedURIException("Bad user info: " + userInfo); 638 } 639 } 640 641 } 642 643 checkScheme(String scheme)644 private static void checkScheme(String scheme) { 645 646 // http is probably 99% of cases so check it first 647 if ("http".equals(scheme)) return; 648 649 if (scheme.length() == 0) { 650 throw new MalformedURIException("URIs cannot begin with a colon"); 651 } 652 char c = scheme.charAt(0); 653 if (!isAlpha(c)) { 654 throw new MalformedURIException( 655 "Illegal initial scheme character " + c); 656 } 657 658 for (int i = scheme.length()-1; i >= 1; i--) { 659 c = scheme.charAt(i); 660 if (!isSchemeCharacter(c)) { 661 throw new MalformedURIException( 662 "Illegal scheme character " + c 663 ); 664 } 665 } 666 667 } 668 669 670 // http://www.faqs.org/rfcs/rfc2373.html 671 // http://www.faqs.org/rfcs/rfc2732.html checkIPv6Address(String ip6Address)672 private static void checkIPv6Address(String ip6Address) { 673 674 StringTokenizer st = new StringTokenizer(ip6Address, ":", true); 675 int numTokens = st.countTokens(); 676 if (numTokens > 15 || numTokens < 2) { 677 throw new MalformedURIException( 678 "Illegal IPv6 host address: " + ip6Address 679 ); 680 } 681 for (int i = 0; i < numTokens; i++) { 682 String hexPart = st.nextToken(); 683 if (":".equals(hexPart)) continue; 684 try { 685 int part = Integer.parseInt(hexPart, 16); 686 if (part < 0) { 687 throw new MalformedURIException( 688 "Illegal IPv6 host address: " + ip6Address 689 ); 690 } 691 } 692 catch (NumberFormatException ex) { 693 if (i == numTokens-1) { 694 checkIP4Address(hexPart, ip6Address); 695 } 696 else { 697 throwMalformedURIException(ip6Address, 698 "Illegal IPv6 host address: " + ip6Address 699 ); 700 } 701 } 702 } 703 704 if (ip6Address.indexOf("::") != ip6Address.lastIndexOf("::")) { 705 throw new MalformedURIException( 706 "Illegal IPv6 host address: " + ip6Address 707 ); 708 } 709 710 } 711 712 checkIP4Address(String address, String ip6Address)713 private static void checkIP4Address(String address, String ip6Address) { 714 715 StringTokenizer st = new StringTokenizer(address, "."); 716 int numTokens = st.countTokens(); 717 if (numTokens != 4) { 718 throw new MalformedURIException( 719 "Illegal IPv6 host address: " + ip6Address 720 ); 721 } 722 for (int i = 0; i < 4; i++) { 723 String decPart = st.nextToken(); 724 // https://github.com/elharo/xom/issues/12 725 if (decPart.startsWith("+")) { 726 throw new MalformedURIException( 727 "Illegal IPv6 host address: " + ip6Address 728 ); 729 } 730 try { 731 int dec = Integer.parseInt(decPart); 732 if (dec > 255 || dec < 0) { 733 throw new MalformedURIException( 734 "Illegal IPv6 host address: " + ip6Address 735 ); 736 } 737 } 738 catch (NumberFormatException ex) { 739 throw new MalformedURIException( 740 "Illegal IPv6 host address: " + ip6Address 741 ); 742 } 743 } 744 745 } 746 747 checkXMLName(String name)748 static void checkXMLName(String name) { 749 750 if (name == null) { 751 throwIllegalNameException(name, "XML names cannot be null"); 752 } 753 754 int length = name.length(); 755 if (length == 0) { 756 throwIllegalNameException(name, "XML names cannot be empty"); 757 } 758 759 char first = name.charAt(0); 760 if ((flags[first] & NAME_START_CHARACTER) == 0) { 761 throwIllegalNameException(name, "XML names cannot start " + 762 "with the character " + Integer.toHexString(first)); 763 } 764 765 for (int i = 1; i < length; i++) { 766 char c = name.charAt(i); 767 if ((flags[c] & NAME_CHARACTER) == 0) { 768 throwIllegalNameException(name, "0x" 769 + Integer.toHexString(c) 770 + " is not a legal name character"); 771 } 772 } 773 774 } 775 776 777 private static boolean[] C0Table = new boolean[0x21]; 778 static { 779 C0Table['\n'] = true; 780 C0Table['\r'] = true; 781 C0Table['\t'] = true; 782 C0Table[' '] = true; 783 } 784 785 isXMLSpaceCharacter(char c)786 static boolean isXMLSpaceCharacter(char c) { 787 if (c > ' ') return false; 788 return C0Table[c]; 789 } 790 791 isHexDigit(char c)792 private static boolean isHexDigit(char c) { 793 794 switch(c) { 795 case '0': return true; 796 case '1': return true; 797 case '2': return true; 798 case '3': return true; 799 case '4': return true; 800 case '5': return true; 801 case '6': return true; 802 case '7': return true; 803 case '8': return true; 804 case '9': return true; 805 case ':': return false; 806 case ';': return false; 807 case '<': return false; 808 case '=': return false; 809 case '>': return false; 810 case '?': return false; 811 case '@': return false; 812 case 'A': return true; 813 case 'B': return true; 814 case 'C': return true; 815 case 'D': return true; 816 case 'E': return true; 817 case 'F': return true; 818 case 'G': return false; 819 case 'H': return false; 820 case 'I': return false; 821 case 'J': return false; 822 case 'K': return false; 823 case 'L': return false; 824 case 'M': return false; 825 case 'N': return false; 826 case 'O': return false; 827 case 'P': return false; 828 case 'Q': return false; 829 case 'R': return false; 830 case 'S': return false; 831 case 'T': return false; 832 case 'U': return false; 833 case 'V': return false; 834 case 'W': return false; 835 case 'X': return false; 836 case 'Y': return false; 837 case 'Z': return false; 838 case '[': return false; 839 case '\\': return false; 840 case ']': return false; 841 case '^': return false; 842 case '_': return false; 843 case '`': return false; 844 case 'a': return true; 845 case 'b': return true; 846 case 'c': return true; 847 case 'd': return true; 848 case 'e': return true; 849 case 'f': return true; 850 } 851 return false; 852 } 853 854 855 // Since namespace URIs are commonly repeated, we can save a lot 856 // of redundant code by storing the ones we've seen before. 857 private static URICache cache = new URICache(); 858 859 private final static class URICache { 860 861 private final static int LOAD = 6; 862 private String[] cache = new String[LOAD]; 863 private int position = 0; 864 contains(String s)865 synchronized boolean contains(String s) { 866 867 for (int i = 0; i < LOAD; i++) { 868 // Here I'm assuming the namespace URIs are interned. 869 // This is commonly but not always true. This won't 870 // break if they haven't been. Using equals() instead 871 // of == is faster when the namespace URIs haven't been 872 // interned but slower if they have. 873 if (s == cache[i]) { 874 return true; 875 } 876 } 877 return false; 878 879 } 880 put(String s)881 synchronized void put(String s) { 882 cache[position] = s; 883 position++; 884 if (position == LOAD) position = 0; 885 } 886 887 } 888 889 890 /** 891 * <p> 892 * Checks a string to see if it is an RFC 3986 absolute 893 * URI reference. URI references can contain fragment identifiers. 894 * Absolute URI references must have a scheme. 895 * </p> 896 * 897 * @param uri <code>String</code> to check 898 * 899 * @throws MalformedURIException if this is not a legal 900 * URI reference 901 */ checkAbsoluteURIReference(String uri)902 static void checkAbsoluteURIReference(String uri) { 903 904 if (cache.contains(uri)) { 905 return; 906 } 907 URIUtil.ParsedURI parsed = new URIUtil.ParsedURI(uri); 908 try { 909 if (parsed.scheme == null) { 910 throwMalformedURIException( 911 uri, "Missing scheme in absolute URI reference"); 912 } 913 checkScheme(parsed.scheme); 914 if (parsed.authority != null) checkAuthority(parsed.authority); 915 checkPath(parsed.path); 916 if (parsed.fragment != null) checkFragment(parsed.fragment); 917 if (parsed.query != null) checkQuery(parsed.query); 918 cache.put(uri); 919 } 920 catch (MalformedURIException ex) { 921 ex.setData(uri); 922 throw ex; 923 } 924 925 } 926 927 isAlpha(char c)928 static boolean isAlpha(char c) { 929 930 switch(c) { 931 case 'A': return true; 932 case 'B': return true; 933 case 'C': return true; 934 case 'D': return true; 935 case 'E': return true; 936 case 'F': return true; 937 case 'G': return true; 938 case 'H': return true; 939 case 'I': return true; 940 case 'J': return true; 941 case 'K': return true; 942 case 'L': return true; 943 case 'M': return true; 944 case 'N': return true; 945 case 'O': return true; 946 case 'P': return true; 947 case 'Q': return true; 948 case 'R': return true; 949 case 'S': return true; 950 case 'T': return true; 951 case 'U': return true; 952 case 'V': return true; 953 case 'W': return true; 954 case 'X': return true; 955 case 'Y': return true; 956 case 'Z': return true; 957 case '[': return false; 958 case '\\': return false; 959 case ']': return false; 960 case '^': return false; 961 case '_': return false; 962 case '`': return false; 963 case 'a': return true; 964 case 'b': return true; 965 case 'c': return true; 966 case 'd': return true; 967 case 'e': return true; 968 case 'f': return true; 969 case 'g': return true; 970 case 'h': return true; 971 case 'i': return true; 972 case 'j': return true; 973 case 'k': return true; 974 case 'l': return true; 975 case 'm': return true; 976 case 'n': return true; 977 case 'o': return true; 978 case 'p': return true; 979 case 'q': return true; 980 case 'r': return true; 981 case 's': return true; 982 case 't': return true; 983 case 'u': return true; 984 case 'v': return true; 985 case 'w': return true; 986 case 'x': return true; 987 case 'y': return true; 988 case 'z': return true; 989 } 990 991 return false; 992 993 } 994 995 isSchemeCharacter(char c)996 static boolean isSchemeCharacter(char c) { 997 998 /* The : and the ? cannot be reached here because they'll 999 * have been parsed out separately before this method is 1000 * called. They're included here strictly for alignment 1001 * so the compiler will generate a table lookup. 1002 */ 1003 switch(c) { 1004 case '+': return true; 1005 case ',': return false; 1006 case '-': return true; 1007 case '.': return true; 1008 case '/': return false; 1009 case '0': return true; 1010 case '1': return true; 1011 case '2': return true; 1012 case '3': return true; 1013 case '4': return true; 1014 case '5': return true; 1015 case '6': return true; 1016 case '7': return true; 1017 case '8': return true; 1018 case '9': return true; 1019 case ':': return false; // unreachable 1020 case ';': return false; 1021 case '<': return false; 1022 case '=': return false; 1023 case '>': return false; 1024 case '?': return false; // unreachable 1025 case '@': return false; 1026 case 'A': return true; 1027 case 'B': return true; 1028 case 'C': return true; 1029 case 'D': return true; 1030 case 'E': return true; 1031 case 'F': return true; 1032 case 'G': return true; 1033 case 'H': return true; 1034 case 'I': return true; 1035 case 'J': return true; 1036 case 'K': return true; 1037 case 'L': return true; 1038 case 'M': return true; 1039 case 'N': return true; 1040 case 'O': return true; 1041 case 'P': return true; 1042 case 'Q': return true; 1043 case 'R': return true; 1044 case 'S': return true; 1045 case 'T': return true; 1046 case 'U': return true; 1047 case 'V': return true; 1048 case 'W': return true; 1049 case 'X': return true; 1050 case 'Y': return true; 1051 case 'Z': return true; 1052 case '[': return false; 1053 case '\\': return false; 1054 case ']': return false; 1055 case '^': return false; 1056 case '_': return false; 1057 case '`': return false; 1058 case 'a': return true; 1059 case 'b': return true; 1060 case 'c': return true; 1061 case 'd': return true; 1062 case 'e': return true; 1063 case 'f': return true; 1064 case 'g': return true; 1065 case 'h': return true; 1066 case 'i': return true; 1067 case 'j': return true; 1068 case 'k': return true; 1069 case 'l': return true; 1070 case 'm': return true; 1071 case 'n': return true; 1072 case 'o': return true; 1073 case 'p': return true; 1074 case 'q': return true; 1075 case 'r': return true; 1076 case 's': return true; 1077 case 't': return true; 1078 case 'u': return true; 1079 case 'v': return true; 1080 case 'w': return true; 1081 case 'x': return true; 1082 case 'y': return true; 1083 case 'z': return true; 1084 } 1085 1086 return false; 1087 1088 } 1089 1090 isPathCharacter(char c)1091 private static boolean isPathCharacter(char c) { 1092 1093 switch(c) { 1094 case '!': return true; 1095 case '"': return false; 1096 case '#': return false; 1097 case '$': return true; 1098 case '%': return false; // checked separately 1099 case '&': return true; 1100 case '\'': return true; 1101 case '(': return true; 1102 case ')': return true; 1103 case '*': return true; 1104 case '+': return true; 1105 case ',': return true; 1106 case '-': return true; 1107 case '.': return true; 1108 case '/': return false; // handled separately 1109 case '0': return true; 1110 case '1': return true; 1111 case '2': return true; 1112 case '3': return true; 1113 case '4': return true; 1114 case '5': return true; 1115 case '6': return true; 1116 case '7': return true; 1117 case '8': return true; 1118 case '9': return true; 1119 case ':': return true; 1120 case ';': return true; 1121 case '<': return false; 1122 case '=': return true; 1123 case '>': return false; 1124 case '?': return false; 1125 case '@': return true; 1126 case 'A': return true; 1127 case 'B': return true; 1128 case 'C': return true; 1129 case 'D': return true; 1130 case 'E': return true; 1131 case 'F': return true; 1132 case 'G': return true; 1133 case 'H': return true; 1134 case 'I': return true; 1135 case 'J': return true; 1136 case 'K': return true; 1137 case 'L': return true; 1138 case 'M': return true; 1139 case 'N': return true; 1140 case 'O': return true; 1141 case 'P': return true; 1142 case 'Q': return true; 1143 case 'R': return true; 1144 case 'S': return true; 1145 case 'T': return true; 1146 case 'U': return true; 1147 case 'V': return true; 1148 case 'W': return true; 1149 case 'X': return true; 1150 case 'Y': return true; 1151 case 'Z': return true; 1152 case '[': return false; 1153 case '\\': return false; 1154 case ']': return false; 1155 case '^': return false; 1156 case '_': return true; 1157 case '`': return false; 1158 case 'a': return true; 1159 case 'b': return true; 1160 case 'c': return true; 1161 case 'd': return true; 1162 case 'e': return true; 1163 case 'f': return true; 1164 case 'g': return true; 1165 case 'h': return true; 1166 case 'i': return true; 1167 case 'j': return true; 1168 case 'k': return true; 1169 case 'l': return true; 1170 case 'm': return true; 1171 case 'n': return true; 1172 case 'o': return true; 1173 case 'p': return true; 1174 case 'q': return true; 1175 case 'r': return true; 1176 case 's': return true; 1177 case 't': return true; 1178 case 'u': return true; 1179 case 'v': return true; 1180 case 'w': return true; 1181 case 'x': return true; 1182 case 'y': return true; 1183 case 'z': return true; 1184 case '{': return false; 1185 case '|': return false; 1186 case '}': return false; 1187 case '~': return true; 1188 } 1189 1190 return false; 1191 1192 } 1193 1194 isUserInfoCharacter(char c)1195 private static boolean isUserInfoCharacter(char c) { 1196 1197 switch(c) { 1198 case '!': return true; 1199 case '"': return false; 1200 case '#': return false; 1201 case '$': return true; 1202 case '%': return false; // checked separately 1203 case '&': return true; 1204 case '\'': return true; 1205 case '(': return true; 1206 case ')': return true; 1207 case '*': return true; 1208 case '+': return true; 1209 case ',': return true; 1210 case '-': return true; 1211 case '.': return true; 1212 case '/': return true; 1213 case '0': return true; 1214 case '1': return true; 1215 case '2': return true; 1216 case '3': return true; 1217 case '4': return true; 1218 case '5': return true; 1219 case '6': return true; 1220 case '7': return true; 1221 case '8': return true; 1222 case '9': return true; 1223 case ':': return true; 1224 case ';': return true; 1225 case '<': return false; 1226 case '=': return true; 1227 case '>': return false; 1228 case '?': return false; 1229 case '@': return false; 1230 case 'A': return true; 1231 case 'B': return true; 1232 case 'C': return true; 1233 case 'D': return true; 1234 case 'E': return true; 1235 case 'F': return true; 1236 case 'G': return true; 1237 case 'H': return true; 1238 case 'I': return true; 1239 case 'J': return true; 1240 case 'K': return true; 1241 case 'L': return true; 1242 case 'M': return true; 1243 case 'N': return true; 1244 case 'O': return true; 1245 case 'P': return true; 1246 case 'Q': return true; 1247 case 'R': return true; 1248 case 'S': return true; 1249 case 'T': return true; 1250 case 'U': return true; 1251 case 'V': return true; 1252 case 'W': return true; 1253 case 'X': return true; 1254 case 'Y': return true; 1255 case 'Z': return true; 1256 case '[': return false; 1257 case '\\': return false; 1258 case ']': return false; 1259 case '^': return false; 1260 case '_': return true; 1261 case '`': return false; 1262 case 'a': return true; 1263 case 'b': return true; 1264 case 'c': return true; 1265 case 'd': return true; 1266 case 'e': return true; 1267 case 'f': return true; 1268 case 'g': return true; 1269 case 'h': return true; 1270 case 'i': return true; 1271 case 'j': return true; 1272 case 'k': return true; 1273 case 'l': return true; 1274 case 'm': return true; 1275 case 'n': return true; 1276 case 'o': return true; 1277 case 'p': return true; 1278 case 'q': return true; 1279 case 'r': return true; 1280 case 's': return true; 1281 case 't': return true; 1282 case 'u': return true; 1283 case 'v': return true; 1284 case 'w': return true; 1285 case 'x': return true; 1286 case 'y': return true; 1287 case 'z': return true; 1288 case '{': return false; 1289 case '|': return false; 1290 case '}': return false; 1291 case '~': return true; 1292 } 1293 1294 return false; 1295 1296 } 1297 1298 1299 /** 1300 * Check to see that this string is an absolute URI, 1301 * neither a relative URI nor a URI reference. 1302 * 1303 */ checkAbsoluteURI(String uri)1304 static void checkAbsoluteURI(String uri) { 1305 1306 URIUtil.ParsedURI parsed = new URIUtil.ParsedURI(uri); 1307 try { 1308 if (parsed.scheme == null) { 1309 throwMalformedURIException(uri, "Missing scheme in absolute URI"); 1310 } 1311 checkScheme(parsed.scheme); 1312 if (parsed.authority != null) checkAuthority(parsed.authority); 1313 checkPath(parsed.path); 1314 if (parsed.fragment != null) { 1315 throwMalformedURIException(uri, "URIs cannot have fragment identifiers"); 1316 } 1317 if (parsed.query != null) checkQuery(parsed.query); 1318 } 1319 catch (MalformedURIException ex) { 1320 ex.setData(uri); 1321 throw ex; 1322 } 1323 1324 } 1325 1326 1327 // For use in checking internal DTD subsets 1328 private static XMLReader parser; 1329 checkInternalDTDSubset(String subset)1330 static synchronized void checkInternalDTDSubset(String subset) { 1331 1332 if (parser == null) { 1333 final InputSource empty = new InputSource(new EmptyReader()); 1334 parser = Builder.findParser(false); 1335 // parser = new org.apache.crimson.parser.XMLReaderImpl(); 1336 // Now let's stop this parser from loading any external 1337 // entities the subset references 1338 parser.setEntityResolver(new EntityResolver() { 1339 1340 public InputSource resolveEntity(String publicID, String systemID) { 1341 return empty; 1342 } 1343 1344 }); 1345 } 1346 1347 String doc = "<!DOCTYPE a [" + subset + "]><a/>"; 1348 try { 1349 InputSource source = new InputSource(new StringReader(doc)); 1350 // just to make sure relative URLs can be resolved; don't 1351 // actually need to connect to this; the EntityResolver 1352 // prevents that 1353 source.setSystemId("http://www.example.org/"); 1354 parser.parse(source); 1355 } 1356 catch (SAXException ex) { 1357 IllegalDataException idex = new IllegalDataException( 1358 "Malformed internal DTD subset: " + ex.getMessage(), ex); 1359 idex.setData(subset); 1360 throw idex; 1361 } 1362 catch (IOException ex) { 1363 throw new RuntimeException("BUG: I don't think this can happen"); 1364 } 1365 1366 } 1367 1368 1369 // A reader that immediately returns end of stream. This is a great 1370 // big hack to avoid reading anything when setting the internal 1371 // DTD subset. I could use the 1372 // http://xml.org/sax/features/external-parameter-entities SAX 1373 // feature, but many parsers don't reliably implement that so 1374 // instead we simply pretend that all URLs point to empty files. 1375 private static class EmptyReader extends Reader { 1376 read(char[] text, int start, int length)1377 public int read(char[] text, int start, int length) throws IOException { 1378 return -1; 1379 } 1380 close()1381 public void close() {} 1382 1383 } 1384 1385 1386 }