1 /* URI.java -- An URI class 2 Copyright (C) 2002, 2004, 2005, 2006, 2008 Free Software Foundation, Inc. 3 4 This file is part of GNU Classpath. 5 6 GNU Classpath is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 GNU Classpath is distributed in the hope that it will be useful, but 12 WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GNU Classpath; see the file COPYING. If not, write to the 18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19 02110-1301 USA. 20 21 Linking this library statically or dynamically with other modules is 22 making a combined work based on this library. Thus, the terms and 23 conditions of the GNU General Public License cover the whole 24 combination. 25 26 As a special exception, the copyright holders of this library give you 27 permission to link this library with independent modules to produce an 28 executable, regardless of the license terms of these independent 29 modules, and to copy and distribute the resulting executable under 30 terms of your choice, provided that you also meet, for each linked 31 independent module, the terms and conditions of the license of that 32 module. An independent module is a module which is not derived from 33 or based on this library. If you modify this library, you may extend 34 this exception to your version of the library, but you are not 35 obligated to do so. If you do not wish to do so, delete this 36 exception statement from your version. */ 37 38 39 package java.net; 40 41 import gnu.java.lang.CPStringBuilder; 42 43 import java.io.IOException; 44 import java.io.ObjectInputStream; 45 import java.io.ObjectOutputStream; 46 import java.io.Serializable; 47 import java.util.regex.Matcher; 48 import java.util.regex.Pattern; 49 50 /** 51 * <p> 52 * A URI instance represents that defined by 53 * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC3986</a>, 54 * with some deviations. 55 * </p> 56 * <p> 57 * At its highest level, a URI consists of: 58 * </p> 59 * <code>[<em>scheme</em><strong>:</strong>]<em>scheme-specific-part</em> 60 * [<strong>#</strong><em>fragment</em>]</code> 61 * </p> 62 * <p> 63 * where <strong>#</strong> and <strong>:</strong> are literal characters, 64 * and those parts enclosed in square brackets are optional. 65 * </p> 66 * <p> 67 * There are two main types of URI. An <em>opaque</em> URI is one 68 * which just consists of the above three parts, and is not further 69 * defined. An example of such a URI would be <em>mailto:</em> URI. 70 * In contrast, <em>hierarchical</em> URIs give further definition 71 * to the scheme-specific part, so as represent some part of a hierarchical 72 * structure. 73 * </p> 74 * <p> 75 * <code>[<strong>//</strong><em>authority</em>][<em>path</em>] 76 * [<strong>?</strong><em>query</em>]</code> 77 * </p> 78 * <p> 79 * with <strong>/</strong> and <strong>?</strong> being literal characters. 80 * When server-based, the authority section is further subdivided into: 81 * </p> 82 * <p> 83 * <code>[<em>user-info</em><strong>@</strong>]<em>host</em> 84 * [<strong>:</strong><em>port</em>]</code> 85 * </p> 86 * <p> 87 * with <strong>@</strong> and <strong>:</strong> as literal characters. 88 * Authority sections that are not server-based are said to be registry-based. 89 * </p> 90 * <p> 91 * Hierarchical URIs can be either relative or absolute. Absolute URIs 92 * always start with a `<strong>/</strong>', while relative URIs don't 93 * specify a scheme. Opaque URIs are always absolute. 94 * </p> 95 * <p> 96 * Each part of the URI may have one of three states: undefined, empty 97 * or containing some content. The former two of these are represented 98 * by <code>null</code> and the empty string in Java, respectively. 99 * The scheme-specific part may never be undefined. It also follows from 100 * this that the path sub-part may also not be undefined, so as to ensure 101 * the former. 102 * </p> 103 * <h2>Character Escaping and Quoting</h2> 104 * <p> 105 * The characters that can be used within a valid URI are restricted. 106 * There are two main classes of characters which can't be used as is 107 * within the URI: 108 * </p> 109 * <ol> 110 * <li><strong>Characters outside the US-ASCII character set</strong>. 111 * These have to be <strong>escaped</strong> in order to create 112 * an RFC-compliant URI; this means replacing the character with the 113 * appropriate hexadecimal value, preceded by a `%'.</li> 114 * <li><strong>Illegal characters</strong> (e.g. space characters, 115 * control characters) are quoted, which results in them being encoded 116 * in the same way as non-US-ASCII characters.</li> 117 * </ol> 118 * <p> 119 * The set of valid characters differs depending on the section of the URI: 120 * </p> 121 * <ul> 122 * <li><strong>Scheme</strong>: Must be an alphanumeric, `-', `.' or '+'.</li> 123 * <li><strong>Authority</strong>:Composed of the username, host, port, `@' 124 * and `:'.</li> 125 * <li><strong>Username</strong>: Allows unreserved or percent-encoded 126 * characters, sub-delimiters and `:'.</li> 127 * <li><strong>Host</strong>: Allows unreserved or percent-encoded 128 * characters, sub-delimiters and square brackets (`[' and `]') for IPv6 129 * addresses.</li> 130 * <li><strong>Port</strong>: Digits only.</li> 131 * <li><strong>Path</strong>: Allows the path characters and `/'. 132 * <li><strong>Query</strong>: Allows the path characters, `?' and '/'. 133 * <li><strong>Fragment</strong>: Allows the path characters, `?' and '/'. 134 * </ul> 135 * <p> 136 * These definitions reference the following sets of characters: 137 * </p> 138 * <ul> 139 * <li><strong>Unreserved characters</strong>: The alphanumerics plus 140 * `-', `.', `_', and `~'.</li> 141 * <li><strong>Sub-delimiters</strong>: `!', `$', `&', `(', `)', `*', 142 * `+', `,', `;', `=' and the single-quote itself.</li> 143 * <li><strong>Path characters</strong>: Unreserved and percent-encoded 144 * characters and the sub-delimiters along with `@' and `:'.</li> 145 * </ul> 146 * <p> 147 * The constructors and accessor methods allow the use and retrieval of 148 * URI components which contain non-US-ASCII characters directly. 149 * They are only escaped when the <code>toASCIIString()</code> method 150 * is used. In contrast, illegal characters are always quoted, with the 151 * exception of the return values of the non-raw accessors. 152 * </p> 153 * 154 * @author Ito Kazumitsu (ito.kazumitsu@hitachi-cable.co.jp) 155 * @author Dalibor Topic (robilad@kaffe.org) 156 * @author Michael Koch (konqueror@gmx.de) 157 * @author Andrew John Hughes (gnu_andrew@member.fsf.org) 158 * @since 1.4 159 */ 160 public final class URI 161 implements Comparable<URI>, Serializable 162 { 163 /** 164 * For serialization compatability. 165 */ 166 static final long serialVersionUID = -6052424284110960213L; 167 168 /** 169 * Regular expression for parsing URIs. 170 * 171 * Taken from RFC 2396, Appendix B. 172 * This expression doesn't parse IPv6 addresses. 173 */ 174 private static final String URI_REGEXP = 175 "^(([^:/?#]+):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?"; 176 177 /** 178 * Regular expression for parsing the authority segment. 179 */ 180 private static final String AUTHORITY_REGEXP = 181 "(([^?#]*)@)?([^?#:]*)(:([0-9]*))?"; 182 183 /** 184 * Valid characters (taken from rfc2396/3986) 185 */ 186 private static final String RFC2396_DIGIT = "0123456789"; 187 private static final String RFC2396_LOWALPHA = "abcdefghijklmnopqrstuvwxyz"; 188 private static final String RFC2396_UPALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; 189 private static final String RFC2396_ALPHA = 190 RFC2396_LOWALPHA + RFC2396_UPALPHA; 191 private static final String RFC2396_ALPHANUM = RFC2396_DIGIT + RFC2396_ALPHA; 192 private static final String RFC3986_UNRESERVED = RFC2396_ALPHANUM + "-._~"; 193 private static final String RFC3986_SUBDELIMS = "!$&'()*+,;="; 194 private static final String RFC3986_REG_NAME = 195 RFC3986_UNRESERVED + RFC3986_SUBDELIMS + "%"; 196 private static final String RFC3986_PCHAR = RFC3986_UNRESERVED + 197 RFC3986_SUBDELIMS + ":@%"; 198 private static final String RFC3986_SEGMENT = RFC3986_PCHAR; 199 private static final String RFC3986_PATH_SEGMENTS = RFC3986_SEGMENT + "/"; 200 private static final String RFC3986_SSP = RFC3986_PCHAR + "?/"; 201 private static final String RFC3986_HOST = RFC3986_REG_NAME + "[]"; 202 private static final String RFC3986_USERINFO = RFC3986_REG_NAME + ":"; 203 204 /** 205 * Index of scheme component in parsed URI. 206 */ 207 private static final int SCHEME_GROUP = 2; 208 209 /** 210 * Index of scheme-specific-part in parsed URI. 211 */ 212 private static final int SCHEME_SPEC_PART_GROUP = 3; 213 214 /** 215 * Index of authority component in parsed URI. 216 */ 217 private static final int AUTHORITY_GROUP = 5; 218 219 /** 220 * Index of path component in parsed URI. 221 */ 222 private static final int PATH_GROUP = 6; 223 224 /** 225 * Index of query component in parsed URI. 226 */ 227 private static final int QUERY_GROUP = 8; 228 229 /** 230 * Index of fragment component in parsed URI. 231 */ 232 private static final int FRAGMENT_GROUP = 10; 233 234 /** 235 * Index of userinfo component in parsed authority section. 236 */ 237 private static final int AUTHORITY_USERINFO_GROUP = 2; 238 239 /** 240 * Index of host component in parsed authority section. 241 */ 242 private static final int AUTHORITY_HOST_GROUP = 3; 243 244 /** 245 * Index of port component in parsed authority section. 246 */ 247 private static final int AUTHORITY_PORT_GROUP = 5; 248 249 /** 250 * The compiled version of the URI regular expression. 251 */ 252 private static final Pattern URI_PATTERN; 253 254 /** 255 * The compiled version of the authority regular expression. 256 */ 257 private static final Pattern AUTHORITY_PATTERN; 258 259 /** 260 * The set of valid hexadecimal characters. 261 */ 262 private static final String HEX = "0123456789ABCDEF"; 263 264 private transient String scheme; 265 private transient String rawSchemeSpecificPart; 266 private transient String schemeSpecificPart; 267 private transient String rawAuthority; 268 private transient String authority; 269 private transient String rawUserInfo; 270 private transient String userInfo; 271 private transient String rawHost; 272 private transient String host; 273 private transient int port = -1; 274 private transient String rawPath; 275 private transient String path; 276 private transient String rawQuery; 277 private transient String query; 278 private transient String rawFragment; 279 private transient String fragment; 280 private String string; 281 282 /** 283 * Static initializer to pre-compile the regular expressions. 284 */ 285 static 286 { 287 URI_PATTERN = Pattern.compile(URI_REGEXP); 288 AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEXP); 289 } 290 readObject(ObjectInputStream is)291 private void readObject(ObjectInputStream is) 292 throws ClassNotFoundException, IOException 293 { 294 this.string = (String) is.readObject(); 295 try 296 { 297 parseURI(this.string); 298 } 299 catch (URISyntaxException x) 300 { 301 // Should not happen. 302 throw new RuntimeException(x); 303 } 304 } 305 writeObject(ObjectOutputStream os)306 private void writeObject(ObjectOutputStream os) throws IOException 307 { 308 if (string == null) 309 string = toString(); 310 os.writeObject(string); 311 } 312 313 /** 314 * <p> 315 * Returns the string content of the specified group of the supplied 316 * matcher. The returned value is modified according to the following: 317 * </p> 318 * <ul> 319 * <li>If the resulting string has a length greater than 0, then 320 * that string is returned.</li> 321 * <li>If a string of zero length, is matched, then the content 322 * of the preceding group is considered. If this is also an empty 323 * string, then <code>null</code> is returned to indicate an undefined 324 * value. Otherwise, the value is truly the empty string and this is 325 * the returned value.</li> 326 * </ul> 327 * <p> 328 * This method is used for matching against all parts of the URI 329 * that may be either undefined or empty (i.e. all those but the 330 * scheme-specific part and the path). In each case, the preceding 331 * group is the content of the original group, along with some 332 * additional distinguishing feature. For example, the preceding 333 * group for the query includes the preceding question mark, 334 * while that of the fragment includes the hash symbol. The presence 335 * of these features enables disambiguation between the two cases 336 * of a completely unspecified value and a simple non-existant value. 337 * The scheme differs in that it will never return an empty string; 338 * the delimiter follows the scheme rather than preceding it, so 339 * it becomes part of the following section. The same is true 340 * of the user information. 341 * </p> 342 * 343 * @param match the matcher, which contains the results of the URI 344 * matched against the URI regular expression. 345 * @return either the matched content, <code>null</code> for undefined 346 * values, or an empty string for a URI part with empty content. 347 */ getURIGroup(Matcher match, int group)348 private static String getURIGroup(Matcher match, int group) 349 { 350 String matched = match.group(group); 351 if (matched == null || matched.length() == 0) 352 { 353 String prevMatched = match.group(group -1); 354 if (prevMatched == null || prevMatched.length() == 0) 355 return null; 356 else 357 return ""; 358 } 359 return matched; 360 } 361 362 /** 363 * Sets fields of this URI by parsing the given string. 364 * 365 * @param str The string to parse 366 * 367 * @exception URISyntaxException If the given string violates RFC 2396 368 */ parseURI(String str)369 private void parseURI(String str) throws URISyntaxException 370 { 371 Matcher matcher = URI_PATTERN.matcher(str); 372 373 if (matcher.matches()) 374 { 375 scheme = getURIGroup(matcher, SCHEME_GROUP); 376 rawSchemeSpecificPart = matcher.group(SCHEME_SPEC_PART_GROUP); 377 schemeSpecificPart = unquote(rawSchemeSpecificPart); 378 if (!isOpaque()) 379 { 380 rawAuthority = getURIGroup(matcher, AUTHORITY_GROUP); 381 rawPath = matcher.group(PATH_GROUP); 382 rawQuery = getURIGroup(matcher, QUERY_GROUP); 383 } 384 rawFragment = getURIGroup(matcher, FRAGMENT_GROUP); 385 } 386 else 387 throw new URISyntaxException(str, 388 "doesn't match URI regular expression"); 389 parseServerAuthority(); 390 391 // We must eagerly unquote the parts, because this is the only time 392 // we may throw an exception. 393 authority = unquote(rawAuthority); 394 userInfo = unquote(rawUserInfo); 395 host = unquote(rawHost); 396 path = unquote(rawPath); 397 query = unquote(rawQuery); 398 fragment = unquote(rawFragment); 399 } 400 401 /** 402 * Unquote "%" + hex quotes characters 403 * 404 * @param str The string to unquote or null. 405 * 406 * @return The unquoted string or null if str was null. 407 * 408 * @exception URISyntaxException If the given string contains invalid 409 * escape sequences. 410 */ unquote(String str)411 private static String unquote(String str) throws URISyntaxException 412 { 413 if (str == null) 414 return null; 415 byte[] buf = new byte[str.length()]; 416 int pos = 0; 417 for (int i = 0; i < str.length(); i++) 418 { 419 char c = str.charAt(i); 420 if (c == '%') 421 { 422 if (i + 2 >= str.length()) 423 throw new URISyntaxException(str, "Invalid quoted character"); 424 int hi = Character.digit(str.charAt(++i), 16); 425 int lo = Character.digit(str.charAt(++i), 16); 426 if (lo < 0 || hi < 0) 427 throw new URISyntaxException(str, "Invalid quoted character"); 428 buf[pos++] = (byte) (hi * 16 + lo); 429 } 430 else 431 buf[pos++] = (byte) c; 432 } 433 try 434 { 435 return new String(buf, 0, pos, "utf-8"); 436 } 437 catch (java.io.UnsupportedEncodingException x2) 438 { 439 throw (Error) new InternalError().initCause(x2); 440 } 441 } 442 443 /** 444 * Quote characters illegal in URIs in given string. 445 * 446 * Replace illegal characters by encoding their UTF-8 447 * representation as "%" + hex code for each resulting 448 * UTF-8 character. 449 * 450 * @param str The string to quote 451 * 452 * @return The quoted string. 453 */ quote(String str)454 private static String quote(String str) 455 { 456 return quote(str, RFC3986_SSP); 457 } 458 459 /** 460 * Quote characters illegal in URI authorities in given string. 461 * 462 * Replace illegal characters by encoding their UTF-8 463 * representation as "%" + hex code for each resulting 464 * UTF-8 character. 465 * 466 * @param str The string to quote 467 * 468 * @return The quoted string. 469 */ quoteAuthority(String str)470 private static String quoteAuthority(String str) 471 { 472 // Technically, we should be using RFC2396_AUTHORITY, but 473 // it contains no additional characters. 474 return quote(str, RFC3986_REG_NAME); 475 } 476 477 /** 478 * Quotes the characters in the supplied string that are not part of 479 * the specified set of legal characters. 480 * 481 * @param str the string to quote 482 * @param legalCharacters the set of legal characters 483 * 484 * @return the quoted string. 485 */ quote(String str, String legalCharacters)486 private static String quote(String str, String legalCharacters) 487 { 488 CPStringBuilder sb = new CPStringBuilder(str.length()); 489 for (int i = 0; i < str.length(); i++) 490 { 491 char c = str.charAt(i); 492 if ((legalCharacters.indexOf(c) == -1) 493 && (c <= 127)) 494 { 495 sb.append('%'); 496 sb.append(HEX.charAt(c / 16)); 497 sb.append(HEX.charAt(c % 16)); 498 } 499 else 500 sb.append(c); 501 } 502 return sb.toString(); 503 } 504 505 /** 506 * Quote characters illegal in URI hosts in given string. 507 * 508 * Replace illegal characters by encoding their UTF-8 509 * representation as "%" + hex code for each resulting 510 * UTF-8 character. 511 * 512 * @param str The string to quote 513 * 514 * @return The quoted string. 515 */ quoteHost(String str)516 private static String quoteHost(String str) 517 { 518 return quote(str, RFC3986_HOST); 519 } 520 521 /** 522 * Quote characters illegal in URI paths in given string. 523 * 524 * Replace illegal characters by encoding their UTF-8 525 * representation as "%" + hex code for each resulting 526 * UTF-8 character. 527 * 528 * @param str The string to quote 529 * 530 * @return The quoted string. 531 */ quotePath(String str)532 private static String quotePath(String str) 533 { 534 // Technically, we should be using RFC2396_PATH, but 535 // it contains no additional characters. 536 return quote(str, RFC3986_PATH_SEGMENTS); 537 } 538 539 /** 540 * Quote characters illegal in URI user infos in given string. 541 * 542 * Replace illegal characters by encoding their UTF-8 543 * representation as "%" + hex code for each resulting 544 * UTF-8 character. 545 * 546 * @param str The string to quote 547 * 548 * @return The quoted string. 549 */ quoteUserInfo(String str)550 private static String quoteUserInfo(String str) 551 { 552 return quote(str, RFC3986_USERINFO); 553 } 554 555 /** 556 * Creates an URI from the given string 557 * 558 * @param str The string to create the URI from 559 * 560 * @exception URISyntaxException If the given string violates RFC 2396 561 * @exception NullPointerException If str is null 562 */ URI(String str)563 public URI(String str) throws URISyntaxException 564 { 565 this.string = str; 566 parseURI(str); 567 } 568 569 /** 570 * Create an URI from the given components 571 * 572 * @param scheme The scheme name 573 * @param userInfo The username and authorization info 574 * @param host The hostname 575 * @param port The port number 576 * @param path The path 577 * @param query The query 578 * @param fragment The fragment 579 * 580 * @exception URISyntaxException If the given string violates RFC 2396 581 */ URI(String scheme, String userInfo, String host, int port, String path, String query, String fragment)582 public URI(String scheme, String userInfo, String host, int port, 583 String path, String query, String fragment) 584 throws URISyntaxException 585 { 586 this((scheme == null ? "" : scheme + ":") 587 + (userInfo == null && host == null && port == -1 ? "" : "//") 588 + (userInfo == null ? "" : quoteUserInfo(userInfo) + "@") 589 + (host == null ? "" : quoteHost(host)) 590 + (port == -1 ? "" : ":" + String.valueOf(port)) 591 + (path == null ? "" : quotePath(path)) 592 + (query == null ? "" : "?" + quote(query)) 593 + (fragment == null ? "" : "#" + quote(fragment))); 594 } 595 596 /** 597 * Create an URI from the given components 598 * 599 * @param scheme The scheme name 600 * @param authority The authority 601 * @param path The apth 602 * @param query The query 603 * @param fragment The fragment 604 * 605 * @exception URISyntaxException If the given string violates RFC 2396 606 */ URI(String scheme, String authority, String path, String query, String fragment)607 public URI(String scheme, String authority, String path, String query, 608 String fragment) throws URISyntaxException 609 { 610 this((scheme == null ? "" : scheme + ":") 611 + (authority == null ? "" : "//" + quoteAuthority(authority)) 612 + (path == null ? "" : quotePath(path)) 613 + (query == null ? "" : "?" + quote(query)) 614 + (fragment == null ? "" : "#" + quote(fragment))); 615 } 616 617 /** 618 * Create an URI from the given components 619 * 620 * @param scheme The scheme name 621 * @param host The hostname 622 * @param path The path 623 * @param fragment The fragment 624 * 625 * @exception URISyntaxException If the given string violates RFC 2396 626 */ URI(String scheme, String host, String path, String fragment)627 public URI(String scheme, String host, String path, String fragment) 628 throws URISyntaxException 629 { 630 this(scheme, null, host, -1, path, null, fragment); 631 } 632 633 /** 634 * Create an URI from the given components 635 * 636 * @param scheme The scheme name 637 * @param ssp The scheme specific part 638 * @param fragment The fragment 639 * 640 * @exception URISyntaxException If the given string violates RFC 2396 641 */ URI(String scheme, String ssp, String fragment)642 public URI(String scheme, String ssp, String fragment) 643 throws URISyntaxException 644 { 645 this((scheme == null ? "" : scheme + ":") 646 + (ssp == null ? "" : quote(ssp)) 647 + (fragment == null ? "" : "#" + quote(fragment))); 648 } 649 650 /** 651 * Create an URI from the given string 652 * 653 * @param str The string to create the URI from 654 * 655 * @exception IllegalArgumentException If the given string violates RFC 2396 656 * @exception NullPointerException If str is null 657 */ create(String str)658 public static URI create(String str) 659 { 660 try 661 { 662 return new URI(str); 663 } 664 catch (URISyntaxException e) 665 { 666 throw (IllegalArgumentException) new IllegalArgumentException() 667 .initCause(e); 668 } 669 } 670 671 /** 672 * Attempts to parse this URI's authority component, if defined, 673 * into user-information, host, and port components. The purpose 674 * of this method was to disambiguate between some authority sections, 675 * which form invalid server-based authories, but valid registry 676 * based authorities. In the updated RFC 3986, the authority section 677 * is defined differently, with registry-based authorities part of 678 * the host section. Thus, this method is now simply an explicit 679 * way of parsing any authority section. 680 * 681 * @return the URI, with the authority section parsed into user 682 * information, host and port components. 683 * @throws URISyntaxException if the given string violates RFC 2396 684 */ parseServerAuthority()685 public URI parseServerAuthority() throws URISyntaxException 686 { 687 if (rawAuthority != null) 688 { 689 Matcher matcher = AUTHORITY_PATTERN.matcher(rawAuthority); 690 691 if (matcher.matches()) 692 { 693 rawUserInfo = getURIGroup(matcher, AUTHORITY_USERINFO_GROUP); 694 rawHost = getURIGroup(matcher, AUTHORITY_HOST_GROUP); 695 696 String portStr = getURIGroup(matcher, AUTHORITY_PORT_GROUP); 697 698 if (portStr != null && ! portStr.isEmpty()) 699 try 700 { 701 port = Integer.parseInt(portStr); 702 } 703 catch (NumberFormatException e) 704 { 705 URISyntaxException use = 706 new URISyntaxException 707 (string, "doesn't match URI regular expression"); 708 use.initCause(e); 709 throw use; 710 } 711 } 712 else 713 throw new URISyntaxException(string, 714 "doesn't match URI regular expression"); 715 } 716 return this; 717 } 718 719 /** 720 * <p> 721 * Returns a normalized version of the URI. If the URI is opaque, 722 * or its path is already in normal form, then this URI is simply 723 * returned. Otherwise, the following transformation of the path 724 * element takes place: 725 * </p> 726 * <ol> 727 * <li>All `.' segments are removed.</li> 728 * <li>Each `..' segment which can be paired with a prior non-`..' segment 729 * is removed along with the preceding segment.</li> 730 * <li>A `.' segment is added to the front if the first segment contains 731 * a colon (`:'). This is a deviation from the RFC, which prevents 732 * confusion between the path and the scheme.</li> 733 * </ol> 734 * <p> 735 * The resulting URI will be free of `.' and `..' segments, barring those 736 * that were prepended or which couldn't be paired, respectively. 737 * </p> 738 * 739 * @return the normalized URI. 740 */ normalize()741 public URI normalize() 742 { 743 if (isOpaque() || path.indexOf("/./") == -1 && path.indexOf("/../") == -1) 744 return this; 745 try 746 { 747 return new URI(scheme, authority, normalizePath(path), query, 748 fragment); 749 } 750 catch (URISyntaxException e) 751 { 752 throw (Error) new InternalError("Normalized URI variant could not "+ 753 "be constructed").initCause(e); 754 } 755 } 756 757 /** 758 * <p> 759 * Normalize the given path. The following transformation takes place: 760 * </p> 761 * <ol> 762 * <li>All `.' segments are removed.</li> 763 * <li>Each `..' segment which can be paired with a prior non-`..' segment 764 * is removed along with the preceding segment.</li> 765 * <li>A `.' segment is added to the front if the first segment contains 766 * a colon (`:'). This is a deviation from the RFC, which prevents 767 * confusion between the path and the scheme.</li> 768 * </ol> 769 * <p> 770 * The resulting URI will be free of `.' and `..' segments, barring those 771 * that were prepended or which couldn't be paired, respectively. 772 * </p> 773 * 774 * @param relativePath the relative path to be normalized. 775 * @return the normalized path. 776 */ normalizePath(String relativePath)777 private String normalizePath(String relativePath) 778 { 779 /* 780 This follows the algorithm in section 5.2.4. of RFC3986, 781 but doesn't modify the input buffer. 782 */ 783 CPStringBuilder input = new CPStringBuilder(relativePath); 784 CPStringBuilder output = new CPStringBuilder(); 785 int start = 0; 786 while (start < input.length()) 787 { 788 /* A */ 789 if (input.indexOf("../",start) == start) 790 { 791 start += 3; 792 continue; 793 } 794 if (input.indexOf("./",start) == start) 795 { 796 start += 2; 797 continue; 798 } 799 /* B */ 800 if (input.indexOf("/./",start) == start) 801 { 802 start += 2; 803 continue; 804 } 805 if (input.indexOf("/.",start) == start 806 && input.charAt(start + 2) != '.') 807 { 808 start += 1; 809 input.setCharAt(start,'/'); 810 continue; 811 } 812 /* C */ 813 if (input.indexOf("/../",start) == start) 814 { 815 start += 3; 816 removeLastSegment(output); 817 continue; 818 } 819 if (input.indexOf("/..",start) == start) 820 { 821 start += 2; 822 input.setCharAt(start,'/'); 823 removeLastSegment(output); 824 continue; 825 } 826 /* D */ 827 if (start == input.length() - 1 && input.indexOf(".",start) == start) 828 { 829 input.delete(0,1); 830 continue; 831 } 832 if (start == input.length() - 2 && input.indexOf("..",start) == start) 833 { 834 input.delete(0,2); 835 continue; 836 } 837 /* E */ 838 int indexOfSlash = input.indexOf("/",start); 839 while (indexOfSlash == start) 840 { 841 output.append("/"); 842 ++start; 843 indexOfSlash = input.indexOf("/",start); 844 } 845 if (indexOfSlash == -1) 846 indexOfSlash = input.length(); 847 output.append(input.substring(start, indexOfSlash)); 848 start = indexOfSlash; 849 } 850 return output.toString(); 851 } 852 853 /** 854 * Removes the last segment of the path from the specified buffer. 855 * 856 * @param buffer the buffer containing the path. 857 */ removeLastSegment(CPStringBuilder buffer)858 private void removeLastSegment(CPStringBuilder buffer) 859 { 860 int lastSlash = buffer.lastIndexOf("/"); 861 if (lastSlash == -1) 862 buffer.setLength(0); 863 else 864 buffer.setLength(lastSlash); 865 } 866 867 /** 868 * Resolves the given URI against this URI 869 * 870 * @param uri The URI to resolve against this URI 871 * 872 * @return The resulting URI, or null when it couldn't be resolved 873 * for some reason. 874 * 875 * @throws NullPointerException if uri is null 876 */ resolve(URI uri)877 public URI resolve(URI uri) 878 { 879 if (uri.isAbsolute()) 880 return uri; 881 if (uri.isOpaque()) 882 return uri; 883 884 String scheme = uri.getScheme(); 885 String schemeSpecificPart = uri.getSchemeSpecificPart(); 886 String authority = uri.getAuthority(); 887 String path = uri.getPath(); 888 String query = uri.getQuery(); 889 String fragment = uri.getFragment(); 890 891 try 892 { 893 if (fragment != null && path != null && path.equals("") 894 && scheme == null && authority == null && query == null) 895 return new URI(this.scheme, this.schemeSpecificPart, fragment); 896 897 if (authority == null) 898 { 899 authority = this.authority; 900 if (path == null) 901 path = ""; 902 if (! (path.startsWith("/"))) 903 { 904 CPStringBuilder basepath = new CPStringBuilder(this.path); 905 int i = this.path.lastIndexOf('/'); 906 907 if (i >= 0) 908 basepath.delete(i + 1, basepath.length()); 909 910 basepath.append(path); 911 path = normalizePath(basepath.toString()); 912 } 913 } 914 return new URI(this.scheme, authority, path, query, fragment); 915 } 916 catch (URISyntaxException e) 917 { 918 throw (Error) new InternalError("Resolved URI variant could not "+ 919 "be constructed").initCause(e); 920 } 921 } 922 923 /** 924 * Resolves the given URI string against this URI 925 * 926 * @param str The URI as string to resolve against this URI 927 * 928 * @return The resulting URI 929 * 930 * @throws IllegalArgumentException If the given URI string 931 * violates RFC 2396 932 * @throws NullPointerException If uri is null 933 */ resolve(String str)934 public URI resolve(String str) throws IllegalArgumentException 935 { 936 return resolve(create(str)); 937 } 938 939 /** 940 * <p> 941 * Relativizes the given URI against this URI. The following 942 * algorithm is used: 943 * </p> 944 * <ul> 945 * <li>If either URI is opaque, the given URI is returned.</li> 946 * <li>If the schemes of the URIs differ, the given URI is returned.</li> 947 * <li>If the authority components of the URIs differ, then the given 948 * URI is returned.</li> 949 * <li>If the path of this URI is not a prefix of the supplied URI, 950 * then the given URI is returned.</li> 951 * <li>If all the above conditions hold, a new URI is created using the 952 * query and fragment components of the given URI, along with a path 953 * computed by removing the path of this URI from the start of the path 954 * of the supplied URI.</li> 955 * </ul> 956 * 957 * @param uri the URI to relativize agsint this URI 958 * @return the resulting URI 959 * @throws NullPointerException if the uri is null 960 */ relativize(URI uri)961 public URI relativize(URI uri) 962 { 963 if (isOpaque() || uri.isOpaque()) 964 return uri; 965 if (scheme == null && uri.getScheme() != null) 966 return uri; 967 if (scheme != null && !(scheme.equals(uri.getScheme()))) 968 return uri; 969 if (rawAuthority == null && uri.getRawAuthority() != null) 970 return uri; 971 if (rawAuthority != null && !(rawAuthority.equals(uri.getRawAuthority()))) 972 return uri; 973 String basePath = rawPath; 974 if (!(uri.getRawPath().equals(rawPath))) 975 { 976 if (!(basePath.endsWith("/"))) 977 basePath = basePath.concat("/"); 978 if (!(uri.getRawPath().startsWith(basePath))) 979 return uri; 980 } 981 try 982 { 983 return new URI(null, null, 984 uri.getRawPath().substring(basePath.length()), 985 uri.getRawQuery(), uri.getRawFragment()); 986 } 987 catch (URISyntaxException e) 988 { 989 throw (Error) new InternalError("Relativized URI variant could not "+ 990 "be constructed").initCause(e); 991 } 992 } 993 994 /** 995 * Creates an URL from an URI 996 * 997 * @throws MalformedURLException If a protocol handler for the URL could 998 * not be found, or if some other error occurred while constructing the URL 999 * @throws IllegalArgumentException If the URI is not absolute 1000 */ toURL()1001 public URL toURL() throws IllegalArgumentException, MalformedURLException 1002 { 1003 if (isAbsolute()) 1004 return new URL(this.toString()); 1005 1006 throw new IllegalArgumentException("not absolute"); 1007 } 1008 1009 /** 1010 * Returns the scheme of the URI 1011 */ getScheme()1012 public String getScheme() 1013 { 1014 return scheme; 1015 } 1016 1017 /** 1018 * Tells whether this URI is absolute or not 1019 */ isAbsolute()1020 public boolean isAbsolute() 1021 { 1022 return scheme != null; 1023 } 1024 1025 /** 1026 * Tell whether this URI is opaque or not 1027 */ isOpaque()1028 public boolean isOpaque() 1029 { 1030 return ((scheme != null) && ! (schemeSpecificPart.startsWith("/"))); 1031 } 1032 1033 /** 1034 * Returns the raw scheme specific part of this URI. 1035 * The scheme-specific part is never undefined, though it may be empty 1036 */ getRawSchemeSpecificPart()1037 public String getRawSchemeSpecificPart() 1038 { 1039 return rawSchemeSpecificPart; 1040 } 1041 1042 /** 1043 * Returns the decoded scheme specific part of this URI. 1044 */ getSchemeSpecificPart()1045 public String getSchemeSpecificPart() 1046 { 1047 return schemeSpecificPart; 1048 } 1049 1050 /** 1051 * Returns the raw authority part of this URI 1052 */ getRawAuthority()1053 public String getRawAuthority() 1054 { 1055 return rawAuthority; 1056 } 1057 1058 /** 1059 * Returns the decoded authority part of this URI 1060 */ getAuthority()1061 public String getAuthority() 1062 { 1063 return authority; 1064 } 1065 1066 /** 1067 * Returns the raw user info part of this URI 1068 */ getRawUserInfo()1069 public String getRawUserInfo() 1070 { 1071 return rawUserInfo; 1072 } 1073 1074 /** 1075 * Returns the decoded user info part of this URI 1076 */ getUserInfo()1077 public String getUserInfo() 1078 { 1079 return userInfo; 1080 } 1081 1082 /** 1083 * Returns the hostname of the URI 1084 */ getHost()1085 public String getHost() 1086 { 1087 return host; 1088 } 1089 1090 /** 1091 * Returns the port number of the URI 1092 */ getPort()1093 public int getPort() 1094 { 1095 return port; 1096 } 1097 1098 /** 1099 * Returns the raw path part of this URI 1100 */ getRawPath()1101 public String getRawPath() 1102 { 1103 return rawPath; 1104 } 1105 1106 /** 1107 * Returns the path of the URI 1108 */ getPath()1109 public String getPath() 1110 { 1111 return path; 1112 } 1113 1114 /** 1115 * Returns the raw query part of this URI 1116 */ getRawQuery()1117 public String getRawQuery() 1118 { 1119 return rawQuery; 1120 } 1121 1122 /** 1123 * Returns the query of the URI 1124 */ getQuery()1125 public String getQuery() 1126 { 1127 return query; 1128 } 1129 1130 /** 1131 * Return the raw fragment part of this URI 1132 */ getRawFragment()1133 public String getRawFragment() 1134 { 1135 return rawFragment; 1136 } 1137 1138 /** 1139 * Returns the fragment of the URI 1140 */ getFragment()1141 public String getFragment() 1142 { 1143 return fragment; 1144 } 1145 1146 /** 1147 * <p> 1148 * Compares the URI with the given object for equality. If the 1149 * object is not a <code>URI</code>, then the method returns false. 1150 * Otherwise, the following criteria are observed: 1151 * </p> 1152 * <ul> 1153 * <li>The scheme of the URIs must either be null (undefined) in both cases, 1154 * or equal, ignorant of case.</li> 1155 * <li>The raw fragment of the URIs must either be null (undefined) in both 1156 * cases, or equal, ignorant of case.</li> 1157 * <li>Both URIs must be of the same type (opaque or hierarchial)</li> 1158 * <li><strong>For opaque URIs:</strong></li> 1159 * <ul> 1160 * <li>The raw scheme-specific parts must be equal.</li> 1161 * </ul> 1162 * <li>For hierarchical URIs:</li> 1163 * <ul> 1164 * <li>The raw paths must be equal, ignorant of case.</li> 1165 * <li>The raw queries are either both undefined or both equal, ignorant 1166 * of case.</li> 1167 * <li>The raw authority sections are either both undefined or:</li> 1168 * <li><strong>For registry-based authorities:</strong></li> 1169 * <ul><li>they are equal.</li></ul> 1170 * <li><strong>For server-based authorities:</strong></li> 1171 * <ul> 1172 * <li>the hosts are equal, ignoring case</li> 1173 * <li>the ports are equal</li> 1174 * <li>the user information components are equal</li> 1175 * </ul> 1176 * </ul> 1177 * </ul> 1178 * 1179 * @param obj the obj to compare the URI with. 1180 * @return <code>true</code> if the objects are equal, according to 1181 * the specification above. 1182 */ equals(Object obj)1183 public boolean equals(Object obj) 1184 { 1185 if (!(obj instanceof URI)) 1186 return false; 1187 URI uriObj = (URI) obj; 1188 if (scheme == null) 1189 { 1190 if (uriObj.getScheme() != null) 1191 return false; 1192 } 1193 else 1194 if (!(scheme.equalsIgnoreCase(uriObj.getScheme()))) 1195 return false; 1196 if (rawFragment == null) 1197 { 1198 if (uriObj.getRawFragment() != null) 1199 return false; 1200 } 1201 else 1202 if (!(rawFragment.equalsIgnoreCase(uriObj.getRawFragment()))) 1203 return false; 1204 boolean opaqueThis = isOpaque(); 1205 boolean opaqueObj = uriObj.isOpaque(); 1206 if (opaqueThis && opaqueObj) 1207 return rawSchemeSpecificPart.equals(uriObj.getRawSchemeSpecificPart()); 1208 else if (!opaqueThis && !opaqueObj) 1209 { 1210 boolean common = rawPath.equalsIgnoreCase(uriObj.getRawPath()) 1211 && ((rawQuery == null && uriObj.getRawQuery() == null) 1212 || rawQuery.equalsIgnoreCase(uriObj.getRawQuery())); 1213 if (rawAuthority == null && uriObj.getRawAuthority() == null) 1214 return common; 1215 if (host == null) 1216 return common 1217 && rawAuthority.equalsIgnoreCase(uriObj.getRawAuthority()); 1218 return common 1219 && host.equalsIgnoreCase(uriObj.getHost()) 1220 && port == uriObj.getPort() 1221 && (rawUserInfo == null ? 1222 uriObj.getRawUserInfo() == null : 1223 rawUserInfo.equalsIgnoreCase(uriObj.getRawUserInfo())); 1224 } 1225 else 1226 return false; 1227 } 1228 1229 /** 1230 * Computes the hashcode of the URI 1231 */ hashCode()1232 public int hashCode() 1233 { 1234 return (getScheme() == null ? 0 : 13 * getScheme().hashCode()) 1235 + 17 * getRawSchemeSpecificPart().hashCode() 1236 + (getRawFragment() == null ? 0 : 21 + getRawFragment().hashCode()); 1237 } 1238 1239 /** 1240 * Compare the URI with another URI. 1241 * Undefined components are taken to be less than any other component. 1242 * The following criteria are observed: 1243 * </p> 1244 * <ul> 1245 * <li>Two URIs with different schemes are compared according to their 1246 * scheme, regardless of case.</li> 1247 * <li>A hierarchical URI is less than an opaque URI with the same 1248 * scheme.</li> 1249 * <li><strong>For opaque URIs:</strong></li> 1250 * <ul> 1251 * <li>URIs with differing scheme-specific parts are ordered according 1252 * to the ordering of the scheme-specific part.</li> 1253 * <li>URIs with the same scheme-specific part are ordered by the 1254 * raw fragment.</li> 1255 * </ul> 1256 * <li>For hierarchical URIs:</li> 1257 * <ul> 1258 * <li>URIs are ordered according to their raw authority sections, 1259 * if they are unequal.</li> 1260 * <li><strong>For registry-based authorities:</strong></li> 1261 * <ul><li>they are ordered according to the ordering of the authority 1262 * component.</li></ul> 1263 * <li><strong>For server-based authorities:</strong></li> 1264 * <ul> 1265 * <li>URIs are ordered according to the raw user information.</li> 1266 * <li>URIs with the same user information are ordered by the host, 1267 * ignoring case.</li> 1268 * <lI>URIs with the same host are ordered by the port.</li> 1269 * </ul> 1270 * <li>URIs with the same authority section are ordered by the raw path.</li> 1271 * <li>URIs with the same path are ordered by their raw query.</li> 1272 * <li>URIs with the same query are ordered by their raw fragments.</li> 1273 * </ul> 1274 * </ul> 1275 * 1276 * @param uri The other URI to compare this URI with 1277 * @return a negative integer, zero or a positive integer depending 1278 * on whether this URI is less than, equal to or greater 1279 * than that supplied, respectively. 1280 */ compareTo(URI uri)1281 public int compareTo(URI uri) 1282 throws ClassCastException 1283 { 1284 if (scheme == null && uri.getScheme() != null) 1285 return -1; 1286 if (scheme != null) 1287 { 1288 int sCompare = scheme.compareToIgnoreCase(uri.getScheme()); 1289 if (sCompare != 0) 1290 return sCompare; 1291 } 1292 boolean opaqueThis = isOpaque(); 1293 boolean opaqueObj = uri.isOpaque(); 1294 if (opaqueThis && !opaqueObj) 1295 return 1; 1296 if (!opaqueThis && opaqueObj) 1297 return -1; 1298 if (opaqueThis) 1299 { 1300 int ssCompare = 1301 rawSchemeSpecificPart.compareTo(uri.getRawSchemeSpecificPart()); 1302 if (ssCompare == 0) 1303 return compareFragments(uri); 1304 else 1305 return ssCompare; 1306 } 1307 if (rawAuthority == null && uri.getRawAuthority() != null) 1308 return -1; 1309 if (rawAuthority != null) 1310 { 1311 int aCompare = rawAuthority.compareTo(uri.getRawAuthority()); 1312 if (aCompare != 0) 1313 { 1314 if (host == null) 1315 return aCompare; 1316 if (rawUserInfo == null && uri.getRawUserInfo() != null) 1317 return -1; 1318 int uCompare = rawUserInfo.compareTo(uri.getRawUserInfo()); 1319 if (uCompare != 0) 1320 return uCompare; 1321 if (host == null && uri.getHost() != null) 1322 return -1; 1323 int hCompare = host.compareTo(uri.getHost()); 1324 if (hCompare != 0) 1325 return hCompare; 1326 int uriPort = uri.getPort(); 1327 return (uriPort == port) ? 0 : (uriPort > port) ? -1 : 1; 1328 } 1329 } 1330 if (rawPath == null && uri.getRawPath() != null) 1331 return -1; 1332 if (rawPath != null) 1333 { 1334 int pCompare = rawPath.compareTo(uri.getRawPath()); 1335 if (pCompare != 0) 1336 return pCompare; 1337 } 1338 if (rawQuery == null && uri.getRawQuery() != null) 1339 return -1; 1340 if (rawQuery != null) 1341 { 1342 int qCompare = rawQuery.compareTo(uri.getRawQuery()); 1343 if (qCompare != 0) 1344 return qCompare; 1345 } 1346 return compareFragments(uri); 1347 } 1348 1349 /** 1350 * Compares the fragment of this URI with that of the supplied URI. 1351 * 1352 * @param uri the URI to compare with this one. 1353 * @return a negative integer, zero or a positive integer depending 1354 * on whether this uri's fragment is less than, equal to 1355 * or greater than the fragment of the uri supplied, respectively. 1356 */ compareFragments(URI uri)1357 private int compareFragments(URI uri) 1358 { 1359 if (rawFragment == null && uri.getRawFragment() != null) 1360 return -1; 1361 else if (rawFragment == null) 1362 return 0; 1363 else 1364 return rawFragment.compareTo(uri.getRawFragment()); 1365 } 1366 1367 /** 1368 * Returns the URI as a String. If the URI was created using a constructor, 1369 * then this will be the same as the original input string. 1370 * 1371 * @return a string representation of the URI. 1372 */ toString()1373 public String toString() 1374 { 1375 return (scheme == null ? "" : scheme + ":") 1376 + rawSchemeSpecificPart 1377 + (rawFragment == null ? "" : "#" + rawFragment); 1378 } 1379 1380 /** 1381 * Returns the URI as US-ASCII string. This is the same as the result 1382 * from <code>toString()</code> for URIs that don't contain any non-US-ASCII 1383 * characters. Otherwise, the non-US-ASCII characters are replaced 1384 * by their percent-encoded representations. 1385 * 1386 * @return a string representation of the URI, containing only US-ASCII 1387 * characters. 1388 */ toASCIIString()1389 public String toASCIIString() 1390 { 1391 String strRep = toString(); 1392 boolean inNonAsciiBlock = false; 1393 CPStringBuilder buffer = new CPStringBuilder(); 1394 CPStringBuilder encBuffer = null; 1395 for (int i = 0; i < strRep.length(); i++) 1396 { 1397 char c = strRep.charAt(i); 1398 if (c <= 127) 1399 { 1400 if (inNonAsciiBlock) 1401 { 1402 buffer.append(escapeCharacters(encBuffer.toString())); 1403 inNonAsciiBlock = false; 1404 } 1405 buffer.append(c); 1406 } 1407 else 1408 { 1409 if (!inNonAsciiBlock) 1410 { 1411 encBuffer = new CPStringBuilder(); 1412 inNonAsciiBlock = true; 1413 } 1414 encBuffer.append(c); 1415 } 1416 } 1417 return buffer.toString(); 1418 } 1419 1420 /** 1421 * Converts the non-ASCII characters in the supplied string 1422 * to their equivalent percent-encoded representations. 1423 * That is, they are replaced by "%" followed by their hexadecimal value. 1424 * 1425 * @param str a string including non-ASCII characters. 1426 * @return the string with the non-ASCII characters converted to their 1427 * percent-encoded representations. 1428 */ escapeCharacters(String str)1429 private static String escapeCharacters(String str) 1430 { 1431 try 1432 { 1433 CPStringBuilder sb = new CPStringBuilder(); 1434 // this is far from optimal, but it works 1435 byte[] utf8 = str.getBytes("utf-8"); 1436 for (int j = 0; j < utf8.length; j++) 1437 { 1438 sb.append('%'); 1439 sb.append(HEX.charAt((utf8[j] & 0xff) / 16)); 1440 sb.append(HEX.charAt((utf8[j] & 0xff) % 16)); 1441 } 1442 return sb.toString(); 1443 } 1444 catch (java.io.UnsupportedEncodingException x) 1445 { 1446 throw (Error) new InternalError("Escaping error").initCause(x); 1447 } 1448 } 1449 1450 } 1451