1 /* URI.java -- An URI class
2    Copyright (C) 2002, 2004, 2005, 2006, 2008  Free Software Foundation, Inc.
3 
4 This file is part of GNU Classpath.
5 
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10 
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING.  If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
20 
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library.  Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
25 
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module.  An independent module is a module which is not derived from
33 or based on this library.  If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so.  If you do not wish to do so, delete this
36 exception statement from your version. */
37 
38 
39 package java.net;
40 
41 import gnu.java.lang.CPStringBuilder;
42 
43 import java.io.IOException;
44 import java.io.ObjectInputStream;
45 import java.io.ObjectOutputStream;
46 import java.io.Serializable;
47 import java.util.regex.Matcher;
48 import java.util.regex.Pattern;
49 
50 /**
51  * <p>
52  * A URI instance represents that defined by
53  * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC3986</a>,
54  * with some deviations.
55  * </p>
56  * <p>
57  * At its highest level, a URI consists of:
58  * </p>
59  * <code>[<em>scheme</em><strong>:</strong>]<em>scheme-specific-part</em>
60  * [<strong>#</strong><em>fragment</em>]</code>
61  * </p>
62  * <p>
63  * where <strong>#</strong> and <strong>:</strong> are literal characters,
64  * and those parts enclosed in square brackets are optional.
65  * </p>
66  * <p>
67  * There are two main types of URI.  An <em>opaque</em> URI is one
68  * which just consists of the above three parts, and is not further
69  * defined.  An example of such a URI would be <em>mailto:</em> URI.
70  * In contrast, <em>hierarchical</em> URIs give further definition
71  * to the scheme-specific part, so as represent some part of a hierarchical
72  * structure.
73  * </p>
74  * <p>
75  * <code>[<strong>//</strong><em>authority</em>][<em>path</em>]
76  * [<strong>?</strong><em>query</em>]</code>
77  * </p>
78  * <p>
79  * with <strong>/</strong> and <strong>?</strong> being literal characters.
80  * When server-based, the authority section is further subdivided into:
81  * </p>
82  * <p>
83  * <code>[<em>user-info</em><strong>@</strong>]<em>host</em>
84  * [<strong>:</strong><em>port</em>]</code>
85  * </p>
86  * <p>
87  * with <strong>@</strong> and <strong>:</strong> as literal characters.
88  * Authority sections that are not server-based are said to be registry-based.
89  * </p>
90  * <p>
91  * Hierarchical URIs can be either relative or absolute.  Absolute URIs
92  * always start with a `<strong>/</strong>', while relative URIs don't
93  * specify a scheme.  Opaque URIs are always absolute.
94  * </p>
95  * <p>
96  * Each part of the URI may have one of three states: undefined, empty
97  * or containing some content.  The former two of these are represented
98  * by <code>null</code> and the empty string in Java, respectively.
99  * The scheme-specific part may never be undefined.  It also follows from
100  * this that the path sub-part may also not be undefined, so as to ensure
101  * the former.
102  * </p>
103  * <h2>Character Escaping and Quoting</h2>
104  * <p>
105  * The characters that can be used within a valid URI are restricted.
106  * There are two main classes of characters which can't be used as is
107  * within the URI:
108  * </p>
109  * <ol>
110  * <li><strong>Characters outside the US-ASCII character set</strong>.
111  * These have to be <strong>escaped</strong> in order to create
112  * an RFC-compliant URI; this means replacing the character with the
113  * appropriate hexadecimal value, preceded by a `%'.</li>
114  * <li><strong>Illegal characters</strong> (e.g. space characters,
115  * control characters) are quoted, which results in them being encoded
116  * in the same way as non-US-ASCII characters.</li>
117  * </ol>
118  * <p>
119  * The set of valid characters differs depending on the section of the URI:
120  * </p>
121  * <ul>
122  * <li><strong>Scheme</strong>: Must be an alphanumeric, `-', `.' or '+'.</li>
123  * <li><strong>Authority</strong>:Composed of the username, host, port, `@'
124  * and `:'.</li>
125  * <li><strong>Username</strong>: Allows unreserved or percent-encoded
126  * characters, sub-delimiters and `:'.</li>
127  * <li><strong>Host</strong>: Allows unreserved or percent-encoded
128  * characters, sub-delimiters and square brackets (`[' and `]') for IPv6
129  * addresses.</li>
130  * <li><strong>Port</strong>: Digits only.</li>
131  * <li><strong>Path</strong>: Allows the path characters and `/'.
132  * <li><strong>Query</strong>: Allows the path characters, `?' and '/'.
133  * <li><strong>Fragment</strong>: Allows the path characters, `?' and '/'.
134  * </ul>
135  * <p>
136  * These definitions reference the following sets of characters:
137  * </p>
138  * <ul>
139  * <li><strong>Unreserved characters</strong>: The alphanumerics plus
140  * `-', `.', `_', and `~'.</li>
141  * <li><strong>Sub-delimiters</strong>: `!', `$', `&', `(', `)', `*',
142  * `+', `,', `;', `=' and the single-quote itself.</li>
143  * <li><strong>Path characters</strong>: Unreserved and percent-encoded
144  * characters and the sub-delimiters along with `@' and `:'.</li>
145  * </ul>
146  * <p>
147  * The constructors and accessor methods allow the use and retrieval of
148  * URI components which contain non-US-ASCII characters directly.
149  * They are only escaped when the <code>toASCIIString()</code> method
150  * is used.  In contrast, illegal characters are always quoted, with the
151  * exception of the return values of the non-raw accessors.
152  * </p>
153  *
154  * @author Ito Kazumitsu (ito.kazumitsu@hitachi-cable.co.jp)
155  * @author Dalibor Topic (robilad@kaffe.org)
156  * @author Michael Koch (konqueror@gmx.de)
157  * @author Andrew John Hughes (gnu_andrew@member.fsf.org)
158  * @since 1.4
159  */
160 public final class URI
161   implements Comparable<URI>, Serializable
162 {
163   /**
164    * For serialization compatability.
165    */
166   static final long serialVersionUID = -6052424284110960213L;
167 
168   /**
169    * Regular expression for parsing URIs.
170    *
171    * Taken from RFC 2396, Appendix B.
172    * This expression doesn't parse IPv6 addresses.
173    */
174   private static final String URI_REGEXP =
175     "^(([^:/?#]+):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?";
176 
177   /**
178    * Regular expression for parsing the authority segment.
179    */
180   private static final String AUTHORITY_REGEXP =
181     "(([^?#]*)@)?([^?#:]*)(:([0-9]*))?";
182 
183   /**
184    * Valid characters (taken from rfc2396/3986)
185    */
186   private static final String RFC2396_DIGIT = "0123456789";
187   private static final String RFC2396_LOWALPHA = "abcdefghijklmnopqrstuvwxyz";
188   private static final String RFC2396_UPALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
189   private static final String RFC2396_ALPHA =
190     RFC2396_LOWALPHA + RFC2396_UPALPHA;
191   private static final String RFC2396_ALPHANUM = RFC2396_DIGIT + RFC2396_ALPHA;
192   private static final String RFC3986_UNRESERVED = RFC2396_ALPHANUM + "-._~";
193   private static final String RFC3986_SUBDELIMS = "!$&'()*+,;=";
194   private static final String RFC3986_REG_NAME =
195     RFC3986_UNRESERVED + RFC3986_SUBDELIMS + "%";
196   private static final String RFC3986_PCHAR = RFC3986_UNRESERVED +
197     RFC3986_SUBDELIMS + ":@%";
198   private static final String RFC3986_SEGMENT = RFC3986_PCHAR;
199   private static final String RFC3986_PATH_SEGMENTS = RFC3986_SEGMENT + "/";
200   private static final String RFC3986_SSP = RFC3986_PCHAR + "?/";
201   private static final String RFC3986_HOST = RFC3986_REG_NAME + "[]";
202   private static final String RFC3986_USERINFO = RFC3986_REG_NAME + ":";
203 
204   /**
205    * Index of scheme component in parsed URI.
206    */
207   private static final int SCHEME_GROUP = 2;
208 
209   /**
210    * Index of scheme-specific-part in parsed URI.
211    */
212   private static final int SCHEME_SPEC_PART_GROUP = 3;
213 
214   /**
215    * Index of authority component in parsed URI.
216    */
217   private static final int AUTHORITY_GROUP = 5;
218 
219   /**
220    * Index of path component in parsed URI.
221    */
222   private static final int PATH_GROUP = 6;
223 
224   /**
225    * Index of query component in parsed URI.
226    */
227   private static final int QUERY_GROUP = 8;
228 
229   /**
230    * Index of fragment component in parsed URI.
231    */
232   private static final int FRAGMENT_GROUP = 10;
233 
234   /**
235    * Index of userinfo component in parsed authority section.
236    */
237   private static final int AUTHORITY_USERINFO_GROUP = 2;
238 
239   /**
240    * Index of host component in parsed authority section.
241    */
242   private static final int AUTHORITY_HOST_GROUP = 3;
243 
244   /**
245    * Index of port component in parsed authority section.
246    */
247   private static final int AUTHORITY_PORT_GROUP = 5;
248 
249   /**
250    * The compiled version of the URI regular expression.
251    */
252   private static final Pattern URI_PATTERN;
253 
254   /**
255    * The compiled version of the authority regular expression.
256    */
257   private static final Pattern AUTHORITY_PATTERN;
258 
259   /**
260    * The set of valid hexadecimal characters.
261    */
262   private static final String HEX = "0123456789ABCDEF";
263 
264   private transient String scheme;
265   private transient String rawSchemeSpecificPart;
266   private transient String schemeSpecificPart;
267   private transient String rawAuthority;
268   private transient String authority;
269   private transient String rawUserInfo;
270   private transient String userInfo;
271   private transient String rawHost;
272   private transient String host;
273   private transient int port = -1;
274   private transient String rawPath;
275   private transient String path;
276   private transient String rawQuery;
277   private transient String query;
278   private transient String rawFragment;
279   private transient String fragment;
280   private String string;
281 
282   /**
283    * Static initializer to pre-compile the regular expressions.
284    */
285   static
286   {
287     URI_PATTERN = Pattern.compile(URI_REGEXP);
288     AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEXP);
289   }
290 
readObject(ObjectInputStream is)291   private void readObject(ObjectInputStream is)
292     throws ClassNotFoundException, IOException
293   {
294     this.string = (String) is.readObject();
295     try
296       {
297         parseURI(this.string);
298       }
299     catch (URISyntaxException x)
300       {
301         // Should not happen.
302         throw new RuntimeException(x);
303       }
304   }
305 
writeObject(ObjectOutputStream os)306   private void writeObject(ObjectOutputStream os) throws IOException
307   {
308     if (string == null)
309       string = toString();
310     os.writeObject(string);
311   }
312 
313   /**
314    * <p>
315    * Returns the string content of the specified group of the supplied
316    * matcher.  The returned value is modified according to the following:
317    * </p>
318    * <ul>
319    * <li>If the resulting string has a length greater than 0, then
320    * that string is returned.</li>
321    * <li>If a string of zero length, is matched, then the content
322    * of the preceding group is considered.  If this is also an empty
323    * string, then <code>null</code> is returned to indicate an undefined
324    * value.  Otherwise, the value is truly the empty string and this is
325    * the returned value.</li>
326    * </ul>
327    * <p>
328    * This method is used for matching against all parts of the URI
329    * that may be either undefined or empty (i.e. all those but the
330    * scheme-specific part and the path).  In each case, the preceding
331    * group is the content of the original group, along with some
332    * additional distinguishing feature.  For example, the preceding
333    * group for the query includes the preceding question mark,
334    * while that of the fragment includes the hash symbol.  The presence
335    * of these features enables disambiguation between the two cases
336    * of a completely unspecified value and a simple non-existant value.
337    * The scheme differs in that it will never return an empty string;
338    * the delimiter follows the scheme rather than preceding it, so
339    * it becomes part of the following section.  The same is true
340    * of the user information.
341    * </p>
342    *
343    * @param match the matcher, which contains the results of the URI
344    *              matched against the URI regular expression.
345    * @return either the matched content, <code>null</code> for undefined
346    *         values, or an empty string for a URI part with empty content.
347    */
getURIGroup(Matcher match, int group)348   private static String getURIGroup(Matcher match, int group)
349   {
350     String matched = match.group(group);
351     if (matched == null || matched.length() == 0)
352       {
353         String prevMatched = match.group(group -1);
354         if (prevMatched == null || prevMatched.length() == 0)
355           return null;
356         else
357           return "";
358       }
359     return matched;
360   }
361 
362   /**
363    * Sets fields of this URI by parsing the given string.
364    *
365    * @param str The string to parse
366    *
367    * @exception URISyntaxException If the given string violates RFC 2396
368    */
parseURI(String str)369   private void parseURI(String str) throws URISyntaxException
370   {
371     Matcher matcher = URI_PATTERN.matcher(str);
372 
373     if (matcher.matches())
374       {
375         scheme = getURIGroup(matcher, SCHEME_GROUP);
376         rawSchemeSpecificPart = matcher.group(SCHEME_SPEC_PART_GROUP);
377         schemeSpecificPart = unquote(rawSchemeSpecificPart);
378         if (!isOpaque())
379           {
380             rawAuthority = getURIGroup(matcher, AUTHORITY_GROUP);
381             rawPath = matcher.group(PATH_GROUP);
382             rawQuery = getURIGroup(matcher, QUERY_GROUP);
383           }
384         rawFragment = getURIGroup(matcher, FRAGMENT_GROUP);
385       }
386     else
387       throw new URISyntaxException(str,
388                                    "doesn't match URI regular expression");
389     parseServerAuthority();
390 
391     // We must eagerly unquote the parts, because this is the only time
392     // we may throw an exception.
393     authority = unquote(rawAuthority);
394     userInfo = unquote(rawUserInfo);
395     host = unquote(rawHost);
396     path = unquote(rawPath);
397     query = unquote(rawQuery);
398     fragment = unquote(rawFragment);
399   }
400 
401   /**
402    * Unquote "%" + hex quotes characters
403    *
404    * @param str The string to unquote or null.
405    *
406    * @return The unquoted string or null if str was null.
407    *
408    * @exception URISyntaxException If the given string contains invalid
409    * escape sequences.
410    */
unquote(String str)411   private static String unquote(String str) throws URISyntaxException
412   {
413     if (str == null)
414       return null;
415     byte[] buf = new byte[str.length()];
416     int pos = 0;
417     for (int i = 0; i < str.length(); i++)
418       {
419         char c = str.charAt(i);
420         if (c == '%')
421           {
422             if (i + 2 >= str.length())
423               throw new URISyntaxException(str, "Invalid quoted character");
424             int hi = Character.digit(str.charAt(++i), 16);
425             int lo = Character.digit(str.charAt(++i), 16);
426             if (lo < 0 || hi < 0)
427               throw new URISyntaxException(str, "Invalid quoted character");
428             buf[pos++] = (byte) (hi * 16 + lo);
429           }
430         else
431           buf[pos++] = (byte) c;
432       }
433     try
434       {
435         return new String(buf, 0, pos, "utf-8");
436       }
437     catch (java.io.UnsupportedEncodingException x2)
438       {
439         throw (Error) new InternalError().initCause(x2);
440       }
441   }
442 
443   /**
444    * Quote characters illegal in URIs in given string.
445    *
446    * Replace illegal characters by encoding their UTF-8
447    * representation as "%" + hex code for each resulting
448    * UTF-8 character.
449    *
450    * @param str The string to quote
451    *
452    * @return The quoted string.
453    */
quote(String str)454   private static String quote(String str)
455   {
456     return quote(str, RFC3986_SSP);
457   }
458 
459   /**
460    * Quote characters illegal in URI authorities in given string.
461    *
462    * Replace illegal characters by encoding their UTF-8
463    * representation as "%" + hex code for each resulting
464    * UTF-8 character.
465    *
466    * @param str The string to quote
467    *
468    * @return The quoted string.
469    */
quoteAuthority(String str)470   private static String quoteAuthority(String str)
471   {
472     // Technically, we should be using RFC2396_AUTHORITY, but
473     // it contains no additional characters.
474     return quote(str, RFC3986_REG_NAME);
475   }
476 
477   /**
478    * Quotes the characters in the supplied string that are not part of
479    * the specified set of legal characters.
480    *
481    * @param str the string to quote
482    * @param legalCharacters the set of legal characters
483    *
484    * @return the quoted string.
485    */
quote(String str, String legalCharacters)486   private static String quote(String str, String legalCharacters)
487   {
488     CPStringBuilder sb = new CPStringBuilder(str.length());
489     for (int i = 0; i < str.length(); i++)
490       {
491         char c = str.charAt(i);
492         if ((legalCharacters.indexOf(c) == -1)
493             && (c <= 127))
494           {
495             sb.append('%');
496             sb.append(HEX.charAt(c / 16));
497             sb.append(HEX.charAt(c % 16));
498           }
499         else
500           sb.append(c);
501       }
502     return sb.toString();
503   }
504 
505   /**
506    * Quote characters illegal in URI hosts in given string.
507    *
508    * Replace illegal characters by encoding their UTF-8
509    * representation as "%" + hex code for each resulting
510    * UTF-8 character.
511    *
512    * @param str The string to quote
513    *
514    * @return The quoted string.
515    */
quoteHost(String str)516   private static String quoteHost(String str)
517   {
518     return quote(str, RFC3986_HOST);
519   }
520 
521   /**
522    * Quote characters illegal in URI paths in given string.
523    *
524    * Replace illegal characters by encoding their UTF-8
525    * representation as "%" + hex code for each resulting
526    * UTF-8 character.
527    *
528    * @param str The string to quote
529    *
530    * @return The quoted string.
531    */
quotePath(String str)532   private static String quotePath(String str)
533   {
534     // Technically, we should be using RFC2396_PATH, but
535     // it contains no additional characters.
536     return quote(str, RFC3986_PATH_SEGMENTS);
537   }
538 
539   /**
540    * Quote characters illegal in URI user infos in given string.
541    *
542    * Replace illegal characters by encoding their UTF-8
543    * representation as "%" + hex code for each resulting
544    * UTF-8 character.
545    *
546    * @param str The string to quote
547    *
548    * @return The quoted string.
549    */
quoteUserInfo(String str)550   private static String quoteUserInfo(String str)
551   {
552     return quote(str, RFC3986_USERINFO);
553   }
554 
555   /**
556    * Creates an URI from the given string
557    *
558    * @param str The string to create the URI from
559    *
560    * @exception URISyntaxException If the given string violates RFC 2396
561    * @exception NullPointerException If str is null
562    */
URI(String str)563   public URI(String str) throws URISyntaxException
564   {
565     this.string = str;
566     parseURI(str);
567   }
568 
569   /**
570    * Create an URI from the given components
571    *
572    * @param scheme The scheme name
573    * @param userInfo The username and authorization info
574    * @param host The hostname
575    * @param port The port number
576    * @param path The path
577    * @param query The query
578    * @param fragment The fragment
579    *
580    * @exception URISyntaxException If the given string violates RFC 2396
581    */
URI(String scheme, String userInfo, String host, int port, String path, String query, String fragment)582   public URI(String scheme, String userInfo, String host, int port,
583              String path, String query, String fragment)
584     throws URISyntaxException
585   {
586     this((scheme == null ? "" : scheme + ":")
587          + (userInfo == null && host == null && port == -1 ? "" : "//")
588          + (userInfo == null ? "" : quoteUserInfo(userInfo) + "@")
589          + (host == null ? "" : quoteHost(host))
590          + (port == -1 ? "" : ":" + String.valueOf(port))
591          + (path == null ? "" : quotePath(path))
592          + (query == null ? "" : "?" + quote(query))
593          + (fragment == null ? "" : "#" + quote(fragment)));
594   }
595 
596   /**
597    * Create an URI from the given components
598    *
599    * @param scheme The scheme name
600    * @param authority The authority
601    * @param path The apth
602    * @param query The query
603    * @param fragment The fragment
604    *
605    * @exception URISyntaxException If the given string violates RFC 2396
606    */
URI(String scheme, String authority, String path, String query, String fragment)607   public URI(String scheme, String authority, String path, String query,
608              String fragment) throws URISyntaxException
609   {
610     this((scheme == null ? "" : scheme + ":")
611          + (authority == null ? "" : "//" + quoteAuthority(authority))
612          + (path == null ? "" : quotePath(path))
613          + (query == null ? "" : "?" + quote(query))
614          + (fragment == null ? "" : "#" + quote(fragment)));
615   }
616 
617   /**
618    * Create an URI from the given components
619    *
620    * @param scheme The scheme name
621    * @param host The hostname
622    * @param path The path
623    * @param fragment The fragment
624    *
625    * @exception URISyntaxException If the given string violates RFC 2396
626    */
URI(String scheme, String host, String path, String fragment)627   public URI(String scheme, String host, String path, String fragment)
628     throws URISyntaxException
629   {
630     this(scheme, null, host, -1, path, null, fragment);
631   }
632 
633   /**
634    * Create an URI from the given components
635    *
636    * @param scheme The scheme name
637    * @param ssp The scheme specific part
638    * @param fragment The fragment
639    *
640    * @exception URISyntaxException If the given string violates RFC 2396
641    */
URI(String scheme, String ssp, String fragment)642   public URI(String scheme, String ssp, String fragment)
643     throws URISyntaxException
644   {
645     this((scheme == null ? "" : scheme + ":")
646          + (ssp == null ? "" : quote(ssp))
647          + (fragment == null ? "" : "#" + quote(fragment)));
648   }
649 
650   /**
651    * Create an URI from the given string
652    *
653    * @param str The string to create the URI from
654    *
655    * @exception IllegalArgumentException If the given string violates RFC 2396
656    * @exception NullPointerException If str is null
657    */
create(String str)658   public static URI create(String str)
659   {
660     try
661       {
662         return new URI(str);
663       }
664     catch (URISyntaxException e)
665       {
666         throw (IllegalArgumentException) new IllegalArgumentException()
667               .initCause(e);
668       }
669   }
670 
671   /**
672    * Attempts to parse this URI's authority component, if defined,
673    * into user-information, host, and port components.  The purpose
674    * of this method was to disambiguate between some authority sections,
675    * which form invalid server-based authories, but valid registry
676    * based authorities.  In the updated RFC 3986, the authority section
677    * is defined differently, with registry-based authorities part of
678    * the host section.  Thus, this method is now simply an explicit
679    * way of parsing any authority section.
680    *
681    * @return the URI, with the authority section parsed into user
682    *         information, host and port components.
683    * @throws URISyntaxException if the given string violates RFC 2396
684    */
parseServerAuthority()685   public URI parseServerAuthority() throws URISyntaxException
686   {
687     if (rawAuthority != null)
688       {
689         Matcher matcher = AUTHORITY_PATTERN.matcher(rawAuthority);
690 
691         if (matcher.matches())
692           {
693             rawUserInfo = getURIGroup(matcher, AUTHORITY_USERINFO_GROUP);
694             rawHost = getURIGroup(matcher, AUTHORITY_HOST_GROUP);
695 
696             String portStr = getURIGroup(matcher, AUTHORITY_PORT_GROUP);
697 
698             if (portStr != null && ! portStr.isEmpty())
699               try
700                 {
701                   port = Integer.parseInt(portStr);
702                 }
703               catch (NumberFormatException e)
704                 {
705                   URISyntaxException use =
706                     new URISyntaxException
707                       (string, "doesn't match URI regular expression");
708                   use.initCause(e);
709                   throw use;
710                 }
711           }
712         else
713           throw new URISyntaxException(string,
714                                        "doesn't match URI regular expression");
715       }
716     return this;
717   }
718 
719   /**
720    * <p>
721    * Returns a normalized version of the URI.  If the URI is opaque,
722    * or its path is already in normal form, then this URI is simply
723    * returned.  Otherwise, the following transformation of the path
724    * element takes place:
725    * </p>
726    * <ol>
727    * <li>All `.' segments are removed.</li>
728    * <li>Each `..' segment which can be paired with a prior non-`..' segment
729    * is removed along with the preceding segment.</li>
730    * <li>A `.' segment is added to the front if the first segment contains
731    * a colon (`:').  This is a deviation from the RFC, which prevents
732    * confusion between the path and the scheme.</li>
733    * </ol>
734    * <p>
735    * The resulting URI will be free of `.' and `..' segments, barring those
736    * that were prepended or which couldn't be paired, respectively.
737    * </p>
738    *
739    * @return the normalized URI.
740    */
normalize()741   public URI normalize()
742   {
743     if (isOpaque() || path.indexOf("/./") == -1 && path.indexOf("/../") == -1)
744       return this;
745     try
746       {
747         return new URI(scheme, authority, normalizePath(path), query,
748                        fragment);
749       }
750     catch (URISyntaxException e)
751       {
752         throw (Error) new InternalError("Normalized URI variant could not "+
753                                         "be constructed").initCause(e);
754       }
755   }
756 
757   /**
758    * <p>
759    * Normalize the given path.  The following transformation takes place:
760    * </p>
761    * <ol>
762    * <li>All `.' segments are removed.</li>
763    * <li>Each `..' segment which can be paired with a prior non-`..' segment
764    * is removed along with the preceding segment.</li>
765    * <li>A `.' segment is added to the front if the first segment contains
766    * a colon (`:').  This is a deviation from the RFC, which prevents
767    * confusion between the path and the scheme.</li>
768    * </ol>
769    * <p>
770    * The resulting URI will be free of `.' and `..' segments, barring those
771    * that were prepended or which couldn't be paired, respectively.
772    * </p>
773    *
774    * @param relativePath the relative path to be normalized.
775    * @return the normalized path.
776    */
normalizePath(String relativePath)777   private String normalizePath(String relativePath)
778   {
779     /*
780        This follows the algorithm in section 5.2.4. of RFC3986,
781        but doesn't modify the input buffer.
782     */
783     CPStringBuilder input = new CPStringBuilder(relativePath);
784     CPStringBuilder output = new CPStringBuilder();
785     int start = 0;
786     while (start < input.length())
787       {
788         /* A */
789         if (input.indexOf("../",start) == start)
790           {
791             start += 3;
792             continue;
793           }
794         if (input.indexOf("./",start) == start)
795           {
796             start += 2;
797             continue;
798           }
799         /* B */
800         if (input.indexOf("/./",start) == start)
801           {
802             start += 2;
803             continue;
804           }
805         if (input.indexOf("/.",start) == start
806             && input.charAt(start + 2) != '.')
807           {
808             start += 1;
809             input.setCharAt(start,'/');
810             continue;
811           }
812         /* C */
813         if (input.indexOf("/../",start) == start)
814           {
815             start += 3;
816             removeLastSegment(output);
817             continue;
818           }
819         if (input.indexOf("/..",start) == start)
820           {
821             start += 2;
822             input.setCharAt(start,'/');
823             removeLastSegment(output);
824             continue;
825           }
826         /* D */
827         if (start == input.length() - 1 && input.indexOf(".",start) == start)
828           {
829             input.delete(0,1);
830             continue;
831           }
832         if (start == input.length() - 2 && input.indexOf("..",start) == start)
833           {
834             input.delete(0,2);
835             continue;
836           }
837         /* E */
838         int indexOfSlash = input.indexOf("/",start);
839         while (indexOfSlash == start)
840           {
841             output.append("/");
842             ++start;
843             indexOfSlash = input.indexOf("/",start);
844           }
845         if (indexOfSlash == -1)
846           indexOfSlash = input.length();
847         output.append(input.substring(start, indexOfSlash));
848         start = indexOfSlash;
849       }
850     return output.toString();
851   }
852 
853   /**
854    * Removes the last segment of the path from the specified buffer.
855    *
856    * @param buffer the buffer containing the path.
857    */
removeLastSegment(CPStringBuilder buffer)858   private void removeLastSegment(CPStringBuilder buffer)
859   {
860     int lastSlash = buffer.lastIndexOf("/");
861     if (lastSlash == -1)
862       buffer.setLength(0);
863     else
864       buffer.setLength(lastSlash);
865   }
866 
867   /**
868    * Resolves the given URI against this URI
869    *
870    * @param uri The URI to resolve against this URI
871    *
872    * @return The resulting URI, or null when it couldn't be resolved
873    * for some reason.
874    *
875    * @throws NullPointerException if uri is null
876    */
resolve(URI uri)877   public URI resolve(URI uri)
878   {
879     if (uri.isAbsolute())
880       return uri;
881     if (uri.isOpaque())
882       return uri;
883 
884     String scheme = uri.getScheme();
885     String schemeSpecificPart = uri.getSchemeSpecificPart();
886     String authority = uri.getAuthority();
887     String path = uri.getPath();
888     String query = uri.getQuery();
889     String fragment = uri.getFragment();
890 
891     try
892       {
893         if (fragment != null && path != null && path.equals("")
894             && scheme == null && authority == null && query == null)
895           return new URI(this.scheme, this.schemeSpecificPart, fragment);
896 
897         if (authority == null)
898           {
899             authority = this.authority;
900             if (path == null)
901               path = "";
902             if (! (path.startsWith("/")))
903               {
904                 CPStringBuilder basepath = new CPStringBuilder(this.path);
905                 int i = this.path.lastIndexOf('/');
906 
907                 if (i >= 0)
908                   basepath.delete(i + 1, basepath.length());
909 
910                 basepath.append(path);
911                 path = normalizePath(basepath.toString());
912               }
913           }
914         return new URI(this.scheme, authority, path, query, fragment);
915       }
916     catch (URISyntaxException e)
917       {
918         throw (Error) new InternalError("Resolved URI variant could not "+
919                                         "be constructed").initCause(e);
920       }
921   }
922 
923   /**
924    * Resolves the given URI string against this URI
925    *
926    * @param str The URI as string to resolve against this URI
927    *
928    * @return The resulting URI
929    *
930    * @throws IllegalArgumentException If the given URI string
931    * violates RFC 2396
932    * @throws NullPointerException If uri is null
933    */
resolve(String str)934   public URI resolve(String str) throws IllegalArgumentException
935   {
936     return resolve(create(str));
937   }
938 
939   /**
940    * <p>
941    * Relativizes the given URI against this URI.  The following
942    * algorithm is used:
943    * </p>
944    * <ul>
945    * <li>If either URI is opaque, the given URI is returned.</li>
946    * <li>If the schemes of the URIs differ, the given URI is returned.</li>
947    * <li>If the authority components of the URIs differ, then the given
948    * URI is returned.</li>
949    * <li>If the path of this URI is not a prefix of the supplied URI,
950    * then the given URI is returned.</li>
951    * <li>If all the above conditions hold, a new URI is created using the
952    * query and fragment components of the given URI, along with a path
953    * computed by removing the path of this URI from the start of the path
954    * of the supplied URI.</li>
955    * </ul>
956    *
957    * @param uri the URI to relativize agsint this URI
958    * @return the resulting URI
959    * @throws NullPointerException if the uri is null
960    */
relativize(URI uri)961   public URI relativize(URI uri)
962   {
963     if (isOpaque() || uri.isOpaque())
964       return uri;
965     if (scheme == null && uri.getScheme() != null)
966       return uri;
967     if (scheme != null && !(scheme.equals(uri.getScheme())))
968       return uri;
969     if (rawAuthority == null && uri.getRawAuthority() != null)
970       return uri;
971     if (rawAuthority != null && !(rawAuthority.equals(uri.getRawAuthority())))
972       return uri;
973     String basePath = rawPath;
974     if (!(uri.getRawPath().equals(rawPath)))
975       {
976         if (!(basePath.endsWith("/")))
977           basePath = basePath.concat("/");
978         if (!(uri.getRawPath().startsWith(basePath)))
979           return uri;
980       }
981     try
982       {
983         return new URI(null, null,
984                        uri.getRawPath().substring(basePath.length()),
985                        uri.getRawQuery(), uri.getRawFragment());
986       }
987     catch (URISyntaxException e)
988       {
989         throw (Error) new InternalError("Relativized URI variant could not "+
990                                         "be constructed").initCause(e);
991       }
992   }
993 
994   /**
995    * Creates an URL from an URI
996    *
997    * @throws MalformedURLException If a protocol handler for the URL could
998    * not be found, or if some other error occurred while constructing the URL
999    * @throws IllegalArgumentException If the URI is not absolute
1000    */
toURL()1001   public URL toURL() throws IllegalArgumentException, MalformedURLException
1002   {
1003     if (isAbsolute())
1004       return new URL(this.toString());
1005 
1006     throw new IllegalArgumentException("not absolute");
1007   }
1008 
1009   /**
1010    * Returns the scheme of the URI
1011    */
getScheme()1012   public String getScheme()
1013   {
1014     return scheme;
1015   }
1016 
1017   /**
1018    * Tells whether this URI is absolute or not
1019    */
isAbsolute()1020   public boolean isAbsolute()
1021   {
1022     return scheme != null;
1023   }
1024 
1025   /**
1026    * Tell whether this URI is opaque or not
1027    */
isOpaque()1028   public boolean isOpaque()
1029   {
1030     return ((scheme != null) && ! (schemeSpecificPart.startsWith("/")));
1031   }
1032 
1033   /**
1034    * Returns the raw scheme specific part of this URI.
1035    * The scheme-specific part is never undefined, though it may be empty
1036    */
getRawSchemeSpecificPart()1037   public String getRawSchemeSpecificPart()
1038   {
1039     return rawSchemeSpecificPart;
1040   }
1041 
1042   /**
1043    * Returns the decoded scheme specific part of this URI.
1044    */
getSchemeSpecificPart()1045   public String getSchemeSpecificPart()
1046   {
1047     return schemeSpecificPart;
1048   }
1049 
1050   /**
1051    * Returns the raw authority part of this URI
1052    */
getRawAuthority()1053   public String getRawAuthority()
1054   {
1055     return rawAuthority;
1056   }
1057 
1058   /**
1059    * Returns the decoded authority part of this URI
1060    */
getAuthority()1061   public String getAuthority()
1062   {
1063     return authority;
1064   }
1065 
1066   /**
1067    * Returns the raw user info part of this URI
1068    */
getRawUserInfo()1069   public String getRawUserInfo()
1070   {
1071     return rawUserInfo;
1072   }
1073 
1074   /**
1075    * Returns the decoded user info part of this URI
1076    */
getUserInfo()1077   public String getUserInfo()
1078   {
1079     return userInfo;
1080   }
1081 
1082   /**
1083    * Returns the hostname of the URI
1084    */
getHost()1085   public String getHost()
1086   {
1087     return host;
1088   }
1089 
1090   /**
1091    * Returns the port number of the URI
1092    */
getPort()1093   public int getPort()
1094   {
1095     return port;
1096   }
1097 
1098   /**
1099    * Returns the raw path part of this URI
1100    */
getRawPath()1101   public String getRawPath()
1102   {
1103     return rawPath;
1104   }
1105 
1106   /**
1107    * Returns the path of the URI
1108    */
getPath()1109   public String getPath()
1110   {
1111     return path;
1112   }
1113 
1114   /**
1115    * Returns the raw query part of this URI
1116    */
getRawQuery()1117   public String getRawQuery()
1118   {
1119     return rawQuery;
1120   }
1121 
1122   /**
1123    * Returns the query of the URI
1124    */
getQuery()1125   public String getQuery()
1126   {
1127     return query;
1128   }
1129 
1130   /**
1131    * Return the raw fragment part of this URI
1132    */
getRawFragment()1133   public String getRawFragment()
1134   {
1135     return rawFragment;
1136   }
1137 
1138   /**
1139    * Returns the fragment of the URI
1140    */
getFragment()1141   public String getFragment()
1142   {
1143     return fragment;
1144   }
1145 
1146   /**
1147    * <p>
1148    * Compares the URI with the given object for equality.  If the
1149    * object is not a <code>URI</code>, then the method returns false.
1150    * Otherwise, the following criteria are observed:
1151    * </p>
1152    * <ul>
1153    * <li>The scheme of the URIs must either be null (undefined) in both cases,
1154    * or equal, ignorant of case.</li>
1155    * <li>The raw fragment of the URIs must either be null (undefined) in both
1156    * cases, or equal, ignorant of case.</li>
1157    * <li>Both URIs must be of the same type (opaque or hierarchial)</li>
1158    * <li><strong>For opaque URIs:</strong></li>
1159    * <ul>
1160    * <li>The raw scheme-specific parts must be equal.</li>
1161    * </ul>
1162    * <li>For hierarchical URIs:</li>
1163    * <ul>
1164    * <li>The raw paths must be equal, ignorant of case.</li>
1165    * <li>The raw queries are either both undefined or both equal, ignorant
1166    * of case.</li>
1167    * <li>The raw authority sections are either both undefined or:</li>
1168    * <li><strong>For registry-based authorities:</strong></li>
1169    * <ul><li>they are equal.</li></ul>
1170    * <li><strong>For server-based authorities:</strong></li>
1171    * <ul>
1172    * <li>the hosts are equal, ignoring case</li>
1173    * <li>the ports are equal</li>
1174    * <li>the user information components are equal</li>
1175    * </ul>
1176    * </ul>
1177    * </ul>
1178    *
1179    * @param obj the obj to compare the URI with.
1180    * @return <code>true</code> if the objects are equal, according to
1181    *         the specification above.
1182    */
equals(Object obj)1183   public boolean equals(Object obj)
1184   {
1185     if (!(obj instanceof URI))
1186       return false;
1187     URI uriObj = (URI) obj;
1188     if (scheme == null)
1189       {
1190         if (uriObj.getScheme() != null)
1191           return false;
1192       }
1193     else
1194       if (!(scheme.equalsIgnoreCase(uriObj.getScheme())))
1195         return false;
1196     if (rawFragment == null)
1197       {
1198         if (uriObj.getRawFragment() != null)
1199           return false;
1200       }
1201     else
1202       if (!(rawFragment.equalsIgnoreCase(uriObj.getRawFragment())))
1203         return false;
1204     boolean opaqueThis = isOpaque();
1205     boolean opaqueObj = uriObj.isOpaque();
1206     if (opaqueThis && opaqueObj)
1207       return rawSchemeSpecificPart.equals(uriObj.getRawSchemeSpecificPart());
1208     else if (!opaqueThis && !opaqueObj)
1209       {
1210         boolean common = rawPath.equalsIgnoreCase(uriObj.getRawPath())
1211           && ((rawQuery == null && uriObj.getRawQuery() == null)
1212               || rawQuery.equalsIgnoreCase(uriObj.getRawQuery()));
1213         if (rawAuthority == null && uriObj.getRawAuthority() == null)
1214           return common;
1215         if (host == null)
1216           return common
1217             && rawAuthority.equalsIgnoreCase(uriObj.getRawAuthority());
1218         return common
1219           && host.equalsIgnoreCase(uriObj.getHost())
1220           && port == uriObj.getPort()
1221           && (rawUserInfo == null ?
1222               uriObj.getRawUserInfo() == null :
1223               rawUserInfo.equalsIgnoreCase(uriObj.getRawUserInfo()));
1224       }
1225     else
1226       return false;
1227   }
1228 
1229   /**
1230    * Computes the hashcode of the URI
1231    */
hashCode()1232   public int hashCode()
1233   {
1234     return (getScheme() == null ? 0 : 13 * getScheme().hashCode())
1235       + 17 * getRawSchemeSpecificPart().hashCode()
1236       + (getRawFragment() == null ? 0 : 21 + getRawFragment().hashCode());
1237   }
1238 
1239   /**
1240    * Compare the URI with another URI.
1241    * Undefined components are taken to be less than any other component.
1242    * The following criteria are observed:
1243    * </p>
1244    * <ul>
1245    * <li>Two URIs with different schemes are compared according to their
1246    * scheme, regardless of case.</li>
1247    * <li>A hierarchical URI is less than an opaque URI with the same
1248    * scheme.</li>
1249    * <li><strong>For opaque URIs:</strong></li>
1250    * <ul>
1251    * <li>URIs with differing scheme-specific parts are ordered according
1252    * to the ordering of the scheme-specific part.</li>
1253    * <li>URIs with the same scheme-specific part are ordered by the
1254    * raw fragment.</li>
1255    * </ul>
1256    * <li>For hierarchical URIs:</li>
1257    * <ul>
1258    * <li>URIs are ordered according to their raw authority sections,
1259    * if they are unequal.</li>
1260    * <li><strong>For registry-based authorities:</strong></li>
1261    * <ul><li>they are ordered according to the ordering of the authority
1262    * component.</li></ul>
1263    * <li><strong>For server-based authorities:</strong></li>
1264    * <ul>
1265    * <li>URIs are ordered according to the raw user information.</li>
1266    * <li>URIs with the same user information are ordered by the host,
1267    * ignoring case.</li>
1268    * <lI>URIs with the same host are ordered by the port.</li>
1269    * </ul>
1270    * <li>URIs with the same authority section are ordered by the raw path.</li>
1271    * <li>URIs with the same path are ordered by their raw query.</li>
1272    * <li>URIs with the same query are ordered by their raw fragments.</li>
1273    * </ul>
1274    * </ul>
1275    *
1276    * @param uri The other URI to compare this URI with
1277    * @return a negative integer, zero or a positive integer depending
1278    *         on whether this URI is less than, equal to or greater
1279    *         than that supplied, respectively.
1280    */
compareTo(URI uri)1281   public int compareTo(URI uri)
1282     throws ClassCastException
1283   {
1284     if (scheme == null && uri.getScheme() != null)
1285       return -1;
1286     if (scheme != null)
1287       {
1288         int sCompare = scheme.compareToIgnoreCase(uri.getScheme());
1289         if (sCompare != 0)
1290           return sCompare;
1291       }
1292     boolean opaqueThis = isOpaque();
1293     boolean opaqueObj = uri.isOpaque();
1294     if (opaqueThis && !opaqueObj)
1295       return 1;
1296     if (!opaqueThis && opaqueObj)
1297       return -1;
1298     if (opaqueThis)
1299       {
1300         int ssCompare =
1301           rawSchemeSpecificPart.compareTo(uri.getRawSchemeSpecificPart());
1302         if (ssCompare == 0)
1303           return compareFragments(uri);
1304         else
1305           return ssCompare;
1306       }
1307     if (rawAuthority == null && uri.getRawAuthority() != null)
1308       return -1;
1309     if (rawAuthority != null)
1310       {
1311         int aCompare = rawAuthority.compareTo(uri.getRawAuthority());
1312         if (aCompare != 0)
1313           {
1314             if (host == null)
1315               return aCompare;
1316             if (rawUserInfo == null && uri.getRawUserInfo() != null)
1317               return -1;
1318             int uCompare = rawUserInfo.compareTo(uri.getRawUserInfo());
1319             if (uCompare != 0)
1320               return uCompare;
1321             if (host == null && uri.getHost() != null)
1322               return -1;
1323             int hCompare = host.compareTo(uri.getHost());
1324             if (hCompare != 0)
1325               return hCompare;
1326             int uriPort = uri.getPort();
1327             return (uriPort == port) ? 0 : (uriPort > port) ? -1 : 1;
1328           }
1329       }
1330     if (rawPath == null && uri.getRawPath() != null)
1331       return -1;
1332     if (rawPath != null)
1333       {
1334         int pCompare = rawPath.compareTo(uri.getRawPath());
1335         if (pCompare != 0)
1336           return pCompare;
1337       }
1338     if (rawQuery == null && uri.getRawQuery() != null)
1339       return -1;
1340     if (rawQuery != null)
1341       {
1342         int qCompare = rawQuery.compareTo(uri.getRawQuery());
1343         if (qCompare != 0)
1344           return qCompare;
1345       }
1346     return compareFragments(uri);
1347   }
1348 
1349   /**
1350    * Compares the fragment of this URI with that of the supplied URI.
1351    *
1352    * @param uri the URI to compare with this one.
1353    * @return a negative integer, zero or a positive integer depending
1354    *         on whether this uri's fragment is less than, equal to
1355    *         or greater than the fragment of the uri supplied, respectively.
1356    */
compareFragments(URI uri)1357   private int compareFragments(URI uri)
1358   {
1359     if (rawFragment == null && uri.getRawFragment() != null)
1360       return -1;
1361     else if (rawFragment == null)
1362       return 0;
1363     else
1364       return rawFragment.compareTo(uri.getRawFragment());
1365   }
1366 
1367   /**
1368    * Returns the URI as a String.  If the URI was created using a constructor,
1369    * then this will be the same as the original input string.
1370    *
1371    * @return a string representation of the URI.
1372    */
toString()1373   public String toString()
1374   {
1375     return (scheme == null ? "" : scheme + ":")
1376       + rawSchemeSpecificPart
1377       + (rawFragment == null ? "" : "#" + rawFragment);
1378   }
1379 
1380   /**
1381    * Returns the URI as US-ASCII string.  This is the same as the result
1382    * from <code>toString()</code> for URIs that don't contain any non-US-ASCII
1383    * characters.  Otherwise, the non-US-ASCII characters are replaced
1384    * by their percent-encoded representations.
1385    *
1386    * @return a string representation of the URI, containing only US-ASCII
1387    *         characters.
1388    */
toASCIIString()1389   public String toASCIIString()
1390   {
1391     String strRep = toString();
1392     boolean inNonAsciiBlock = false;
1393     CPStringBuilder buffer = new CPStringBuilder();
1394     CPStringBuilder encBuffer = null;
1395     for (int i = 0; i < strRep.length(); i++)
1396       {
1397         char c = strRep.charAt(i);
1398         if (c <= 127)
1399           {
1400             if (inNonAsciiBlock)
1401               {
1402                 buffer.append(escapeCharacters(encBuffer.toString()));
1403                 inNonAsciiBlock = false;
1404               }
1405             buffer.append(c);
1406           }
1407         else
1408           {
1409             if (!inNonAsciiBlock)
1410               {
1411                 encBuffer = new CPStringBuilder();
1412                 inNonAsciiBlock = true;
1413               }
1414             encBuffer.append(c);
1415           }
1416       }
1417     return buffer.toString();
1418   }
1419 
1420   /**
1421    * Converts the non-ASCII characters in the supplied string
1422    * to their equivalent percent-encoded representations.
1423    * That is, they are replaced by "%" followed by their hexadecimal value.
1424    *
1425    * @param str a string including non-ASCII characters.
1426    * @return the string with the non-ASCII characters converted to their
1427    *         percent-encoded representations.
1428    */
escapeCharacters(String str)1429   private static String escapeCharacters(String str)
1430   {
1431     try
1432       {
1433         CPStringBuilder sb = new CPStringBuilder();
1434         // this is far from optimal, but it works
1435         byte[] utf8 = str.getBytes("utf-8");
1436         for (int j = 0; j < utf8.length; j++)
1437           {
1438             sb.append('%');
1439             sb.append(HEX.charAt((utf8[j] & 0xff) / 16));
1440             sb.append(HEX.charAt((utf8[j] & 0xff) % 16));
1441           }
1442         return sb.toString();
1443       }
1444     catch (java.io.UnsupportedEncodingException x)
1445       {
1446         throw (Error) new InternalError("Escaping error").initCause(x);
1447       }
1448   }
1449 
1450 }
1451