1 /* Copyright 2004-2006, 2009 Elliotte Rusty Harold
2 
3    This library is free software; you can redistribute it and/or modify
4    it under the terms of version 2.1 of the GNU Lesser General Public
5    License as published by the Free Software Foundation.
6 
7    This library is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10    GNU Lesser General Public License for more details.
11 
12    You should have received a copy of the GNU Lesser General Public
13    License along with this library; if not, write to the
14    Free Software Foundation, Inc., 59 Temple Place, Suite 330,
15    Boston, MA 02111-1307  USA
16 
17    You can contact Elliotte Rusty Harold by sending e-mail to
18    elharo@ibiblio.org. Please include the word "XOM" in the
19    subject line. The XOM home page is located at http://www.xom.nu/
20 */
21 
22 package nu.xom;
23 
24 import java.io.UnsupportedEncodingException;
25 
26 
27 /**
28  * These methods are not fully general.
29  * You would need to uncomment some lines to make this a
30  * public API. Certain preconditions for these methods to
31  * operate correctly are true in the context of XOM,
32  * but may well not be true in a more general context.
33  *
34  * @author Elliotte Rusty Harold
35  * @version 1.2.3
36  *
37  */
38 class URIUtil {
39 
40     // We assume the URI has already been verified as a potentially
41     // legal URI. Thus we don't have to check everything here.
isOpaque(String uri)42     static boolean isOpaque(String uri) {
43 
44         int colon = uri.indexOf(':');
45         // if (colon < 1) return false;
46         // This next line is the difference between absolute and opaque
47         if (uri.substring(colon+1).startsWith("/")) return false;
48         if (!Verifier.isAlpha(uri.charAt(0))) return false;
49         /* for (int i = 1; i < colon; i++) {
50              if (!Verifier.isSchemeCharacter(uri.charAt(i))) {
51                  return false;
52              }
53         } */
54         return true;
55 
56     }
57 
58 
isAbsolute(String uri)59     static boolean isAbsolute(String uri) {
60 
61         int colon = uri.indexOf(':');
62         if (colon < 1) return false;
63         // We assume the URI has already been verified as a potentially
64         // legal URI. Thus we don't have to check everything here.
65         /*if (!Verifier.isAlpha(uri.charAt(0))) return false;
66         for (int i = 1; i < colon; i++) {
67              if (!Verifier.isSchemeCharacter(uri.charAt(i))) return false;
68         } */
69         return true;
70 
71     }
72 
73 
74     // This doesn't do enough error checking to be a public API.
absolutize(String baseURI, String spec)75     static String absolutize(String baseURI, String spec) {
76 
77         if ("".equals(baseURI) || baseURI == null) return spec;
78 
79         ParsedURI base = new ParsedURI(baseURI);
80 
81         // This seems to be necessary to handle base URLs like
82         // http://www.example.com/test/data/..
83         // but I don't think it's part of the 3986 algorithm.
84         // ???? It may be a bug in that algorithm. Check.
85         if (base.path.endsWith("/..")) base.path += '/';
86 
87         // The variable names R and T violate Java naming conventions.
88         // They are taken from the pseudo-code in the RFC 3986 spec.
89         ParsedURI R = new ParsedURI(spec);
90         ParsedURI T = new ParsedURI();
91 
92         // We should be able to skip this check. basically it
93         // asserts that the spec is not an absolute URI already
94         /* if (R.scheme != null) {
95             T.scheme    = R.scheme;
96             T.authority = R.authority;
97             T.query     = R.query;
98             T.path      = removeDotSegments(R.path);
99         }
100         else { */
101         if (R.authority != null) {
102             T.authority = R.authority;
103             T.query     = R.query;
104             T.path      = removeDotSegments(R.path);
105         }
106         else {
107             if ("".equals(R.path)) {
108                 T.path = base.path;
109                 if (R.query != null) {
110                     T.query = R.query;
111                 }
112                 else {
113                     T.query = base.query;
114                 }
115             }
116             else {
117                 if (R.path.startsWith("/")) {
118                    T.path = removeDotSegments(R.path);
119                 }
120                 else {
121                    T.path = merge(base, R.path);
122                    T.path = removeDotSegments(T.path);
123                 }
124                 T.query = R.query;
125             }
126             T.authority = base.authority;
127         }
128         T.scheme = base.scheme;
129         // }
130         // Fragment ID of base URI is never considered
131         T.fragment = R.fragment;
132 
133         return T.toString();
134 
135     }
136 
137 
merge(ParsedURI base, String relativePath)138     private static String merge(ParsedURI base, String relativePath) {
139 
140         if (base.authority != null && "".equals(base.path)
141           && !"".equals(base.authority)) {
142             return "/" + relativePath;
143         }
144 
145         int lastSlash = base.path.lastIndexOf('/');
146         if (lastSlash == -1) return relativePath;
147         String topPath = base.path.substring(0, lastSlash+1);
148         return topPath + relativePath;
149 
150     }
151 
152 
removeDotSegments(String path)153     static String removeDotSegments(String path) {
154 
155         StringBuffer output = new StringBuffer();
156 
157         while (path.length() > 0) {
158             if (path.startsWith("/./")) {
159                 path = '/' + path.substring(3);
160             }
161             else if (path.equals("/.")) {
162                 path = "/";
163             }
164             else if (path.startsWith("/../")) {
165                 path = '/' + path.substring(4);
166                 int lastSlash = output.toString().lastIndexOf('/');
167                 if (lastSlash != -1) output.setLength(lastSlash);
168             }
169             else if (path.equals("/..")) {
170                 path = "/";
171                 int lastSlash = output.toString().lastIndexOf('/');
172                 if (lastSlash != -1) output.setLength(lastSlash);
173             }
174             // These next three cases are unreachable in the context of XOM.
175             // They may be needed in a more general public URIUtil.
176             // ???? need to consider whether these are still unreachable now that
177             // Builder.canonicalizeURL is calling this method.
178             /* else if (path.equals(".") || path.equals("..")) {
179                 path = "";
180             }
181             else if (path.startsWith("../")) {
182                 path = path.substring(3);
183             }
184             else if (path.startsWith("./")) {
185                 path = path.substring(2);
186             } */
187             else {
188                 int nextSlash = path.indexOf('/');
189                 if (nextSlash == 0) nextSlash = path.indexOf('/', 1);
190                 if (nextSlash == -1) {
191                     output.append(path);
192                     path = "";
193                 }
194                 else {
195                     output.append(path.substring(0, nextSlash));
196                     path = path.substring(nextSlash);
197                 }
198             }
199         }
200 
201         return output.toString();
202 
203     }
204 
205 
206     // really just a struct
207     static class ParsedURI {
208 
209         String scheme;
210         String schemeSpecificPart;
211         String query;
212         String fragment;
213         String authority;
214         String path = "";
215 
ParsedURI(String spec)216         ParsedURI(String spec) {
217 
218             int colon = spec.indexOf(':');
219             int question;
220 
221             // URIs can only contain one sharp sign
222             int sharp = spec.lastIndexOf('#');
223 
224             // Fragment IDs can contain question marks so we only read
225             // the question mark before the fragment ID, if any
226             if (sharp == -1) question = spec.indexOf('?');
227             else question = spec.substring(0, sharp).indexOf('?');
228 
229             if (colon != -1) scheme = spec.substring(0, colon);
230 
231             if (question == -1 && sharp == -1) {
232                 schemeSpecificPart = spec.substring(colon+1);
233             }
234             else if (question != -1) {
235                 if (question < colon) {
236                     MalformedURIException ex
237                       = new MalformedURIException("Unparseable URI");
238                     ex.setData(spec);
239                     throw ex;
240                 }
241                 schemeSpecificPart = spec.substring(colon+1, question);
242             }
243             else {
244                 if (sharp < colon) {
245                     MalformedURIException ex
246                       = new MalformedURIException("Unparseable URI");
247                     ex.setData(spec);
248                     throw ex;
249                 }
250                 schemeSpecificPart = spec.substring(colon+1, sharp);
251             }
252 
253             if (sharp != -1) {
254                 fragment = spec.substring(sharp+1);
255             }
256 
257             if (question != -1) {
258                 if (sharp == -1) {
259                     query = spec.substring(question+1);
260                 }
261                 else {
262                     query = spec.substring(question+1, sharp);
263                 }
264             }
265 
266             if (schemeSpecificPart.startsWith("//")) {
267                 int authorityBegin = 2;
268                 int authorityEnd = schemeSpecificPart.indexOf('/', authorityBegin);
269                 if (authorityEnd == -1) {
270                     authority = schemeSpecificPart.substring(2);
271                     path = "";
272                 }
273                 else {
274                     authority = schemeSpecificPart.substring(authorityBegin, authorityEnd);
275                     path = schemeSpecificPart.substring(authorityEnd);
276                 }
277             }
278             else {
279                 path = schemeSpecificPart;
280             }
281 
282         }
283 
ParsedURI()284         ParsedURI() {}
285 
toString()286         public String toString() {
287 
288             StringBuffer result = new StringBuffer(30);
289 
290             if (scheme != null) {
291                 result.append(scheme);
292                 result.append(':');
293             }
294 
295             if (schemeSpecificPart != null) {
296                 result.append(schemeSpecificPart);
297             }
298             else {
299                 result.append("//");
300                 if (authority != null) result.append(authority);
301                 result.append(path);
302             }
303 
304             if (query != null) {
305                 result.append('?');
306                 result.append(query);
307             }
308 
309             if (fragment != null) {
310                 result.append('#');
311                 result.append(fragment);
312             }
313 
314             return result.toString();
315 
316         }
317 
318     }
319 
320 
toURI(String iri)321     static String toURI(String iri) {
322 
323         int length = iri.length();
324         StringBuffer uri = new StringBuffer(length);
325         for (int i = 0; i < length; i++) {
326             char c = iri.charAt(i);
327             switch(c) {
328                 case ' ':
329                     uri.append("%20");
330                     break;
331                 case '!':
332                     uri.append(c);
333                     break;
334                 case '"':
335                     uri.append("%22");
336                     break;
337                 case '#':
338                     uri.append(c);
339                     break;
340                 case '$':
341                     uri.append(c);
342                     break;
343                 case '%':
344                     uri.append(c);
345                     break;
346                 case '&':
347                     uri.append(c);
348                     break;
349                 case '\'':
350                     uri.append(c);
351                     break;
352                 case '(':
353                     uri.append(c);
354                     break;
355                 case ')':
356                     uri.append(c);
357                     break;
358                 case '*':
359                     uri.append(c);
360                     break;
361                 case '+':
362                     uri.append(c);
363                     break;
364                 case ',':
365                     uri.append(c);
366                     break;
367                 case '-':
368                     uri.append(c);
369                     break;
370                 case '.':
371                     uri.append(c);
372                     break;
373                 case '/':
374                     uri.append(c);
375                     break;
376                 case '0':
377                     uri.append(c);
378                     break;
379                 case '1':
380                     uri.append(c);
381                     break;
382                 case '2':
383                     uri.append(c);
384                     break;
385                 case '3':
386                     uri.append(c);
387                     break;
388                 case '4':
389                     uri.append(c);
390                     break;
391                 case '5':
392                     uri.append(c);
393                     break;
394                 case '6':
395                     uri.append(c);
396                     break;
397                 case '7':
398                     uri.append(c);
399                     break;
400                 case '8':
401                     uri.append(c);
402                     break;
403                 case '9':
404                     uri.append(c);
405                     break;
406                 case ':':
407                     uri.append(c);
408                     break;
409                 case ';':
410                     uri.append(c);
411                     break;
412                 case '<':
413                     uri.append("%3C");
414                     break;
415                 case '=':
416                     uri.append(c);
417                     break;
418                 case '>':
419                     uri.append("%3E");
420                     break;
421                 case '?':
422                     uri.append(c);
423                     break;
424                 case '@':
425                     uri.append(c);
426                     break;
427                 case 'A':
428                     uri.append(c);
429                     break;
430                 case 'B':
431                     uri.append(c);
432                     break;
433                 case 'C':
434                     uri.append(c);
435                     break;
436                 case 'D':
437                     uri.append(c);
438                     break;
439                 case 'E':
440                     uri.append(c);
441                     break;
442                 case 'F':
443                     uri.append(c);
444                     break;
445                 case 'G':
446                     uri.append(c);
447                     break;
448                 case 'H':
449                     uri.append(c);
450                     break;
451                 case 'I':
452                     uri.append(c);
453                     break;
454                 case 'J':
455                     uri.append(c);
456                     break;
457                 case 'K':
458                     uri.append(c);
459                     break;
460                 case 'L':
461                     uri.append(c);
462                     break;
463                 case 'M':
464                     uri.append(c);
465                     break;
466                 case 'N':
467                     uri.append(c);
468                     break;
469                 case 'O':
470                     uri.append(c);
471                     break;
472                 case 'P':
473                     uri.append(c);
474                     break;
475                 case 'Q':
476                     uri.append(c);
477                     break;
478                 case 'R':
479                     uri.append(c);
480                     break;
481                 case 'S':
482                     uri.append(c);
483                     break;
484                 case 'T':
485                     uri.append(c);
486                     break;
487                 case 'U':
488                     uri.append(c);
489                     break;
490                 case 'V':
491                     uri.append(c);
492                     break;
493                 case 'W':
494                     uri.append(c);
495                     break;
496                 case 'X':
497                     uri.append(c);
498                     break;
499                 case 'Y':
500                     uri.append(c);
501                     break;
502                 case 'Z':
503                     uri.append(c);
504                     break;
505                 case '[':
506                     uri.append(c);
507                     break;
508                 case '\\':
509                     uri.append("%5C");
510                     break;
511                 case ']':
512                     uri.append(c);
513                     break;
514                 case '^':
515                     uri.append("%5E");
516                     break;
517                 case '_':
518                     uri.append(c);
519                     break;
520                 case '`':
521                     uri.append("%60");
522                     break;
523                 case 'a':
524                     uri.append(c);
525                     break;
526                 case 'b':
527                     uri.append(c);
528                     break;
529                 case 'c':
530                     uri.append(c);
531                     break;
532                 case 'd':
533                     uri.append(c);
534                     break;
535                 case 'e':
536                     uri.append(c);
537                     break;
538                 case 'f':
539                     uri.append(c);
540                     break;
541                 case 'g':
542                     uri.append(c);
543                     break;
544                 case 'h':
545                     uri.append(c);
546                     break;
547                 case 'i':
548                     uri.append(c);
549                     break;
550                 case 'j':
551                     uri.append(c);
552                     break;
553                 case 'k':
554                     uri.append(c);
555                     break;
556                 case 'l':
557                     uri.append(c);
558                     break;
559                 case 'm':
560                     uri.append(c);
561                     break;
562                 case 'n':
563                     uri.append(c);
564                     break;
565                 case 'o':
566                     uri.append(c);
567                     break;
568                 case 'p':
569                     uri.append(c);
570                     break;
571                 case 'q':
572                     uri.append(c);
573                     break;
574                 case 'r':
575                     uri.append(c);
576                     break;
577                 case 's':
578                     uri.append(c);
579                     break;
580                 case 't':
581                     uri.append(c);
582                     break;
583                 case 'u':
584                     uri.append(c);
585                     break;
586                 case 'v':
587                     uri.append(c);
588                     break;
589                 case 'w':
590                     uri.append(c);
591                     break;
592                 case 'x':
593                     uri.append(c);
594                     break;
595                 case 'y':
596                     uri.append(c);
597                     break;
598                 case 'z':
599                     uri.append(c);
600                     break;
601                 case '{':
602                     uri.append("%7B");
603                     break;
604                 case '|':
605                     uri.append("%7C");
606                     break;
607                 case '}':
608                     uri.append("%7D");
609                     break;
610                 case '~':
611                     uri.append(c);
612                     break;
613                 default:
614                     uri.append(percentEscape(c));
615             }
616         }
617         return uri.toString();
618 
619     }
620 
621 
percentEscape(char c)622     static String percentEscape(char c) {
623 
624         StringBuffer result = new StringBuffer(3);
625         String s = String.valueOf(c);
626         try {
627             byte[] data = s.getBytes("UTF8");
628             for (int i = 0; i < data.length; i++) {
629                 result.append('%');
630                 String hex = Integer.toHexString(data[i]).toUpperCase();
631                 if (c < 16) {
632                     result.append('0');
633                     result.append(hex);
634                 }
635                 else {
636                     // When c is negative as a byte, (e.g. greater
637                     // than 128) the hex strings come out as 8
638                     // characters rather than 2.
639                     result.append(hex.substring(hex.length()-2));
640                 }
641             }
642             return result.toString();
643         }
644         catch (UnsupportedEncodingException ex) {
645             throw new RuntimeException(
646               "Broken VM: does not recognize UTF-8 encoding");
647         }
648 
649     }
650 
651 
relativize(String base, String abs)652     static String relativize(String base, String abs) {
653 
654         ParsedURI parsedBase = new ParsedURI(base);
655         ParsedURI parsedAbs  = new ParsedURI(abs);
656 
657         parsedBase.path = removeDotSegments(parsedBase.path);
658 
659         if (parsedBase.scheme.equals(parsedAbs.scheme)
660           && parsedBase.authority.equals(parsedAbs.authority)) {
661 
662             String basePath = parsedBase.path;
663             String relPath = parsedAbs.path;
664 
665             while (basePath.length() > 1) {
666                 basePath = basePath.substring(0, basePath.lastIndexOf('/'));
667                 if (relPath.startsWith(basePath)) {
668                     return relPath.substring(basePath.length()+1);
669                 }
670             }
671 
672             return relPath;
673         }
674         else {
675             return abs;
676         }
677 
678     }
679 
680 
681 }
682