1 /* Copyright 2004-2006, 2009 Elliotte Rusty Harold 2 3 This library is free software; you can redistribute it and/or modify 4 it under the terms of version 2.1 of the GNU Lesser General Public 5 License as published by the Free Software Foundation. 6 7 This library is distributed in the hope that it will be useful, 8 but WITHOUT ANY WARRANTY; without even the implied warranty of 9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 GNU Lesser General Public License for more details. 11 12 You should have received a copy of the GNU Lesser General Public 13 License along with this library; if not, write to the 14 Free Software Foundation, Inc., 59 Temple Place, Suite 330, 15 Boston, MA 02111-1307 USA 16 17 You can contact Elliotte Rusty Harold by sending e-mail to 18 elharo@ibiblio.org. Please include the word "XOM" in the 19 subject line. The XOM home page is located at http://www.xom.nu/ 20 */ 21 22 package nu.xom; 23 24 import java.io.UnsupportedEncodingException; 25 26 27 /** 28 * These methods are not fully general. 29 * You would need to uncomment some lines to make this a 30 * public API. Certain preconditions for these methods to 31 * operate correctly are true in the context of XOM, 32 * but may well not be true in a more general context. 33 * 34 * @author Elliotte Rusty Harold 35 * @version 1.2.3 36 * 37 */ 38 class URIUtil { 39 40 // We assume the URI has already been verified as a potentially 41 // legal URI. Thus we don't have to check everything here. isOpaque(String uri)42 static boolean isOpaque(String uri) { 43 44 int colon = uri.indexOf(':'); 45 // if (colon < 1) return false; 46 // This next line is the difference between absolute and opaque 47 if (uri.substring(colon+1).startsWith("/")) return false; 48 if (!Verifier.isAlpha(uri.charAt(0))) return false; 49 /* for (int i = 1; i < colon; i++) { 50 if (!Verifier.isSchemeCharacter(uri.charAt(i))) { 51 return false; 52 } 53 } */ 54 return true; 55 56 } 57 58 isAbsolute(String uri)59 static boolean isAbsolute(String uri) { 60 61 int colon = uri.indexOf(':'); 62 if (colon < 1) return false; 63 // We assume the URI has already been verified as a potentially 64 // legal URI. Thus we don't have to check everything here. 65 /*if (!Verifier.isAlpha(uri.charAt(0))) return false; 66 for (int i = 1; i < colon; i++) { 67 if (!Verifier.isSchemeCharacter(uri.charAt(i))) return false; 68 } */ 69 return true; 70 71 } 72 73 74 // This doesn't do enough error checking to be a public API. absolutize(String baseURI, String spec)75 static String absolutize(String baseURI, String spec) { 76 77 if ("".equals(baseURI) || baseURI == null) return spec; 78 79 ParsedURI base = new ParsedURI(baseURI); 80 81 // This seems to be necessary to handle base URLs like 82 // http://www.example.com/test/data/.. 83 // but I don't think it's part of the 3986 algorithm. 84 // ???? It may be a bug in that algorithm. Check. 85 if (base.path.endsWith("/..")) base.path += '/'; 86 87 // The variable names R and T violate Java naming conventions. 88 // They are taken from the pseudo-code in the RFC 3986 spec. 89 ParsedURI R = new ParsedURI(spec); 90 ParsedURI T = new ParsedURI(); 91 92 // We should be able to skip this check. basically it 93 // asserts that the spec is not an absolute URI already 94 /* if (R.scheme != null) { 95 T.scheme = R.scheme; 96 T.authority = R.authority; 97 T.query = R.query; 98 T.path = removeDotSegments(R.path); 99 } 100 else { */ 101 if (R.authority != null) { 102 T.authority = R.authority; 103 T.query = R.query; 104 T.path = removeDotSegments(R.path); 105 } 106 else { 107 if ("".equals(R.path)) { 108 T.path = base.path; 109 if (R.query != null) { 110 T.query = R.query; 111 } 112 else { 113 T.query = base.query; 114 } 115 } 116 else { 117 if (R.path.startsWith("/")) { 118 T.path = removeDotSegments(R.path); 119 } 120 else { 121 T.path = merge(base, R.path); 122 T.path = removeDotSegments(T.path); 123 } 124 T.query = R.query; 125 } 126 T.authority = base.authority; 127 } 128 T.scheme = base.scheme; 129 // } 130 // Fragment ID of base URI is never considered 131 T.fragment = R.fragment; 132 133 return T.toString(); 134 135 } 136 137 merge(ParsedURI base, String relativePath)138 private static String merge(ParsedURI base, String relativePath) { 139 140 if (base.authority != null && "".equals(base.path) 141 && !"".equals(base.authority)) { 142 return "/" + relativePath; 143 } 144 145 int lastSlash = base.path.lastIndexOf('/'); 146 if (lastSlash == -1) return relativePath; 147 String topPath = base.path.substring(0, lastSlash+1); 148 return topPath + relativePath; 149 150 } 151 152 removeDotSegments(String path)153 static String removeDotSegments(String path) { 154 155 StringBuffer output = new StringBuffer(); 156 157 while (path.length() > 0) { 158 if (path.startsWith("/./")) { 159 path = '/' + path.substring(3); 160 } 161 else if (path.equals("/.")) { 162 path = "/"; 163 } 164 else if (path.startsWith("/../")) { 165 path = '/' + path.substring(4); 166 int lastSlash = output.toString().lastIndexOf('/'); 167 if (lastSlash != -1) output.setLength(lastSlash); 168 } 169 else if (path.equals("/..")) { 170 path = "/"; 171 int lastSlash = output.toString().lastIndexOf('/'); 172 if (lastSlash != -1) output.setLength(lastSlash); 173 } 174 // These next three cases are unreachable in the context of XOM. 175 // They may be needed in a more general public URIUtil. 176 // ???? need to consider whether these are still unreachable now that 177 // Builder.canonicalizeURL is calling this method. 178 /* else if (path.equals(".") || path.equals("..")) { 179 path = ""; 180 } 181 else if (path.startsWith("../")) { 182 path = path.substring(3); 183 } 184 else if (path.startsWith("./")) { 185 path = path.substring(2); 186 } */ 187 else { 188 int nextSlash = path.indexOf('/'); 189 if (nextSlash == 0) nextSlash = path.indexOf('/', 1); 190 if (nextSlash == -1) { 191 output.append(path); 192 path = ""; 193 } 194 else { 195 output.append(path.substring(0, nextSlash)); 196 path = path.substring(nextSlash); 197 } 198 } 199 } 200 201 return output.toString(); 202 203 } 204 205 206 // really just a struct 207 static class ParsedURI { 208 209 String scheme; 210 String schemeSpecificPart; 211 String query; 212 String fragment; 213 String authority; 214 String path = ""; 215 ParsedURI(String spec)216 ParsedURI(String spec) { 217 218 int colon = spec.indexOf(':'); 219 int question; 220 221 // URIs can only contain one sharp sign 222 int sharp = spec.lastIndexOf('#'); 223 224 // Fragment IDs can contain question marks so we only read 225 // the question mark before the fragment ID, if any 226 if (sharp == -1) question = spec.indexOf('?'); 227 else question = spec.substring(0, sharp).indexOf('?'); 228 229 if (colon != -1) scheme = spec.substring(0, colon); 230 231 if (question == -1 && sharp == -1) { 232 schemeSpecificPart = spec.substring(colon+1); 233 } 234 else if (question != -1) { 235 if (question < colon) { 236 MalformedURIException ex 237 = new MalformedURIException("Unparseable URI"); 238 ex.setData(spec); 239 throw ex; 240 } 241 schemeSpecificPart = spec.substring(colon+1, question); 242 } 243 else { 244 if (sharp < colon) { 245 MalformedURIException ex 246 = new MalformedURIException("Unparseable URI"); 247 ex.setData(spec); 248 throw ex; 249 } 250 schemeSpecificPart = spec.substring(colon+1, sharp); 251 } 252 253 if (sharp != -1) { 254 fragment = spec.substring(sharp+1); 255 } 256 257 if (question != -1) { 258 if (sharp == -1) { 259 query = spec.substring(question+1); 260 } 261 else { 262 query = spec.substring(question+1, sharp); 263 } 264 } 265 266 if (schemeSpecificPart.startsWith("//")) { 267 int authorityBegin = 2; 268 int authorityEnd = schemeSpecificPart.indexOf('/', authorityBegin); 269 if (authorityEnd == -1) { 270 authority = schemeSpecificPart.substring(2); 271 path = ""; 272 } 273 else { 274 authority = schemeSpecificPart.substring(authorityBegin, authorityEnd); 275 path = schemeSpecificPart.substring(authorityEnd); 276 } 277 } 278 else { 279 path = schemeSpecificPart; 280 } 281 282 } 283 ParsedURI()284 ParsedURI() {} 285 toString()286 public String toString() { 287 288 StringBuffer result = new StringBuffer(30); 289 290 if (scheme != null) { 291 result.append(scheme); 292 result.append(':'); 293 } 294 295 if (schemeSpecificPart != null) { 296 result.append(schemeSpecificPart); 297 } 298 else { 299 result.append("//"); 300 if (authority != null) result.append(authority); 301 result.append(path); 302 } 303 304 if (query != null) { 305 result.append('?'); 306 result.append(query); 307 } 308 309 if (fragment != null) { 310 result.append('#'); 311 result.append(fragment); 312 } 313 314 return result.toString(); 315 316 } 317 318 } 319 320 toURI(String iri)321 static String toURI(String iri) { 322 323 int length = iri.length(); 324 StringBuffer uri = new StringBuffer(length); 325 for (int i = 0; i < length; i++) { 326 char c = iri.charAt(i); 327 switch(c) { 328 case ' ': 329 uri.append("%20"); 330 break; 331 case '!': 332 uri.append(c); 333 break; 334 case '"': 335 uri.append("%22"); 336 break; 337 case '#': 338 uri.append(c); 339 break; 340 case '$': 341 uri.append(c); 342 break; 343 case '%': 344 uri.append(c); 345 break; 346 case '&': 347 uri.append(c); 348 break; 349 case '\'': 350 uri.append(c); 351 break; 352 case '(': 353 uri.append(c); 354 break; 355 case ')': 356 uri.append(c); 357 break; 358 case '*': 359 uri.append(c); 360 break; 361 case '+': 362 uri.append(c); 363 break; 364 case ',': 365 uri.append(c); 366 break; 367 case '-': 368 uri.append(c); 369 break; 370 case '.': 371 uri.append(c); 372 break; 373 case '/': 374 uri.append(c); 375 break; 376 case '0': 377 uri.append(c); 378 break; 379 case '1': 380 uri.append(c); 381 break; 382 case '2': 383 uri.append(c); 384 break; 385 case '3': 386 uri.append(c); 387 break; 388 case '4': 389 uri.append(c); 390 break; 391 case '5': 392 uri.append(c); 393 break; 394 case '6': 395 uri.append(c); 396 break; 397 case '7': 398 uri.append(c); 399 break; 400 case '8': 401 uri.append(c); 402 break; 403 case '9': 404 uri.append(c); 405 break; 406 case ':': 407 uri.append(c); 408 break; 409 case ';': 410 uri.append(c); 411 break; 412 case '<': 413 uri.append("%3C"); 414 break; 415 case '=': 416 uri.append(c); 417 break; 418 case '>': 419 uri.append("%3E"); 420 break; 421 case '?': 422 uri.append(c); 423 break; 424 case '@': 425 uri.append(c); 426 break; 427 case 'A': 428 uri.append(c); 429 break; 430 case 'B': 431 uri.append(c); 432 break; 433 case 'C': 434 uri.append(c); 435 break; 436 case 'D': 437 uri.append(c); 438 break; 439 case 'E': 440 uri.append(c); 441 break; 442 case 'F': 443 uri.append(c); 444 break; 445 case 'G': 446 uri.append(c); 447 break; 448 case 'H': 449 uri.append(c); 450 break; 451 case 'I': 452 uri.append(c); 453 break; 454 case 'J': 455 uri.append(c); 456 break; 457 case 'K': 458 uri.append(c); 459 break; 460 case 'L': 461 uri.append(c); 462 break; 463 case 'M': 464 uri.append(c); 465 break; 466 case 'N': 467 uri.append(c); 468 break; 469 case 'O': 470 uri.append(c); 471 break; 472 case 'P': 473 uri.append(c); 474 break; 475 case 'Q': 476 uri.append(c); 477 break; 478 case 'R': 479 uri.append(c); 480 break; 481 case 'S': 482 uri.append(c); 483 break; 484 case 'T': 485 uri.append(c); 486 break; 487 case 'U': 488 uri.append(c); 489 break; 490 case 'V': 491 uri.append(c); 492 break; 493 case 'W': 494 uri.append(c); 495 break; 496 case 'X': 497 uri.append(c); 498 break; 499 case 'Y': 500 uri.append(c); 501 break; 502 case 'Z': 503 uri.append(c); 504 break; 505 case '[': 506 uri.append(c); 507 break; 508 case '\\': 509 uri.append("%5C"); 510 break; 511 case ']': 512 uri.append(c); 513 break; 514 case '^': 515 uri.append("%5E"); 516 break; 517 case '_': 518 uri.append(c); 519 break; 520 case '`': 521 uri.append("%60"); 522 break; 523 case 'a': 524 uri.append(c); 525 break; 526 case 'b': 527 uri.append(c); 528 break; 529 case 'c': 530 uri.append(c); 531 break; 532 case 'd': 533 uri.append(c); 534 break; 535 case 'e': 536 uri.append(c); 537 break; 538 case 'f': 539 uri.append(c); 540 break; 541 case 'g': 542 uri.append(c); 543 break; 544 case 'h': 545 uri.append(c); 546 break; 547 case 'i': 548 uri.append(c); 549 break; 550 case 'j': 551 uri.append(c); 552 break; 553 case 'k': 554 uri.append(c); 555 break; 556 case 'l': 557 uri.append(c); 558 break; 559 case 'm': 560 uri.append(c); 561 break; 562 case 'n': 563 uri.append(c); 564 break; 565 case 'o': 566 uri.append(c); 567 break; 568 case 'p': 569 uri.append(c); 570 break; 571 case 'q': 572 uri.append(c); 573 break; 574 case 'r': 575 uri.append(c); 576 break; 577 case 's': 578 uri.append(c); 579 break; 580 case 't': 581 uri.append(c); 582 break; 583 case 'u': 584 uri.append(c); 585 break; 586 case 'v': 587 uri.append(c); 588 break; 589 case 'w': 590 uri.append(c); 591 break; 592 case 'x': 593 uri.append(c); 594 break; 595 case 'y': 596 uri.append(c); 597 break; 598 case 'z': 599 uri.append(c); 600 break; 601 case '{': 602 uri.append("%7B"); 603 break; 604 case '|': 605 uri.append("%7C"); 606 break; 607 case '}': 608 uri.append("%7D"); 609 break; 610 case '~': 611 uri.append(c); 612 break; 613 default: 614 uri.append(percentEscape(c)); 615 } 616 } 617 return uri.toString(); 618 619 } 620 621 percentEscape(char c)622 static String percentEscape(char c) { 623 624 StringBuffer result = new StringBuffer(3); 625 String s = String.valueOf(c); 626 try { 627 byte[] data = s.getBytes("UTF8"); 628 for (int i = 0; i < data.length; i++) { 629 result.append('%'); 630 String hex = Integer.toHexString(data[i]).toUpperCase(); 631 if (c < 16) { 632 result.append('0'); 633 result.append(hex); 634 } 635 else { 636 // When c is negative as a byte, (e.g. greater 637 // than 128) the hex strings come out as 8 638 // characters rather than 2. 639 result.append(hex.substring(hex.length()-2)); 640 } 641 } 642 return result.toString(); 643 } 644 catch (UnsupportedEncodingException ex) { 645 throw new RuntimeException( 646 "Broken VM: does not recognize UTF-8 encoding"); 647 } 648 649 } 650 651 relativize(String base, String abs)652 static String relativize(String base, String abs) { 653 654 ParsedURI parsedBase = new ParsedURI(base); 655 ParsedURI parsedAbs = new ParsedURI(abs); 656 657 parsedBase.path = removeDotSegments(parsedBase.path); 658 659 if (parsedBase.scheme.equals(parsedAbs.scheme) 660 && parsedBase.authority.equals(parsedAbs.authority)) { 661 662 String basePath = parsedBase.path; 663 String relPath = parsedAbs.path; 664 665 while (basePath.length() > 1) { 666 basePath = basePath.substring(0, basePath.lastIndexOf('/')); 667 if (relPath.startsWith(basePath)) { 668 return relPath.substring(basePath.length()+1); 669 } 670 } 671 672 return relPath; 673 } 674 else { 675 return abs; 676 } 677 678 } 679 680 681 } 682