1 /* Copyright 2002-2006, 2009, 2010, 2013, 2018 Elliotte Rusty Harold 2 3 This library is free software; you can redistribute it and/or modify 4 it under the terms of version 2.1 of the GNU Lesser General Public 5 License as published by the Free Software Foundation. 6 7 This library is distributed in the hope that it will be useful, 8 but WITHOUT ANY WARRANTY; without even the implied warranty of 9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 GNU Lesser General Public License for more details. 11 12 You should have received a copy of the GNU Lesser General Public 13 License along with this library; if not, write to the 14 Free Software Foundation, Inc., 59 Temple Place, Suite 330, 15 Boston, MA 02111-1307 USA 16 17 You can contact Elliotte Rusty Harold by sending e-mail to 18 elharo@ibiblio.org. Please include the word "XOM" in the 19 subject line. The XOM home page is located at http://www.xom.nu/ 20 */ 21 22 package nu.xom; 23 24 import java.io.CharConversionException; 25 import java.io.File; 26 import java.io.FileInputStream; 27 import java.io.IOException; 28 import java.io.InputStream; 29 import java.io.Reader; 30 import java.io.StringReader; 31 import java.io.UTFDataFormatException; 32 import java.net.MalformedURLException; 33 import java.net.URL; 34 35 import org.xml.sax.ErrorHandler; 36 import org.xml.sax.InputSource; 37 import org.xml.sax.SAXException; 38 import org.xml.sax.SAXNotRecognizedException; 39 import org.xml.sax.SAXNotSupportedException; 40 import org.xml.sax.SAXParseException; 41 import org.xml.sax.XMLFilter; 42 import org.xml.sax.XMLReader; 43 import org.xml.sax.helpers.XMLReaderFactory; 44 45 import org.apache.xerces.impl.Version; 46 47 /** 48 * <p> 49 * This class is responsible for creating XOM <code>Document</code> 50 * objects from a URL, file, string, or input stream by reading 51 * an XML document. A SAX parser is used to read the 52 * document and report any well-formedness errors. 53 * </p> 54 * 55 * @author Elliotte Rusty Harold 56 * @version 1.2.11 57 * 58 */ 59 public class Builder { 60 61 62 private XMLReader parser; 63 private NodeFactory factory; 64 65 private static double xercesVersion = 2.6; 66 67 static { 68 69 try { 70 String x = Version.getVersion(); 71 String versionString = x.substring(9); 72 int firstPeriod = versionString.indexOf("."); 73 int secondPeriod = versionString.lastIndexOf("."); 74 String major = versionString.substring(0, firstPeriod); 75 String minor = versionString.substring(firstPeriod+1, secondPeriod); 76 if (Integer.parseInt(minor) < 10 && Integer.parseInt(major) < 3) { 77 xercesVersion = Double.parseDouble(x.substring(9,12)); 78 } 79 // else it's 2.6 or later which is all we really need to know 80 } 81 catch (Exception ex) { 82 // The version string format changed so presumably it's 83 // 2.6 or later 84 } 85 catch (Error err) { 86 // Xerces not installed, so none of this matters 87 } 88 89 } 90 91 92 /** 93 * <p> 94 * Creates a <code>Builder</code> that uses the default node 95 * factory and chooses among any available SAX2 parsers. 96 * In order of preference, it looks for: 97 * </p> 98 * 99 * <ol> 100 * <li>Xerces 2.x (a.k.a. IBM XML parser for Java)</li> 101 * <li>GNU Ælfred</li> 102 * <li>Crimson</li> 103 * <li>Piccolo</li> 104 * <li>Oracle</li> 105 * <li>XP</li> 106 * <li>Saxon's Ælfred</li> 107 * <li>dom4j's Ælfred</li> 108 * <li>The platform default specified by the 109 * <code>org.xml.sax.driver</code> system property</li> 110 * </ol> 111 * 112 * <p> 113 * Parsers must implicitly or explicitly support the 114 * http://xml.org/sax/features/external-general-entities 115 * and 116 * http://xml.org/sax/features/external-parameter-entities 117 * features XOM requires. Parsers that don't are rejected 118 * automatically. 119 * </p> 120 * 121 * @throws XMLException if no satisfactory parser is 122 * installed in the local class path 123 */ Builder()124 public Builder() { 125 this(false); 126 } 127 128 129 /** 130 * <p> 131 * Creates a <code>Builder</code> based on an optionally validating 132 * parser. If the <code>validate</code> argument 133 * is true, then a validity error while 134 * parsing will cause a fatal error; that is, 135 * it will throw a <code>ValidityException</code>. 136 * </p> 137 * 138 * @param validate true if the parser should 139 * validate the document while parsing 140 * 141 * @throws XMLException if no satisfactory parser 142 * is installed in the local class path 143 */ Builder(boolean validate)144 public Builder(boolean validate) { 145 this(findParser(validate), validate, null); 146 } 147 148 149 /** 150 * <p> 151 * Creates a <code>Builder</code> based on an optionally 152 * validating parser that builds node objects with the supplied 153 * factory. If the <code>validate</code> argument is true, then 154 * a validity error while parsing will cause a fatal error; that 155 * is, it will throw a <code>ValidityException</code>. 156 * </p> 157 * 158 * @param validate true if the parser should 159 * validate the document while parsing 160 * @param factory the <code>NodeFactory</code> that creates 161 * the node objects for this <code>Builder</code> 162 * 163 * @throws XMLException if no satisfactory parser 164 * is installed in the local class path 165 */ Builder(boolean validate, NodeFactory factory)166 public Builder(boolean validate, NodeFactory factory) { 167 this(findParser(validate), validate, factory); 168 } 169 170 171 // These are stored in the order of preference. 172 private static String[] parsers = { 173 "nu.xom.XML1_0Parser", 174 "nu.xom.JDK15XML1_0Parser", 175 "org.apache.xerces.parsers.SAXParser", 176 "org.apache.xerces.jaxp.SAXParserImpl$JAXPSAXParser", // xerces-2.9.x 177 "com.sun.org.apache.xerces.internal.jaxp.SAXParserImpl$JAXPSAXParser", // JDK 1.6 178 "com.sun.org.apache.xerces.internal.parsers.SAXParser", 179 "gnu.xml.aelfred2.XmlReader", 180 "org.apache.crimson.parser.XMLReaderImpl", 181 "com.bluecast.xml.Piccolo", 182 "oracle.xml.parser.v2.SAXParser", 183 "com.jclark.xml.sax.SAX2Driver", 184 "net.sf.saxon.aelfred.SAXDriver", 185 "com.icl.saxon.aelfred.SAXDriver", 186 "org.dom4j.io.aelfred2.SAXDriver", 187 "org.dom4j.io.aelfred.SAXDriver", 188 "org.xmlpull.v1.sax2.Driver" // android 189 }; 190 191 findParser(boolean validate)192 static XMLReader findParser(boolean validate) { 193 194 // first look for Xerces; we only trust Xerces if 195 // we set it up; and we need to configure it specially 196 // so we can't load it with the XMLReaderFactory 197 XMLReader parser; 198 try { 199 parser = new XML1_0Parser(); 200 setupParser(parser, validate); 201 return parser; 202 } 203 catch (SAXException ex) { 204 // look for next one 205 } 206 catch (NoClassDefFoundError err) { 207 // Xerces is not available; look for next one 208 } 209 210 try { 211 parser = (XMLReader) Class.forName( 212 "nu.xom.JDK15XML1_0Parser").newInstance(); 213 setupParser(parser, validate); 214 return parser; 215 } 216 catch (SAXException ex) { 217 // look for next one 218 } 219 catch (InstantiationException ex) { 220 // look for next one 221 } 222 catch (ClassNotFoundException ex) { 223 // look for next one 224 } 225 catch (IllegalAccessException ex) { 226 // look for next one 227 } 228 catch (NoClassDefFoundError err) { 229 // Xerces is not available; look for next one 230 } 231 232 // XMLReaderFactory.createXMLReader never returns 233 // null. If it can't locate the parser, it throws 234 // a SAXException. 235 for (int i = 2; i < parsers.length; i++) { 236 try { 237 parser = XMLReaderFactory.createXMLReader(parsers[i]); 238 setupParser(parser, validate); 239 return parser; 240 } 241 catch (SAXException ex) { 242 // try the next one 243 } 244 catch (NoClassDefFoundError err) { 245 // try the next one 246 } 247 } 248 249 try { // default 250 parser = XMLReaderFactory.createXMLReader(); 251 setupParser(parser, validate); 252 return parser; 253 } 254 catch (SAXException ex) { 255 throw new XMLException( 256 "Could not find a suitable SAX2 parser", ex); 257 } 258 259 } 260 261 setupParser(XMLReader parser, boolean validate)262 private static void setupParser(XMLReader parser, boolean validate) 263 throws SAXNotRecognizedException, SAXNotSupportedException { 264 265 // General configuration for all parsers 266 parser.setFeature( 267 "http://xml.org/sax/features/namespace-prefixes", true); 268 parser.setFeature( 269 "http://xml.org/sax/features/namespaces", true); 270 271 // Parser specific configuration 272 XMLReader baseParser = parser; 273 while (baseParser instanceof XMLFilter) { 274 XMLReader parent = ((XMLFilter) baseParser).getParent(); 275 if (parent == null) break; 276 baseParser = parent; 277 } 278 279 String parserName = baseParser.getClass().getName(); 280 if (!validate) { 281 if (parserName.equals( // Crimson workaround 282 "org.apache.crimson.parser.XMLReaderImpl")) { 283 parser.setErrorHandler( 284 new NamespaceWellformednessRequired() 285 ); 286 } 287 else { 288 parser.setFeature( 289 "http://xml.org/sax/features/external-general-entities", 290 true 291 ); 292 parser.setFeature( 293 "http://xml.org/sax/features/external-parameter-entities", 294 true 295 ); 296 } 297 } 298 else { 299 parser.setFeature( 300 "http://xml.org/sax/features/validation", true); 301 parser.setErrorHandler(new ValidityRequired()); 302 } 303 304 try { 305 parser.setFeature( 306 "http://xml.org/sax/features/string-interning", true); 307 } 308 catch (SAXException ex) { 309 // This parser does not support string interning. 310 // We can live without that. 311 } 312 313 // A couple of Xerces specific properties 314 if (parserName.equals("nu.xom.XML1_0Parser") 315 || parserName.equals("nu.xom.JDK15XML1_0Parser") 316 || parserName.equals("org.apache.xerces.parsers.SAXParser") 317 || parserName.equals("com.sun.org.apache.xerces.internal.parsers.SAXParser") 318 || parserName.equals("org.apache.xerces.jaxp.SAXParserImpl$JAXPSAXParser") // xerces-2.9.x 319 || parserName.equals("com.sun.org.apache.xerces.internal.jaxp.SAXParserImpl$JAXPSAXParser")) // JDK 1.6 320 { 321 try { 322 parser.setFeature( 323 "http://apache.org/xml/features/allow-java-encodings", true); 324 } 325 catch (SAXException ex) { 326 // Possibly an earlier version of Xerces; no big deal. 327 // We can live without this feature. 328 } 329 // See http://nagoya.apache.org/bugzilla/show_bug.cgi?id=23768 330 // if you care to know why this line breaks unit tests on 331 // versions of Xerces prior to 2.6.1 332 try { 333 parser.setFeature( 334 "http://apache.org/xml/features/standard-uri-conformant", 335 true); 336 } 337 catch (SAXException ex) { 338 // Possibly an earlier version of Xerces, or a 339 // or a non-Xerces parser; no big deal. 340 // We can live without this. 341 } 342 } 343 344 } 345 346 347 /** 348 * <p> 349 * Creates a <code>Builder</code> that uses 350 * the specified SAX <code>XMLReader</code>. 351 * Custom SAX features and properties such as 352 * schema validation can be set on this <code>XMLReader</code> 353 * before passing it to this method. 354 * </p> 355 * 356 * @param parser the SAX2 <code>XMLReader</code> that 357 * parses the document 358 * 359 * @throws XMLException if <code>parser</code> does not support the 360 * features XOM requires 361 */ Builder(XMLReader parser)362 public Builder(XMLReader parser) { 363 this(parser, false); 364 } 365 366 367 /** 368 * <p> 369 * Creates a <code>Builder</code> that uses 370 * the specified <code>NodeFactory</code> to create 371 * node objects. 372 * </p> 373 * 374 * @param factory the <code>NodeFactory</code> that creates 375 * the node objects for this <code>Builder</code> 376 * 377 * @throws XMLException if no satisfactory parser is 378 * installed in the local class path 379 */ Builder(NodeFactory factory)380 public Builder(NodeFactory factory) { 381 this(findParser(false), false, factory); 382 } 383 384 385 /** 386 * <p> 387 * Creates a optionally validating <code>Builder</code> based 388 * on the specified parser object. Custom SAX features and 389 * properties such as schema validation can be set on this 390 * <code>XMLReader</code> before passing it to this method. 391 * </p> 392 * 393 * <p> 394 * If the validate argument is true, then a validity error 395 * while parsing will cause a fatal error; that is, it 396 * will throw a <code>ParsingException</code> 397 * </p> 398 * 399 * @param parser the SAX2 <code>XMLReader</code> that parses 400 * the document 401 * @param validate true if the parser should validate 402 * the document while parsing 403 * 404 */ Builder(XMLReader parser, boolean validate)405 public Builder(XMLReader parser, boolean validate) { 406 this(parser, validate, null); 407 } 408 409 410 /** 411 * <p> 412 * Creates an optionally validating <code>Builder</code> that reads 413 * data from the specified parser object and constructs new nodes 414 * using the specified factory object. Custom SAX features and 415 * properties such as schema validation can be set on this 416 * <code>XMLReader</code> before passing it to this method. 417 * </p> 418 * 419 * <p> 420 * If the <code>validate</code> argument is true, then a validity 421 * error while parsing will throw a <code>ParsingException</code>. 422 * </p> 423 * 424 * @param parser the SAX2 <code>XMLReader</code> that parses 425 * the document 426 * @param validate true if the parser should validate the 427 * document while parsing 428 * @param factory the <code>NodeFactory</code> 429 * this builder uses to create objects in the tree 430 * 431 * @throws XMLException if <code>parser</code> does not support 432 * the features XOM requires 433 * 434 */ Builder( XMLReader parser, boolean validate, NodeFactory factory)435 public Builder( 436 XMLReader parser, boolean validate, NodeFactory factory) { 437 438 try { 439 setupParser(parser, validate); 440 } 441 catch (SAXException ex) { 442 if (validate) { 443 throw new XMLException(parser.getClass().getName() 444 + " does not support validation.", ex); 445 } 446 else { 447 throw new XMLException(parser.getClass().getName() 448 + " does not support the entity resolution" 449 + " features XOM requires.", ex); 450 } 451 } 452 453 // setup the handlers 454 this.parser = parser; 455 this.factory = factory; 456 setHandlers(); 457 458 } 459 460 knownGoodParser(XMLReader parser)461 private static boolean knownGoodParser(XMLReader parser) { 462 463 String parserName = parser.getClass().getName(); 464 465 // In general, a filter may violate the constraints of XML 1.0. 466 // However, I specifically trust Norm Walsh not to do that, so 467 // if his filters are being used we look at the parent instead. 468 if (parserName.equals("org.apache.xml.resolver.tools.ResolvingXMLFilter")) { 469 XMLFilter filter = (XMLFilter) parser; 470 parserName = filter.getParent().getClass().getName(); 471 } 472 473 // These parsers are known to not make all the checks 474 // they're supposed to. :-( 475 if (parserName.equals("gnu.xml.aelfred2.XmlReader")) return false; 476 if (parserName.equals("net.sf.saxon.aelfred.SAXDriver")) return false; 477 if (parserName.equals("com.icl.saxon.aelfred.SAXDriver")) return false; 478 479 if (parserName.equals("org.apache.xerces.parsers.SAXParser") 480 && xercesVersion >= 2.4) { 481 return false; 482 } 483 484 for (int i = 0; i < parsers.length; i++) { 485 if (parserName.equals(parsers[i])) return true; 486 } 487 return false; 488 489 } 490 491 setHandlers()492 private void setHandlers() { 493 XOMHandler handler; 494 if ((factory == null 495 || factory.getClass().getName().equals("nu.xom.NodeFactory")) 496 && knownGoodParser(parser)) { 497 // If no factory is supplied by user, don't 498 // return one 499 NodeFactory tempFactory = factory; 500 if (tempFactory == null) tempFactory = new NodeFactory(); 501 handler = new NonVerifyingHandler(tempFactory); 502 } 503 else { 504 if (factory == null) factory = new NodeFactory(); 505 handler = new XOMHandler(factory); 506 } 507 508 parser.setContentHandler(handler); 509 parser.setDTDHandler(handler); 510 511 try { 512 parser.setProperty( 513 "http://xml.org/sax/properties/lexical-handler", 514 handler); 515 } 516 catch (SAXException ex) { 517 // This parser does not support lexical events. 518 // We can live without them, though it does mean 519 // there won't be any comments or a DOCTYPE declaration 520 // in the tree. 521 } 522 523 try { 524 parser.setProperty( 525 "http://xml.org/sax/properties/declaration-handler", 526 handler); 527 // Due to Crimson bugs in misidentifying the internal and 528 // external DTD subsets, we only build the internal DTD 529 // subset if there is no external DTD subset. 530 if (parser.getClass().getName().equals( 531 "org.apache.crimson.parser.XMLReaderImpl")) { 532 handler.usingCrimson = true; 533 } 534 } 535 catch (SAXException ex) { 536 // This parser does not support declaration events. 537 // We can live without them, though it does mean 538 // they won't be any internal DTD subset. 539 } 540 541 } 542 543 544 /** 545 * <p> 546 * Parses the document at the specified URL. 547 * </p> 548 * 549 * <p> 550 * Note that relative URLs generally do not work here, as 551 * there's no base to resolve them against. This includes 552 * relative URLs that point into the file system, though this 553 * is somewhat platform dependent. Furthermore, <code>file</code> 554 * URLs often only work when they adhere exactly to RFC 2396 555 * syntax. URLs that work in Internet Explorer often fail when 556 * used in Java. If you're reading XML from a file, more reliable 557 * results are obtained by using the <code>build</code> method 558 * that takes a <code>java.io.File</code> object as an argument. 559 * </p> 560 * 561 * @param systemID an absolute URL from which the document is read. 562 * The URL's scheme must be one supported by the Java VM. 563 * 564 * @return the parsed <code>Document</code> 565 * 566 * @throws ValidityException if a validity error is detected. This 567 * is only thrown if the builder has been instructed to validate. 568 * @throws ParsingException if a well-formedness error is detected 569 * @throws IOException if an I/O error such as a broken socket 570 * prevents the document from being fully read 571 */ build(String systemID)572 public Document build(String systemID) 573 throws ParsingException, ValidityException, IOException { 574 575 systemID = canonicalizeURL(systemID); 576 InputSource source = new InputSource(systemID); 577 return build(source); 578 579 } 580 581 582 /** 583 * <p> 584 * Reads the document from an input stream. 585 * </p> 586 * 587 * @param in the input stream from which the document is read 588 * 589 * @return the parsed <code>Document</code> 590 * 591 * @throws ValidityException if a validity error is detected; 592 * only thrown if the builder has been instructed to validate 593 * @throws ParsingException if a well-formedness error is detected 594 * @throws IOException if an I/O error such as a broken 595 * socket prevents the document from being fully read 596 * @throws NullPointerException if <code>in</code> is null 597 */ build(InputStream in)598 public Document build(InputStream in) 599 throws ParsingException, ValidityException, IOException { 600 601 if (in == null) throw new NullPointerException("Null InputStream"); 602 InputSource source = new InputSource(in); 603 return build(source); 604 605 } 606 607 608 /** 609 * <p> 610 * Reads the document from an input stream while specifying 611 * a base URI (which need not be the stream's actual URI). 612 * </p> 613 * 614 * @param in the input stream from which the document is read 615 * @param baseURI an absolute URI for this document; may be null 616 * 617 * @return the parsed <code>Document</code> 618 * 619 * @throws ValidityException if a validity error is detected; 620 * only thrown if the builder has been instructed to validate 621 * @throws ParsingException if a well-formedness error is detected 622 * @throws IOException if an I/O error such as a broken 623 * socket prevents the document from being fully read 624 */ build(InputStream in, String baseURI)625 public Document build(InputStream in, String baseURI) 626 throws ParsingException, ValidityException, IOException { 627 628 InputSource source = new InputSource(in); 629 if (baseURI != null) { 630 baseURI = canonicalizeURL(baseURI); 631 source.setSystemId(baseURI); 632 } 633 return build(source); 634 635 } 636 637 638 // Nasty hack to make sure we get the right form 639 // of file URLs on Windows 640 private static String fileURLPrefix = "file://"; 641 642 static { 643 String os = System.getProperty("os.name", "Unix"); 644 // I could do System.setProperty("os.name" "Windows") to test 645 // this, but I'd need to use a fresh ClassLoader to rerun the 646 // static initializer block. 647 if (os.indexOf("Windows") >= 0) { 648 fileURLPrefix = "file:/"; 649 } 650 } 651 652 653 /** 654 * <p> 655 * Reads the document from a file. 656 * The base URI of the document is set to the 657 * location of the file. 658 * </p> 659 * 660 * @param in the file from which the document is read 661 * 662 * @return the parsed <code>Document</code> 663 * 664 * @throws ValidityException if a validity error is detected. This 665 * is only thrown if the builder has been instructed to validate. 666 * @throws ParsingException if a well-formedness error is detected 667 * @throws IOException if an I/O error such as a bad disk 668 * prevents the file from being read 669 */ build(File in)670 public Document build(File in) 671 throws ParsingException, ValidityException, IOException { 672 673 InputStream fin = new FileInputStream(in); 674 // Java's toURL method doesn't properly escape file 675 // names so we have to do it manually 676 String absolute = in.getAbsolutePath(); 677 StringBuffer url = new StringBuffer(fileURLPrefix); 678 int length = absolute.length(); 679 char separatorChar = File.separatorChar; 680 for (int i = 0; i < length; i++) { 681 char c = absolute.charAt(i); 682 if (c == separatorChar) url.append('/'); 683 else { 684 switch(c) { 685 case ' ': 686 url.append("%20"); 687 break; 688 case '!': 689 url.append(c); 690 break; 691 case '"': 692 url.append("%22"); 693 break; 694 case '#': 695 url.append("%23"); 696 break; 697 case '$': 698 url.append(c); 699 break; 700 case '%': 701 url.append("%25"); 702 break; 703 case '&': 704 // ampersand does not need to be encoded in 705 // path part of URL 706 url.append('&'); 707 break; 708 case '\'': 709 url.append(c); 710 break; 711 case '(': 712 url.append(c); 713 break; 714 case ')': 715 url.append(c); 716 break; 717 case '*': 718 url.append(c); 719 break; 720 case '+': 721 url.append("%2B"); 722 break; 723 case ',': 724 url.append(c); 725 break; 726 case '-': 727 url.append(c); 728 break; 729 case '.': 730 url.append(c); 731 break; 732 case '/': 733 url.append("%2F"); 734 break; 735 case '0': 736 url.append(c); 737 break; 738 case '1': 739 url.append(c); 740 break; 741 case '2': 742 url.append(c); 743 break; 744 case '3': 745 url.append(c); 746 break; 747 case '4': 748 url.append(c); 749 break; 750 case '5': 751 url.append(c); 752 break; 753 case '6': 754 url.append(c); 755 break; 756 case '7': 757 url.append(c); 758 break; 759 case '8': 760 url.append(c); 761 break; 762 case '9': 763 url.append(c); 764 break; 765 case ':': 766 url.append(c); 767 break; 768 case ';': 769 url.append(c); 770 break; 771 case '<': 772 url.append("%3C"); 773 break; 774 case '=': 775 url.append(c); 776 break; 777 case '>': 778 url.append("%3E"); 779 break; 780 case '?': 781 url.append("%3F"); 782 break; 783 case '@': 784 url.append("%40"); 785 break; 786 case 'A': 787 url.append(c); 788 break; 789 case 'B': 790 url.append(c); 791 break; 792 case 'C': 793 url.append(c); 794 break; 795 case 'D': 796 url.append(c); 797 break; 798 case 'E': 799 url.append(c); 800 break; 801 case 'F': 802 url.append(c); 803 break; 804 case 'G': 805 url.append(c); 806 break; 807 case 'H': 808 url.append(c); 809 break; 810 case 'I': 811 url.append(c); 812 break; 813 case 'J': 814 url.append(c); 815 break; 816 case 'K': 817 url.append(c); 818 break; 819 case 'L': 820 url.append(c); 821 break; 822 case 'M': 823 url.append(c); 824 break; 825 case 'N': 826 url.append(c); 827 break; 828 case 'O': 829 url.append(c); 830 break; 831 case 'P': 832 url.append(c); 833 break; 834 case 'Q': 835 url.append(c); 836 break; 837 case 'R': 838 url.append(c); 839 break; 840 case 'S': 841 url.append(c); 842 break; 843 case 'T': 844 url.append(c); 845 break; 846 case 'U': 847 url.append(c); 848 break; 849 case 'V': 850 url.append(c); 851 break; 852 case 'W': 853 url.append(c); 854 break; 855 case 'X': 856 url.append(c); 857 break; 858 case 'Y': 859 url.append(c); 860 break; 861 case 'Z': 862 url.append(c); 863 break; 864 case '[': 865 url.append("%5B"); 866 break; 867 case '\\': 868 url.append("%5C"); 869 break; 870 case ']': 871 url.append("%5D"); 872 break; 873 case '^': 874 url.append("%5E"); 875 break; 876 case '_': 877 url.append(c); 878 break; 879 case '`': 880 url.append("%60"); 881 break; 882 case 'a': 883 url.append(c); 884 break; 885 case 'b': 886 url.append(c); 887 break; 888 case 'c': 889 url.append(c); 890 break; 891 case 'd': 892 url.append(c); 893 break; 894 case 'e': 895 url.append(c); 896 break; 897 case 'f': 898 url.append(c); 899 break; 900 case 'g': 901 url.append(c); 902 break; 903 case 'h': 904 url.append(c); 905 break; 906 case 'i': 907 url.append(c); 908 break; 909 case 'j': 910 url.append(c); 911 break; 912 case 'k': 913 url.append(c); 914 break; 915 case 'l': 916 url.append(c); 917 break; 918 case 'm': 919 url.append(c); 920 break; 921 case 'n': 922 url.append(c); 923 break; 924 case 'o': 925 url.append(c); 926 break; 927 case 'p': 928 url.append(c); 929 break; 930 case 'q': 931 url.append(c); 932 break; 933 case 'r': 934 url.append(c); 935 break; 936 case 's': 937 url.append(c); 938 break; 939 case 't': 940 url.append(c); 941 break; 942 case 'u': 943 url.append(c); 944 break; 945 case 'v': 946 url.append(c); 947 break; 948 case 'w': 949 url.append(c); 950 break; 951 case 'x': 952 url.append(c); 953 break; 954 case 'y': 955 url.append(c); 956 break; 957 case 'z': 958 url.append(c); 959 break; 960 case '{': 961 url.append("%7B"); 962 break; 963 case '|': 964 url.append("%7C"); 965 break; 966 case '}': 967 url.append("%7D"); 968 break; 969 case '~': 970 url.append(c); 971 break; 972 default: 973 if (c < 0xD800 || c > 0xDFFF) { 974 url.append(URIUtil.percentEscape(c)); 975 } 976 else if (c <= 0xDBFF) { 977 // high surrogate; therefore we need to 978 // grab the next half before encoding 979 i++; 980 try { 981 char low = absolute.charAt(i); 982 String character = String.valueOf(c)+low; 983 byte[] data = character.getBytes("UTF8"); 984 // Always exactly 4 bytes, unless the encoder is buggy 985 for (int j=0; j < 4; j++) { 986 url.append('%'); 987 String hex = Integer.toHexString(data[j]).toUpperCase(); 988 url.append(hex.substring(hex.length()-2)); 989 } 990 } 991 catch (IndexOutOfBoundsException ex) { 992 // file name contains a high half and not a low half 993 url = new StringBuffer(0); 994 break; 995 } 996 } 997 else { 998 // low half not preceded by high half 999 // Can't create a base URI 1000 url = new StringBuffer(0); 1001 break; 1002 } 1003 } 1004 } 1005 } 1006 1007 String base = url.toString(); 1008 try { 1009 Document doc = build(fin, base); 1010 return doc; 1011 } 1012 finally { 1013 fin.close(); 1014 } 1015 1016 } 1017 1018 1019 /** 1020 * <p> 1021 * Reads the document from a reader. 1022 * </p> 1023 * 1024 * @param in the reader from which the document is read 1025 * 1026 * @return the parsed <code>Document</code> 1027 * 1028 * @throws ValidityException if a validity error is detected. This 1029 * is only thrown if the builder has been instructed to validate. 1030 * @throws ParsingException if a well-formedness error is detected 1031 * @throws IOException if an I/O error such as a bad disk 1032 * prevents the document from being fully read 1033 */ build(Reader in)1034 public Document build(Reader in) 1035 throws ParsingException, ValidityException, IOException { 1036 1037 if (in == null) throw new NullPointerException("Attempted to build from null reader"); 1038 InputSource source = new InputSource(in); 1039 return build(source); 1040 1041 } 1042 1043 1044 /** 1045 * <p> 1046 * Reads the document from a character stream while 1047 * specifying a base URI. 1048 * </p> 1049 * 1050 * @param in the reader from which the document 1051 * is read 1052 * @param baseURI the base URI for this document; may be null 1053 * 1054 * @return the parsed <code>Document</code> 1055 * 1056 * @throws ValidityException if a validity error is detected. This 1057 * is only thrown if the builder has been instructed to 1058 * validate. 1059 * @throws ParsingException if a well-formedness error is detected 1060 * @throws IOException if an I/O error such as a bad disk 1061 * prevents the document from being completely read 1062 */ build(Reader in, String baseURI)1063 public Document build(Reader in, String baseURI) 1064 throws ParsingException, ValidityException, IOException { 1065 1066 InputSource source = new InputSource(in); 1067 if (baseURI != null) { 1068 baseURI = canonicalizeURL(baseURI); 1069 source.setSystemId(baseURI); 1070 } 1071 return build(source); 1072 1073 } 1074 1075 1076 /** 1077 * <p> 1078 * Reads the document from the contents of a string. 1079 * </p> 1080 * 1081 * @param document the string that contains the XML document 1082 * @param baseURI the base URI for this document; may be null 1083 * 1084 * @return the parsed <code>Document</code> 1085 * 1086 * @throws ValidityException if a validity error is detected. This 1087 * is only thrown if the builder has been instructed to 1088 * validate. 1089 * @throws ParsingException if a well-formedness error is detected 1090 * @throws IOException if an I/O error such as a bad disk 1091 * prevents the document's external DTD subset from being read 1092 */ build(String document, String baseURI)1093 public Document build(String document, String baseURI) 1094 throws ParsingException, ValidityException, IOException { 1095 1096 Reader reader = new StringReader(document); 1097 return build(reader, baseURI); 1098 1099 } 1100 1101 // needed to work around a bug in Xerces and Crimson 1102 // for URLs with no trailing slashes (no path part) 1103 // such as http://www.cafeconleche.org. 1104 // Also needed to work around a VM bug involving file URLs such as 1105 // file:///tmp/nosuchdirectory/../foo.xml 1106 // where "nosuchdirectory" does not exist. canonicalizeURL(String uri)1107 private String canonicalizeURL(String uri) { 1108 1109 try { 1110 URL u = new URL(uri); 1111 String path = u.getPath(); 1112 String scheme = u.getProtocol(); 1113 String authority = u.getHost(); 1114 String query = u.getQuery(); 1115 int port = u.getPort(); 1116 // fragment ID not needed 1117 if (path == null || path.length() == 0) { 1118 // We handle here the case where we have a URL such as 1119 // http://www.cafeaulait.org with no trailing slash. 1120 path = "/"; 1121 } 1122 // If this proves to be a hot spot we could probably take this path 1123 // only if the scheme is file; not in the more common case where 1124 // it's http 1125 path = URIUtil.removeDotSegments(path); 1126 StringBuffer canonicalForm = new StringBuffer(uri.length()); 1127 canonicalForm.append(scheme); 1128 canonicalForm.append("://"); 1129 if (authority != null) canonicalForm.append(authority); 1130 if (port >= 0) canonicalForm.append(":" + port); 1131 canonicalForm.append(path); 1132 if (query != null) canonicalForm.append("?" + query); 1133 return canonicalForm.toString(); 1134 } 1135 catch (MalformedURLException ex) { 1136 return uri; 1137 } 1138 } 1139 1140 1141 /** 1142 * <p> 1143 * Reads the document from a SAX <code>InputSource</code>. 1144 * </p> 1145 * 1146 * @param in the input source from which the document is read 1147 * 1148 * @return the parsed <code>Document</code> 1149 * 1150 * @throws ValidityException if a validity error is detected. This 1151 * is only thrown if the builder has been instructed to 1152 * validate. 1153 * @throws ParsingException if a well-formedness error is detected 1154 * @throws IOException if an I/O error such as a bad disk 1155 * prevents the document from being read 1156 */ build(InputSource in)1157 private Document build(InputSource in) 1158 throws ParsingException, ValidityException, IOException { 1159 1160 XOMHandler handler = (XOMHandler) parser.getContentHandler(); 1161 Document result = null; 1162 try { 1163 parser.parse(in); 1164 result = handler.getDocument(); 1165 } 1166 catch (SAXParseException ex) { 1167 ParsingException pex = new ParsingException( 1168 ex.getMessage(), 1169 ex.getSystemId(), 1170 ex.getLineNumber(), 1171 ex.getColumnNumber(), 1172 ex); 1173 throw pex; 1174 } 1175 catch (SAXException ex) { 1176 ParsingException pex 1177 = new ParsingException(ex.getMessage(), in.getSystemId(), ex); 1178 throw pex; 1179 } 1180 catch (XMLException ex) { 1181 throw new ParsingException(ex.getMessage(), ex); 1182 } 1183 catch (RuntimeException ex) { 1184 // Work-around for non-conformant parsers, especially Piccolo 1185 ParsingException pex 1186 = new ParsingException(ex.getMessage(), in.getSystemId(), ex); 1187 throw pex; 1188 } 1189 catch (UTFDataFormatException ex) { 1190 // Work-around for non-conformant parsers, especially Xerces 1191 // http://nagoya.apache.org/bugzilla/show_bug.cgi?id=27583 1192 ParsingException pex 1193 = new ParsingException(ex.getMessage(), in.getSystemId(), ex); 1194 throw pex; 1195 } 1196 catch (CharConversionException ex) { 1197 // Work-around for non-conformant parsers, especially Xerces 1198 // http://nagoya.apache.org/bugzilla/show_bug.cgi?id=27583 1199 ParsingException pex 1200 = new ParsingException(ex.getMessage(), in.getSystemId(), ex); 1201 throw pex; 1202 } 1203 catch (IOException ex) { 1204 // Work-around for Xerces; I don't want to just catch 1205 // org.apache.xerces.util.URI.MalformedURIException 1206 // because that would introduce a dependence on Xerces 1207 if (ex.getClass().getName().equals( 1208 "org.apache.xerces.util.URI$MalformedURIException")) { 1209 throw new ParsingException(ex.getMessage(), in.getSystemId(), ex); 1210 } 1211 else { 1212 throw ex; 1213 } 1214 } 1215 finally { 1216 handler.freeMemory(); 1217 } 1218 1219 if (result == null) { 1220 ParsingException ex = new ParsingException( 1221 "Parser did not build document", 1222 in.getSystemId(), -1, -1 1223 ); 1224 throw ex; 1225 } 1226 1227 if ("".equals(result.getBaseURI())) { 1228 result.setBaseURI(in.getSystemId()); 1229 } 1230 1231 ErrorHandler errorHandler = parser.getErrorHandler(); 1232 if (errorHandler instanceof ValidityRequired) { 1233 ValidityRequired validityHandler 1234 = (ValidityRequired) errorHandler; 1235 if (!validityHandler.isValid()) { 1236 ValidityException vex = validityHandler.vexception; 1237 vex.setDocument(result); 1238 validityHandler.reset(); 1239 throw vex; 1240 } 1241 } 1242 return result; 1243 1244 } 1245 1246 1247 private static class ValidityRequired implements ErrorHandler { 1248 1249 ValidityException vexception = null; 1250 reset()1251 void reset() { 1252 vexception = null; 1253 } 1254 warning(SAXParseException exception)1255 public void warning(SAXParseException exception) { 1256 // ignore warnings 1257 } 1258 error(SAXParseException exception)1259 public void error(SAXParseException exception) { 1260 1261 if (vexception == null) { 1262 vexception = new ValidityException( 1263 exception.getMessage(), 1264 exception.getSystemId(), 1265 exception.getLineNumber(), 1266 exception.getColumnNumber(), 1267 exception); 1268 } 1269 vexception.addError(exception); 1270 } 1271 fatalError(SAXParseException exception)1272 public void fatalError(SAXParseException exception) 1273 throws SAXParseException { 1274 throw exception; 1275 } 1276 isValid()1277 boolean isValid() { 1278 return vexception == null; 1279 } 1280 1281 } 1282 1283 1284 // Because Crimson doesn't report namespace errors as fatal 1285 private static class NamespaceWellformednessRequired 1286 implements ErrorHandler { 1287 warning(SAXParseException exception)1288 public void warning(SAXParseException exception) { 1289 // ignore warnings 1290 } 1291 error(SAXParseException exception)1292 public void error(SAXParseException exception) 1293 throws SAXParseException { 1294 1295 if (exception.getMessage().equals("Illegal Namespace prefix: \"xml\".")) { 1296 return; 1297 } 1298 1299 throw exception; 1300 1301 } 1302 fatalError(SAXParseException exception)1303 public void fatalError(SAXParseException exception) 1304 throws SAXParseException { 1305 throw exception; 1306 } 1307 1308 } 1309 1310 1311 // I added this because XIncluder needed it. 1312 /** 1313 * <p> 1314 * Returns this builder's <code>NodeFactory</code>. It returns 1315 * null if a factory was not supplied when the builder was created. 1316 * </p> 1317 * 1318 * @return the node factory that was specified in the constructor 1319 */ getNodeFactory()1320 public NodeFactory getNodeFactory() { 1321 return factory; 1322 } 1323 1324 1325 }