1 /* Copyright 2002-2006, 2009, 2010, 2013, 2018 Elliotte Rusty Harold
2 
3    This library is free software; you can redistribute it and/or modify
4    it under the terms of version 2.1 of the GNU Lesser General Public
5    License as published by the Free Software Foundation.
6 
7    This library is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10    GNU Lesser General Public License for more details.
11 
12    You should have received a copy of the GNU Lesser General Public
13    License along with this library; if not, write to the
14    Free Software Foundation, Inc., 59 Temple Place, Suite 330,
15    Boston, MA 02111-1307  USA
16 
17    You can contact Elliotte Rusty Harold by sending e-mail to
18    elharo@ibiblio.org. Please include the word "XOM" in the
19    subject line. The XOM home page is located at http://www.xom.nu/
20 */
21 
22 package nu.xom;
23 
24 import java.io.CharConversionException;
25 import java.io.File;
26 import java.io.FileInputStream;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.io.Reader;
30 import java.io.StringReader;
31 import java.io.UTFDataFormatException;
32 import java.net.MalformedURLException;
33 import java.net.URL;
34 
35 import org.xml.sax.ErrorHandler;
36 import org.xml.sax.InputSource;
37 import org.xml.sax.SAXException;
38 import org.xml.sax.SAXNotRecognizedException;
39 import org.xml.sax.SAXNotSupportedException;
40 import org.xml.sax.SAXParseException;
41 import org.xml.sax.XMLFilter;
42 import org.xml.sax.XMLReader;
43 import org.xml.sax.helpers.XMLReaderFactory;
44 
45 import org.apache.xerces.impl.Version;
46 
47 /**
48  * <p>
49  * This class is responsible for creating XOM <code>Document</code>
50  * objects  from a URL, file, string, or input stream by reading
51  * an XML document. A SAX parser is used to read the
52  * document and report any well-formedness errors.
53  * </p>
54  *
55  * @author Elliotte Rusty Harold
56  * @version 1.2.11
57  *
58  */
59 public class Builder {
60 
61 
62     private XMLReader   parser;
63     private NodeFactory factory;
64 
65     private static double xercesVersion = 2.6;
66 
67     static {
68 
69         try {
70             String x = Version.getVersion();
71             String versionString = x.substring(9);
72             int firstPeriod = versionString.indexOf(".");
73             int secondPeriod = versionString.lastIndexOf(".");
74             String major = versionString.substring(0, firstPeriod);
75             String minor = versionString.substring(firstPeriod+1, secondPeriod);
76             if (Integer.parseInt(minor) < 10 && Integer.parseInt(major) < 3) {
77                 xercesVersion = Double.parseDouble(x.substring(9,12));
78             }
79             // else it's 2.6 or later which is all we really need to know
80         }
81         catch (Exception ex) {
82             // The version string format changed so presumably it's
83             // 2.6 or later
84         }
85         catch (Error err) {
86             // Xerces not installed, so none of this matters
87         }
88 
89     }
90 
91 
92     /**
93      * <p>
94      * Creates a <code>Builder</code> that uses the default node
95      * factory and chooses among any available SAX2 parsers.
96      * In order of preference, it looks for:
97      * </p>
98      *
99      * <ol>
100      * <li>Xerces 2.x (a.k.a. IBM XML parser for Java)</li>
101      * <li>GNU &AElig;lfred</li>
102      * <li>Crimson</li>
103      * <li>Piccolo</li>
104      * <li>Oracle</li>
105      * <li>XP</li>
106      * <li>Saxon's &AElig;lfred</li>
107      * <li>dom4j's &AElig;lfred</li>
108      * <li>The platform default specified by the
109      *     <code>org.xml.sax.driver</code> system property</li>
110      * </ol>
111      *
112      * <p>
113      * Parsers must implicitly or explicitly support the
114      * http://xml.org/sax/features/external-general-entities
115      * and
116      * http://xml.org/sax/features/external-parameter-entities
117      * features XOM requires. Parsers that don't are rejected
118      * automatically.
119      * </p>
120      *
121      * @throws XMLException if no satisfactory parser is
122      *     installed in the local class path
123      */
Builder()124     public Builder() {
125         this(false);
126     }
127 
128 
129     /**
130      * <p>
131      * Creates a <code>Builder</code> based on an optionally validating
132      * parser. If the <code>validate</code> argument
133      * is true, then a validity error while
134      * parsing will cause a fatal error; that is,
135      * it will throw a <code>ValidityException</code>.
136      * </p>
137      *
138      * @param validate true if the parser should
139      *     validate the document while parsing
140      *
141      * @throws XMLException if no satisfactory parser
142      *     is installed in the local class path
143      */
Builder(boolean validate)144     public Builder(boolean validate) {
145         this(findParser(validate), validate, null);
146     }
147 
148 
149     /**
150      * <p>
151      * Creates a <code>Builder</code> based on an optionally
152      * validating parser that builds node objects with the supplied
153      * factory. If the <code>validate</code> argument is true, then
154      * a validity error while parsing will cause a fatal error; that
155      * is, it will throw a <code>ValidityException</code>.
156      * </p>
157      *
158      * @param validate true if the parser should
159      *     validate the document while parsing
160      * @param factory the <code>NodeFactory</code> that creates
161      *     the node objects for this <code>Builder</code>
162      *
163      * @throws XMLException if no satisfactory parser
164      *     is installed in the local class path
165      */
Builder(boolean validate, NodeFactory factory)166     public Builder(boolean validate, NodeFactory factory) {
167         this(findParser(validate), validate, factory);
168     }
169 
170 
171     // These are stored in the order of preference.
172     private static String[] parsers = {
173         "nu.xom.XML1_0Parser",
174         "nu.xom.JDK15XML1_0Parser",
175         "org.apache.xerces.parsers.SAXParser",
176         "org.apache.xerces.jaxp.SAXParserImpl$JAXPSAXParser", // xerces-2.9.x
177         "com.sun.org.apache.xerces.internal.jaxp.SAXParserImpl$JAXPSAXParser", // JDK 1.6
178         "com.sun.org.apache.xerces.internal.parsers.SAXParser",
179         "gnu.xml.aelfred2.XmlReader",
180         "org.apache.crimson.parser.XMLReaderImpl",
181         "com.bluecast.xml.Piccolo",
182         "oracle.xml.parser.v2.SAXParser",
183         "com.jclark.xml.sax.SAX2Driver",
184         "net.sf.saxon.aelfred.SAXDriver",
185         "com.icl.saxon.aelfred.SAXDriver",
186         "org.dom4j.io.aelfred2.SAXDriver",
187         "org.dom4j.io.aelfred.SAXDriver",
188         "org.xmlpull.v1.sax2.Driver" // android
189     };
190 
191 
findParser(boolean validate)192     static XMLReader findParser(boolean validate) {
193 
194         // first look for Xerces; we only trust Xerces if
195         // we set it up; and we need to configure it specially
196         // so we can't load it with the XMLReaderFactory
197         XMLReader parser;
198         try {
199             parser = new XML1_0Parser();
200             setupParser(parser, validate);
201             return parser;
202         }
203         catch (SAXException ex) {
204             // look for next one
205         }
206         catch (NoClassDefFoundError err) {
207             // Xerces is not available; look for next one
208         }
209 
210         try {
211             parser = (XMLReader) Class.forName(
212               "nu.xom.JDK15XML1_0Parser").newInstance();
213             setupParser(parser, validate);
214             return parser;
215         }
216         catch (SAXException ex) {
217             // look for next one
218         }
219         catch (InstantiationException ex) {
220             // look for next one
221         }
222         catch (ClassNotFoundException ex) {
223             // look for next one
224         }
225         catch (IllegalAccessException ex) {
226             // look for next one
227         }
228         catch (NoClassDefFoundError err) {
229             // Xerces is not available; look for next one
230         }
231 
232         // XMLReaderFactory.createXMLReader never returns
233         // null. If it can't locate the parser, it throws
234         // a SAXException.
235         for (int i = 2; i < parsers.length; i++) {
236             try {
237                 parser = XMLReaderFactory.createXMLReader(parsers[i]);
238                 setupParser(parser, validate);
239                 return parser;
240             }
241             catch (SAXException ex) {
242                 // try the next one
243             }
244             catch (NoClassDefFoundError err) {
245                 // try the next one
246             }
247         }
248 
249         try { // default
250             parser = XMLReaderFactory.createXMLReader();
251             setupParser(parser, validate);
252             return parser;
253         }
254         catch (SAXException ex) {
255             throw new XMLException(
256               "Could not find a suitable SAX2 parser", ex);
257         }
258 
259     }
260 
261 
setupParser(XMLReader parser, boolean validate)262     private static void setupParser(XMLReader parser, boolean validate)
263       throws SAXNotRecognizedException, SAXNotSupportedException {
264 
265         // General configuration for all parsers
266         parser.setFeature(
267                 "http://xml.org/sax/features/namespace-prefixes", true);
268         parser.setFeature(
269                 "http://xml.org/sax/features/namespaces", true);
270 
271         // Parser specific configuration
272         XMLReader baseParser = parser;
273         while (baseParser instanceof XMLFilter) {
274              XMLReader parent = ((XMLFilter) baseParser).getParent();
275              if (parent == null) break;
276              baseParser = parent;
277         }
278 
279         String parserName = baseParser.getClass().getName();
280         if (!validate) {
281             if (parserName.equals(  // Crimson workaround
282               "org.apache.crimson.parser.XMLReaderImpl")) {
283                 parser.setErrorHandler(
284                   new NamespaceWellformednessRequired()
285                 );
286             }
287             else {
288                 parser.setFeature(
289                   "http://xml.org/sax/features/external-general-entities",
290                   true
291                 );
292                 parser.setFeature(
293                  "http://xml.org/sax/features/external-parameter-entities",
294                   true
295                 );
296             }
297         }
298         else {
299             parser.setFeature(
300               "http://xml.org/sax/features/validation", true);
301             parser.setErrorHandler(new ValidityRequired());
302         }
303 
304         try {
305             parser.setFeature(
306               "http://xml.org/sax/features/string-interning", true);
307         }
308         catch (SAXException ex) {
309             // This parser does not support string interning.
310             // We can live without that.
311         }
312 
313         // A couple of Xerces specific properties
314         if (parserName.equals("nu.xom.XML1_0Parser")
315          || parserName.equals("nu.xom.JDK15XML1_0Parser")
316          || parserName.equals("org.apache.xerces.parsers.SAXParser")
317          || parserName.equals("com.sun.org.apache.xerces.internal.parsers.SAXParser")
318          || parserName.equals("org.apache.xerces.jaxp.SAXParserImpl$JAXPSAXParser") // xerces-2.9.x
319          || parserName.equals("com.sun.org.apache.xerces.internal.jaxp.SAXParserImpl$JAXPSAXParser")) // JDK 1.6
320         {
321             try {
322                 parser.setFeature(
323                  "http://apache.org/xml/features/allow-java-encodings", true);
324             }
325             catch (SAXException ex) {
326                 // Possibly an earlier version of Xerces; no big deal.
327                 // We can live without this feature.
328             }
329             // See http://nagoya.apache.org/bugzilla/show_bug.cgi?id=23768
330             // if you care to know why this line breaks unit tests on
331             // versions of Xerces prior to 2.6.1
332             try {
333                 parser.setFeature(
334                  "http://apache.org/xml/features/standard-uri-conformant",
335                  true);
336             }
337             catch (SAXException ex) {
338                 // Possibly an earlier version of Xerces, or a
339                 // or a non-Xerces parser;  no big deal.
340                 // We can live without this.
341             }
342         }
343 
344     }
345 
346 
347     /**
348      * <p>
349      * Creates a <code>Builder</code> that uses
350      * the specified SAX <code>XMLReader</code>.
351      * Custom SAX features and properties such as
352      * schema validation can be set on this <code>XMLReader</code>
353      * before passing it to this method.
354      * </p>
355      *
356      * @param parser the SAX2 <code>XMLReader</code> that
357      *     parses the document
358      *
359      * @throws XMLException if <code>parser</code> does not support the
360      *     features XOM requires
361      */
Builder(XMLReader parser)362     public Builder(XMLReader parser) {
363         this(parser, false);
364     }
365 
366 
367     /**
368      * <p>
369      * Creates a <code>Builder</code> that uses
370      * the specified <code>NodeFactory</code> to create
371      * node objects.
372      * </p>
373      *
374      * @param factory the <code>NodeFactory</code> that creates
375      *     the node objects for this <code>Builder</code>
376      *
377      * @throws XMLException if no satisfactory parser is
378      *     installed in the local class path
379      */
Builder(NodeFactory factory)380     public Builder(NodeFactory factory) {
381         this(findParser(false), false, factory);
382     }
383 
384 
385     /**
386      * <p>
387      * Creates a optionally validating <code>Builder</code> based
388      * on the specified parser object. Custom SAX features and
389      * properties such as schema validation can be set on this
390      * <code>XMLReader</code> before passing it to this method.
391      * </p>
392      *
393      * <p>
394      * If the validate argument is true, then a validity error
395      * while parsing will cause a fatal error; that is, it
396      * will throw a <code>ParsingException</code>
397      * </p>
398      *
399      * @param parser the SAX2 <code>XMLReader</code> that parses
400      *     the document
401      * @param validate true if the parser should validate
402      *     the document while parsing
403      *
404      */
Builder(XMLReader parser, boolean validate)405     public Builder(XMLReader parser, boolean validate) {
406         this(parser, validate, null);
407     }
408 
409 
410     /**
411      * <p>
412      * Creates an optionally validating <code>Builder</code> that reads
413      * data from the specified parser object and constructs new nodes
414      * using the specified factory object. Custom SAX features and
415      * properties such as schema validation can be set on this
416      * <code>XMLReader</code> before passing it to this method.
417      * </p>
418      *
419      * <p>
420      * If the <code>validate</code> argument is true, then a validity
421      * error while parsing will throw a <code>ParsingException</code>.
422      * </p>
423      *
424      * @param parser the SAX2 <code>XMLReader</code> that parses
425      *     the document
426      * @param validate true if the parser should validate the
427      *     document while parsing
428      * @param factory the <code>NodeFactory</code>
429      *     this builder uses to create objects in the tree
430      *
431      * @throws XMLException if <code>parser</code> does not support
432      *     the features XOM requires
433      *
434      */
Builder( XMLReader parser, boolean validate, NodeFactory factory)435     public Builder(
436       XMLReader parser, boolean validate, NodeFactory factory) {
437 
438         try {
439             setupParser(parser, validate);
440         }
441         catch (SAXException ex) {
442             if (validate) {
443                 throw new XMLException(parser.getClass().getName()
444                   + " does not support validation.", ex);
445             }
446             else {
447                 throw new XMLException(parser.getClass().getName()
448                   + " does not support the entity resolution"
449                   + " features XOM requires.", ex);
450             }
451         }
452 
453         // setup the handlers
454         this.parser = parser;
455         this.factory = factory;
456         setHandlers();
457 
458     }
459 
460 
knownGoodParser(XMLReader parser)461     private static boolean knownGoodParser(XMLReader parser) {
462 
463         String parserName = parser.getClass().getName();
464 
465         // In general, a filter may violate the constraints of XML 1.0.
466         // However, I specifically trust Norm Walsh not to do that, so
467         // if his filters are being used we look at the parent instead.
468         if (parserName.equals("org.apache.xml.resolver.tools.ResolvingXMLFilter")) {
469             XMLFilter filter = (XMLFilter) parser;
470             parserName = filter.getParent().getClass().getName();
471         }
472 
473         // These parsers are known to not make all the checks
474         // they're supposed to. :-(
475         if (parserName.equals("gnu.xml.aelfred2.XmlReader")) return false;
476         if (parserName.equals("net.sf.saxon.aelfred.SAXDriver")) return false;
477         if (parserName.equals("com.icl.saxon.aelfred.SAXDriver")) return false;
478 
479         if (parserName.equals("org.apache.xerces.parsers.SAXParser")
480             && xercesVersion >= 2.4) {
481             return false;
482         }
483 
484         for (int i = 0; i < parsers.length; i++) {
485             if (parserName.equals(parsers[i])) return true;
486         }
487         return false;
488 
489     }
490 
491 
setHandlers()492     private void setHandlers() {
493         XOMHandler handler;
494         if ((factory == null
495           || factory.getClass().getName().equals("nu.xom.NodeFactory"))
496           && knownGoodParser(parser)) {
497             // If no factory is supplied by user, don't
498             // return one
499             NodeFactory tempFactory = factory;
500             if (tempFactory == null) tempFactory = new NodeFactory();
501             handler = new NonVerifyingHandler(tempFactory);
502         }
503         else {
504             if (factory == null) factory = new NodeFactory();
505             handler = new XOMHandler(factory);
506         }
507 
508         parser.setContentHandler(handler);
509         parser.setDTDHandler(handler);
510 
511         try {
512             parser.setProperty(
513               "http://xml.org/sax/properties/lexical-handler",
514               handler);
515         }
516         catch (SAXException ex) {
517             // This parser does not support lexical events.
518             // We can live without them, though it does mean
519             // there won't be any comments or a DOCTYPE declaration
520             // in the tree.
521         }
522 
523         try {
524             parser.setProperty(
525               "http://xml.org/sax/properties/declaration-handler",
526               handler);
527             // Due to Crimson bugs in misidentifying the internal and
528             // external DTD subsets, we only build the internal DTD
529             // subset if there is no external DTD subset.
530             if (parser.getClass().getName().equals(
531               "org.apache.crimson.parser.XMLReaderImpl")) {
532                 handler.usingCrimson = true;
533             }
534         }
535         catch (SAXException ex) {
536             // This parser does not support declaration events.
537             // We can live without them, though it does mean
538             // they won't be any internal DTD subset.
539         }
540 
541     }
542 
543 
544     /**
545      * <p>
546      * Parses the document at the specified URL.
547      * </p>
548      *
549      * <p>
550      * Note that relative URLs generally do not work here, as
551      * there's no base to resolve them against. This includes
552      * relative URLs that point into the file system, though this
553      * is somewhat platform dependent. Furthermore, <code>file</code>
554      * URLs often only work when they adhere exactly to RFC 2396
555      * syntax. URLs that work in Internet Explorer often fail when
556      * used in Java. If you're reading XML from a file, more reliable
557      * results are obtained by using the <code>build</code> method
558      * that takes a <code>java.io.File</code> object as an argument.
559      * </p>
560      *
561      * @param systemID an absolute URL from which the document is read.
562      *     The URL's scheme must be one supported by the Java VM.
563      *
564      * @return the parsed <code>Document</code>
565      *
566      * @throws ValidityException if a validity error is detected. This
567      *     is only thrown if the builder has been instructed to validate.
568      * @throws ParsingException if a well-formedness error is detected
569      * @throws IOException if an I/O error such as a broken socket
570      *     prevents the document from being fully read
571      */
build(String systemID)572     public Document build(String systemID)
573       throws ParsingException, ValidityException, IOException {
574 
575         systemID = canonicalizeURL(systemID);
576         InputSource source = new InputSource(systemID);
577         return build(source);
578 
579     }
580 
581 
582     /**
583      * <p>
584      * Reads the document from an input stream.
585      * </p>
586      *
587      * @param in the input stream from which the document is read
588      *
589      * @return the parsed <code>Document</code>
590      *
591      * @throws ValidityException if a validity error is detected;
592      *     only thrown if the builder has been instructed to validate
593      * @throws ParsingException  if a well-formedness error is detected
594      * @throws IOException       if an I/O error such as a broken
595      *     socket prevents the document from being fully read
596      * @throws NullPointerException  if <code>in</code> is null
597      */
build(InputStream in)598     public Document build(InputStream in)
599       throws ParsingException, ValidityException, IOException {
600 
601         if (in == null) throw new NullPointerException("Null InputStream");
602         InputSource source = new InputSource(in);
603         return build(source);
604 
605     }
606 
607 
608     /**
609      * <p>
610      * Reads the document from an input stream while specifying
611      * a base URI (which need not be the stream's actual URI).
612      * </p>
613      *
614      * @param in the input stream from which the document is read
615      * @param baseURI an absolute URI for this document; may be null
616      *
617      * @return the parsed <code>Document</code>
618      *
619      * @throws ValidityException if a validity error is detected;
620      *     only thrown if the builder has been instructed to validate
621      * @throws ParsingException if a well-formedness error is detected
622      * @throws IOException if an I/O error such as a broken
623      *       socket prevents the document from being fully read
624      */
build(InputStream in, String baseURI)625     public Document build(InputStream in, String baseURI)
626       throws ParsingException, ValidityException, IOException {
627 
628         InputSource source = new InputSource(in);
629         if (baseURI != null) {
630             baseURI = canonicalizeURL(baseURI);
631             source.setSystemId(baseURI);
632         }
633         return build(source);
634 
635     }
636 
637 
638     // Nasty hack to make sure we get the right form
639     // of file URLs on Windows
640     private static String fileURLPrefix = "file://";
641 
642     static {
643         String os = System.getProperty("os.name", "Unix");
644         // I could do System.setProperty("os.name" "Windows") to test
645         // this, but I'd need to use a fresh ClassLoader to rerun the
646         // static initializer block.
647         if (os.indexOf("Windows") >= 0) {
648             fileURLPrefix = "file:/";
649         }
650     }
651 
652 
653     /**
654      * <p>
655      * Reads the document from a file.
656      * The base URI of the document is set to the
657      * location of the file.
658      * </p>
659      *
660      * @param in the file from which the document is read
661      *
662      * @return the parsed <code>Document</code>
663      *
664      * @throws ValidityException if a validity error is detected. This
665      *   is only thrown if the builder has been instructed to validate.
666      * @throws ParsingException if a well-formedness error is detected
667      * @throws IOException if an I/O error such as a bad disk
668      *     prevents the file from being read
669      */
build(File in)670     public Document build(File in)
671       throws ParsingException, ValidityException, IOException {
672 
673         InputStream fin = new FileInputStream(in);
674         // Java's toURL method doesn't properly escape file
675         // names so we have to do it manually
676         String absolute = in.getAbsolutePath();
677         StringBuffer url = new StringBuffer(fileURLPrefix);
678         int length = absolute.length();
679         char separatorChar = File.separatorChar;
680         for (int i = 0; i < length; i++) {
681             char c = absolute.charAt(i);
682             if (c == separatorChar) url.append('/');
683             else {
684                 switch(c) {
685                     case ' ':
686                         url.append("%20");
687                         break;
688                     case '!':
689                         url.append(c);
690                         break;
691                     case '"':
692                         url.append("%22");
693                         break;
694                     case '#':
695                         url.append("%23");
696                         break;
697                     case '$':
698                         url.append(c);
699                         break;
700                     case '%':
701                         url.append("%25");
702                         break;
703                     case '&':
704                         // ampersand does not need to be encoded in
705                         // path part of URL
706                         url.append('&');
707                         break;
708                     case '\'':
709                         url.append(c);
710                         break;
711                     case '(':
712                         url.append(c);
713                         break;
714                     case ')':
715                         url.append(c);
716                         break;
717                     case '*':
718                         url.append(c);
719                         break;
720                     case '+':
721                         url.append("%2B");
722                         break;
723                     case ',':
724                         url.append(c);
725                         break;
726                     case '-':
727                         url.append(c);
728                         break;
729                     case '.':
730                         url.append(c);
731                         break;
732                     case '/':
733                         url.append("%2F");
734                         break;
735                     case '0':
736                         url.append(c);
737                         break;
738                     case '1':
739                         url.append(c);
740                         break;
741                     case '2':
742                         url.append(c);
743                         break;
744                     case '3':
745                         url.append(c);
746                         break;
747                     case '4':
748                         url.append(c);
749                         break;
750                     case '5':
751                         url.append(c);
752                         break;
753                     case '6':
754                         url.append(c);
755                         break;
756                     case '7':
757                         url.append(c);
758                         break;
759                     case '8':
760                         url.append(c);
761                         break;
762                     case '9':
763                         url.append(c);
764                         break;
765                     case ':':
766                         url.append(c);
767                         break;
768                     case ';':
769                         url.append(c);
770                         break;
771                     case '<':
772                         url.append("%3C");
773                         break;
774                     case '=':
775                         url.append(c);
776                         break;
777                     case '>':
778                         url.append("%3E");
779                         break;
780                     case '?':
781                         url.append("%3F");
782                         break;
783                     case '@':
784                         url.append("%40");
785                         break;
786                     case 'A':
787                         url.append(c);
788                         break;
789                     case 'B':
790                         url.append(c);
791                         break;
792                     case 'C':
793                         url.append(c);
794                         break;
795                     case 'D':
796                         url.append(c);
797                         break;
798                     case 'E':
799                         url.append(c);
800                         break;
801                     case 'F':
802                         url.append(c);
803                         break;
804                     case 'G':
805                         url.append(c);
806                         break;
807                     case 'H':
808                         url.append(c);
809                         break;
810                     case 'I':
811                         url.append(c);
812                         break;
813                     case 'J':
814                         url.append(c);
815                         break;
816                     case 'K':
817                         url.append(c);
818                         break;
819                     case 'L':
820                         url.append(c);
821                         break;
822                     case 'M':
823                         url.append(c);
824                         break;
825                     case 'N':
826                         url.append(c);
827                         break;
828                     case 'O':
829                         url.append(c);
830                         break;
831                     case 'P':
832                         url.append(c);
833                         break;
834                     case 'Q':
835                         url.append(c);
836                         break;
837                     case 'R':
838                         url.append(c);
839                         break;
840                     case 'S':
841                         url.append(c);
842                         break;
843                     case 'T':
844                         url.append(c);
845                         break;
846                     case 'U':
847                         url.append(c);
848                         break;
849                     case 'V':
850                         url.append(c);
851                         break;
852                     case 'W':
853                         url.append(c);
854                         break;
855                     case 'X':
856                         url.append(c);
857                         break;
858                     case 'Y':
859                         url.append(c);
860                         break;
861                     case 'Z':
862                         url.append(c);
863                         break;
864                     case '[':
865                         url.append("%5B");
866                         break;
867                     case '\\':
868                         url.append("%5C");
869                         break;
870                     case ']':
871                         url.append("%5D");
872                         break;
873                     case '^':
874                         url.append("%5E");
875                         break;
876                     case '_':
877                         url.append(c);
878                         break;
879                     case '`':
880                         url.append("%60");
881                         break;
882                     case 'a':
883                         url.append(c);
884                         break;
885                     case 'b':
886                         url.append(c);
887                         break;
888                     case 'c':
889                         url.append(c);
890                         break;
891                     case 'd':
892                         url.append(c);
893                         break;
894                     case 'e':
895                         url.append(c);
896                         break;
897                     case 'f':
898                         url.append(c);
899                         break;
900                     case 'g':
901                         url.append(c);
902                         break;
903                     case 'h':
904                         url.append(c);
905                         break;
906                     case 'i':
907                         url.append(c);
908                         break;
909                     case 'j':
910                         url.append(c);
911                         break;
912                     case 'k':
913                         url.append(c);
914                         break;
915                     case 'l':
916                         url.append(c);
917                         break;
918                     case 'm':
919                         url.append(c);
920                         break;
921                     case 'n':
922                         url.append(c);
923                         break;
924                     case 'o':
925                         url.append(c);
926                         break;
927                     case 'p':
928                         url.append(c);
929                         break;
930                     case 'q':
931                         url.append(c);
932                         break;
933                     case 'r':
934                         url.append(c);
935                         break;
936                     case 's':
937                         url.append(c);
938                         break;
939                     case 't':
940                         url.append(c);
941                         break;
942                     case 'u':
943                         url.append(c);
944                         break;
945                     case 'v':
946                         url.append(c);
947                         break;
948                     case 'w':
949                         url.append(c);
950                         break;
951                     case 'x':
952                         url.append(c);
953                         break;
954                     case 'y':
955                         url.append(c);
956                         break;
957                     case 'z':
958                         url.append(c);
959                         break;
960                     case '{':
961                         url.append("%7B");
962                         break;
963                     case '|':
964                         url.append("%7C");
965                         break;
966                     case '}':
967                         url.append("%7D");
968                         break;
969                     case '~':
970                         url.append(c);
971                         break;
972                     default:
973                         if (c < 0xD800 || c > 0xDFFF) {
974                             url.append(URIUtil.percentEscape(c));
975                         }
976                         else if (c <= 0xDBFF) {
977                             // high surrogate; therefore we need to
978                             // grab the next half before encoding
979                             i++;
980                             try {
981                                 char low = absolute.charAt(i);
982                                 String character = String.valueOf(c)+low;
983                                 byte[] data = character.getBytes("UTF8");
984                                 // Always exactly 4 bytes, unless the encoder is buggy
985                                 for (int j=0; j < 4; j++) {
986                                     url.append('%');
987                                     String hex = Integer.toHexString(data[j]).toUpperCase();
988                                     url.append(hex.substring(hex.length()-2));
989                                 }
990                             }
991                             catch (IndexOutOfBoundsException ex) {
992                                 // file name contains a high half and not a low half
993                                 url = new StringBuffer(0);
994                                 break;
995                             }
996                         }
997                         else {
998                             // low half not preceded by high half
999                             // Can't create a base URI
1000                             url = new StringBuffer(0);
1001                             break;
1002                         }
1003                 }
1004             }
1005         }
1006 
1007         String base = url.toString();
1008         try {
1009             Document doc = build(fin, base);
1010             return doc;
1011         }
1012         finally {
1013             fin.close();
1014         }
1015 
1016     }
1017 
1018 
1019     /**
1020      * <p>
1021      * Reads the document from a reader.
1022      * </p>
1023      *
1024      * @param in the reader from which the document is read
1025      *
1026      * @return the parsed <code>Document</code>
1027      *
1028      * @throws ValidityException if a validity error is detected. This
1029      *   is only thrown if the builder has been instructed to validate.
1030      * @throws ParsingException  if a well-formedness error is detected
1031      * @throws IOException       if an I/O error such as a bad disk
1032      *     prevents the document from being fully read
1033      */
build(Reader in)1034     public Document build(Reader in)
1035       throws ParsingException, ValidityException, IOException {
1036 
1037         if (in == null) throw new NullPointerException("Attempted to build from null reader");
1038         InputSource source = new InputSource(in);
1039         return build(source);
1040 
1041     }
1042 
1043 
1044     /**
1045      * <p>
1046      * Reads the document from a character stream while
1047      * specifying a base URI.
1048      * </p>
1049      *
1050      * @param in the reader from which the document
1051      *     is read
1052      * @param baseURI the base URI for this document; may be null
1053      *
1054      * @return the parsed <code>Document</code>
1055      *
1056      * @throws ValidityException if a validity error is detected. This
1057      *     is only thrown if the builder has been instructed to
1058      *     validate.
1059      * @throws ParsingException  if a well-formedness error is detected
1060      * @throws IOException       if an I/O error such as a bad disk
1061      *     prevents the document from being completely read
1062      */
build(Reader in, String baseURI)1063     public Document build(Reader in, String baseURI)
1064       throws ParsingException, ValidityException, IOException {
1065 
1066         InputSource source = new InputSource(in);
1067         if (baseURI != null) {
1068             baseURI = canonicalizeURL(baseURI);
1069             source.setSystemId(baseURI);
1070         }
1071         return build(source);
1072 
1073     }
1074 
1075 
1076     /**
1077      * <p>
1078      * Reads the document from the contents of a string.
1079      * </p>
1080      *
1081      * @param document the string that contains the XML document
1082      * @param baseURI the base URI for this document; may be null
1083      *
1084      * @return  the parsed <code>Document</code>
1085      *
1086      * @throws ValidityException if a validity error is detected. This
1087      *     is only thrown if the builder has been instructed to
1088      *     validate.
1089      * @throws ParsingException  if a well-formedness error is detected
1090      * @throws IOException       if an I/O error such as a bad disk
1091      *     prevents the document's external DTD subset from being read
1092      */
build(String document, String baseURI)1093     public Document build(String document, String baseURI)
1094       throws ParsingException, ValidityException, IOException {
1095 
1096         Reader reader = new StringReader(document);
1097         return build(reader, baseURI);
1098 
1099     }
1100 
1101     // needed to work around a bug in Xerces and Crimson
1102     // for URLs with no trailing slashes (no path part)
1103     // such as http://www.cafeconleche.org.
1104     // Also needed to work around a VM bug involving file URLs such as
1105     // file:///tmp/nosuchdirectory/../foo.xml
1106     // where "nosuchdirectory" does not exist.
canonicalizeURL(String uri)1107     private String canonicalizeURL(String uri) {
1108 
1109         try {
1110             URL u = new URL(uri);
1111             String path = u.getPath();
1112             String scheme = u.getProtocol();
1113             String authority = u.getHost();
1114             String query = u.getQuery();
1115             int port = u.getPort();
1116             // fragment ID not needed
1117             if (path == null || path.length() == 0) {
1118               // We handle here the case where we have a URL such as
1119               // http://www.cafeaulait.org with no trailing slash.
1120                 path = "/";
1121             }
1122             // If this proves to be a hot spot we could probably take this path
1123             // only if the scheme is file; not in the more common case where
1124             // it's http
1125             path = URIUtil.removeDotSegments(path);
1126             StringBuffer canonicalForm = new StringBuffer(uri.length());
1127             canonicalForm.append(scheme);
1128             canonicalForm.append("://");
1129             if (authority != null) canonicalForm.append(authority);
1130             if (port >= 0) canonicalForm.append(":" + port);
1131             canonicalForm.append(path);
1132             if (query != null) canonicalForm.append("?" + query);
1133             return canonicalForm.toString();
1134         }
1135         catch (MalformedURLException ex) {
1136             return uri;
1137         }
1138     }
1139 
1140 
1141     /**
1142      * <p>
1143      * Reads the document from a SAX <code>InputSource</code>.
1144      * </p>
1145      *
1146      * @param in the input source from which the document is read
1147      *
1148      * @return the parsed <code>Document</code>
1149      *
1150      * @throws ValidityException if a validity error is detected. This
1151      *     is only thrown if the builder has been instructed to
1152      *     validate.
1153      * @throws ParsingException  if a well-formedness error is detected
1154      * @throws IOException       if an I/O error such as a bad disk
1155      *     prevents the document from being read
1156      */
build(InputSource in)1157     private Document build(InputSource in)
1158       throws ParsingException, ValidityException, IOException {
1159 
1160         XOMHandler handler = (XOMHandler) parser.getContentHandler();
1161         Document result = null;
1162         try {
1163             parser.parse(in);
1164             result = handler.getDocument();
1165         }
1166         catch (SAXParseException ex) {
1167             ParsingException pex = new ParsingException(
1168                 ex.getMessage(),
1169                 ex.getSystemId(),
1170                 ex.getLineNumber(),
1171                 ex.getColumnNumber(),
1172                 ex);
1173             throw pex;
1174         }
1175         catch (SAXException ex) {
1176             ParsingException pex
1177               = new ParsingException(ex.getMessage(), in.getSystemId(), ex);
1178             throw pex;
1179         }
1180         catch (XMLException ex) {
1181             throw new ParsingException(ex.getMessage(), ex);
1182         }
1183         catch (RuntimeException ex) {
1184             // Work-around for non-conformant parsers, especially Piccolo
1185             ParsingException pex
1186               = new ParsingException(ex.getMessage(), in.getSystemId(), ex);
1187             throw pex;
1188         }
1189         catch (UTFDataFormatException ex) {
1190             // Work-around for non-conformant parsers, especially Xerces
1191             // http://nagoya.apache.org/bugzilla/show_bug.cgi?id=27583
1192             ParsingException pex
1193               = new ParsingException(ex.getMessage(), in.getSystemId(), ex);
1194             throw pex;
1195         }
1196         catch (CharConversionException ex) {
1197             // Work-around for non-conformant parsers, especially Xerces
1198             // http://nagoya.apache.org/bugzilla/show_bug.cgi?id=27583
1199             ParsingException pex
1200               = new ParsingException(ex.getMessage(), in.getSystemId(), ex);
1201             throw pex;
1202         }
1203         catch (IOException ex) {
1204             // Work-around for Xerces; I don't want to just catch
1205             // org.apache.xerces.util.URI.MalformedURIException
1206             // because that would introduce a dependence on Xerces
1207             if (ex.getClass().getName().equals(
1208               "org.apache.xerces.util.URI$MalformedURIException")) {
1209                 throw new ParsingException(ex.getMessage(), in.getSystemId(), ex);
1210             }
1211             else {
1212                 throw ex;
1213             }
1214         }
1215         finally {
1216             handler.freeMemory();
1217         }
1218 
1219         if (result == null) {
1220             ParsingException ex = new ParsingException(
1221               "Parser did not build document",
1222               in.getSystemId(), -1, -1
1223             );
1224             throw ex;
1225         }
1226 
1227         if ("".equals(result.getBaseURI())) {
1228             result.setBaseURI(in.getSystemId());
1229         }
1230 
1231         ErrorHandler errorHandler = parser.getErrorHandler();
1232         if (errorHandler instanceof ValidityRequired) {
1233             ValidityRequired validityHandler
1234               = (ValidityRequired) errorHandler;
1235             if (!validityHandler.isValid())  {
1236                 ValidityException vex = validityHandler.vexception;
1237                 vex.setDocument(result);
1238                 validityHandler.reset();
1239                 throw vex;
1240             }
1241         }
1242         return result;
1243 
1244     }
1245 
1246 
1247     private static class ValidityRequired implements ErrorHandler {
1248 
1249         ValidityException vexception = null;
1250 
reset()1251         void reset() {
1252             vexception = null;
1253         }
1254 
warning(SAXParseException exception)1255         public void warning(SAXParseException exception) {
1256             // ignore warnings
1257         }
1258 
error(SAXParseException exception)1259         public void error(SAXParseException exception) {
1260 
1261             if (vexception == null) {
1262                 vexception = new ValidityException(
1263                   exception.getMessage(),
1264                   exception.getSystemId(),
1265                   exception.getLineNumber(),
1266                   exception.getColumnNumber(),
1267                   exception);
1268             }
1269             vexception.addError(exception);
1270         }
1271 
fatalError(SAXParseException exception)1272         public void fatalError(SAXParseException exception)
1273           throws SAXParseException {
1274             throw exception;
1275         }
1276 
isValid()1277         boolean isValid() {
1278             return vexception == null;
1279         }
1280 
1281     }
1282 
1283 
1284     // Because Crimson doesn't report namespace errors as fatal
1285     private static class NamespaceWellformednessRequired
1286       implements ErrorHandler {
1287 
warning(SAXParseException exception)1288         public void warning(SAXParseException exception) {
1289             // ignore warnings
1290         }
1291 
error(SAXParseException exception)1292         public void error(SAXParseException exception)
1293           throws SAXParseException {
1294 
1295             if (exception.getMessage().equals("Illegal Namespace prefix: \"xml\".")) {
1296                 return;
1297             }
1298 
1299             throw exception;
1300 
1301         }
1302 
fatalError(SAXParseException exception)1303         public void fatalError(SAXParseException exception)
1304           throws SAXParseException {
1305             throw exception;
1306         }
1307 
1308     }
1309 
1310 
1311     // I added this because XIncluder needed it.
1312     /**
1313      * <p>
1314      * Returns this builder's <code>NodeFactory</code>. It returns
1315      * null if a factory was not supplied when the builder was created.
1316      * </p>
1317      *
1318      * @return the node factory that was specified in the constructor
1319      */
getNodeFactory()1320     public NodeFactory getNodeFactory() {
1321         return factory;
1322     }
1323 
1324 
1325 }