1 /* Copyright 2002-2006 Elliotte Rusty Harold
2 
3    This library is free software; you can redistribute it and/or modify
4    it under the terms of version 2.1 of the GNU Lesser General Public
5    License as published by the Free Software Foundation.
6 
7    This library is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10    GNU Lesser General Public License for more details.
11 
12    You should have received a copy of the GNU Lesser General Public
13    License along with this library; if not, write to the
14    Free Software Foundation, Inc., 59 Temple Place, Suite 330,
15    Boston, MA 02111-1307  USA
16 
17    You can contact Elliotte Rusty Harold by sending e-mail to
18    elharo@ibiblio.org. Please include the word "XOM" in the
19    subject line. The XOM home page is located at http://www.xom.nu/
20 */
21 
22 package nu.xom;
23 
24 import java.io.IOException;
25 import java.io.OutputStream;
26 import java.io.OutputStreamWriter;
27 import java.io.UnsupportedEncodingException;
28 import java.io.Writer;
29 import java.util.Locale;
30 import org.xml.sax.helpers.NamespaceSupport;
31 
32 /**
33  * <p>
34  *  Outputs a <code>Document</code> object in a specific encoding using
35  *  various options for controlling white space, normalization,
36  *  indenting, line breaking, and base URIs. However, in general these
37  *  options do affect the document's infoset. In particular, if you set
38  *  either the maximum line length or the indent size to a positive
39  *  value, then the serializer will not respect input white space. It
40  *  may trim leading and trailing space, condense runs of white
41  *  space to a single space, convert carriage returns and linefeeds
42  *  to spaces, add extra space where none was present before,
43  *  and otherwise muck with the document's white space.
44  *  The defaults, however, preserve all significant white space
45  *  including ignorable white space and boundary white space.
46  * </p>
47  *
48  * @author Elliotte Rusty Harold
49  * @version 1.2d1
50  *
51  */
52 public class Serializer {
53 
54     private TextWriter escaper;
55     private boolean preserveBaseURI = false;
56     // ???? reset when exception is thrown?
57     private NamespaceSupport namespaces = new NamespaceSupport();
58 
59 
60     /**
61      * <p>
62      * Create a new serializer that uses the UTF-8 encoding.
63      * </p>
64      *
65      * @param out the output stream to write the document on
66      *
67      * @throws NullPointerException if <code>out</code> is null
68      */
Serializer(OutputStream out)69     public Serializer(OutputStream out) {
70 
71         try {
72             this.setOutputStream(out, "UTF-8");
73         }
74         catch (UnsupportedEncodingException ex) {
75             throw new RuntimeException(
76               "The VM is broken. It does not understand UTF-8.");
77         }
78 
79     }
80 
81 
82     /**
83      * <p>
84      * Create a new serializer that uses the specified encoding.
85      * The encoding must be recognized by the Java virtual machine. If
86      * you attempt to use an encoding that the local Java virtual
87      * machine does not support, the constructor will throw an
88      * <code>UnsupportedEncodingException</code>.
89      * Currently the following encodings are recognized by XOM:
90      * </p>
91      *
92      * <ul>
93      *   <li>UTF-8</li>
94      *   <li>UTF-16</li>
95      *   <li>UTF-16BE</li>
96      *   <li>UTF-16LE</li>
97      *   <li>ISO-10646-UCS-2</li>
98      *   <li>ISO-8859-1</li>
99      *   <li>ISO-8859-2</li>
100      *   <li>ISO-8859-3</li>
101      *   <li>ISO-8859-4</li>
102      *   <li>ISO-8859-5</li>
103      *   <li>ISO-8859-6</li>
104      *   <li>ISO-8859-7</li>
105      *   <li>ISO-8859-8</li>
106      *   <li>ISO-8859-9</li>
107      *   <li>ISO-8859-10</li>
108      *   <li>ISO-8859-11 (a.k.a. TIS-620)</li>
109      *   <li>ISO-8859-13</li>
110      *   <li>ISO-8859-14</li>
111      *   <li>ISO-8859-15</li>
112      *   <li>ISO-8859-16</li>
113      *   <li>IBM037 (a.k.a. CP037, EBCDIC-CP-US, EBCDIC-CP-CA,
114      *         EBCDIC-CP-WA, EBCDIC-CP-NL, and CSIBM037)</li>
115      *   <li>GB18030</li>
116      * </ul>
117      *
118      * <p>
119      * You can use encodings not in this list if the virtual
120      * machine supports them. However, they may be
121      * significantly slower than the encodings in this list.
122      * </p>
123      *
124      * <p>
125      * I've noticed Java has significant bugs in its handling of some
126      * of these encodings. In some cases such as 0x80 in Big5, XOM
127      * will escape a character that should not need to be escaped
128      * because Java can't output that character in the specified
129      * encoding, even though the output character set does contain it.
130      * :-(
131      * </p>
132      *
133      * @param out the output stream to write the document on
134      * @param encoding the character encoding for the serialization
135 
136      * @throws NullPointerException if <code>out</code>
137      *     or <code>encoding</code> is null
138      * @throws UnsupportedEncodingException if the VM does not
139      *     support the requested encoding
140      *
141      */
Serializer(OutputStream out, String encoding)142     public Serializer(OutputStream out, String encoding)
143       throws UnsupportedEncodingException {
144 
145         if (encoding == null) {
146             throw new NullPointerException("Null encoding");
147         }
148         this.setOutputStream(out, encoding);
149 
150     }
151 
152 
153     /**
154      * <p>
155      * Flushes the previous output stream and
156      * redirects further output to the new output stream.
157      * </p>
158      *
159      *
160      * @param out the output stream to write the document on
161 
162      * @throws NullPointerException if <code>out</code> is null
163      * @throws IOException if the previous output stream
164      *     encounters an I/O error when flushed
165      *
166      */
setOutputStream(OutputStream out)167     public void setOutputStream(OutputStream out)
168       throws IOException {
169 
170         // flush any data onto the old output stream
171         this.flush();
172         int maxLength = getMaxLength();
173         int indent = this.getIndent();
174         String lineSeparator = getLineSeparator();
175         boolean nfc = getUnicodeNormalizationFormC();
176         String encoding = escaper.getEncoding();
177         boolean lineSeparatorSet = escaper.lineSeparatorSet;
178         setOutputStream(out, encoding);
179         setIndent(indent);
180         setMaxLength(maxLength);
181         setUnicodeNormalizationFormC(nfc);
182         if (lineSeparatorSet) setLineSeparator(lineSeparator);
183 
184     }
185 
186 
setOutputStream(OutputStream out, String encoding)187     private void setOutputStream(OutputStream out, String encoding)
188         throws UnsupportedEncodingException {
189 
190         if (out == null) {
191             throw new NullPointerException("Null OutputStream");
192         }
193         Writer writer;
194         String encodingUpperCase = encoding.toUpperCase(Locale.ENGLISH);
195         if (encodingUpperCase.equals("UTF-8")) {
196            writer = new OutputStreamWriter(out, "UTF-8");
197         }
198         else if (encodingUpperCase.equals("UTF-16")
199           || encodingUpperCase.equals("ISO-10646-UCS-2")) {
200            // For compatibility with Java 1.2 and earlier
201            writer = new OutputStreamWriter(out, "UnicodeBig");
202         }
203         // Java's Cp037 encoding is broken, so we have to
204         // provide our own.
205         else if (encodingUpperCase.equals("IBM037")
206           || encodingUpperCase.equals("CP037")
207           || encodingUpperCase.equals("EBCDIC-CP-US")
208           || encodingUpperCase.equals("EBCDIC-CP-CA")
209           || encodingUpperCase.equals("EBCDIC-CP-WA")
210           || encodingUpperCase.equals("EBCDIC-CP-NL")
211           || encodingUpperCase.equals("CSIBM037")) {
212             writer = new EBCDICWriter(out);
213         }
214         else if (encodingUpperCase.equals("ISO-8859-11")
215           || encodingUpperCase.equals("TIS-620")) {
216            // Java doesn't recognize the name ISO-8859-11 and
217            // Java 1.3 and earlier don't recognize TIS-620
218            writer = new OutputStreamWriter(out, "TIS620");
219         }
220         else writer = new OutputStreamWriter(out, encoding);
221 
222         writer = new UnsynchronizedBufferedWriter(writer);
223         this.escaper = TextWriterFactory.getTextWriter(writer, encoding);
224 
225     }
226 
227 
228     /**
229      * <p>
230      * Serializes a document onto the output
231      * stream using the current options.
232      * </p>
233      *
234      * @param doc the <code>Document</code> to serialize
235      *
236      * @throws IOException if the underlying output stream
237      *      encounters an I/O error
238      * @throws NullPointerException if <code>doc</code> is null
239      * @throws UnavailableCharacterException if the document contains
240      *     an unescapable character (e.g. in an element name) that is
241      *     not available in the current encoding
242      */
write(Document doc)243     public void write(Document doc) throws IOException {
244 
245         escaper.reset();
246         namespaces.reset();
247         namespaces.declarePrefix("", "");
248         // The OutputStreamWriter automatically inserts
249         // the byte order mark if necessary.
250         writeXMLDeclaration();
251         int childCount = doc.getChildCount();
252         for (int i = 0; i < childCount; i++) {
253             writeChild(doc.getChild(i));
254 
255             // Might want to remove this line break in a
256             // non-XML serializer where it's not guaranteed to be
257             // OK to add extra line breaks in the prolog
258             escaper.breakLine();
259         }
260         escaper.flush();
261 
262     }
263 
264 
265     /**
266      * <p>
267      * Writes the XML declaration onto the output stream,
268      * followed by a line break.
269      * </p>
270      *
271      * @throws IOException if the underlying output stream
272      *      encounters an I/O error
273      */
writeXMLDeclaration()274     protected void writeXMLDeclaration() throws IOException {
275 
276         escaper.writeUncheckedMarkup("<?xml version=\"1.0\" encoding=\"");
277         escaper.writeUncheckedMarkup(escaper.getEncoding());
278         escaper.writeUncheckedMarkup("\"?>");
279         escaper.breakLine();
280 
281     }
282 
283 
284     /**
285      * <p>
286      * Serializes an element onto the output stream using the current
287      * options. The result is guaranteed to be well-formed.
288      * </p>
289      *
290      * <p>
291      * If the element is empty, this method invokes
292      * <code>writeEmptyElementTag</code>. If the element is not
293      * empty, then:
294      * </p>
295      *
296      * <ol>
297      *   <li>It calls <code>writeStartTag</code>.</li>
298      *   <li>It passes each of the element's children to
299      *       <code>writeChild</code> in order.</li>
300      *   <li>It calls <code>writeEndTag</code>.</li>
301      * </ol>
302      *
303      * <p>
304      * It may break lines or add white space if the serializer has
305      * been configured to indent or use a maximum line length.
306      * </p>
307      *
308      * @param element the <code>Element</code> to serialize
309      *
310      * @throws IOException if the underlying output stream
311      *     encounters an I/O error
312      * @throws UnavailableCharacterException if the element name
313      *     contains a character that is not available in the
314      *     current encoding
315      */
write(Element element)316     protected void write(Element element) throws IOException {
317 
318         // workaround for case where only children are empty text nodes
319         boolean hasRealChildren = false;
320         int childCount = element.getChildCount();
321         for (int i = 0; i < childCount; i++) {
322             Node child = element.getChild(i);
323             if (child.isText()) {
324                 Text t = (Text) child;
325                 if (t.isEmpty()) continue;
326             }
327             hasRealChildren = true;
328             break;
329         }
330 
331         if (hasRealChildren) {
332             boolean wasPreservingWhiteSpace = escaper.isPreserveSpace();
333             writeStartTag(element);
334 
335             // children
336             for (int i = 0; i < childCount; i++) {
337                 Node child = element.getChild(i);
338                 // need to work around a very tricky case here where
339                 // denormalized characters cross boundaries of
340                 // consecutive text nodes
341                 if (escaper.getNFC() && child.isText()) {
342                     Text t = (Text) child;
343                     while (i < childCount-1) { // not the last node
344                         Node next = element.getChild(i+1);
345                         if (next.isText()) {
346                             t = new Text(t.getValue() + next.getValue());
347                             i++;
348                         }
349                         else break;
350                     }
351                     writeChild(t);
352                 }
353                 else {
354                     writeChild(child);
355                 }
356             }
357             writeEndTag(element);
358 
359             // restore parent value
360             escaper.setPreserveSpace(wasPreservingWhiteSpace);
361         }
362         else {
363             writeEmptyElementTag(element);
364         }
365 
366     }
367 
368 
hasNonTextChildren(Element element)369     private boolean hasNonTextChildren(Element element) {
370 
371         int childCount = element.getChildCount();
372         for (int i = 0; i < childCount; i++) {
373             if (! element.getChild(i).isText()) return true;
374         }
375         return false;
376 
377     }
378 
379 
380     // writeEndTag should not normally throw UnavailableCharacterException
381     // because that would already have been thrown for the
382     // corresponding start-tag.
383     /**
384      * <p>
385      *   Writes the end-tag for an element in the form
386      *   <code>&lt;/<i>name</i>&gt;</code>.
387      * </p>
388      *
389      * @param element the element whose end-tag is written
390      *
391      * @throws IOException if the underlying output stream
392      *     encounters an I/O error
393      */
writeEndTag(Element element)394     protected void writeEndTag(Element element) throws IOException {
395 
396         escaper.decrementIndent();
397         if (escaper.getIndent() > 0 && !escaper.isPreserveSpace()) {
398             if (hasNonTextChildren(element)) {
399                 escaper.breakLine();
400             }
401         }
402         escaper.write('<');
403         escaper.write('/');
404         escaper.writeName(element.getQualifiedName());
405         escaper.write('>');
406         namespaces.popContext();
407 
408     }
409 
410 
411     /**
412      *
413      * <p>
414      *  Writes the start-tag for the element including
415      *  all its namespace declarations and attributes.
416      * </p>
417      *
418      * <p>
419      *   The <code>writeAttributes</code> method is called to write
420      *   all the non-namespace-declaration attributes.
421      *   The <code>writeNamespaceDeclarations</code> method
422      *   is called to write all the namespace declaration attributes.
423      * </p>
424      *
425      * @param element the element whose start-tag is written
426      *
427      * @throws IOException if the underlying output stream
428      *     encounters an I/O error
429      * @throws UnavailableCharacterException if the name of the element
430      *     or the name of any of its attributes contains a character
431      *     that is not available in the current encoding
432      */
writeStartTag(Element element)433     protected void writeStartTag(Element element) throws IOException {
434 
435         writeTagBeginning(element);
436         escaper.write('>');
437         escaper.incrementIndent();
438         String xmlSpaceValue = element.getAttributeValue(
439            "space", "http://www.w3.org/XML/1998/namespace");
440         if  (xmlSpaceValue != null) {
441             if ("preserve".equals(xmlSpaceValue)){
442                 escaper.setPreserveSpace(true);
443             }
444             else if ("default".equals(xmlSpaceValue)){
445                 escaper.setPreserveSpace(false);
446             }
447         }
448 
449     }
450 
451 
452     /**
453      *
454      * <p>
455      *  Writes an empty-element tag for the element
456      *  including all its namespace declarations and attributes.
457      * </p>
458      *
459      * <p>
460      *   The <code>writeAttributes</code> method is called to write
461      *   all the non-namespace-declaration attributes.
462      *   The <code>writeNamespaceDeclarations</code> method
463      *   is called to write all the namespace declaration attributes.
464      * </p>
465      *
466      * <p>
467      *   If subclasses don't wish empty-element tags to be used,
468      *   they can override this method to simply invoke
469      *   <code>writeStartTag</code> followed by
470      *   <code>writeEndTag</code>.
471      * </p>
472      *
473      * @param element the element whose empty-element tag is written
474      *
475      * @throws IOException if the underlying output stream
476      *     encounters an I/O error
477      * @throws UnavailableCharacterException if the name of the element or the name of
478      *     any of its attributes contains a character that is not
479      *     available in the current encoding
480      */
writeEmptyElementTag(Element element)481     protected void writeEmptyElementTag(Element element)
482       throws IOException {
483         writeTagBeginning(element);
484         escaper.write('/');
485         escaper.write('>');
486         namespaces.popContext();
487     }
488 
489 
490     // This just extracts the commonality between writeStartTag
491     // and writeEmptyElementTag
writeTagBeginning(Element element)492     private void writeTagBeginning(Element element)
493       throws IOException {
494 
495         namespaces.pushContext();
496 
497         if (escaper.isIndenting()
498           && !escaper.isPreserveSpace()
499           && !escaper.justBroke()) {
500             escaper.breakLine();
501         }
502         escaper.write('<');
503         escaper.writeName(element.getQualifiedName());
504         writeAttributes(element);
505         writeNamespaceDeclarations(element);
506 
507     }
508 
509 
510     /**
511      * <p>
512      *   Writes all the attributes of the specified
513      *   element onto the output stream, one at a time, separated
514      *   by white space. If preserveBaseURI is true, and it is
515      *   necessary to add an <code>xml:base</code> attribute
516      *   to the element in order to preserve the base URI, then
517      *   that attribute is also written here.
518      *   Each individual attribute is written by invoking
519      *   <code>write(Attribute)</code>.
520      * </p>
521      *
522      * @param element the <code>Element</code> whose attributes are
523      *     written
524      * @throws IOException if the underlying output stream
525      *     encounters an I/O error
526      * @throws UnavailableCharacterException if the name of any of
527      *     the element's attributes contains a character that is not
528      *     available in the current encoding
529      */
writeAttributes(Element element)530     protected void writeAttributes(Element element)
531       throws IOException {
532 
533         // check to see if we need an xml:base attribute
534         if (preserveBaseURI) {
535             ParentNode parent = element.getParent();
536             if (element.getAttribute("base",
537               "http://www.w3.org/XML/1998/namespace") == null) {
538                 String baseValue = element.getBaseURI();
539                 if (parent == null
540                   || parent.isDocument()
541                   || !element.getBaseURI()
542                        .equals(parent.getBaseURI())) {
543 
544                     escaper.write(' ');
545                     Attribute baseAttribute = new Attribute(
546                       "xml:base",
547                       "http://www.w3.org/XML/1998/namespace",
548                       baseValue);
549                     write(baseAttribute);
550                 }
551             }
552         }
553 
554         int attributeCount = element.getAttributeCount();
555         for (int i = 0; i < attributeCount; i++) {
556             Attribute attribute = element.getAttribute(i);
557             escaper.write(' ');
558             write(attribute);
559         }
560     }
561 
562 
563     /**
564      * <p>
565      *   Writes all the namespace declaration
566      *   attributes of the specified element onto the output stream,
567      *   one at a time, separated by white space. Each individual
568      *   declaration is written by invoking
569      *   <code>writeNamespaceDeclaration</code>.
570      * </p>
571      *
572      * @param element the <code>Element</code> whose namespace
573      *     declarations are written
574      * @throws IOException if the underlying output stream
575      *     encounters an I/O error
576      * @throws UnavailableCharacterException if any of the element's
577      *     namespace prefixes contains a character that is not
578      *     available in the current encoding
579      */
writeNamespaceDeclarations(Element element)580     protected void writeNamespaceDeclarations(Element element)
581       throws IOException {
582 
583         String prefix = element.getNamespacePrefix();
584         if (!("xml".equals(prefix))) {
585             writeNamespaceDeclarationIfNecessary(prefix, element.getNamespaceURI());
586         }
587 
588         // write attribute namespaces
589         int attCount = element.getAttributeCount();
590         for (int i = 0; i < attCount; i++) {
591             Attribute att = element.getAttribute(i);
592             String attPrefix = att.getNamespacePrefix();
593             if (attPrefix.length() != 0 && !("xml".equals(attPrefix))) {
594                 writeNamespaceDeclarationIfNecessary(attPrefix, att.getNamespaceURI());
595             }
596         }
597 
598         // write additional namespaces
599         Namespaces namespaces = element.namespaces;
600         if (namespaces == null) return;
601         int namespaceCount = namespaces.size();
602         for (int i = 0; i < namespaceCount; i++) {
603             String additionalPrefix = namespaces.getPrefix(i);
604             String uri = namespaces.getURI(additionalPrefix);
605             writeNamespaceDeclarationIfNecessary(additionalPrefix, uri);
606         }
607 
608     }
609 
610 
writeNamespaceDeclarationIfNecessary(String prefix, String uri)611     private void writeNamespaceDeclarationIfNecessary(String prefix, String uri)
612       throws IOException {
613 
614         String currentValue = namespaces.getURI(prefix);
615         // NamespaceSupport returns null for no namespace, not the
616         // empty string like XOM does
617         if (currentValue == null && "".equals(uri)) {
618             return;
619         }
620         else if (uri.equals(currentValue)) {
621             return;
622         }
623 
624         escaper.write(' ');
625         writeNamespaceDeclaration(prefix, uri);
626 
627     }
628 
629 
630     /**
631      * <p>
632      *   Writes a namespace declaration in the form
633      *   <code>xmlns:<i>prefix</i>="<i>uri</i>"</code> or
634      *   <code>xmlns="<i>uri</i>"</code>. It does not write
635      *   the spaces on either side of the namespace declaration.
636      *   These are written by <code>writeNamespaceDeclarations</code>.
637      * </p>
638      *
639      * @param prefix the namespace prefix; the empty string for the
640      *     default namespace
641      * @param uri the namespace URI
642      *
643      * @throws IOException if the underlying output stream
644      *     encounters an I/O error
645      * @throws UnavailableCharacterException if the namespace prefix contains a
646      *     character that is not available in the current encoding
647      */
writeNamespaceDeclaration(String prefix, String uri)648     protected void writeNamespaceDeclaration(String prefix, String uri)
649       throws IOException {
650 
651         namespaces.declarePrefix(prefix, uri);
652         if ("".equals(prefix)) {
653             escaper.writeUncheckedMarkup("xmlns");
654         }
655         else {
656             escaper.writeUncheckedMarkup("xmlns:");
657             escaper.writeName(prefix);
658         }
659         escaper.write('=');
660         escaper.write('"');
661         escaper.writePCDATA(uri);
662         escaper.write('"');
663 
664     }
665 
666 
667     /**
668      * <p>
669      *   Writes an attribute in the form
670      *   <code><i>name</i>="<i>value</i>"</code>.
671      *   Characters in the attribute value are escaped as necessary.
672      * </p>
673      *
674      * @param attribute the <code>Attribute</code> to write
675      *
676      * @throws IOException if the underlying output stream
677      *     encounters an I/O error
678      * @throws UnavailableCharacterException if the attribute name contains a character
679      *     that is not available in the current encoding
680      *
681      */
write(Attribute attribute)682     protected void write(Attribute attribute) throws IOException {
683         escaper.writeName(attribute.getQualifiedName());
684         escaper.write('=');
685         escaper.write('"');
686         escaper.writeAttributeValue(attribute.getValue());
687         escaper.write('"');
688     }
689 
690 
691     /**
692      * <p>
693      * Writes a comment onto the output stream using the current
694      * options. Since character and entity references are not resolved
695      * in comments, comments can only be serialized when all
696      * characters they contain are available in the current
697      * encoding.
698      * </p>
699      *
700      * @param comment the <code>Comment</code> to serialize
701      *
702      * @throws IOException if the underlying output stream
703      *     encounters an I/O error
704      * @throws UnavailableCharacterException if the comment contains a
705      *     character that is not available in the current encoding
706      */
write(Comment comment)707     protected void write(Comment comment) throws IOException {
708         if (escaper.isIndenting()) escaper.breakLine();
709         escaper.writeUncheckedMarkup("<!--");
710         escaper.writeMarkup(comment.getValue());
711         escaper.writeUncheckedMarkup("-->");
712     }
713 
714 
715     /**
716      * <p>
717      * Writes a processing instruction
718      * onto the output stream using the current options.
719      * Since character and entity references are not resolved
720      * in processing instructions, processing instructions
721      * can only be serialized when all
722      * characters they contain are available in the current
723      * encoding.
724      * </p>
725      *
726      * @param instruction the <code>ProcessingInstruction</code>
727      *     to serialize
728      *
729      * @throws IOException if the underlying output stream
730      *     encounters an I/O error
731      * @throws UnavailableCharacterException if the comment contains a
732      *     character that is not available in the current encoding
733      */
write(ProcessingInstruction instruction)734     protected void write(ProcessingInstruction instruction)
735       throws IOException {
736 
737         if (escaper.isIndenting()) escaper.breakLine();
738         escaper.writeUncheckedMarkup("<?");
739         escaper.writeName(instruction.getTarget());
740         String value = instruction.getValue();
741         // for canonical XML, only output a space after the target
742         // if there is a value
743         if (!"".equals(value)) {
744             escaper.write(' ');
745             escaper.writeMarkup(value);
746         }
747         escaper.writeUncheckedMarkup("?>");
748 
749     }
750 
751     /**
752      * <p>
753      * Writes a <code>Text</code> object
754      * onto the output stream using the current options.
755      * Reserved characters such as &lt;, &gt; and "
756      * are escaped using the standard entity references
757      * such as <code>&amp;lt;</code>, <code>&amp;gt;</code>,
758      * and <code>&amp;quot;</code>.
759      * </p>
760      *
761      * <p>
762      * Characters which cannot be encoded in the current character set
763      * (for example, &Omega; in ISO-8859-1) are encoded using
764      * character references.
765      * </p>
766      *
767      * @param text the <code>Text</code> to serialize
768      *
769      * @throws IOException if the underlying output stream
770      *     encounters an I/O error
771      */
write(Text text)772     protected void write(Text text) throws IOException {
773 
774         // XXX Is there a shortcut that takes advantage of the
775         // data being stored in UTF-8 here? perhaps even if only
776         // when serializing to UTF-8?
777         String value = text.getValue();
778         if (text.isCDATASection()
779           && value.indexOf("]]>") == -1) {
780             if (!(escaper instanceof UnicodeWriter)) {
781                 int length = value.length();
782                 for (int i = 0; i < length; i++) {
783                    if (escaper.needsEscaping(value.charAt(i))) {
784                         // can't use CDATA section
785                         escaper.writePCDATA(value);
786                         return;
787                    }
788                 }
789             }
790             escaper.writeUncheckedMarkup("<![CDATA[");
791             escaper.writeMarkup(value);
792             escaper.writeUncheckedMarkup("]]>");
793         }
794         // is this boundary whitespace we can ignore?
795         else if (isBoundaryWhitespace(text, value)) {
796             return; // without writing node
797         }
798         else {
799             escaper.writePCDATA(value);
800         }
801 
802     }
803 
804 
isBoundaryWhitespace(Text text, String value)805     private boolean isBoundaryWhitespace(Text text, String value) {
806 
807         if (getIndent() <= 0) return false;
808 
809         ParentNode parent = text.getParent();
810         if (parent == null) {
811             return "".equals(value.trim());
812         }
813 
814         // ???? cutting next line only breaks a few tests; and what it does
815         // break might be better off if the breakage is accepted as correct behavior
816         int childCount = parent.getChildCount();
817         if (childCount == 1) return false;
818         if (! "".equals(value.trim())) return false;
819 
820         // ???? This is a huge Hotspot. maybe 12% of serialization time
821         // when indenting. Is there any way to eliminate this?
822         // We only actually need to test a couple of positions, 0 and
823         // parent.getChildCount()-1
824         // Instead of getting position we could get those two elements and compare
825         // to the text. But you still need the previous and next
826         int position = parent.indexOf(text);
827 
828         Node previous = null;
829         Node next = null;
830 
831         if (position != 0) previous = parent.getChild(position-1);
832         if (position != childCount-1) {
833             next = parent.getChild(position+1);
834         }
835         if (previous == null || !previous.isText()) {
836             if (next == null || !next.isText()) {
837                 return true;
838             }
839         }
840 
841         return false;
842 
843     }
844 
845 
846     /**
847      * <p>
848      * Writes a <code>DocType</code> object
849      * onto the output stream using the current options.
850      * </p>
851      *
852      * @param doctype the document type declaration to serialize
853      *
854      * @throws IOException if the underlying
855      *     output stream encounters an I/O error
856      * @throws UnavailableCharacterException if the document type
857      *     declaration contains a character that is not available
858      *     in the current encoding
859      */
write(DocType doctype)860     protected void write(DocType doctype) throws IOException {
861 
862         escaper.writeUncheckedMarkup("<!DOCTYPE ");
863         escaper.writeName(doctype.getRootElementName());
864         if (doctype.getPublicID() != null) {
865           escaper.writeMarkup(" PUBLIC \"" + doctype.getPublicID()
866            + "\" \"" + doctype.getSystemID() + "\"");
867         }
868         else if (doctype.getSystemID() != null) {
869           escaper.writeMarkup(
870             " SYSTEM \"" + doctype.getSystemID() + "\"");
871         }
872 
873         String internalDTDSubset = doctype.getInternalDTDSubset();
874         if (!internalDTDSubset.equals("")) {
875             escaper.writeUncheckedMarkup(" [");
876             escaper.breakLine();
877             escaper.setInDocType(true);
878             escaper.writeMarkup(internalDTDSubset);
879             escaper.setInDocType(false);
880             escaper.write(']');
881         }
882 
883         escaper.write('>');
884 
885     }
886 
887 
888     /**
889      * <p>
890      * Writes a child node onto the output stream using the
891      * current options. It is invoked when walking the tree to
892      * serialize the entire document. It is not called, and indeed
893      * should not be called, for either the <code>Document</code>
894      * node or for attributes.
895      * </p>
896      *
897      * @param node the <code>Node</code> to serialize
898      *
899      * @throws IOException if the underlying output stream
900      *     encounters an I/O error
901      * @throws XMLException if an <code>Attribute</code>, a
902      *     <code>Document</code>, or <code>Namespace</code>
903      *     is passed to this method
904      */
writeChild(Node node)905     protected void writeChild(Node node) throws IOException {
906 
907         if (node.isElement()) {
908             write((Element) node);
909         }
910         else if (node.isText()) {
911             write((Text) node);
912         }
913         else if (node.isComment()) {
914             write((Comment) node);
915         }
916         else if (node.isProcessingInstruction()) {
917             write((ProcessingInstruction) node);
918         }
919         else if (node.isDocType()) {
920             write((DocType) node);
921         }
922         else {
923             throw new XMLException("Cannot write a " +
924               node.getClass().getName() +
925               " from the writeChild() method");
926         }
927 
928     }
929 
930 
931     /** <p>
932      * Writes a string onto the underlying output stream.
933      * Non-ASCII characters that are not available in the
934      * current character set are encoded with numeric character
935      * references. The three reserved characters &lt;, &gt;, and &amp;
936      * are escaped using the standard entity references
937      * <code>&amp;lt;</code>, <code>&amp;gt;</code>,
938      * and <code>&amp;amp;</code>.
939      * Double and single quotes are not escaped.
940      * </p>
941      *
942      * @param text the parsed character data to serialize
943      *
944      * @throws IOException if the underlying output stream
945      *     encounters an I/O error
946      */
writeEscaped(String text)947     protected final void writeEscaped(String text) throws IOException {
948         escaper.writePCDATA(text);
949     }
950 
951     /** <p>
952      *   Writes a string onto the underlying output stream.
953      *   Non-ASCII characters that are not available in the
954      *   current character set are escaped using hexadecimal numeric
955      *   character references. Carriage returns, line feeds, and tabs
956      *   are also escaped using hexadecimal numeric character
957      *   references in order to ensure their preservation on a round
958      *   trip. The four reserved characters &lt;, &gt;, &amp;,
959      *   and &quot; are escaped using the standard entity references
960      *   <code>&amp;lt;</code>, <code>&amp;gt;</code>,
961      *   <code>&amp;amp;</code>, and <code>&amp;quot;</code>.
962      *   The single quote is not escaped.
963      * </p>
964      *
965      * @param value the attribute value to serialize
966      *
967      * @throws IOException if the underlying output stream
968      *     encounters an I/O error
969      */
writeAttributeValue(String value)970     protected final void writeAttributeValue(String value)
971       throws IOException {
972         escaper.writeAttributeValue(value);
973     }
974 
975 
976     /** <p>
977      *   Writes a string onto the underlying output stream.
978      *   without escaping any characters.
979      *   Non-ASCII characters that are not available in the
980      *   current character set cause an <code>IOException</code>.
981      * </p>
982      *
983      * @param text the <code>String</code> to serialize
984      *
985      * @throws IOException if the underlying output stream
986      *     encounters an I/O error or <code>text</code> contains
987      *     characters not available in the current character set
988      */
writeRaw(String text)989     protected final void writeRaw(String text) throws IOException {
990         escaper.writeMarkup(text);
991     }
992 
993 
994     /** <p>
995      *   Writes the current line break string
996      *   onto the underlying output stream and indents
997      *   as specified by the current level and the indent property.
998      * </p>
999      *
1000      * @throws IOException if the underlying output stream
1001      *     encounters an I/O error
1002      */
breakLine()1003     protected final void breakLine() throws IOException {
1004         escaper.breakLine();
1005     }
1006 
1007 
1008     /**
1009      * <p>
1010      * Flushes the data onto the output stream.
1011      * It is not enough to flush the output stream.
1012      * You must flush the serializer object itself because it
1013      * uses some internal buffering.
1014      * The serializer will flush the underlying output stream.
1015      * </p>
1016      *
1017      * @throws IOException  if the underlying
1018      *     output stream encounters an I/O error
1019      */
flush()1020     public void flush() throws IOException {
1021         escaper.flush();
1022     }
1023 
1024 
1025     /**
1026      * <p>
1027      * Returns the number of spaces this serializer indents.
1028      * </p>
1029      *
1030      * @return the number of spaces this serializer indents
1031      *     each successive level beyond the previous one
1032      */
getIndent()1033     public int getIndent() {
1034         return escaper.getIndent();
1035     }
1036 
1037 
1038     /**
1039      * <p>
1040      * Sets the number of additional spaces to add to each successive
1041      * level in the hierarchy. Use 0 for no extra indenting. The
1042      * maximum indentation is in limited to approximately half the
1043      * maximum line length. The serializer will not indent further
1044      * than that no matter how many levels deep the hierarchy is.
1045      * </p>
1046      *
1047      * <p>
1048      *   When this variable is set to a value greater than 0,
1049      *   the serializer does not preserve white space. Spaces,
1050      *   tabs, carriage returns, and line feeds can all be
1051      *   interchanged at the serializer's discretion, and additional
1052      *   white space may be added before and after tags.
1053      *   Carriage returns, line feeds, and tabs will not be
1054      *   escaped with numeric character references.
1055      * </p>
1056      *
1057      * <p>
1058      *   Inside elements with an <code>xml:space="preserve"</code>
1059      *   attribute, white space is preserved and no indenting
1060      *   takes place, regardless of the setting of the indent
1061      *   property, unless, of course, an
1062      *   <code>xml:space="default"</code> attribute overrides the
1063      *   <code>xml:space="preserve"</code> attribute.
1064      * </p>
1065      *
1066      * <p>
1067      *   The default value for indent is 0; that is, the default is
1068      *   not to add or subtract any white space from the source
1069      *   document.
1070      * </p>
1071      *
1072      * @param indent the number of spaces to indent
1073      *      each successive level of the hierarchy
1074      *
1075      * @throws IllegalArgumentException if indent is less than zero
1076      *
1077      */
setIndent(int indent)1078     public void setIndent(int indent) {
1079         if (indent < 0) {
1080             throw new IllegalArgumentException(
1081               "Indent cannot be negative"
1082             );
1083         }
1084         escaper.setIndent(indent);
1085     }
1086 
1087 
1088     /**
1089      * <p>
1090      * Returns the string used as a line separator.
1091      * This is always <code>"\n"</code>, <code>"\r"</code>,
1092      * or <code>"\r\n"</code>.
1093      * </p>
1094      *
1095      * @return the line separator
1096      */
getLineSeparator()1097     public String getLineSeparator() {
1098         return escaper.getLineSeparator();
1099     }
1100 
1101 
1102     /**
1103      * <p>
1104      * Sets the line separator. This can only be one of the
1105      * three strings <code>"\n"</code>, <code>"\r"</code>,
1106      * or <code>"\r\n"</code>. All other values are forbidden.
1107      * If this method is invoked, then
1108      * line separators in the character data will be changed to this
1109      * string. Line separators in attribute values will be changed
1110      * to the hexadecimal numeric character references corresponding
1111      * to this string.
1112      * </p>
1113      *
1114      * <p>
1115      *  The default line separator is <code>"\r\n"</code>. However,
1116      *  line separators in character data and attribute values are not
1117      *  changed to this string, unless this method is called first.
1118      * </p>
1119      *
1120      * @param lineSeparator the line separator to set
1121      *
1122      * @throws IllegalArgumentException if you attempt to use any line
1123      *    separator other than <code>"\n"</code>, <code>"\r"</code>,
1124      *    or <code>"\r\n"</code>.
1125      *
1126      */
setLineSeparator(String lineSeparator)1127     public void setLineSeparator(String lineSeparator) {
1128         escaper.setLineSeparator(lineSeparator);
1129     }
1130 
1131 
1132     /**
1133      * <p>
1134      * Returns the preferred maximum line length.
1135      * </p>
1136      *
1137      * @return the preferred maximum line length.
1138      */
getMaxLength()1139     public int getMaxLength() {
1140         return escaper.getMaxLength();
1141     }
1142 
1143 
1144     /**
1145      * <p>
1146      * Sets the suggested maximum line length for this serializer.
1147      * Setting this to 0 indicates that no automatic wrapping is to be
1148      * performed. When a line approaches this length, the serializer
1149      * begins looking for opportunities to break the line. Generally
1150      * it will break on any ASCII white space character (tab, carriage
1151      * return, linefeed, and space). In some circumstances the
1152      * serializer may not be able to break the line before the maximum
1153      * length is reached. For instance, if an element name is longer
1154      * than the maximum line length the only way to correctly
1155      * serialize it is to exceed the maximum line length. In this case,
1156      *  the serializer will exceed the maximum line length.
1157      * </p>
1158      *
1159      * <p>
1160      * The default value for maximum line length is 0, which is
1161      * interpreted as no maximum line length.
1162      * Setting this to a negative value just sets it to 0.
1163      * </p>
1164      *
1165      * <p>
1166      *   When this variable is set to a value greater than 0,
1167      *   the serializer does not preserve white space. Spaces,
1168      *   tabs, carriage returns, and line feeds can all be
1169      *   interchanged at the serializer's discretion.
1170      *   Carriage returns, line feeds, and tabs will not be
1171      *   escaped with numeric character references.
1172      * </p>
1173      *
1174      * <p>
1175      *   Inside elements with an <code>xml:space="preserve"</code>
1176      *   attribute, the maximum line length is not enforced,
1177      *   regardless of the setting of the this property, unless,
1178      *   of course, an <code>xml:space="default"</code> attribute
1179      *   overrides the <code>xml:space="preserve"</code> attribute.
1180      * </p>
1181      *
1182      * @param maxLength the preferred maximum line length
1183      */
setMaxLength(int maxLength)1184     public void setMaxLength(int maxLength) {
1185         escaper.setMaxLength(maxLength);
1186     }
1187 
1188 
1189     /**
1190      * <p>
1191      * Returns true if this serializer preserves the original
1192      * base URIs by inserting extra <code>xml:base</code> attributes.
1193      * </p>
1194      *
1195      * @return true if this <code>Serializer</code> inserts
1196      *    extra <code>xml:base</code> attributes to attempt to
1197      *    preserve base URI information from the document.
1198      */
getPreserveBaseURI()1199     public boolean getPreserveBaseURI() {
1200         return preserveBaseURI;
1201     }
1202 
1203 
1204     /**
1205      * <p>
1206      * Determines whether this serializer inserts
1207      * extra <code>xml:base</code> attributes to attempt to
1208      * preserve base URI information from the document.
1209      * The default is false, do not preserve base URI information.
1210      * <code>xml:base</code> attributes that have been explicitly
1211      * added to an element are always output. This property only
1212      * determines whether or not extra <code>xml:base</code>
1213      * attributes are added.
1214      * </p>
1215      *
1216      * @param preserve true if <code>xml:base</code>
1217      *     attributes should be added as necessary
1218      *     to preserve base URI information
1219      */
setPreserveBaseURI(boolean preserve)1220     public void setPreserveBaseURI(boolean preserve) {
1221         this.preserveBaseURI = preserve;
1222     }
1223 
1224 
1225     /**
1226      * <p>
1227      *   Returns the name of the character encoding used by
1228      *   this serializer.
1229      * </p>
1230      *
1231      * @return the encoding used for the output document
1232      */
getEncoding()1233     public String getEncoding() {
1234         return escaper.getEncoding();
1235     }
1236 
1237 
1238     /**
1239      * <p>
1240      *   If true, this property indicates serialization will
1241      *   perform Unicode normalization on all data using normalization
1242      *   form C (NFC). Performing Unicode normalization may change the
1243      *   document's infoset. The default is false; do not normalize.
1244      *   This version is based on Unicode 4.0.
1245      * </p>
1246      *
1247      * <p>
1248      *   This feature has not yet been benchmarked or optimized.
1249      *   It may result in substantially slower code.
1250      * </p>
1251      *
1252      * <p>
1253      *   If all your data is in the first 256 code points of Unicode
1254      *   (i.e. the ISO-8859-1, Latin-1 character set), then it's
1255      *   already in normalization form C and normalizing won't change
1256      *   anything.
1257      * </p>
1258      *
1259      * @param normalize true if normalization is performed;
1260      *     false if it isn't
1261      */
setUnicodeNormalizationFormC(boolean normalize)1262     public void setUnicodeNormalizationFormC(boolean normalize) {
1263         escaper.setNFC(normalize);
1264     }
1265 
1266 
1267     /**
1268      * <p>
1269      *   Indicates whether serialization will
1270      *   perform Unicode normalization on all data using normalization
1271      *   form C (NFC). The default is false; do not normalize.
1272      * </p>
1273      *
1274      * @return true if this serializer performs Unicode
1275      *     normalization; false if it doesn't
1276      */
getUnicodeNormalizationFormC()1277     public boolean getUnicodeNormalizationFormC() {
1278         return escaper.getNFC();
1279     }
1280 
1281 
1282     /**
1283      * <p>
1284      *   Returns the current column number of the output stream. This
1285      *   method useful for subclasses that implement their own pretty
1286      *   printing strategies by inserting white space and line breaks
1287      *   at appropriate points.
1288      * </p>
1289      *
1290      * <p>
1291      *   Columns are counted based on Unicode characters, not Java
1292      *   chars. A surrogate pair counts as one character in this
1293      *   context, not two. However, a character followed by a
1294      *   combining character (e.g. e followed by combining accent
1295      *   acute) counts as two characters. This latter choice
1296      *   (treating combining characters like regular characters)
1297      *   is under review, and may change in the future if it's not
1298      *   too big a performance hit.
1299      * </p>
1300      *
1301      * @return the current column number
1302      */
getColumnNumber()1303     protected final int getColumnNumber() {
1304         return escaper.getColumnNumber();
1305     }
1306 
1307 }