1 /* Copyright 2002-2006 Elliotte Rusty Harold 2 3 This library is free software; you can redistribute it and/or modify 4 it under the terms of version 2.1 of the GNU Lesser General Public 5 License as published by the Free Software Foundation. 6 7 This library is distributed in the hope that it will be useful, 8 but WITHOUT ANY WARRANTY; without even the implied warranty of 9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 GNU Lesser General Public License for more details. 11 12 You should have received a copy of the GNU Lesser General Public 13 License along with this library; if not, write to the 14 Free Software Foundation, Inc., 59 Temple Place, Suite 330, 15 Boston, MA 02111-1307 USA 16 17 You can contact Elliotte Rusty Harold by sending e-mail to 18 elharo@ibiblio.org. Please include the word "XOM" in the 19 subject line. The XOM home page is located at http://www.xom.nu/ 20 */ 21 22 package nu.xom; 23 24 import java.io.IOException; 25 import java.io.OutputStream; 26 import java.io.OutputStreamWriter; 27 import java.io.UnsupportedEncodingException; 28 import java.io.Writer; 29 import java.util.Locale; 30 import org.xml.sax.helpers.NamespaceSupport; 31 32 /** 33 * <p> 34 * Outputs a <code>Document</code> object in a specific encoding using 35 * various options for controlling white space, normalization, 36 * indenting, line breaking, and base URIs. However, in general these 37 * options do affect the document's infoset. In particular, if you set 38 * either the maximum line length or the indent size to a positive 39 * value, then the serializer will not respect input white space. It 40 * may trim leading and trailing space, condense runs of white 41 * space to a single space, convert carriage returns and linefeeds 42 * to spaces, add extra space where none was present before, 43 * and otherwise muck with the document's white space. 44 * The defaults, however, preserve all significant white space 45 * including ignorable white space and boundary white space. 46 * </p> 47 * 48 * @author Elliotte Rusty Harold 49 * @version 1.2d1 50 * 51 */ 52 public class Serializer { 53 54 private TextWriter escaper; 55 private boolean preserveBaseURI = false; 56 // ???? reset when exception is thrown? 57 private NamespaceSupport namespaces = new NamespaceSupport(); 58 59 60 /** 61 * <p> 62 * Create a new serializer that uses the UTF-8 encoding. 63 * </p> 64 * 65 * @param out the output stream to write the document on 66 * 67 * @throws NullPointerException if <code>out</code> is null 68 */ Serializer(OutputStream out)69 public Serializer(OutputStream out) { 70 71 try { 72 this.setOutputStream(out, "UTF-8"); 73 } 74 catch (UnsupportedEncodingException ex) { 75 throw new RuntimeException( 76 "The VM is broken. It does not understand UTF-8."); 77 } 78 79 } 80 81 82 /** 83 * <p> 84 * Create a new serializer that uses the specified encoding. 85 * The encoding must be recognized by the Java virtual machine. If 86 * you attempt to use an encoding that the local Java virtual 87 * machine does not support, the constructor will throw an 88 * <code>UnsupportedEncodingException</code>. 89 * Currently the following encodings are recognized by XOM: 90 * </p> 91 * 92 * <ul> 93 * <li>UTF-8</li> 94 * <li>UTF-16</li> 95 * <li>UTF-16BE</li> 96 * <li>UTF-16LE</li> 97 * <li>ISO-10646-UCS-2</li> 98 * <li>ISO-8859-1</li> 99 * <li>ISO-8859-2</li> 100 * <li>ISO-8859-3</li> 101 * <li>ISO-8859-4</li> 102 * <li>ISO-8859-5</li> 103 * <li>ISO-8859-6</li> 104 * <li>ISO-8859-7</li> 105 * <li>ISO-8859-8</li> 106 * <li>ISO-8859-9</li> 107 * <li>ISO-8859-10</li> 108 * <li>ISO-8859-11 (a.k.a. TIS-620)</li> 109 * <li>ISO-8859-13</li> 110 * <li>ISO-8859-14</li> 111 * <li>ISO-8859-15</li> 112 * <li>ISO-8859-16</li> 113 * <li>IBM037 (a.k.a. CP037, EBCDIC-CP-US, EBCDIC-CP-CA, 114 * EBCDIC-CP-WA, EBCDIC-CP-NL, and CSIBM037)</li> 115 * <li>GB18030</li> 116 * </ul> 117 * 118 * <p> 119 * You can use encodings not in this list if the virtual 120 * machine supports them. However, they may be 121 * significantly slower than the encodings in this list. 122 * </p> 123 * 124 * <p> 125 * I've noticed Java has significant bugs in its handling of some 126 * of these encodings. In some cases such as 0x80 in Big5, XOM 127 * will escape a character that should not need to be escaped 128 * because Java can't output that character in the specified 129 * encoding, even though the output character set does contain it. 130 * :-( 131 * </p> 132 * 133 * @param out the output stream to write the document on 134 * @param encoding the character encoding for the serialization 135 136 * @throws NullPointerException if <code>out</code> 137 * or <code>encoding</code> is null 138 * @throws UnsupportedEncodingException if the VM does not 139 * support the requested encoding 140 * 141 */ Serializer(OutputStream out, String encoding)142 public Serializer(OutputStream out, String encoding) 143 throws UnsupportedEncodingException { 144 145 if (encoding == null) { 146 throw new NullPointerException("Null encoding"); 147 } 148 this.setOutputStream(out, encoding); 149 150 } 151 152 153 /** 154 * <p> 155 * Flushes the previous output stream and 156 * redirects further output to the new output stream. 157 * </p> 158 * 159 * 160 * @param out the output stream to write the document on 161 162 * @throws NullPointerException if <code>out</code> is null 163 * @throws IOException if the previous output stream 164 * encounters an I/O error when flushed 165 * 166 */ setOutputStream(OutputStream out)167 public void setOutputStream(OutputStream out) 168 throws IOException { 169 170 // flush any data onto the old output stream 171 this.flush(); 172 int maxLength = getMaxLength(); 173 int indent = this.getIndent(); 174 String lineSeparator = getLineSeparator(); 175 boolean nfc = getUnicodeNormalizationFormC(); 176 String encoding = escaper.getEncoding(); 177 boolean lineSeparatorSet = escaper.lineSeparatorSet; 178 setOutputStream(out, encoding); 179 setIndent(indent); 180 setMaxLength(maxLength); 181 setUnicodeNormalizationFormC(nfc); 182 if (lineSeparatorSet) setLineSeparator(lineSeparator); 183 184 } 185 186 setOutputStream(OutputStream out, String encoding)187 private void setOutputStream(OutputStream out, String encoding) 188 throws UnsupportedEncodingException { 189 190 if (out == null) { 191 throw new NullPointerException("Null OutputStream"); 192 } 193 Writer writer; 194 String encodingUpperCase = encoding.toUpperCase(Locale.ENGLISH); 195 if (encodingUpperCase.equals("UTF-8")) { 196 writer = new OutputStreamWriter(out, "UTF-8"); 197 } 198 else if (encodingUpperCase.equals("UTF-16") 199 || encodingUpperCase.equals("ISO-10646-UCS-2")) { 200 // For compatibility with Java 1.2 and earlier 201 writer = new OutputStreamWriter(out, "UnicodeBig"); 202 } 203 // Java's Cp037 encoding is broken, so we have to 204 // provide our own. 205 else if (encodingUpperCase.equals("IBM037") 206 || encodingUpperCase.equals("CP037") 207 || encodingUpperCase.equals("EBCDIC-CP-US") 208 || encodingUpperCase.equals("EBCDIC-CP-CA") 209 || encodingUpperCase.equals("EBCDIC-CP-WA") 210 || encodingUpperCase.equals("EBCDIC-CP-NL") 211 || encodingUpperCase.equals("CSIBM037")) { 212 writer = new EBCDICWriter(out); 213 } 214 else if (encodingUpperCase.equals("ISO-8859-11") 215 || encodingUpperCase.equals("TIS-620")) { 216 // Java doesn't recognize the name ISO-8859-11 and 217 // Java 1.3 and earlier don't recognize TIS-620 218 writer = new OutputStreamWriter(out, "TIS620"); 219 } 220 else writer = new OutputStreamWriter(out, encoding); 221 222 writer = new UnsynchronizedBufferedWriter(writer); 223 this.escaper = TextWriterFactory.getTextWriter(writer, encoding); 224 225 } 226 227 228 /** 229 * <p> 230 * Serializes a document onto the output 231 * stream using the current options. 232 * </p> 233 * 234 * @param doc the <code>Document</code> to serialize 235 * 236 * @throws IOException if the underlying output stream 237 * encounters an I/O error 238 * @throws NullPointerException if <code>doc</code> is null 239 * @throws UnavailableCharacterException if the document contains 240 * an unescapable character (e.g. in an element name) that is 241 * not available in the current encoding 242 */ write(Document doc)243 public void write(Document doc) throws IOException { 244 245 escaper.reset(); 246 namespaces.reset(); 247 namespaces.declarePrefix("", ""); 248 // The OutputStreamWriter automatically inserts 249 // the byte order mark if necessary. 250 writeXMLDeclaration(); 251 int childCount = doc.getChildCount(); 252 for (int i = 0; i < childCount; i++) { 253 writeChild(doc.getChild(i)); 254 255 // Might want to remove this line break in a 256 // non-XML serializer where it's not guaranteed to be 257 // OK to add extra line breaks in the prolog 258 escaper.breakLine(); 259 } 260 escaper.flush(); 261 262 } 263 264 265 /** 266 * <p> 267 * Writes the XML declaration onto the output stream, 268 * followed by a line break. 269 * </p> 270 * 271 * @throws IOException if the underlying output stream 272 * encounters an I/O error 273 */ writeXMLDeclaration()274 protected void writeXMLDeclaration() throws IOException { 275 276 escaper.writeUncheckedMarkup("<?xml version=\"1.0\" encoding=\""); 277 escaper.writeUncheckedMarkup(escaper.getEncoding()); 278 escaper.writeUncheckedMarkup("\"?>"); 279 escaper.breakLine(); 280 281 } 282 283 284 /** 285 * <p> 286 * Serializes an element onto the output stream using the current 287 * options. The result is guaranteed to be well-formed. 288 * </p> 289 * 290 * <p> 291 * If the element is empty, this method invokes 292 * <code>writeEmptyElementTag</code>. If the element is not 293 * empty, then: 294 * </p> 295 * 296 * <ol> 297 * <li>It calls <code>writeStartTag</code>.</li> 298 * <li>It passes each of the element's children to 299 * <code>writeChild</code> in order.</li> 300 * <li>It calls <code>writeEndTag</code>.</li> 301 * </ol> 302 * 303 * <p> 304 * It may break lines or add white space if the serializer has 305 * been configured to indent or use a maximum line length. 306 * </p> 307 * 308 * @param element the <code>Element</code> to serialize 309 * 310 * @throws IOException if the underlying output stream 311 * encounters an I/O error 312 * @throws UnavailableCharacterException if the element name 313 * contains a character that is not available in the 314 * current encoding 315 */ write(Element element)316 protected void write(Element element) throws IOException { 317 318 // workaround for case where only children are empty text nodes 319 boolean hasRealChildren = false; 320 int childCount = element.getChildCount(); 321 for (int i = 0; i < childCount; i++) { 322 Node child = element.getChild(i); 323 if (child.isText()) { 324 Text t = (Text) child; 325 if (t.isEmpty()) continue; 326 } 327 hasRealChildren = true; 328 break; 329 } 330 331 if (hasRealChildren) { 332 boolean wasPreservingWhiteSpace = escaper.isPreserveSpace(); 333 writeStartTag(element); 334 335 // children 336 for (int i = 0; i < childCount; i++) { 337 Node child = element.getChild(i); 338 // need to work around a very tricky case here where 339 // denormalized characters cross boundaries of 340 // consecutive text nodes 341 if (escaper.getNFC() && child.isText()) { 342 Text t = (Text) child; 343 while (i < childCount-1) { // not the last node 344 Node next = element.getChild(i+1); 345 if (next.isText()) { 346 t = new Text(t.getValue() + next.getValue()); 347 i++; 348 } 349 else break; 350 } 351 writeChild(t); 352 } 353 else { 354 writeChild(child); 355 } 356 } 357 writeEndTag(element); 358 359 // restore parent value 360 escaper.setPreserveSpace(wasPreservingWhiteSpace); 361 } 362 else { 363 writeEmptyElementTag(element); 364 } 365 366 } 367 368 hasNonTextChildren(Element element)369 private boolean hasNonTextChildren(Element element) { 370 371 int childCount = element.getChildCount(); 372 for (int i = 0; i < childCount; i++) { 373 if (! element.getChild(i).isText()) return true; 374 } 375 return false; 376 377 } 378 379 380 // writeEndTag should not normally throw UnavailableCharacterException 381 // because that would already have been thrown for the 382 // corresponding start-tag. 383 /** 384 * <p> 385 * Writes the end-tag for an element in the form 386 * <code></<i>name</i>></code>. 387 * </p> 388 * 389 * @param element the element whose end-tag is written 390 * 391 * @throws IOException if the underlying output stream 392 * encounters an I/O error 393 */ writeEndTag(Element element)394 protected void writeEndTag(Element element) throws IOException { 395 396 escaper.decrementIndent(); 397 if (escaper.getIndent() > 0 && !escaper.isPreserveSpace()) { 398 if (hasNonTextChildren(element)) { 399 escaper.breakLine(); 400 } 401 } 402 escaper.write('<'); 403 escaper.write('/'); 404 escaper.writeName(element.getQualifiedName()); 405 escaper.write('>'); 406 namespaces.popContext(); 407 408 } 409 410 411 /** 412 * 413 * <p> 414 * Writes the start-tag for the element including 415 * all its namespace declarations and attributes. 416 * </p> 417 * 418 * <p> 419 * The <code>writeAttributes</code> method is called to write 420 * all the non-namespace-declaration attributes. 421 * The <code>writeNamespaceDeclarations</code> method 422 * is called to write all the namespace declaration attributes. 423 * </p> 424 * 425 * @param element the element whose start-tag is written 426 * 427 * @throws IOException if the underlying output stream 428 * encounters an I/O error 429 * @throws UnavailableCharacterException if the name of the element 430 * or the name of any of its attributes contains a character 431 * that is not available in the current encoding 432 */ writeStartTag(Element element)433 protected void writeStartTag(Element element) throws IOException { 434 435 writeTagBeginning(element); 436 escaper.write('>'); 437 escaper.incrementIndent(); 438 String xmlSpaceValue = element.getAttributeValue( 439 "space", "http://www.w3.org/XML/1998/namespace"); 440 if (xmlSpaceValue != null) { 441 if ("preserve".equals(xmlSpaceValue)){ 442 escaper.setPreserveSpace(true); 443 } 444 else if ("default".equals(xmlSpaceValue)){ 445 escaper.setPreserveSpace(false); 446 } 447 } 448 449 } 450 451 452 /** 453 * 454 * <p> 455 * Writes an empty-element tag for the element 456 * including all its namespace declarations and attributes. 457 * </p> 458 * 459 * <p> 460 * The <code>writeAttributes</code> method is called to write 461 * all the non-namespace-declaration attributes. 462 * The <code>writeNamespaceDeclarations</code> method 463 * is called to write all the namespace declaration attributes. 464 * </p> 465 * 466 * <p> 467 * If subclasses don't wish empty-element tags to be used, 468 * they can override this method to simply invoke 469 * <code>writeStartTag</code> followed by 470 * <code>writeEndTag</code>. 471 * </p> 472 * 473 * @param element the element whose empty-element tag is written 474 * 475 * @throws IOException if the underlying output stream 476 * encounters an I/O error 477 * @throws UnavailableCharacterException if the name of the element or the name of 478 * any of its attributes contains a character that is not 479 * available in the current encoding 480 */ writeEmptyElementTag(Element element)481 protected void writeEmptyElementTag(Element element) 482 throws IOException { 483 writeTagBeginning(element); 484 escaper.write('/'); 485 escaper.write('>'); 486 namespaces.popContext(); 487 } 488 489 490 // This just extracts the commonality between writeStartTag 491 // and writeEmptyElementTag writeTagBeginning(Element element)492 private void writeTagBeginning(Element element) 493 throws IOException { 494 495 namespaces.pushContext(); 496 497 if (escaper.isIndenting() 498 && !escaper.isPreserveSpace() 499 && !escaper.justBroke()) { 500 escaper.breakLine(); 501 } 502 escaper.write('<'); 503 escaper.writeName(element.getQualifiedName()); 504 writeAttributes(element); 505 writeNamespaceDeclarations(element); 506 507 } 508 509 510 /** 511 * <p> 512 * Writes all the attributes of the specified 513 * element onto the output stream, one at a time, separated 514 * by white space. If preserveBaseURI is true, and it is 515 * necessary to add an <code>xml:base</code> attribute 516 * to the element in order to preserve the base URI, then 517 * that attribute is also written here. 518 * Each individual attribute is written by invoking 519 * <code>write(Attribute)</code>. 520 * </p> 521 * 522 * @param element the <code>Element</code> whose attributes are 523 * written 524 * @throws IOException if the underlying output stream 525 * encounters an I/O error 526 * @throws UnavailableCharacterException if the name of any of 527 * the element's attributes contains a character that is not 528 * available in the current encoding 529 */ writeAttributes(Element element)530 protected void writeAttributes(Element element) 531 throws IOException { 532 533 // check to see if we need an xml:base attribute 534 if (preserveBaseURI) { 535 ParentNode parent = element.getParent(); 536 if (element.getAttribute("base", 537 "http://www.w3.org/XML/1998/namespace") == null) { 538 String baseValue = element.getBaseURI(); 539 if (parent == null 540 || parent.isDocument() 541 || !element.getBaseURI() 542 .equals(parent.getBaseURI())) { 543 544 escaper.write(' '); 545 Attribute baseAttribute = new Attribute( 546 "xml:base", 547 "http://www.w3.org/XML/1998/namespace", 548 baseValue); 549 write(baseAttribute); 550 } 551 } 552 } 553 554 int attributeCount = element.getAttributeCount(); 555 for (int i = 0; i < attributeCount; i++) { 556 Attribute attribute = element.getAttribute(i); 557 escaper.write(' '); 558 write(attribute); 559 } 560 } 561 562 563 /** 564 * <p> 565 * Writes all the namespace declaration 566 * attributes of the specified element onto the output stream, 567 * one at a time, separated by white space. Each individual 568 * declaration is written by invoking 569 * <code>writeNamespaceDeclaration</code>. 570 * </p> 571 * 572 * @param element the <code>Element</code> whose namespace 573 * declarations are written 574 * @throws IOException if the underlying output stream 575 * encounters an I/O error 576 * @throws UnavailableCharacterException if any of the element's 577 * namespace prefixes contains a character that is not 578 * available in the current encoding 579 */ writeNamespaceDeclarations(Element element)580 protected void writeNamespaceDeclarations(Element element) 581 throws IOException { 582 583 String prefix = element.getNamespacePrefix(); 584 if (!("xml".equals(prefix))) { 585 writeNamespaceDeclarationIfNecessary(prefix, element.getNamespaceURI()); 586 } 587 588 // write attribute namespaces 589 int attCount = element.getAttributeCount(); 590 for (int i = 0; i < attCount; i++) { 591 Attribute att = element.getAttribute(i); 592 String attPrefix = att.getNamespacePrefix(); 593 if (attPrefix.length() != 0 && !("xml".equals(attPrefix))) { 594 writeNamespaceDeclarationIfNecessary(attPrefix, att.getNamespaceURI()); 595 } 596 } 597 598 // write additional namespaces 599 Namespaces namespaces = element.namespaces; 600 if (namespaces == null) return; 601 int namespaceCount = namespaces.size(); 602 for (int i = 0; i < namespaceCount; i++) { 603 String additionalPrefix = namespaces.getPrefix(i); 604 String uri = namespaces.getURI(additionalPrefix); 605 writeNamespaceDeclarationIfNecessary(additionalPrefix, uri); 606 } 607 608 } 609 610 writeNamespaceDeclarationIfNecessary(String prefix, String uri)611 private void writeNamespaceDeclarationIfNecessary(String prefix, String uri) 612 throws IOException { 613 614 String currentValue = namespaces.getURI(prefix); 615 // NamespaceSupport returns null for no namespace, not the 616 // empty string like XOM does 617 if (currentValue == null && "".equals(uri)) { 618 return; 619 } 620 else if (uri.equals(currentValue)) { 621 return; 622 } 623 624 escaper.write(' '); 625 writeNamespaceDeclaration(prefix, uri); 626 627 } 628 629 630 /** 631 * <p> 632 * Writes a namespace declaration in the form 633 * <code>xmlns:<i>prefix</i>="<i>uri</i>"</code> or 634 * <code>xmlns="<i>uri</i>"</code>. It does not write 635 * the spaces on either side of the namespace declaration. 636 * These are written by <code>writeNamespaceDeclarations</code>. 637 * </p> 638 * 639 * @param prefix the namespace prefix; the empty string for the 640 * default namespace 641 * @param uri the namespace URI 642 * 643 * @throws IOException if the underlying output stream 644 * encounters an I/O error 645 * @throws UnavailableCharacterException if the namespace prefix contains a 646 * character that is not available in the current encoding 647 */ writeNamespaceDeclaration(String prefix, String uri)648 protected void writeNamespaceDeclaration(String prefix, String uri) 649 throws IOException { 650 651 namespaces.declarePrefix(prefix, uri); 652 if ("".equals(prefix)) { 653 escaper.writeUncheckedMarkup("xmlns"); 654 } 655 else { 656 escaper.writeUncheckedMarkup("xmlns:"); 657 escaper.writeName(prefix); 658 } 659 escaper.write('='); 660 escaper.write('"'); 661 escaper.writePCDATA(uri); 662 escaper.write('"'); 663 664 } 665 666 667 /** 668 * <p> 669 * Writes an attribute in the form 670 * <code><i>name</i>="<i>value</i>"</code>. 671 * Characters in the attribute value are escaped as necessary. 672 * </p> 673 * 674 * @param attribute the <code>Attribute</code> to write 675 * 676 * @throws IOException if the underlying output stream 677 * encounters an I/O error 678 * @throws UnavailableCharacterException if the attribute name contains a character 679 * that is not available in the current encoding 680 * 681 */ write(Attribute attribute)682 protected void write(Attribute attribute) throws IOException { 683 escaper.writeName(attribute.getQualifiedName()); 684 escaper.write('='); 685 escaper.write('"'); 686 escaper.writeAttributeValue(attribute.getValue()); 687 escaper.write('"'); 688 } 689 690 691 /** 692 * <p> 693 * Writes a comment onto the output stream using the current 694 * options. Since character and entity references are not resolved 695 * in comments, comments can only be serialized when all 696 * characters they contain are available in the current 697 * encoding. 698 * </p> 699 * 700 * @param comment the <code>Comment</code> to serialize 701 * 702 * @throws IOException if the underlying output stream 703 * encounters an I/O error 704 * @throws UnavailableCharacterException if the comment contains a 705 * character that is not available in the current encoding 706 */ write(Comment comment)707 protected void write(Comment comment) throws IOException { 708 if (escaper.isIndenting()) escaper.breakLine(); 709 escaper.writeUncheckedMarkup("<!--"); 710 escaper.writeMarkup(comment.getValue()); 711 escaper.writeUncheckedMarkup("-->"); 712 } 713 714 715 /** 716 * <p> 717 * Writes a processing instruction 718 * onto the output stream using the current options. 719 * Since character and entity references are not resolved 720 * in processing instructions, processing instructions 721 * can only be serialized when all 722 * characters they contain are available in the current 723 * encoding. 724 * </p> 725 * 726 * @param instruction the <code>ProcessingInstruction</code> 727 * to serialize 728 * 729 * @throws IOException if the underlying output stream 730 * encounters an I/O error 731 * @throws UnavailableCharacterException if the comment contains a 732 * character that is not available in the current encoding 733 */ write(ProcessingInstruction instruction)734 protected void write(ProcessingInstruction instruction) 735 throws IOException { 736 737 if (escaper.isIndenting()) escaper.breakLine(); 738 escaper.writeUncheckedMarkup("<?"); 739 escaper.writeName(instruction.getTarget()); 740 String value = instruction.getValue(); 741 // for canonical XML, only output a space after the target 742 // if there is a value 743 if (!"".equals(value)) { 744 escaper.write(' '); 745 escaper.writeMarkup(value); 746 } 747 escaper.writeUncheckedMarkup("?>"); 748 749 } 750 751 /** 752 * <p> 753 * Writes a <code>Text</code> object 754 * onto the output stream using the current options. 755 * Reserved characters such as <, > and " 756 * are escaped using the standard entity references 757 * such as <code>&lt;</code>, <code>&gt;</code>, 758 * and <code>&quot;</code>. 759 * </p> 760 * 761 * <p> 762 * Characters which cannot be encoded in the current character set 763 * (for example, Ω in ISO-8859-1) are encoded using 764 * character references. 765 * </p> 766 * 767 * @param text the <code>Text</code> to serialize 768 * 769 * @throws IOException if the underlying output stream 770 * encounters an I/O error 771 */ write(Text text)772 protected void write(Text text) throws IOException { 773 774 // XXX Is there a shortcut that takes advantage of the 775 // data being stored in UTF-8 here? perhaps even if only 776 // when serializing to UTF-8? 777 String value = text.getValue(); 778 if (text.isCDATASection() 779 && value.indexOf("]]>") == -1) { 780 if (!(escaper instanceof UnicodeWriter)) { 781 int length = value.length(); 782 for (int i = 0; i < length; i++) { 783 if (escaper.needsEscaping(value.charAt(i))) { 784 // can't use CDATA section 785 escaper.writePCDATA(value); 786 return; 787 } 788 } 789 } 790 escaper.writeUncheckedMarkup("<![CDATA["); 791 escaper.writeMarkup(value); 792 escaper.writeUncheckedMarkup("]]>"); 793 } 794 // is this boundary whitespace we can ignore? 795 else if (isBoundaryWhitespace(text, value)) { 796 return; // without writing node 797 } 798 else { 799 escaper.writePCDATA(value); 800 } 801 802 } 803 804 isBoundaryWhitespace(Text text, String value)805 private boolean isBoundaryWhitespace(Text text, String value) { 806 807 if (getIndent() <= 0) return false; 808 809 ParentNode parent = text.getParent(); 810 if (parent == null) { 811 return "".equals(value.trim()); 812 } 813 814 // ???? cutting next line only breaks a few tests; and what it does 815 // break might be better off if the breakage is accepted as correct behavior 816 int childCount = parent.getChildCount(); 817 if (childCount == 1) return false; 818 if (! "".equals(value.trim())) return false; 819 820 // ???? This is a huge Hotspot. maybe 12% of serialization time 821 // when indenting. Is there any way to eliminate this? 822 // We only actually need to test a couple of positions, 0 and 823 // parent.getChildCount()-1 824 // Instead of getting position we could get those two elements and compare 825 // to the text. But you still need the previous and next 826 int position = parent.indexOf(text); 827 828 Node previous = null; 829 Node next = null; 830 831 if (position != 0) previous = parent.getChild(position-1); 832 if (position != childCount-1) { 833 next = parent.getChild(position+1); 834 } 835 if (previous == null || !previous.isText()) { 836 if (next == null || !next.isText()) { 837 return true; 838 } 839 } 840 841 return false; 842 843 } 844 845 846 /** 847 * <p> 848 * Writes a <code>DocType</code> object 849 * onto the output stream using the current options. 850 * </p> 851 * 852 * @param doctype the document type declaration to serialize 853 * 854 * @throws IOException if the underlying 855 * output stream encounters an I/O error 856 * @throws UnavailableCharacterException if the document type 857 * declaration contains a character that is not available 858 * in the current encoding 859 */ write(DocType doctype)860 protected void write(DocType doctype) throws IOException { 861 862 escaper.writeUncheckedMarkup("<!DOCTYPE "); 863 escaper.writeName(doctype.getRootElementName()); 864 if (doctype.getPublicID() != null) { 865 escaper.writeMarkup(" PUBLIC \"" + doctype.getPublicID() 866 + "\" \"" + doctype.getSystemID() + "\""); 867 } 868 else if (doctype.getSystemID() != null) { 869 escaper.writeMarkup( 870 " SYSTEM \"" + doctype.getSystemID() + "\""); 871 } 872 873 String internalDTDSubset = doctype.getInternalDTDSubset(); 874 if (!internalDTDSubset.equals("")) { 875 escaper.writeUncheckedMarkup(" ["); 876 escaper.breakLine(); 877 escaper.setInDocType(true); 878 escaper.writeMarkup(internalDTDSubset); 879 escaper.setInDocType(false); 880 escaper.write(']'); 881 } 882 883 escaper.write('>'); 884 885 } 886 887 888 /** 889 * <p> 890 * Writes a child node onto the output stream using the 891 * current options. It is invoked when walking the tree to 892 * serialize the entire document. It is not called, and indeed 893 * should not be called, for either the <code>Document</code> 894 * node or for attributes. 895 * </p> 896 * 897 * @param node the <code>Node</code> to serialize 898 * 899 * @throws IOException if the underlying output stream 900 * encounters an I/O error 901 * @throws XMLException if an <code>Attribute</code>, a 902 * <code>Document</code>, or <code>Namespace</code> 903 * is passed to this method 904 */ writeChild(Node node)905 protected void writeChild(Node node) throws IOException { 906 907 if (node.isElement()) { 908 write((Element) node); 909 } 910 else if (node.isText()) { 911 write((Text) node); 912 } 913 else if (node.isComment()) { 914 write((Comment) node); 915 } 916 else if (node.isProcessingInstruction()) { 917 write((ProcessingInstruction) node); 918 } 919 else if (node.isDocType()) { 920 write((DocType) node); 921 } 922 else { 923 throw new XMLException("Cannot write a " + 924 node.getClass().getName() + 925 " from the writeChild() method"); 926 } 927 928 } 929 930 931 /** <p> 932 * Writes a string onto the underlying output stream. 933 * Non-ASCII characters that are not available in the 934 * current character set are encoded with numeric character 935 * references. The three reserved characters <, >, and & 936 * are escaped using the standard entity references 937 * <code>&lt;</code>, <code>&gt;</code>, 938 * and <code>&amp;</code>. 939 * Double and single quotes are not escaped. 940 * </p> 941 * 942 * @param text the parsed character data to serialize 943 * 944 * @throws IOException if the underlying output stream 945 * encounters an I/O error 946 */ writeEscaped(String text)947 protected final void writeEscaped(String text) throws IOException { 948 escaper.writePCDATA(text); 949 } 950 951 /** <p> 952 * Writes a string onto the underlying output stream. 953 * Non-ASCII characters that are not available in the 954 * current character set are escaped using hexadecimal numeric 955 * character references. Carriage returns, line feeds, and tabs 956 * are also escaped using hexadecimal numeric character 957 * references in order to ensure their preservation on a round 958 * trip. The four reserved characters <, >, &, 959 * and " are escaped using the standard entity references 960 * <code>&lt;</code>, <code>&gt;</code>, 961 * <code>&amp;</code>, and <code>&quot;</code>. 962 * The single quote is not escaped. 963 * </p> 964 * 965 * @param value the attribute value to serialize 966 * 967 * @throws IOException if the underlying output stream 968 * encounters an I/O error 969 */ writeAttributeValue(String value)970 protected final void writeAttributeValue(String value) 971 throws IOException { 972 escaper.writeAttributeValue(value); 973 } 974 975 976 /** <p> 977 * Writes a string onto the underlying output stream. 978 * without escaping any characters. 979 * Non-ASCII characters that are not available in the 980 * current character set cause an <code>IOException</code>. 981 * </p> 982 * 983 * @param text the <code>String</code> to serialize 984 * 985 * @throws IOException if the underlying output stream 986 * encounters an I/O error or <code>text</code> contains 987 * characters not available in the current character set 988 */ writeRaw(String text)989 protected final void writeRaw(String text) throws IOException { 990 escaper.writeMarkup(text); 991 } 992 993 994 /** <p> 995 * Writes the current line break string 996 * onto the underlying output stream and indents 997 * as specified by the current level and the indent property. 998 * </p> 999 * 1000 * @throws IOException if the underlying output stream 1001 * encounters an I/O error 1002 */ breakLine()1003 protected final void breakLine() throws IOException { 1004 escaper.breakLine(); 1005 } 1006 1007 1008 /** 1009 * <p> 1010 * Flushes the data onto the output stream. 1011 * It is not enough to flush the output stream. 1012 * You must flush the serializer object itself because it 1013 * uses some internal buffering. 1014 * The serializer will flush the underlying output stream. 1015 * </p> 1016 * 1017 * @throws IOException if the underlying 1018 * output stream encounters an I/O error 1019 */ flush()1020 public void flush() throws IOException { 1021 escaper.flush(); 1022 } 1023 1024 1025 /** 1026 * <p> 1027 * Returns the number of spaces this serializer indents. 1028 * </p> 1029 * 1030 * @return the number of spaces this serializer indents 1031 * each successive level beyond the previous one 1032 */ getIndent()1033 public int getIndent() { 1034 return escaper.getIndent(); 1035 } 1036 1037 1038 /** 1039 * <p> 1040 * Sets the number of additional spaces to add to each successive 1041 * level in the hierarchy. Use 0 for no extra indenting. The 1042 * maximum indentation is in limited to approximately half the 1043 * maximum line length. The serializer will not indent further 1044 * than that no matter how many levels deep the hierarchy is. 1045 * </p> 1046 * 1047 * <p> 1048 * When this variable is set to a value greater than 0, 1049 * the serializer does not preserve white space. Spaces, 1050 * tabs, carriage returns, and line feeds can all be 1051 * interchanged at the serializer's discretion, and additional 1052 * white space may be added before and after tags. 1053 * Carriage returns, line feeds, and tabs will not be 1054 * escaped with numeric character references. 1055 * </p> 1056 * 1057 * <p> 1058 * Inside elements with an <code>xml:space="preserve"</code> 1059 * attribute, white space is preserved and no indenting 1060 * takes place, regardless of the setting of the indent 1061 * property, unless, of course, an 1062 * <code>xml:space="default"</code> attribute overrides the 1063 * <code>xml:space="preserve"</code> attribute. 1064 * </p> 1065 * 1066 * <p> 1067 * The default value for indent is 0; that is, the default is 1068 * not to add or subtract any white space from the source 1069 * document. 1070 * </p> 1071 * 1072 * @param indent the number of spaces to indent 1073 * each successive level of the hierarchy 1074 * 1075 * @throws IllegalArgumentException if indent is less than zero 1076 * 1077 */ setIndent(int indent)1078 public void setIndent(int indent) { 1079 if (indent < 0) { 1080 throw new IllegalArgumentException( 1081 "Indent cannot be negative" 1082 ); 1083 } 1084 escaper.setIndent(indent); 1085 } 1086 1087 1088 /** 1089 * <p> 1090 * Returns the string used as a line separator. 1091 * This is always <code>"\n"</code>, <code>"\r"</code>, 1092 * or <code>"\r\n"</code>. 1093 * </p> 1094 * 1095 * @return the line separator 1096 */ getLineSeparator()1097 public String getLineSeparator() { 1098 return escaper.getLineSeparator(); 1099 } 1100 1101 1102 /** 1103 * <p> 1104 * Sets the line separator. This can only be one of the 1105 * three strings <code>"\n"</code>, <code>"\r"</code>, 1106 * or <code>"\r\n"</code>. All other values are forbidden. 1107 * If this method is invoked, then 1108 * line separators in the character data will be changed to this 1109 * string. Line separators in attribute values will be changed 1110 * to the hexadecimal numeric character references corresponding 1111 * to this string. 1112 * </p> 1113 * 1114 * <p> 1115 * The default line separator is <code>"\r\n"</code>. However, 1116 * line separators in character data and attribute values are not 1117 * changed to this string, unless this method is called first. 1118 * </p> 1119 * 1120 * @param lineSeparator the line separator to set 1121 * 1122 * @throws IllegalArgumentException if you attempt to use any line 1123 * separator other than <code>"\n"</code>, <code>"\r"</code>, 1124 * or <code>"\r\n"</code>. 1125 * 1126 */ setLineSeparator(String lineSeparator)1127 public void setLineSeparator(String lineSeparator) { 1128 escaper.setLineSeparator(lineSeparator); 1129 } 1130 1131 1132 /** 1133 * <p> 1134 * Returns the preferred maximum line length. 1135 * </p> 1136 * 1137 * @return the preferred maximum line length. 1138 */ getMaxLength()1139 public int getMaxLength() { 1140 return escaper.getMaxLength(); 1141 } 1142 1143 1144 /** 1145 * <p> 1146 * Sets the suggested maximum line length for this serializer. 1147 * Setting this to 0 indicates that no automatic wrapping is to be 1148 * performed. When a line approaches this length, the serializer 1149 * begins looking for opportunities to break the line. Generally 1150 * it will break on any ASCII white space character (tab, carriage 1151 * return, linefeed, and space). In some circumstances the 1152 * serializer may not be able to break the line before the maximum 1153 * length is reached. For instance, if an element name is longer 1154 * than the maximum line length the only way to correctly 1155 * serialize it is to exceed the maximum line length. In this case, 1156 * the serializer will exceed the maximum line length. 1157 * </p> 1158 * 1159 * <p> 1160 * The default value for maximum line length is 0, which is 1161 * interpreted as no maximum line length. 1162 * Setting this to a negative value just sets it to 0. 1163 * </p> 1164 * 1165 * <p> 1166 * When this variable is set to a value greater than 0, 1167 * the serializer does not preserve white space. Spaces, 1168 * tabs, carriage returns, and line feeds can all be 1169 * interchanged at the serializer's discretion. 1170 * Carriage returns, line feeds, and tabs will not be 1171 * escaped with numeric character references. 1172 * </p> 1173 * 1174 * <p> 1175 * Inside elements with an <code>xml:space="preserve"</code> 1176 * attribute, the maximum line length is not enforced, 1177 * regardless of the setting of the this property, unless, 1178 * of course, an <code>xml:space="default"</code> attribute 1179 * overrides the <code>xml:space="preserve"</code> attribute. 1180 * </p> 1181 * 1182 * @param maxLength the preferred maximum line length 1183 */ setMaxLength(int maxLength)1184 public void setMaxLength(int maxLength) { 1185 escaper.setMaxLength(maxLength); 1186 } 1187 1188 1189 /** 1190 * <p> 1191 * Returns true if this serializer preserves the original 1192 * base URIs by inserting extra <code>xml:base</code> attributes. 1193 * </p> 1194 * 1195 * @return true if this <code>Serializer</code> inserts 1196 * extra <code>xml:base</code> attributes to attempt to 1197 * preserve base URI information from the document. 1198 */ getPreserveBaseURI()1199 public boolean getPreserveBaseURI() { 1200 return preserveBaseURI; 1201 } 1202 1203 1204 /** 1205 * <p> 1206 * Determines whether this serializer inserts 1207 * extra <code>xml:base</code> attributes to attempt to 1208 * preserve base URI information from the document. 1209 * The default is false, do not preserve base URI information. 1210 * <code>xml:base</code> attributes that have been explicitly 1211 * added to an element are always output. This property only 1212 * determines whether or not extra <code>xml:base</code> 1213 * attributes are added. 1214 * </p> 1215 * 1216 * @param preserve true if <code>xml:base</code> 1217 * attributes should be added as necessary 1218 * to preserve base URI information 1219 */ setPreserveBaseURI(boolean preserve)1220 public void setPreserveBaseURI(boolean preserve) { 1221 this.preserveBaseURI = preserve; 1222 } 1223 1224 1225 /** 1226 * <p> 1227 * Returns the name of the character encoding used by 1228 * this serializer. 1229 * </p> 1230 * 1231 * @return the encoding used for the output document 1232 */ getEncoding()1233 public String getEncoding() { 1234 return escaper.getEncoding(); 1235 } 1236 1237 1238 /** 1239 * <p> 1240 * If true, this property indicates serialization will 1241 * perform Unicode normalization on all data using normalization 1242 * form C (NFC). Performing Unicode normalization may change the 1243 * document's infoset. The default is false; do not normalize. 1244 * This version is based on Unicode 4.0. 1245 * </p> 1246 * 1247 * <p> 1248 * This feature has not yet been benchmarked or optimized. 1249 * It may result in substantially slower code. 1250 * </p> 1251 * 1252 * <p> 1253 * If all your data is in the first 256 code points of Unicode 1254 * (i.e. the ISO-8859-1, Latin-1 character set), then it's 1255 * already in normalization form C and normalizing won't change 1256 * anything. 1257 * </p> 1258 * 1259 * @param normalize true if normalization is performed; 1260 * false if it isn't 1261 */ setUnicodeNormalizationFormC(boolean normalize)1262 public void setUnicodeNormalizationFormC(boolean normalize) { 1263 escaper.setNFC(normalize); 1264 } 1265 1266 1267 /** 1268 * <p> 1269 * Indicates whether serialization will 1270 * perform Unicode normalization on all data using normalization 1271 * form C (NFC). The default is false; do not normalize. 1272 * </p> 1273 * 1274 * @return true if this serializer performs Unicode 1275 * normalization; false if it doesn't 1276 */ getUnicodeNormalizationFormC()1277 public boolean getUnicodeNormalizationFormC() { 1278 return escaper.getNFC(); 1279 } 1280 1281 1282 /** 1283 * <p> 1284 * Returns the current column number of the output stream. This 1285 * method useful for subclasses that implement their own pretty 1286 * printing strategies by inserting white space and line breaks 1287 * at appropriate points. 1288 * </p> 1289 * 1290 * <p> 1291 * Columns are counted based on Unicode characters, not Java 1292 * chars. A surrogate pair counts as one character in this 1293 * context, not two. However, a character followed by a 1294 * combining character (e.g. e followed by combining accent 1295 * acute) counts as two characters. This latter choice 1296 * (treating combining characters like regular characters) 1297 * is under review, and may change in the future if it's not 1298 * too big a performance hit. 1299 * </p> 1300 * 1301 * @return the current column number 1302 */ getColumnNumber()1303 protected final int getColumnNumber() { 1304 return escaper.getColumnNumber(); 1305 } 1306 1307 }