1 /* Copyright 2002-2006 Elliotte Rusty Harold 2 3 This library is free software; you can redistribute it and/or modify 4 it under the terms of version 2.1 of the GNU Lesser General Public 5 License as published by the Free Software Foundation. 6 7 This library is distributed in the hope that it will be useful, 8 but WITHOUT ANY WARRANTY; without even the implied warranty of 9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 GNU Lesser General Public License for more details. 11 12 You should have received a copy of the GNU Lesser General Public 13 License along with this library; if not, write to the 14 Free Software Foundation, Inc., 59 Temple Place, Suite 330, 15 Boston, MA 02111-1307 USA 16 17 You can contact Elliotte Rusty Harold by sending e-mail to 18 elharo@ibiblio.org. Please include the word "XOM" in the 19 subject line. The XOM home page is located at http://www.xom.nu/ 20 */ 21 22 package nu.xom; 23 24 import java.io.IOException; 25 import java.io.Writer; 26 27 /** 28 * <p> 29 * This class is responsible for writing strings with the 30 * necessary escaping for their context. 31 * </p> 32 * 33 * @author Elliotte Rusty Harold 34 * @version 1.2d1 35 * 36 */ 37 abstract class TextWriter { 38 39 protected final Writer out; 40 protected final String encoding; 41 42 private String lineSeparator = "\r\n"; 43 // true if the user has requested a specific 44 // line separator 45 boolean lineSeparatorSet = false; 46 private boolean inDocType = false; 47 private int maxLength = 0; 48 private int indent = 0; 49 private String indentString = ""; 50 protected int column = 0; 51 // Is an xml:space="preserve" attribute in scope? 52 private boolean preserveSpace = false; 53 protected boolean normalize = false; 54 TextWriter(Writer out, String encoding)55 protected TextWriter(Writer out, String encoding) { 56 this.out = out; 57 this.encoding = encoding; 58 } 59 60 reset()61 void reset() { 62 column = 0; 63 fakeIndents = 0; 64 lastCharacterWasSpace = false; 65 skipFollowingLinefeed = false; 66 } 67 68 69 protected boolean lastCharacterWasSpace = false; 70 71 /** 72 * Indicates whether a linefeed is just half of a \r\n pair 73 * used for a line break. 74 */ 75 protected boolean skipFollowingLinefeed = false; 76 77 // Needed for memory between calls. 78 private char highSurrogate; 79 80 isHighSurrogate(int c)81 private boolean isHighSurrogate(int c) { 82 return c >= 0xD800 && c <= 0xDBFF; 83 } 84 85 isLowSurrogate(int c)86 private boolean isLowSurrogate(int c) { 87 return c >= 0xDC00 && c <= 0xDFFF; 88 } 89 90 writePCDATA(char c)91 final void writePCDATA(char c) throws IOException { 92 93 switch(c) { 94 case '\r': 95 if (!adjustingWhiteSpace() && !lineSeparatorSet) { 96 out.write("
"); 97 column += 6; 98 justBroke=false; 99 } 100 else { 101 breakLine(); 102 lastCharacterWasSpace = true; 103 } 104 skipFollowingLinefeed = true; 105 break; 106 case 14: // unreachable 107 case 15: // unreachable 108 case 16: // unreachable 109 case 17: // unreachable 110 case 18: // unreachable 111 case 19: // unreachable 112 case 20: // unreachable 113 case 21: // unreachable 114 case 22: // unreachable 115 case 23: // unreachable 116 case 24: // unreachable 117 case 25: // unreachable 118 case 26: // unreachable 119 case 27: // unreachable 120 case 28: // unreachable 121 case 29: // unreachable 122 case 30: // unreachable 123 case 31: // unreachable 124 throw new XMLException("Bad character snuck into document"); 125 case ' ': 126 write(c); 127 break; 128 case '!': 129 write(c); 130 break; 131 case '"': 132 write(c); 133 break; 134 case '#': 135 write(c); 136 break; 137 case '$': 138 write(c); 139 break; 140 case '%': 141 write(c); 142 break; 143 case '&': 144 out.write("&"); 145 column += 5; 146 lastCharacterWasSpace = false; 147 skipFollowingLinefeed = false; 148 justBroke = false; 149 break; 150 case '\'': 151 write(c); 152 break; 153 case '(': 154 write(c); 155 break; 156 case ')': 157 write(c); 158 break; 159 case '*': 160 write(c); 161 break; 162 case '+': 163 write(c); 164 break; 165 case ',': 166 write(c); 167 break; 168 case '-': 169 write(c); 170 break; 171 case '.': 172 write(c); 173 break; 174 case '/': 175 write(c); 176 break; 177 case '0': 178 write(c); 179 break; 180 case '1': 181 write(c); 182 break; 183 case '2': 184 write(c); 185 break; 186 case '3': 187 write(c); 188 break; 189 case '4': 190 write(c); 191 break; 192 case '5': 193 write(c); 194 break; 195 case '6': 196 write(c); 197 break; 198 case '7': 199 write(c); 200 break; 201 case '8': 202 write(c); 203 break; 204 case '9': 205 write(c); 206 break; 207 case ':': 208 write(c); 209 break; 210 case ';': 211 write(c); 212 break; 213 case '<': 214 out.write("<"); 215 column += 4; 216 lastCharacterWasSpace = false; 217 skipFollowingLinefeed = false; 218 justBroke = false; 219 break; 220 case '=': 221 write(c); 222 break; 223 case '>': 224 out.write(">"); 225 column += 4; 226 lastCharacterWasSpace = false; 227 skipFollowingLinefeed = false; 228 justBroke = false; 229 break; 230 default: 231 if (needsEscaping(c)) writeEscapedChar(c); 232 else write(c); 233 } 234 235 } 236 237 writeEscapedChar(char c)238 private void writeEscapedChar(char c) throws IOException { 239 240 if (isHighSurrogate(c)) { 241 //store and wait for low half 242 highSurrogate = c; 243 } 244 else if (isLowSurrogate(c)) { 245 // decode and write entity reference 246 // I am assuming here that nothing allows the 247 // text to be created with a malformed surrogate 248 // pair such as a low surrogate that is not immediately 249 // preceded by a high surrogate 250 int uchar = UnicodeUtil.combineSurrogatePair(highSurrogate, c); 251 String s = "&#x" + Integer.toHexString(uchar).toUpperCase() + ';'; 252 out.write(s); 253 column += s.length(); 254 lastCharacterWasSpace = false; 255 skipFollowingLinefeed = false; 256 justBroke = false; 257 } 258 else { 259 String s = "&#x" + Integer.toHexString(c).toUpperCase() + ';'; 260 out.write(s); 261 column += s.length(); 262 lastCharacterWasSpace = false; 263 skipFollowingLinefeed = false; 264 justBroke=false; 265 } 266 267 } 268 269 adjustingWhiteSpace()270 private boolean adjustingWhiteSpace() { 271 return maxLength > 0 || indent > 0; 272 } 273 274 275 // This is the same as writePCDATA except that it 276 // also needs to escape " as " and tab as "	". 277 // I'm not escaping the single quote because Serializer 278 // always uses double quotes to contain 279 // values. writeAttributeValue(char c)280 final void writeAttributeValue(char c) 281 throws IOException { 282 283 switch(c) { 284 // Handle white space that the parser might normalize 285 // on roundtrip. We only escape them if the serializer 286 // is not adjusting white space; that is indent is 0 287 // and maxLength is 0. 288 case '\t': 289 if (!adjustingWhiteSpace()) { 290 out.write("	"); 291 column += 6; 292 lastCharacterWasSpace = true; 293 skipFollowingLinefeed = false; 294 justBroke=false; 295 } 296 else { 297 write(' '); 298 } 299 break; 300 case '\n': 301 if (skipFollowingLinefeed) { 302 skipFollowingLinefeed = false; 303 return; 304 } 305 else if (adjustingWhiteSpace()) { 306 out.write(" "); 307 lastCharacterWasSpace = true; 308 justBroke=false; 309 } 310 else { 311 if (lineSeparatorSet) { 312 escapeBreakLine(); 313 } 314 else { 315 out.write("
"); 316 column += 6; 317 justBroke=false; 318 } 319 lastCharacterWasSpace = true; 320 } 321 break; 322 case 11: 323 // unreachable 324 case 12: 325 // unreachable 326 throw new XMLException("Bad character snuck into document"); 327 case '\r': 328 if (adjustingWhiteSpace()) { 329 out.write(" "); 330 lastCharacterWasSpace = true; 331 skipFollowingLinefeed = true; 332 justBroke=false; 333 } 334 else { 335 if (lineSeparatorSet) { 336 escapeBreakLine(); 337 skipFollowingLinefeed = true; 338 } 339 else { 340 out.write("
"); 341 column += 6; 342 justBroke=false; 343 } 344 } 345 break; 346 case 14: 347 // unreachable 348 case 15: 349 // unreachable 350 case 16: 351 // unreachable 352 case 17: 353 // unreachable 354 case 18: 355 // unreachable 356 case 19: 357 // unreachable 358 case 20: 359 // unreachable 360 case 21: 361 // unreachable 362 case 22: 363 // unreachable 364 case 23: 365 // unreachable 366 case 24: 367 // unreachable 368 case 25: 369 // unreachable 370 case 26: 371 // unreachable 372 case 27: 373 // unreachable 374 case 28: 375 // unreachable 376 case 29: 377 // unreachable 378 case 30: 379 // unreachable 380 case 31: 381 // unreachable 382 throw new XMLException("Bad character snuck into document"); 383 case ' ': 384 write(c); 385 break; 386 case '!': 387 write(c); 388 break; 389 case '"': 390 out.write("""); 391 column += 6; 392 lastCharacterWasSpace = false; 393 skipFollowingLinefeed = false; 394 justBroke=false; 395 break; 396 case '#': 397 write(c); 398 break; 399 case '$': 400 write(c); 401 break; 402 case '%': 403 write(c); 404 break; 405 case '&': 406 out.write("&"); 407 column += 5; 408 lastCharacterWasSpace = false; 409 skipFollowingLinefeed = false; 410 justBroke = false; 411 break; 412 case '\'': 413 write(c); 414 break; 415 case '(': 416 write(c); 417 break; 418 case ')': 419 write(c); 420 break; 421 case '*': 422 write(c); 423 break; 424 case '+': 425 write(c); 426 break; 427 case ',': 428 write(c); 429 break; 430 case '-': 431 write(c); 432 break; 433 case '.': 434 write(c); 435 break; 436 case '/': 437 write(c); 438 break; 439 case '0': 440 write(c); 441 break; 442 case '1': 443 write(c); 444 break; 445 case '2': 446 write(c); 447 break; 448 case '3': 449 write(c); 450 break; 451 case '4': 452 write(c); 453 break; 454 case '5': 455 write(c); 456 break; 457 case '6': 458 write(c); 459 break; 460 case '7': 461 write(c); 462 break; 463 case '8': 464 write(c); 465 break; 466 case '9': 467 write(c); 468 break; 469 case ':': 470 write(c); 471 break; 472 case ';': 473 write(c); 474 break; 475 case '<': 476 out.write("<"); 477 column += 4; 478 lastCharacterWasSpace = false; 479 skipFollowingLinefeed = false; 480 justBroke = false; 481 break; 482 case '=': 483 write(c); 484 break; 485 case '>': 486 out.write(">"); 487 column += 4; 488 lastCharacterWasSpace = false; 489 skipFollowingLinefeed = false; 490 justBroke = false; 491 break; 492 default: 493 if (needsEscaping(c)) writeEscapedChar(c); 494 else write(c); 495 } 496 497 } 498 499 500 // XXX We might be able to optimize this by using switch statements 501 // in the methods that call this to separate out the special cases. 502 // --\n, \t, space, etc.--and passing them to a different method 503 // thus avoiding the if tests here. See if this method shows up as 504 // a HotSpot in profiling. write(char c)505 void write(char c) throws IOException { 506 507 // Carriage returns are completely handled by 508 // writePCDATA and writeAttributeValue. They never 509 // enter this method. 510 if ((c == ' ' || c == '\n' || c == '\t')) { 511 if (needsBreak()) { 512 breakLine(); 513 skipFollowingLinefeed = false; 514 } 515 else if (preserveSpace || (indent <= 0 && maxLength <= 0)) { 516 // We're neither indenting nor wrapping 517 // so we need to preserve white space 518 if (c == ' ' || c == '\t') { 519 out.write(c); 520 skipFollowingLinefeed = false; 521 column++; 522 justBroke=false; 523 } 524 else { // (c == '\n') 525 if (!lineSeparatorSet || 526 !skipFollowingLinefeed) { 527 writeLineSeparator(c); 528 } 529 skipFollowingLinefeed = false; 530 column = 0; 531 } 532 } 533 else if (!lastCharacterWasSpace) { 534 out.write(' '); 535 column++; 536 skipFollowingLinefeed = false; 537 justBroke=false; 538 } 539 lastCharacterWasSpace = true; 540 } 541 else { 542 out.write(c); 543 // don't increment column for high surrogate, only low surrogate 544 if (c < 0xd800 || c > 0xDBFF) column++; 545 lastCharacterWasSpace = false; 546 skipFollowingLinefeed = false; 547 justBroke=false; 548 } 549 550 } 551 552 writeLineSeparator(char c)553 private void writeLineSeparator(char c) 554 throws IOException { 555 556 if (!inDocType && (!lineSeparatorSet || preserveSpace)) out.write(c); 557 else if (lineSeparator.equals("\r\n")) { 558 out.write("\r\n"); 559 } 560 else if (lineSeparator.equals("\n")) { 561 out.write('\n'); 562 } 563 else { // lineSeparator.equals("\r")) 564 out.write('\r'); 565 } 566 // Remember, there are only three possible line separators 567 568 } 569 570 needsBreak()571 private boolean needsBreak() { 572 573 if (maxLength <= 0 || preserveSpace) return false; 574 // Better algorithm needed: Should look ahead in the 575 // stream, see if there's a white space character 576 // between here and the maxLength, Then again, simple is good. 577 // Here we just assume there's probably space somewhere 578 // within the next ten characters 579 580 return column >= maxLength - 10; 581 582 } 583 584 585 protected boolean justBroke = false; 586 justBroke()587 boolean justBroke() { 588 return justBroke; 589 } 590 591 breakLine()592 final void breakLine() throws IOException { 593 594 out.write(lineSeparator); 595 out.write(indentString); 596 column = indentString.length(); 597 lastCharacterWasSpace = true; 598 justBroke = true; 599 600 } 601 602 escapeBreakLine()603 private final void escapeBreakLine() throws IOException { 604 605 if ("\n".equals(lineSeparator)) { 606 out.write("
"); 607 column += 6; 608 } 609 else if ("\r\n".equals(lineSeparator)) { 610 out.write("
"); 611 column += 12; 612 } 613 else { 614 out.write("
"); 615 column += 6; 616 } 617 lastCharacterWasSpace = true; 618 619 } 620 621 622 // Note that when this method is called directly, then 623 // normalization is not performed on c. Currently this is 624 // only called for ASCII characters like <, >, and the space, 625 // which should be OK writeMarkup(char c)626 final void writeMarkup(char c) throws IOException { 627 628 if (needsEscaping(c)) { 629 throw new UnavailableCharacterException(c, encoding); 630 } 631 write(c); 632 633 } 634 635 636 // XXX should we have a special package protected 637 // method to be used only for ASCII characters we know don't need escaping or 638 // normalization such as <, /, A-Z, etc.? 639 640 writePCDATA(String s)641 void writePCDATA(String s) throws IOException { 642 643 s = normalize(s); 644 int length = s.length(); 645 for (int i=0; i < length; i++) { 646 writePCDATA(s.charAt(i)); 647 } 648 649 } 650 651 writeAttributeValue(String s)652 void writeAttributeValue(String s) 653 throws IOException { 654 655 s = normalize(s); 656 int length = s.length(); 657 for (int i=0; i < length; i++) { 658 writeAttributeValue(s.charAt(i)); 659 } 660 661 } 662 663 writeMarkup(String s)664 void writeMarkup(String s) throws IOException { 665 666 s = normalize(s); 667 int length = s.length(); 668 for (int i=0; i < length; i++) { 669 writeMarkup(s.charAt(i)); 670 } 671 672 } 673 674 675 // This is for ASCII characters like < and = we know are 676 // available in all encodings and do not need to be normalized writeUncheckedMarkup(String s)677 void writeUncheckedMarkup(String s) throws IOException { 678 679 int length = s.length(); 680 for (int i=0; i < length; i++) { 681 write(s.charAt(i)); 682 } 683 684 } 685 686 normalize(String s)687 protected String normalize(String s) { 688 689 if (normalize) { 690 return UnicodeUtil.normalize(s); 691 } 692 return s; 693 694 } 695 696 697 isIndenting()698 boolean isIndenting() { 699 return indentString.length() > 0; 700 } 701 702 703 private int fakeIndents = 0; 704 705 private final static String _128_SPACES=" "; 706 private final static int _128 = 128; 707 incrementIndent()708 void incrementIndent() { 709 710 if (indent == 0) return; 711 712 String newIndent; 713 int length = indentString.length() + indent; 714 if (indentString.length() + indent < _128) { 715 newIndent = _128_SPACES.substring(0, length); 716 } 717 else { 718 StringBuffer sb = new StringBuffer(length); 719 sb.append(_128_SPACES); 720 for (int i = _128; i < length; i++) { 721 sb.append(' '); 722 } 723 newIndent = sb.toString(); 724 } 725 726 // limit maximum indent to half of maximum line length 727 if (maxLength > 0 && newIndent.length() > maxLength / 2) { 728 fakeIndents++; 729 } 730 else this.indentString = newIndent; 731 732 } 733 734 decrementIndent()735 void decrementIndent() { 736 737 if (indent == 0) return; 738 else if (fakeIndents > 0) fakeIndents--; 739 else { 740 indentString = indentString.substring( 741 0, indentString.length()-indent 742 ); 743 } 744 745 } 746 747 getEncoding()748 String getEncoding() { 749 return this.encoding; 750 } 751 752 753 /** 754 * <p> 755 * Returns the String used as a line separator. 756 * This is always "\n", "\r", or "\r\n". 757 * </p> 758 * 759 * @return the line separator 760 */ getLineSeparator()761 String getLineSeparator() { 762 return lineSeparator; 763 } 764 765 766 /** 767 * <p> 768 * Sets the lineSeparator. This 769 * can only be one of the three 770 * strings "\n", "\r", or "\r\n". 771 * All other values are forbidden. 772 * </p> 773 * 774 * @param lineSeparator the lineSeparator to set 775 * 776 * @throws IllegalArgumentException if you attempt to use 777 * any line separator other than "\n", "\r", or "\r\n". 778 * 779 */ setLineSeparator(String lineSeparator)780 void setLineSeparator(String lineSeparator) { 781 782 if (lineSeparator.equals("\n") 783 || lineSeparator.equals("\r") 784 || lineSeparator.equals("\r\n")) { 785 this.lineSeparator = lineSeparator; 786 this.lineSeparatorSet = true; 787 } 788 else { 789 throw new IllegalArgumentException( 790 "Illegal Line Separator"); 791 } 792 793 } 794 795 setInDocType(boolean inDocType)796 void setInDocType(boolean inDocType) { 797 this.inDocType = inDocType; 798 } 799 800 801 /** 802 * <p> 803 * Returns the number of spaces this serializer indents. 804 * </p> 805 * 806 * @return the number of spaces this serializer indents 807 */ getIndent()808 int getIndent() { 809 return indent; 810 } 811 812 813 /** 814 * <p> 815 * Returns the maximum line length. 816 * </p> 817 * 818 * @return the maximum line length. 819 */ getMaxLength()820 int getMaxLength() { 821 return maxLength; 822 } 823 824 /** 825 * <p> 826 * Sets the suggested maximum line length for this serializer. 827 * In some circumstances this may not be respected. 828 * </p> 829 * 830 * @param maxLength the maxLength to set 831 */ setMaxLength(int maxLength)832 void setMaxLength(int maxLength) { 833 if (maxLength < 0) maxLength = 0; 834 this.maxLength = maxLength; 835 } 836 837 838 /** 839 * <p> 840 * Sets the number of spaces to indent each successive level in the 841 * hierarchy. Use 0 for no extra indenting. 842 * </p> 843 * 844 * @param indent the indent to set 845 */ setIndent(int indent)846 void setIndent(int indent) { 847 this.indent = indent; 848 } 849 850 flush()851 void flush() throws IOException { 852 out.flush(); 853 } 854 855 needsEscaping(char c)856 abstract boolean needsEscaping(char c); 857 858 859 /** 860 * <p> 861 * Used to track the current status of xml:space. 862 * This is false by default, unless an xml:space="preserve" 863 * attribute is in-scope. When such an attribute is in-scope, 864 * white space is not adjusted even if indenting and/or 865 * a maximum line length has been requested. 866 * </p> 867 * 868 * 869 * @return true if an <code>xml:space="true"</code> attribute 870 * is in-scope 871 */ isPreserveSpace()872 boolean isPreserveSpace() { 873 return preserveSpace; 874 } 875 876 877 /** 878 * @param preserveSpace whether to preserve all white space 879 */ setPreserveSpace(boolean preserveSpace)880 void setPreserveSpace(boolean preserveSpace) { 881 this.preserveSpace = preserveSpace; 882 } 883 884 885 /** 886 * @return the current column number 887 */ getColumnNumber()888 int getColumnNumber() { 889 return this.column; 890 } 891 892 893 /** 894 * <p> 895 * If true, this property indicates serialization will 896 * perform Unicode normalization on all data using normalization 897 * form C (NFC). Performing Unicode normalization 898 * does change the document's infoset. 899 * The default is false; do not normalize. 900 * </p> 901 * 902 * <p> 903 * This feature has not yet been benchmarked or optimized. 904 * It may result in substantially slower code. 905 * </p> 906 * 907 * @param normalize true if normalization is performed; 908 * false if it isn't. 909 */ setNFC(boolean normalize)910 void setNFC(boolean normalize) { 911 this.normalize = normalize; 912 } 913 914 915 /** 916 * <p> 917 * If true, this property indicates serialization will 918 * perform Unicode normalization on all data using normalization 919 * form C (NFC). The default is false; do not normalize. 920 * </p> 921 * 922 * @return true if this serialization performs Unicode 923 * normalization; false if it doesn't. 924 */ getNFC()925 boolean getNFC() { 926 return this.normalize; 927 } 928 929 writeName(String name)930 void writeName(String name) throws IOException { 931 writeMarkup(name); 932 } 933 934 935 }