1 /* Copyright 2002-2006 Elliotte Rusty Harold
2 
3    This library is free software; you can redistribute it and/or modify
4    it under the terms of version 2.1 of the GNU Lesser General Public
5    License as published by the Free Software Foundation.
6 
7    This library is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10    GNU Lesser General Public License for more details.
11 
12    You should have received a copy of the GNU Lesser General Public
13    License along with this library; if not, write to the
14    Free Software Foundation, Inc., 59 Temple Place, Suite 330,
15    Boston, MA 02111-1307  USA
16 
17    You can contact Elliotte Rusty Harold by sending e-mail to
18    elharo@ibiblio.org. Please include the word "XOM" in the
19    subject line. The XOM home page is located at http://www.xom.nu/
20 */
21 
22 package nu.xom;
23 
24 import java.io.IOException;
25 import java.io.Writer;
26 
27 /**
28  * <p>
29  *   This class is responsible for writing strings with the
30  *   necessary escaping for their context.
31  * </p>
32  *
33  * @author Elliotte Rusty Harold
34  * @version 1.2d1
35  *
36  */
37 abstract class TextWriter {
38 
39     protected final Writer out;
40     protected final String encoding;
41 
42     private String lineSeparator = "\r\n";
43     // true if the user has requested a specific
44     // line separator
45             boolean lineSeparatorSet = false;
46     private boolean inDocType = false;
47     private int     maxLength = 0;
48     private int     indent = 0;
49     private String  indentString = "";
50     protected int   column = 0;
51     // Is an xml:space="preserve" attribute in scope?
52     private boolean preserveSpace = false;
53     protected boolean normalize = false;
54 
TextWriter(Writer out, String encoding)55     protected TextWriter(Writer out, String encoding) {
56         this.out = out;
57         this.encoding = encoding;
58     }
59 
60 
reset()61     void reset() {
62         column = 0;
63         fakeIndents = 0;
64         lastCharacterWasSpace = false;
65         skipFollowingLinefeed = false;
66     }
67 
68 
69     protected boolean lastCharacterWasSpace = false;
70 
71     /**
72      * Indicates whether a linefeed is just half of a \r\n pair
73      * used for a line break.
74      */
75     protected boolean skipFollowingLinefeed = false;
76 
77     // Needed for memory between calls.
78     private char highSurrogate;
79 
80 
isHighSurrogate(int c)81     private boolean isHighSurrogate(int c) {
82         return c >= 0xD800 && c <= 0xDBFF;
83     }
84 
85 
isLowSurrogate(int c)86     private boolean isLowSurrogate(int c) {
87         return c >= 0xDC00 && c <= 0xDFFF;
88     }
89 
90 
writePCDATA(char c)91     final void writePCDATA(char c) throws IOException {
92 
93         switch(c) {
94             case '\r':
95                 if (!adjustingWhiteSpace()  && !lineSeparatorSet) {
96                     out.write("&#x0D;");
97                     column += 6;
98                     justBroke=false;
99                 }
100                 else {
101                     breakLine();
102                     lastCharacterWasSpace = true;
103                 }
104                 skipFollowingLinefeed = true;
105                 break;
106             case 14: // unreachable
107             case 15: // unreachable
108             case 16: // unreachable
109             case 17: // unreachable
110             case 18: // unreachable
111             case 19: // unreachable
112             case 20: // unreachable
113             case 21: // unreachable
114             case 22: // unreachable
115             case 23: // unreachable
116             case 24: // unreachable
117             case 25: // unreachable
118             case 26: // unreachable
119             case 27: // unreachable
120             case 28: // unreachable
121             case 29: // unreachable
122             case 30: // unreachable
123             case 31: // unreachable
124                 throw new XMLException("Bad character snuck into document");
125             case ' ':
126                 write(c);
127                 break;
128             case '!':
129                 write(c);
130                 break;
131             case '"':
132                 write(c);
133                 break;
134             case '#':
135                 write(c);
136                 break;
137             case '$':
138                 write(c);
139                 break;
140             case '%':
141                 write(c);
142                 break;
143             case '&':
144                 out.write("&amp;");
145                 column += 5;
146                 lastCharacterWasSpace = false;
147                 skipFollowingLinefeed = false;
148                 justBroke = false;
149                 break;
150             case '\'':
151                 write(c);
152                 break;
153             case '(':
154                 write(c);
155                 break;
156             case ')':
157                 write(c);
158                 break;
159             case '*':
160                 write(c);
161                 break;
162             case '+':
163                 write(c);
164                 break;
165             case ',':
166                 write(c);
167                 break;
168             case '-':
169                 write(c);
170                 break;
171             case '.':
172                 write(c);
173                 break;
174             case '/':
175                 write(c);
176                 break;
177             case '0':
178                 write(c);
179                 break;
180             case '1':
181                 write(c);
182                 break;
183             case '2':
184                 write(c);
185                 break;
186             case '3':
187                 write(c);
188                 break;
189             case '4':
190                 write(c);
191                 break;
192             case '5':
193                 write(c);
194                 break;
195             case '6':
196                 write(c);
197                 break;
198             case '7':
199                 write(c);
200                 break;
201             case '8':
202                 write(c);
203                 break;
204             case '9':
205                 write(c);
206                 break;
207             case ':':
208                 write(c);
209                 break;
210             case ';':
211                 write(c);
212                 break;
213             case '<':
214                 out.write("&lt;");
215                 column += 4;
216                 lastCharacterWasSpace = false;
217                 skipFollowingLinefeed = false;
218                 justBroke = false;
219                 break;
220             case '=':
221                 write(c);
222                 break;
223             case '>':
224                 out.write("&gt;");
225                 column += 4;
226                 lastCharacterWasSpace = false;
227                 skipFollowingLinefeed = false;
228                 justBroke = false;
229                 break;
230             default:
231                 if (needsEscaping(c)) writeEscapedChar(c);
232                 else write(c);
233         }
234 
235     }
236 
237 
writeEscapedChar(char c)238     private void writeEscapedChar(char c) throws IOException {
239 
240         if (isHighSurrogate(c)) {
241             //store and wait for low half
242             highSurrogate = c;
243         }
244         else if (isLowSurrogate(c)) {
245             // decode and write entity reference
246             // I am assuming here that nothing allows the
247             // text to be created with a malformed surrogate
248             // pair such as a low surrogate that is not immediately
249             // preceded by a high surrogate
250             int uchar = UnicodeUtil.combineSurrogatePair(highSurrogate, c);
251             String s = "&#x" + Integer.toHexString(uchar).toUpperCase() + ';';
252             out.write(s);
253             column += s.length();
254             lastCharacterWasSpace = false;
255             skipFollowingLinefeed = false;
256             justBroke = false;
257         }
258         else {
259             String s = "&#x" + Integer.toHexString(c).toUpperCase() + ';';
260             out.write(s);
261             column += s.length();
262             lastCharacterWasSpace = false;
263             skipFollowingLinefeed = false;
264             justBroke=false;
265         }
266 
267     }
268 
269 
adjustingWhiteSpace()270     private boolean adjustingWhiteSpace() {
271         return maxLength > 0 || indent > 0;
272     }
273 
274 
275     // This is the same as writePCDATA except that it
276     // also needs to escape " as &quot; and tab as "&#x09;".
277     // I'm not escaping the single quote because Serializer
278     // always uses double quotes to contain
279     // values.
writeAttributeValue(char c)280     final void writeAttributeValue(char c)
281       throws IOException {
282 
283         switch(c) {
284             // Handle white space that the parser might normalize
285             // on roundtrip. We only escape them if the serializer
286             // is not adjusting white space; that is indent is 0
287             // and maxLength is 0.
288             case '\t':
289                 if (!adjustingWhiteSpace()) {
290                     out.write("&#x09;");
291                     column += 6;
292                     lastCharacterWasSpace = true;
293                     skipFollowingLinefeed = false;
294                     justBroke=false;
295                 }
296                 else {
297                     write(' ');
298                 }
299                 break;
300             case '\n':
301                 if (skipFollowingLinefeed) {
302                     skipFollowingLinefeed = false;
303                     return;
304                 }
305                 else if (adjustingWhiteSpace()) {
306                     out.write(" ");
307                     lastCharacterWasSpace = true;
308                     justBroke=false;
309                 }
310                 else {
311                     if (lineSeparatorSet) {
312                         escapeBreakLine();
313                     }
314                     else {
315                         out.write("&#x0A;");
316                         column += 6;
317                         justBroke=false;
318                     }
319                     lastCharacterWasSpace = true;
320                 }
321                 break;
322             case 11:
323                 // unreachable
324             case 12:
325                 // unreachable
326                 throw new XMLException("Bad character snuck into document");
327             case '\r':
328                 if (adjustingWhiteSpace()) {
329                     out.write(" ");
330                     lastCharacterWasSpace = true;
331                     skipFollowingLinefeed = true;
332                     justBroke=false;
333                 }
334                 else {
335                     if (lineSeparatorSet) {
336                         escapeBreakLine();
337                         skipFollowingLinefeed = true;
338                     }
339                     else {
340                         out.write("&#x0D;");
341                         column += 6;
342                         justBroke=false;
343                     }
344                 }
345                 break;
346             case 14:
347                 // unreachable
348             case 15:
349                 // unreachable
350             case 16:
351                 // unreachable
352             case 17:
353                 // unreachable
354             case 18:
355                 // unreachable
356             case 19:
357                 // unreachable
358             case 20:
359                 // unreachable
360             case 21:
361                 // unreachable
362             case 22:
363                 // unreachable
364             case 23:
365                 // unreachable
366             case 24:
367                 // unreachable
368             case 25:
369                 // unreachable
370             case 26:
371                 // unreachable
372             case 27:
373                 // unreachable
374             case 28:
375                 // unreachable
376             case 29:
377                 // unreachable
378             case 30:
379                 // unreachable
380             case 31:
381                 // unreachable
382                 throw new XMLException("Bad character snuck into document");
383             case ' ':
384                 write(c);
385                 break;
386             case '!':
387                 write(c);
388                 break;
389             case '"':
390                 out.write("&quot;");
391                 column += 6;
392                 lastCharacterWasSpace = false;
393                 skipFollowingLinefeed = false;
394                 justBroke=false;
395                 break;
396             case '#':
397                 write(c);
398                 break;
399             case '$':
400                 write(c);
401                 break;
402             case '%':
403                 write(c);
404                 break;
405             case '&':
406                 out.write("&amp;");
407                 column += 5;
408                 lastCharacterWasSpace = false;
409                 skipFollowingLinefeed = false;
410                 justBroke = false;
411                 break;
412             case '\'':
413                 write(c);
414                 break;
415             case '(':
416                 write(c);
417                 break;
418             case ')':
419                 write(c);
420                 break;
421             case '*':
422                 write(c);
423                 break;
424             case '+':
425                 write(c);
426                 break;
427             case ',':
428                 write(c);
429                 break;
430             case '-':
431                 write(c);
432                 break;
433             case '.':
434                 write(c);
435                 break;
436             case '/':
437                 write(c);
438                 break;
439             case '0':
440                 write(c);
441                 break;
442             case '1':
443                 write(c);
444                 break;
445             case '2':
446                 write(c);
447                 break;
448             case '3':
449                 write(c);
450                 break;
451             case '4':
452                 write(c);
453                 break;
454             case '5':
455                 write(c);
456                 break;
457             case '6':
458                 write(c);
459                 break;
460             case '7':
461                 write(c);
462                 break;
463             case '8':
464                 write(c);
465                 break;
466             case '9':
467                 write(c);
468                 break;
469             case ':':
470                 write(c);
471                 break;
472             case ';':
473                 write(c);
474                 break;
475             case '<':
476                 out.write("&lt;");
477                 column += 4;
478                 lastCharacterWasSpace = false;
479                 skipFollowingLinefeed = false;
480                 justBroke = false;
481                 break;
482             case '=':
483                 write(c);
484                 break;
485             case '>':
486                 out.write("&gt;");
487                 column += 4;
488                 lastCharacterWasSpace = false;
489                 skipFollowingLinefeed = false;
490                 justBroke = false;
491                 break;
492             default:
493                 if (needsEscaping(c)) writeEscapedChar(c);
494                 else write(c);
495         }
496 
497     }
498 
499 
500     // XXX We might be able to optimize this by using switch statements
501     // in the methods that call this to separate out the special cases.
502     // --\n, \t, space, etc.--and passing them to a different method
503     // thus avoiding the if tests here. See if this method shows up as
504     // a HotSpot in profiling.
write(char c)505     void write(char c) throws IOException {
506 
507       // Carriage returns are completely handled by
508       // writePCDATA and writeAttributeValue. They never
509       // enter this method.
510       if ((c == ' ' || c == '\n' || c == '\t')) {
511             if (needsBreak()) {
512                 breakLine();
513                 skipFollowingLinefeed = false;
514             }
515             else if (preserveSpace || (indent <= 0 && maxLength <= 0)) {
516                 // We're neither indenting nor wrapping
517                 // so we need to preserve white space
518                 if (c == ' ' ||  c == '\t') {
519                     out.write(c);
520                     skipFollowingLinefeed = false;
521                     column++;
522                     justBroke=false;
523                 }
524                 else { // (c == '\n')
525                     if (!lineSeparatorSet ||
526                         !skipFollowingLinefeed) {
527                         writeLineSeparator(c);
528                     }
529                     skipFollowingLinefeed = false;
530                     column = 0;
531                 }
532             }
533             else if (!lastCharacterWasSpace) {
534                 out.write(' ');
535                 column++;
536                 skipFollowingLinefeed = false;
537                 justBroke=false;
538             }
539             lastCharacterWasSpace = true;
540         }
541         else {
542             out.write(c);
543             // don't increment column for high surrogate, only low surrogate
544             if (c < 0xd800 || c > 0xDBFF) column++;
545             lastCharacterWasSpace = false;
546             skipFollowingLinefeed = false;
547             justBroke=false;
548         }
549 
550     }
551 
552 
writeLineSeparator(char c)553     private void writeLineSeparator(char c)
554       throws IOException {
555 
556         if (!inDocType && (!lineSeparatorSet || preserveSpace)) out.write(c);
557         else if (lineSeparator.equals("\r\n")) {
558             out.write("\r\n");
559         }
560         else if (lineSeparator.equals("\n")) {
561             out.write('\n');
562         }
563         else  { // lineSeparator.equals("\r"))
564             out.write('\r');
565         }
566         // Remember, there are only three possible line separators
567 
568     }
569 
570 
needsBreak()571     private boolean needsBreak() {
572 
573         if (maxLength <= 0 || preserveSpace) return false;
574         // Better algorithm needed: Should look ahead in the
575         // stream, see if there's a white space character
576         // between here and the maxLength, Then again, simple is good.
577         // Here we just assume there's probably space somewhere
578         // within the next ten characters
579 
580         return column >= maxLength - 10;
581 
582     }
583 
584 
585     protected boolean justBroke = false;
586 
justBroke()587     boolean justBroke() {
588         return justBroke;
589     }
590 
591 
breakLine()592     final void breakLine() throws IOException {
593 
594         out.write(lineSeparator);
595         out.write(indentString);
596         column = indentString.length();
597         lastCharacterWasSpace = true;
598         justBroke = true;
599 
600     }
601 
602 
escapeBreakLine()603     private final void escapeBreakLine() throws IOException {
604 
605         if ("\n".equals(lineSeparator)) {
606             out.write("&#x0A;");
607             column += 6;
608         }
609         else if ("\r\n".equals(lineSeparator)) {
610             out.write("&#x0D;&#x0A;");
611             column += 12;
612         }
613         else {
614             out.write("&#x0D;");
615             column += 6;
616         }
617         lastCharacterWasSpace = true;
618 
619     }
620 
621 
622     // Note that when this method is called directly, then
623     // normalization is not performed on c. Currently this is
624     // only called for ASCII characters like <, >, and the space,
625     // which should be OK
writeMarkup(char c)626     final void writeMarkup(char c) throws IOException {
627 
628         if (needsEscaping(c)) {
629             throw new UnavailableCharacterException(c, encoding);
630         }
631         write(c);
632 
633     }
634 
635 
636     // XXX should we have a special package protected
637     // method to be used only for ASCII characters we know don't need escaping or
638     // normalization such as <, /, A-Z, etc.?
639 
640 
writePCDATA(String s)641     void writePCDATA(String s) throws IOException {
642 
643         s = normalize(s);
644         int length = s.length();
645         for (int i=0; i < length; i++) {
646             writePCDATA(s.charAt(i));
647         }
648 
649     }
650 
651 
writeAttributeValue(String s)652     void writeAttributeValue(String s)
653       throws IOException {
654 
655         s = normalize(s);
656         int length = s.length();
657         for (int i=0; i < length; i++) {
658             writeAttributeValue(s.charAt(i));
659         }
660 
661     }
662 
663 
writeMarkup(String s)664     void writeMarkup(String s) throws IOException {
665 
666         s = normalize(s);
667         int length = s.length();
668         for (int i=0; i < length; i++) {
669             writeMarkup(s.charAt(i));
670         }
671 
672     }
673 
674 
675     // This is for ASCII characters like < and = we know are
676     // available in all encodings and do not need to be normalized
writeUncheckedMarkup(String s)677     void writeUncheckedMarkup(String s) throws IOException {
678 
679         int length = s.length();
680         for (int i=0; i < length; i++) {
681             write(s.charAt(i));
682         }
683 
684     }
685 
686 
normalize(String s)687      protected String normalize(String s) {
688 
689         if (normalize) {
690             return UnicodeUtil.normalize(s);
691         }
692         return s;
693 
694     }
695 
696 
697 
isIndenting()698    boolean isIndenting() {
699         return indentString.length() > 0;
700     }
701 
702 
703     private int fakeIndents = 0;
704 
705     private final static String _128_SPACES="                                                                                                                                ";
706     private final static int    _128 = 128;
707 
incrementIndent()708     void incrementIndent() {
709 
710         if (indent == 0) return;
711 
712         String newIndent;
713         int length = indentString.length() + indent;
714         if (indentString.length() + indent < _128) {
715             newIndent = _128_SPACES.substring(0, length);
716         }
717         else {
718             StringBuffer sb = new StringBuffer(length);
719             sb.append(_128_SPACES);
720             for (int i = _128; i < length; i++) {
721                 sb.append(' ');
722             }
723             newIndent = sb.toString();
724         }
725 
726         // limit maximum indent to half of maximum line length
727         if (maxLength > 0 && newIndent.length() > maxLength / 2) {
728             fakeIndents++;
729         }
730         else this.indentString = newIndent;
731 
732     }
733 
734 
decrementIndent()735     void decrementIndent() {
736 
737         if (indent == 0) return;
738         else if (fakeIndents > 0) fakeIndents--;
739         else {
740             indentString = indentString.substring(
741               0, indentString.length()-indent
742             );
743         }
744 
745     }
746 
747 
getEncoding()748     String getEncoding() {
749         return this.encoding;
750     }
751 
752 
753     /**
754      * <p>
755      * Returns the String used as a line separator.
756      * This is always "\n", "\r", or "\r\n".
757      * </p>
758      *
759      * @return the line separator
760      */
getLineSeparator()761     String getLineSeparator() {
762         return lineSeparator;
763     }
764 
765 
766     /**
767      * <p>
768      * Sets the lineSeparator. This
769      * can only be one of the three
770      * strings "\n", "\r", or "\r\n".
771      * All other values are forbidden.
772      * </p>
773      *
774      * @param lineSeparator the lineSeparator to set
775      *
776      * @throws IllegalArgumentException if you attempt to use
777      *      any line separator other than "\n", "\r", or "\r\n".
778      *
779      */
setLineSeparator(String lineSeparator)780     void setLineSeparator(String lineSeparator) {
781 
782         if (lineSeparator.equals("\n")
783           || lineSeparator.equals("\r")
784           || lineSeparator.equals("\r\n")) {
785             this.lineSeparator = lineSeparator;
786             this.lineSeparatorSet = true;
787         }
788         else {
789             throw new IllegalArgumentException(
790               "Illegal Line Separator");
791         }
792 
793     }
794 
795 
setInDocType(boolean inDocType)796     void setInDocType(boolean inDocType) {
797         this.inDocType = inDocType;
798     }
799 
800 
801     /**
802      * <p>
803      * Returns the number of spaces this serializer indents.
804      * </p>
805      *
806      * @return the number of spaces this serializer indents
807      */
getIndent()808     int getIndent() {
809         return indent;
810     }
811 
812 
813     /**
814      * <p>
815      * Returns the maximum line length.
816      * </p>
817      *
818      * @return the maximum line length.
819      */
getMaxLength()820     int getMaxLength() {
821         return maxLength;
822     }
823 
824     /**
825      * <p>
826      * Sets the suggested maximum line length for this serializer.
827      * In some circumstances this may not be respected.
828      * </p>
829      *
830      * @param maxLength the maxLength to set
831      */
setMaxLength(int maxLength)832     void setMaxLength(int maxLength) {
833         if (maxLength < 0) maxLength = 0;
834         this.maxLength = maxLength;
835     }
836 
837 
838    /**
839      * <p>
840      * Sets the number of spaces to indent each successive level in the
841      *  hierarchy. Use 0 for no extra indenting.
842      * </p>
843      *
844      * @param indent the indent to set
845      */
setIndent(int indent)846     void setIndent(int indent) {
847         this.indent = indent;
848     }
849 
850 
flush()851     void flush() throws IOException {
852         out.flush();
853     }
854 
855 
needsEscaping(char c)856     abstract boolean needsEscaping(char c);
857 
858 
859     /**
860      * <p>
861      *  Used to track the current status of xml:space.
862      *  This is false by default, unless an xml:space="preserve"
863      *  attribute is in-scope. When such an attribute is in-scope,
864      *  white space is not adjusted even if indenting and/or
865      *  a maximum line length has been requested.
866      * </p>
867      *
868      *
869      * @return true if an <code>xml:space="true"</code> attribute
870      *      is in-scope
871      */
isPreserveSpace()872     boolean isPreserveSpace() {
873         return preserveSpace;
874     }
875 
876 
877     /**
878      * @param preserveSpace whether to preserve all white space
879      */
setPreserveSpace(boolean preserveSpace)880     void setPreserveSpace(boolean preserveSpace) {
881         this.preserveSpace = preserveSpace;
882     }
883 
884 
885     /**
886      * @return the current column number
887      */
getColumnNumber()888     int getColumnNumber() {
889         return this.column;
890     }
891 
892 
893     /**
894      * <p>
895      *   If true, this property indicates serialization will
896      *   perform Unicode normalization on all data using normalization
897      *   form C (NFC). Performing Unicode normalization
898      *   does change the document's infoset.
899      *   The default is false; do not normalize.
900      * </p>
901      *
902      * <p>
903      *   This feature has not yet been benchmarked or optimized.
904      *   It may result in substantially slower code.
905      * </p>
906      *
907      * @param normalize true if normalization is performed;
908      *     false if it isn't.
909      */
setNFC(boolean normalize)910     void setNFC(boolean normalize) {
911         this.normalize = normalize;
912     }
913 
914 
915     /**
916      * <p>
917      *   If true, this property indicates serialization will
918      *   perform Unicode normalization on all data using normalization
919      *   form C (NFC). The default is false; do not normalize.
920      * </p>
921      *
922      * @return true if this serialization performs Unicode
923      *     normalization; false if it doesn't.
924      */
getNFC()925     boolean getNFC() {
926         return this.normalize;
927     }
928 
929 
writeName(String name)930     void writeName(String name) throws IOException {
931         writeMarkup(name);
932     }
933 
934 
935 }