1 /* Copyright 2002, 2003, 2005, 2006 Elliotte Rusty Harold
2 
3    This library is free software; you can redistribute it and/or modify
4    it under the terms of version 2.1 of the GNU Lesser General Public
5    License as published by the Free Software Foundation.
6 
7    This library is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10    GNU Lesser General Public License for more details.
11 
12    You should have received a copy of the GNU Lesser General Public
13    License along with this library; if not, write to the
14    Free Software Foundation, Inc., 59 Temple Place, Suite 330,
15    Boston, MA 02111-1307  USA
16 
17    You can contact Elliotte Rusty Harold by sending e-mail to
18    elharo@ibiblio.org. Please include the word "XOM" in the
19    subject line. The XOM home page is located at http://www.xom.nu/
20 */
21 
22 package nu.xom;
23 
24 import java.io.IOException;
25 import java.io.Writer;
26 
27 /**
28  * @author Elliotte Rusty Harold
29  * @version 1.2d1
30  *
31  */
32 final class UnicodeWriter extends TextWriter {
33 
UnicodeWriter(Writer out, String encoding)34     UnicodeWriter(Writer out, String encoding) {
35         super(out, encoding);
36     }
37 
38     /**
39      * @see nu.xom.TextWriter#needsEscaping(char)
40      */
needsEscaping(char c)41     boolean needsEscaping(char c) {
42         return false;
43     }
44 
45 
writeMarkup(String s)46     void writeMarkup(String s) throws IOException {
47 
48          if (normalize) {
49              s = normalize(s);
50          }
51 
52          int unicodeStringLength = getUnicodeLengthForMarkup(s);
53          if (unicodeStringLength >= 0) {
54              out.write(s);
55              if (unicodeStringLength > 0) {
56                  column += unicodeStringLength;
57                  lastCharacterWasSpace = false;
58                  skipFollowingLinefeed = false;
59                  justBroke=false;
60              }
61          }
62          else { // write character by character
63              int length = s.length();
64              for (int i=0; i < length; i++) {
65                  writeMarkup(s.charAt(i));
66              }
67          }
68 
69     }
70 
71 
72     // Names don't contain white space
writeName(String name)73     void writeName(String name) throws IOException {
74 
75          if (normalize) {
76              name = normalize(name);
77          }
78 
79          int unicodeStringLength = getUnicodeLengthForName(name);
80          out.write(name);
81          column += unicodeStringLength;
82          lastCharacterWasSpace = false;
83          skipFollowingLinefeed = false;
84          justBroke=false;
85 
86     }
87 
88 
89     /*
90      * This is tricky. This method is doing two things:
91      *
92      * 1. It's counting the number of Unicode characters in s.
93      * 2. It's checking to see if this text contains anything
94      *    that might need to be escaped.
95      *
96      * If the latter it returns -1; otherwise it returns the number of characters.
97      */
getUnicodeLengthForMarkup(String s)98     private static int getUnicodeLengthForMarkup(String s) {
99 
100         int unicodeLength = 0;
101         int javaLength = s.length();
102         for (int i = 0; i < javaLength; i++) {
103             // Benchmarking shows using toCharArray to be a little slower than using charAt
104             char c = s.charAt(i);
105             if (c <= ' ') {
106                 // Really we're testing only for \t, \n, and space here.
107                 // However all other characters less than or equal to 32
108                 // can't appear in markup sections.
109                 // These characters cause an adjustment of
110                 // lastCharacterWasSpace, skipFollowingLinefeed, and justBroke
111                 // They may need to be escaped but only in doctype declarations.
112                 // Should these have their own writeDoctypeDeclaration method????
113                 // Also an issue with spaces and such in PIs, XML declaration, comments
114                 return -1;
115             }
116             // Count the low surrogates but skip the high surrogates
117             // so surrogate pairs aren't counted twice.
118             else if (c < 0xD800 || c > 0xDBFF) unicodeLength++;
119         }
120         return unicodeLength;
121 
122     }
123 
124 
getUnicodeLengthForName(String name)125     private static int getUnicodeLengthForName(String name) {
126 
127         int unicodeLength = 0;
128         int javaLength = name.length();
129         for (int i = 0; i < javaLength; i++) {
130             char c = name.charAt(i);
131             if (c < 0xD800 || c > 0xDBFF) unicodeLength++;
132         }
133         return unicodeLength;
134 
135     }
136 
137 
writeAttributeValue(String s)138     void writeAttributeValue(String s) throws IOException {
139 
140          if (normalize) {
141              s = normalize(s);
142          }
143          int unicodeStringLength = getUnicodeLengthForAttributeValue(s);
144          if (unicodeStringLength >= 0) {
145              out.write(s);
146              if (unicodeStringLength > 0) {
147                  column += unicodeStringLength;
148                  lastCharacterWasSpace = false;
149                  skipFollowingLinefeed = false;
150                  justBroke=false;
151              }
152          }
153          else {
154              int length = s.length();
155              for (int i=0; i < length; i++) {
156                  writeAttributeValue(s.charAt(i));
157              }
158          }
159 
160      }
161 
162 
163     // All three getUnicodeLengthForFOO methods are very similar.
164     // Could the code duplciation be eliminated efficiently somehow?
getUnicodeLengthForAttributeValue(String s)165     private static int getUnicodeLengthForAttributeValue(String s) {
166 
167         int unicodeLength = 0;
168         int javaLength = s.length();
169         for (int i = 0; i < javaLength; i++) {
170             char c = s.charAt(i);
171             switch (c) {
172                 case '\t': return -1;
173                 case '\n': return -1;
174                 case   11: // unreachable
175                 case   12: throw new XMLException("Bad character snuck into document");
176                 case '\r': return -1;
177                 case 14: // unreachable
178                 case 15: // unreachable
179                 case 16: // unreachable
180                 case 17: // unreachable
181                 case 18: // unreachable
182                 case 19: // unreachable
183                 case 20: // unreachable
184                 case 21: // unreachable
185                 case 22: // unreachable
186                 case 23: // unreachable
187                 case 24: // unreachable
188                 case 25: // unreachable
189                 case 26: // unreachable
190                 case 27: // unreachable
191                 case 28: // unreachable
192                 case 29: // unreachable
193                 case 30: // unreachable
194                 case 31: // unreachable
195                     throw new XMLException("Bad character snuck into document");
196                 case ' ':  return -1;
197                 case '!':
198                     unicodeLength++;
199                     break;
200                 case '"':
201                     return -1;
202                 case '#':
203                     unicodeLength++;
204                     break;
205                 case '$':
206                     unicodeLength++;
207                     break;
208                 case '%':
209                     unicodeLength++;
210                     break;
211                 case '&':
212                     return -1;
213                 case '\'':
214                     unicodeLength++;
215                     break;
216                 case '(':
217                     unicodeLength++;
218                     break;
219                 case ')':
220                     unicodeLength++;
221                     break;
222                 case '*':
223                     unicodeLength++;
224                     break;
225                 case '+':
226                     unicodeLength++;
227                     break;
228                 case ',':
229                     unicodeLength++;
230                     break;
231                 case '-':
232                     unicodeLength++;
233                     break;
234                 case '.':
235                     unicodeLength++;
236                     break;
237                 case '/':
238                     unicodeLength++;
239                     break;
240                 case '0':
241                     unicodeLength++;
242                     break;
243                 case '1':
244                     unicodeLength++;
245                     break;
246                 case '2':
247                     unicodeLength++;
248                     break;
249                 case '3':
250                     unicodeLength++;
251                     break;
252                 case '4':
253                     unicodeLength++;
254                     break;
255                 case '5':
256                     unicodeLength++;
257                     break;
258                 case '6':
259                     unicodeLength++;
260                     break;
261                 case '7':
262                     unicodeLength++;
263                     break;
264                 case '8':
265                     unicodeLength++;
266                     break;
267                 case '9':
268                     unicodeLength++;
269                     break;
270                 case ':':
271                     unicodeLength++;
272                     break;
273                 case ';':
274                     unicodeLength++;
275                     break;
276                 case '<':
277                     return -1;
278                 case '=':
279                     unicodeLength++;
280                     break;
281                 case '>':
282                     return -1;
283                 default:
284                     if (c < 0xd800 || c > 0xDBFF) unicodeLength++;
285             }
286         }
287         return unicodeLength;
288 
289      }
290 
291 
writePCDATA(String s)292      void writePCDATA(String s) throws IOException {
293 
294          if (normalize) {
295              s = normalize(s);
296          }
297 
298          int unicodeStringLength = getUnicodeLengthForPCDATA(s);
299          if (unicodeStringLength >= 0) {
300              out.write(s);
301              if (unicodeStringLength > 0) {
302                  column += unicodeStringLength;
303                  lastCharacterWasSpace = false;
304                  skipFollowingLinefeed = false;
305                  justBroke=false;
306              }
307          }
308          else {
309              int length = s.length();
310              for (int i=0; i < length; i++) {
311                  writePCDATA(s.charAt(i));
312              }
313          }
314 
315     }
316 
317 
getUnicodeLengthForPCDATA(String s)318     private static int getUnicodeLengthForPCDATA(String s) {
319 
320         int unicodeLength = 0;
321         int javaLength = s.length();
322         for (int i = 0; i < javaLength; i++) {
323             char c = s.charAt(i);
324             switch (c) {
325                 case '\t': return -1;
326                 case '\n': return -1;
327                 case   11: // unreachable
328                 case   12: throw new XMLException("Bad character snuck into document");
329                 case '\r': return -1;
330                 case 14: // unreachable
331                 case 15: // unreachable
332                 case 16: // unreachable
333                 case 17: // unreachable
334                 case 18: // unreachable
335                 case 19: // unreachable
336                 case 20: // unreachable
337                 case 21: // unreachable
338                 case 22: // unreachable
339                 case 23: // unreachable
340                 case 24: // unreachable
341                 case 25: // unreachable
342                 case 26: // unreachable
343                 case 27: // unreachable
344                 case 28: // unreachable
345                 case 29: // unreachable
346                 case 30: // unreachable
347                 case 31: // unreachable
348                     throw new XMLException("Bad character snuck into document");
349                 case ' ':  return -1;
350                 case '!':
351                     unicodeLength++;
352                     break;
353                 case '"':
354                     unicodeLength++;
355                     break;
356                 case '#':
357                     unicodeLength++;
358                     break;
359                 case '$':
360                     unicodeLength++;
361                     break;
362                 case '%':
363                     unicodeLength++;
364                     break;
365                 case '&':
366                     return -1;
367                 case '\'':
368                     unicodeLength++;
369                     break;
370                 case '(':
371                     unicodeLength++;
372                     break;
373                 case ')':
374                     unicodeLength++;
375                     break;
376                 case '*':
377                     unicodeLength++;
378                     break;
379                 case '+':
380                     unicodeLength++;
381                     break;
382                 case ',':
383                     unicodeLength++;
384                     break;
385                 case '-':
386                     unicodeLength++;
387                     break;
388                 case '.':
389                     unicodeLength++;
390                     break;
391                 case '/':
392                     unicodeLength++;
393                     break;
394                 case '0':
395                     unicodeLength++;
396                     break;
397                 case '1':
398                     unicodeLength++;
399                     break;
400                 case '2':
401                     unicodeLength++;
402                     break;
403                 case '3':
404                     unicodeLength++;
405                     break;
406                 case '4':
407                     unicodeLength++;
408                     break;
409                 case '5':
410                     unicodeLength++;
411                     break;
412                 case '6':
413                     unicodeLength++;
414                     break;
415                 case '7':
416                     unicodeLength++;
417                     break;
418                 case '8':
419                     unicodeLength++;
420                     break;
421                 case '9':
422                     unicodeLength++;
423                     break;
424                 case ':':
425                     unicodeLength++;
426                     break;
427                 case ';':
428                     unicodeLength++;
429                     break;
430                 case '<':
431                     return -1;
432                 case '=':
433                     unicodeLength++;
434                     break;
435                 case '>':
436                     return -1;
437                 default:
438                     if (c < 0xd800 || c > 0xDBFF) unicodeLength++;
439             }
440         }
441         return unicodeLength;
442 
443     }
444 
445 }
446