1 /* Copyright 2002, 2003, 2005, 2006 Elliotte Rusty Harold 2 3 This library is free software; you can redistribute it and/or modify 4 it under the terms of version 2.1 of the GNU Lesser General Public 5 License as published by the Free Software Foundation. 6 7 This library is distributed in the hope that it will be useful, 8 but WITHOUT ANY WARRANTY; without even the implied warranty of 9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 GNU Lesser General Public License for more details. 11 12 You should have received a copy of the GNU Lesser General Public 13 License along with this library; if not, write to the 14 Free Software Foundation, Inc., 59 Temple Place, Suite 330, 15 Boston, MA 02111-1307 USA 16 17 You can contact Elliotte Rusty Harold by sending e-mail to 18 elharo@ibiblio.org. Please include the word "XOM" in the 19 subject line. The XOM home page is located at http://www.xom.nu/ 20 */ 21 22 package nu.xom; 23 24 import java.io.IOException; 25 import java.io.Writer; 26 27 /** 28 * @author Elliotte Rusty Harold 29 * @version 1.2d1 30 * 31 */ 32 final class UnicodeWriter extends TextWriter { 33 UnicodeWriter(Writer out, String encoding)34 UnicodeWriter(Writer out, String encoding) { 35 super(out, encoding); 36 } 37 38 /** 39 * @see nu.xom.TextWriter#needsEscaping(char) 40 */ needsEscaping(char c)41 boolean needsEscaping(char c) { 42 return false; 43 } 44 45 writeMarkup(String s)46 void writeMarkup(String s) throws IOException { 47 48 if (normalize) { 49 s = normalize(s); 50 } 51 52 int unicodeStringLength = getUnicodeLengthForMarkup(s); 53 if (unicodeStringLength >= 0) { 54 out.write(s); 55 if (unicodeStringLength > 0) { 56 column += unicodeStringLength; 57 lastCharacterWasSpace = false; 58 skipFollowingLinefeed = false; 59 justBroke=false; 60 } 61 } 62 else { // write character by character 63 int length = s.length(); 64 for (int i=0; i < length; i++) { 65 writeMarkup(s.charAt(i)); 66 } 67 } 68 69 } 70 71 72 // Names don't contain white space writeName(String name)73 void writeName(String name) throws IOException { 74 75 if (normalize) { 76 name = normalize(name); 77 } 78 79 int unicodeStringLength = getUnicodeLengthForName(name); 80 out.write(name); 81 column += unicodeStringLength; 82 lastCharacterWasSpace = false; 83 skipFollowingLinefeed = false; 84 justBroke=false; 85 86 } 87 88 89 /* 90 * This is tricky. This method is doing two things: 91 * 92 * 1. It's counting the number of Unicode characters in s. 93 * 2. It's checking to see if this text contains anything 94 * that might need to be escaped. 95 * 96 * If the latter it returns -1; otherwise it returns the number of characters. 97 */ getUnicodeLengthForMarkup(String s)98 private static int getUnicodeLengthForMarkup(String s) { 99 100 int unicodeLength = 0; 101 int javaLength = s.length(); 102 for (int i = 0; i < javaLength; i++) { 103 // Benchmarking shows using toCharArray to be a little slower than using charAt 104 char c = s.charAt(i); 105 if (c <= ' ') { 106 // Really we're testing only for \t, \n, and space here. 107 // However all other characters less than or equal to 32 108 // can't appear in markup sections. 109 // These characters cause an adjustment of 110 // lastCharacterWasSpace, skipFollowingLinefeed, and justBroke 111 // They may need to be escaped but only in doctype declarations. 112 // Should these have their own writeDoctypeDeclaration method???? 113 // Also an issue with spaces and such in PIs, XML declaration, comments 114 return -1; 115 } 116 // Count the low surrogates but skip the high surrogates 117 // so surrogate pairs aren't counted twice. 118 else if (c < 0xD800 || c > 0xDBFF) unicodeLength++; 119 } 120 return unicodeLength; 121 122 } 123 124 getUnicodeLengthForName(String name)125 private static int getUnicodeLengthForName(String name) { 126 127 int unicodeLength = 0; 128 int javaLength = name.length(); 129 for (int i = 0; i < javaLength; i++) { 130 char c = name.charAt(i); 131 if (c < 0xD800 || c > 0xDBFF) unicodeLength++; 132 } 133 return unicodeLength; 134 135 } 136 137 writeAttributeValue(String s)138 void writeAttributeValue(String s) throws IOException { 139 140 if (normalize) { 141 s = normalize(s); 142 } 143 int unicodeStringLength = getUnicodeLengthForAttributeValue(s); 144 if (unicodeStringLength >= 0) { 145 out.write(s); 146 if (unicodeStringLength > 0) { 147 column += unicodeStringLength; 148 lastCharacterWasSpace = false; 149 skipFollowingLinefeed = false; 150 justBroke=false; 151 } 152 } 153 else { 154 int length = s.length(); 155 for (int i=0; i < length; i++) { 156 writeAttributeValue(s.charAt(i)); 157 } 158 } 159 160 } 161 162 163 // All three getUnicodeLengthForFOO methods are very similar. 164 // Could the code duplciation be eliminated efficiently somehow? getUnicodeLengthForAttributeValue(String s)165 private static int getUnicodeLengthForAttributeValue(String s) { 166 167 int unicodeLength = 0; 168 int javaLength = s.length(); 169 for (int i = 0; i < javaLength; i++) { 170 char c = s.charAt(i); 171 switch (c) { 172 case '\t': return -1; 173 case '\n': return -1; 174 case 11: // unreachable 175 case 12: throw new XMLException("Bad character snuck into document"); 176 case '\r': return -1; 177 case 14: // unreachable 178 case 15: // unreachable 179 case 16: // unreachable 180 case 17: // unreachable 181 case 18: // unreachable 182 case 19: // unreachable 183 case 20: // unreachable 184 case 21: // unreachable 185 case 22: // unreachable 186 case 23: // unreachable 187 case 24: // unreachable 188 case 25: // unreachable 189 case 26: // unreachable 190 case 27: // unreachable 191 case 28: // unreachable 192 case 29: // unreachable 193 case 30: // unreachable 194 case 31: // unreachable 195 throw new XMLException("Bad character snuck into document"); 196 case ' ': return -1; 197 case '!': 198 unicodeLength++; 199 break; 200 case '"': 201 return -1; 202 case '#': 203 unicodeLength++; 204 break; 205 case '$': 206 unicodeLength++; 207 break; 208 case '%': 209 unicodeLength++; 210 break; 211 case '&': 212 return -1; 213 case '\'': 214 unicodeLength++; 215 break; 216 case '(': 217 unicodeLength++; 218 break; 219 case ')': 220 unicodeLength++; 221 break; 222 case '*': 223 unicodeLength++; 224 break; 225 case '+': 226 unicodeLength++; 227 break; 228 case ',': 229 unicodeLength++; 230 break; 231 case '-': 232 unicodeLength++; 233 break; 234 case '.': 235 unicodeLength++; 236 break; 237 case '/': 238 unicodeLength++; 239 break; 240 case '0': 241 unicodeLength++; 242 break; 243 case '1': 244 unicodeLength++; 245 break; 246 case '2': 247 unicodeLength++; 248 break; 249 case '3': 250 unicodeLength++; 251 break; 252 case '4': 253 unicodeLength++; 254 break; 255 case '5': 256 unicodeLength++; 257 break; 258 case '6': 259 unicodeLength++; 260 break; 261 case '7': 262 unicodeLength++; 263 break; 264 case '8': 265 unicodeLength++; 266 break; 267 case '9': 268 unicodeLength++; 269 break; 270 case ':': 271 unicodeLength++; 272 break; 273 case ';': 274 unicodeLength++; 275 break; 276 case '<': 277 return -1; 278 case '=': 279 unicodeLength++; 280 break; 281 case '>': 282 return -1; 283 default: 284 if (c < 0xd800 || c > 0xDBFF) unicodeLength++; 285 } 286 } 287 return unicodeLength; 288 289 } 290 291 writePCDATA(String s)292 void writePCDATA(String s) throws IOException { 293 294 if (normalize) { 295 s = normalize(s); 296 } 297 298 int unicodeStringLength = getUnicodeLengthForPCDATA(s); 299 if (unicodeStringLength >= 0) { 300 out.write(s); 301 if (unicodeStringLength > 0) { 302 column += unicodeStringLength; 303 lastCharacterWasSpace = false; 304 skipFollowingLinefeed = false; 305 justBroke=false; 306 } 307 } 308 else { 309 int length = s.length(); 310 for (int i=0; i < length; i++) { 311 writePCDATA(s.charAt(i)); 312 } 313 } 314 315 } 316 317 getUnicodeLengthForPCDATA(String s)318 private static int getUnicodeLengthForPCDATA(String s) { 319 320 int unicodeLength = 0; 321 int javaLength = s.length(); 322 for (int i = 0; i < javaLength; i++) { 323 char c = s.charAt(i); 324 switch (c) { 325 case '\t': return -1; 326 case '\n': return -1; 327 case 11: // unreachable 328 case 12: throw new XMLException("Bad character snuck into document"); 329 case '\r': return -1; 330 case 14: // unreachable 331 case 15: // unreachable 332 case 16: // unreachable 333 case 17: // unreachable 334 case 18: // unreachable 335 case 19: // unreachable 336 case 20: // unreachable 337 case 21: // unreachable 338 case 22: // unreachable 339 case 23: // unreachable 340 case 24: // unreachable 341 case 25: // unreachable 342 case 26: // unreachable 343 case 27: // unreachable 344 case 28: // unreachable 345 case 29: // unreachable 346 case 30: // unreachable 347 case 31: // unreachable 348 throw new XMLException("Bad character snuck into document"); 349 case ' ': return -1; 350 case '!': 351 unicodeLength++; 352 break; 353 case '"': 354 unicodeLength++; 355 break; 356 case '#': 357 unicodeLength++; 358 break; 359 case '$': 360 unicodeLength++; 361 break; 362 case '%': 363 unicodeLength++; 364 break; 365 case '&': 366 return -1; 367 case '\'': 368 unicodeLength++; 369 break; 370 case '(': 371 unicodeLength++; 372 break; 373 case ')': 374 unicodeLength++; 375 break; 376 case '*': 377 unicodeLength++; 378 break; 379 case '+': 380 unicodeLength++; 381 break; 382 case ',': 383 unicodeLength++; 384 break; 385 case '-': 386 unicodeLength++; 387 break; 388 case '.': 389 unicodeLength++; 390 break; 391 case '/': 392 unicodeLength++; 393 break; 394 case '0': 395 unicodeLength++; 396 break; 397 case '1': 398 unicodeLength++; 399 break; 400 case '2': 401 unicodeLength++; 402 break; 403 case '3': 404 unicodeLength++; 405 break; 406 case '4': 407 unicodeLength++; 408 break; 409 case '5': 410 unicodeLength++; 411 break; 412 case '6': 413 unicodeLength++; 414 break; 415 case '7': 416 unicodeLength++; 417 break; 418 case '8': 419 unicodeLength++; 420 break; 421 case '9': 422 unicodeLength++; 423 break; 424 case ':': 425 unicodeLength++; 426 break; 427 case ';': 428 unicodeLength++; 429 break; 430 case '<': 431 return -1; 432 case '=': 433 unicodeLength++; 434 break; 435 case '>': 436 return -1; 437 default: 438 if (c < 0xd800 || c > 0xDBFF) unicodeLength++; 439 } 440 } 441 return unicodeLength; 442 443 } 444 445 } 446