1 /* 2 * reserved comment block 3 * DO NOT REMOVE OR ALTER! 4 */ 5 /* 6 * Licensed to the Apache Software Foundation (ASF) under one or more 7 * contributor license agreements. See the NOTICE file distributed with 8 * this work for additional information regarding copyright ownership. 9 * The ASF licenses this file to You under the Apache License, Version 2.0 10 * (the "License"); you may not use this file except in compliance with 11 * the License. You may obtain a copy of the License at 12 * 13 * http://www.apache.org/licenses/LICENSE-2.0 14 * 15 * Unless required by applicable law or agreed to in writing, software 16 * distributed under the License is distributed on an "AS IS" BASIS, 17 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 * See the License for the specific language governing permissions and 19 * limitations under the License. 20 */ 21 22 package com.sun.org.apache.xml.internal.serializer; 23 24 import java.io.UnsupportedEncodingException; 25 26 /** 27 * Holds information about a given encoding, which is the Java name for the 28 * encoding, the equivalent ISO name. 29 * <p> 30 * An object of this type has two useful methods 31 * <pre> 32 * isInEncoding(char ch); 33 * </pre> 34 * which can be called if the character is not the high one in 35 * a surrogate pair and: 36 * <pre> 37 * isInEncoding(char high, char low); 38 * </pre> 39 * which can be called if the two characters from a high/low surrogate pair. 40 * <p> 41 * An EncodingInfo object is a node in a binary search tree. Such a node 42 * will answer if a character is in the encoding, and do so for a given 43 * range of unicode values (<code>m_first</code> to 44 * <code>m_last</code>). It will handle a certain range of values 45 * explicitly (<code>m_explFirst</code> to <code>m_explLast</code>). 46 * If the unicode point is before that explicit range, that is it 47 * is in the range <code>m_first <= value < m_explFirst</code>, then it will delegate to another EncodingInfo object for The root 48 * of such a tree, m_before. Likewise for values in the range 49 * <code>m_explLast < value <= m_last</code>, but delgating to <code>m_after</code> 50 * <p> 51 * Actually figuring out if a code point is in the encoding is expensive. So the 52 * purpose of this tree is to cache such determinations, and not to build the 53 * entire tree of information at the start, but only build up as much of the 54 * tree as is used during the transformation. 55 * <p> 56 * This Class is not a public API, and should only be used internally within 57 * the serializer. 58 * 59 * @xsl.usage internal 60 */ 61 public final class EncodingInfo extends Object 62 { 63 64 /** 65 * The ISO encoding name. 66 */ 67 final String name; 68 69 /** 70 * The name used by the Java convertor. 71 */ 72 final String javaName; 73 74 /** 75 * A helper object that we can ask if a 76 * single char, or a surrogate UTF-16 pair 77 * of chars that form a single character, 78 * is in this encoding. 79 */ 80 private InEncoding m_encoding; 81 82 /** 83 * This is not a public API. It returns true if the 84 * char in question is in the encoding. 85 * @param ch the char in question. 86 * @xsl.usage internal 87 */ isInEncoding(char ch)88 public boolean isInEncoding(char ch) { 89 if (m_encoding == null) { 90 m_encoding = new EncodingImpl(); 91 92 // One could put alternate logic in here to 93 // instantiate another object that implements the 94 // InEncoding interface. For example if the JRE is 1.4 or up 95 // we could have an object that uses JRE 1.4 methods 96 } 97 return m_encoding.isInEncoding(ch); 98 } 99 100 /** 101 * This is not a public API. It returns true if the 102 * character formed by the high/low pair is in the encoding. 103 * @param high a char that the a high char of a high/low surrogate pair. 104 * @param low a char that is the low char of a high/low surrogate pair. 105 * @xsl.usage internal 106 */ isInEncoding(char high, char low)107 public boolean isInEncoding(char high, char low) { 108 if (m_encoding == null) { 109 m_encoding = new EncodingImpl(); 110 111 // One could put alternate logic in here to 112 // instantiate another object that implements the 113 // InEncoding interface. For example if the JRE is 1.4 or up 114 // we could have an object that uses JRE 1.4 methods 115 } 116 return m_encoding.isInEncoding(high, low); 117 } 118 119 /** 120 * Create an EncodingInfo object based on the ISO name and Java name. 121 * If both parameters are null any character will be considered to 122 * be in the encoding. This is useful for when the serializer is in 123 * temporary output state, and has no assciated encoding. 124 * 125 * @param name reference to the ISO name. 126 * @param javaName reference to the Java encoding name. 127 */ EncodingInfo(String name, String javaName)128 public EncodingInfo(String name, String javaName) 129 { 130 131 this.name = name; 132 this.javaName = javaName; 133 } 134 135 136 137 /** 138 * A simple interface to isolate the implementation. 139 * We could also use some new JRE 1.4 methods in another implementation 140 * provided we use reflection with them. 141 * <p> 142 * This interface is not a public API, 143 * and should only be used internally within the serializer. 144 * @xsl.usage internal 145 */ 146 private interface InEncoding { 147 /** 148 * Returns true if the char is in the encoding 149 */ isInEncoding(char ch)150 public boolean isInEncoding(char ch); 151 /** 152 * Returns true if the high/low surrogate pair forms 153 * a character that is in the encoding. 154 */ isInEncoding(char high, char low)155 public boolean isInEncoding(char high, char low); 156 } 157 158 /** 159 * This class implements the 160 */ 161 private class EncodingImpl implements InEncoding { 162 163 164 isInEncoding(char ch1)165 public boolean isInEncoding(char ch1) { 166 final boolean ret; 167 int codePoint = Encodings.toCodePoint(ch1); 168 if (codePoint < m_explFirst) { 169 // The unicode value is before the range 170 // that we explictly manage, so we delegate the answer. 171 172 // If we don't have an m_before object to delegate to, make one. 173 if (m_before == null) 174 m_before = 175 new EncodingImpl( 176 m_encoding, 177 m_first, 178 m_explFirst - 1, 179 codePoint); 180 ret = m_before.isInEncoding(ch1); 181 } else if (m_explLast < codePoint) { 182 // The unicode value is after the range 183 // that we explictly manage, so we delegate the answer. 184 185 // If we don't have an m_after object to delegate to, make one. 186 if (m_after == null) 187 m_after = 188 new EncodingImpl( 189 m_encoding, 190 m_explLast + 1, 191 m_last, 192 codePoint); 193 ret = m_after.isInEncoding(ch1); 194 } else { 195 // The unicode value is in the range we explitly handle 196 final int idx = codePoint - m_explFirst; 197 198 // If we already know the answer, just return it. 199 if (m_alreadyKnown[idx]) 200 ret = m_isInEncoding[idx]; 201 else { 202 // We don't know the answer, so find out, 203 // which may be expensive, then cache the answer 204 ret = inEncoding(ch1, m_encoding); 205 m_alreadyKnown[idx] = true; 206 m_isInEncoding[idx] = ret; 207 } 208 } 209 return ret; 210 } 211 isInEncoding(char high, char low)212 public boolean isInEncoding(char high, char low) { 213 final boolean ret; 214 int codePoint = Encodings.toCodePoint(high,low); 215 if (codePoint < m_explFirst) { 216 // The unicode value is before the range 217 // that we explictly manage, so we delegate the answer. 218 219 // If we don't have an m_before object to delegate to, make one. 220 if (m_before == null) 221 m_before = 222 new EncodingImpl( 223 m_encoding, 224 m_first, 225 m_explFirst - 1, 226 codePoint); 227 ret = m_before.isInEncoding(high,low); 228 } else if (m_explLast < codePoint) { 229 // The unicode value is after the range 230 // that we explictly manage, so we delegate the answer. 231 232 // If we don't have an m_after object to delegate to, make one. 233 if (m_after == null) 234 m_after = 235 new EncodingImpl( 236 m_encoding, 237 m_explLast + 1, 238 m_last, 239 codePoint); 240 ret = m_after.isInEncoding(high,low); 241 } else { 242 // The unicode value is in the range we explitly handle 243 final int idx = codePoint - m_explFirst; 244 245 // If we already know the answer, just return it. 246 if (m_alreadyKnown[idx]) 247 ret = m_isInEncoding[idx]; 248 else { 249 // We don't know the answer, so find out, 250 // which may be expensive, then cache the answer 251 ret = inEncoding(high, low, m_encoding); 252 m_alreadyKnown[idx] = true; 253 m_isInEncoding[idx] = ret; 254 } 255 } 256 return ret; 257 } 258 259 /** 260 * The encoding. 261 */ 262 final private String m_encoding; 263 /** 264 * m_first through m_last is the range of unicode 265 * values that this object will return an answer on. 266 * It may delegate to a similar object with a different 267 * range 268 */ 269 final private int m_first; 270 271 /** 272 * m_explFirst through m_explLast is the range of unicode 273 * value that this object handles explicitly and does not 274 * delegate to a similar object. 275 */ 276 final private int m_explFirst; 277 final private int m_explLast; 278 final private int m_last; 279 280 /** 281 * The object, of the same type as this one, 282 * that handles unicode values in a range before 283 * the range explictly handled by this object, and 284 * to which this object may delegate. 285 */ 286 private InEncoding m_before; 287 /** 288 * The object, of the same type as this one, 289 * that handles unicode values in a range after 290 * the range explictly handled by this object, and 291 * to which this object may delegate. 292 */ 293 private InEncoding m_after; 294 295 /** 296 * The number of unicode values explicitly handled 297 * by a single EncodingInfo object. This value is 298 * tuneable, but is set to 128 because that covers the 299 * entire low range of ASCII type chars within a single 300 * object. 301 */ 302 private static final int RANGE = 128; 303 304 /** 305 * A flag to record if we already know the answer 306 * for the given unicode value. 307 */ 308 final private boolean m_alreadyKnown[] = new boolean[RANGE]; 309 /** 310 * A table holding the answer on whether the given unicode 311 * value is in the encoding. 312 */ 313 final private boolean m_isInEncoding[] = new boolean[RANGE]; 314 EncodingImpl()315 private EncodingImpl() { 316 // This object will answer whether any unicode value 317 // is in the encoding, it handles values 0 through Integer.MAX_VALUE 318 this(javaName, 0, Integer.MAX_VALUE, (char) 0); 319 } 320 EncodingImpl(String encoding, int first, int last, int codePoint)321 private EncodingImpl(String encoding, int first, int last, int codePoint) { 322 // Set the range of unicode values that this object manages 323 // either explicitly or implicitly. 324 m_first = first; 325 m_last = last; 326 327 // Set the range of unicode values that this object 328 // explicitly manages. Align the explicitly managed values 329 // to RANGE so multiple EncodingImpl objects dont manage the same 330 // values. 331 m_explFirst = codePoint / RANGE * RANGE; 332 m_explLast = m_explFirst + (RANGE-1); 333 334 m_encoding = encoding; 335 336 if (javaName != null) 337 { 338 // Some optimization. 339 if (0 <= m_explFirst && m_explFirst <= 127) { 340 // This particular EncodingImpl explicitly handles 341 // characters in the low range. 342 if ("UTF8".equals(javaName) 343 || "UTF-16".equals(javaName) 344 || "ASCII".equals(javaName) 345 || "US-ASCII".equals(javaName) 346 || "Unicode".equals(javaName) 347 || "UNICODE".equals(javaName) 348 || javaName.startsWith("ISO8859")) { 349 350 // Not only does this EncodingImpl object explicitly 351 // handle chracters in the low range, it is 352 // also one that we know something about, without 353 // needing to call inEncoding(char ch, String encoding) 354 // for this low range 355 // 356 // By initializing the table ahead of time 357 // for these low values, we prevent the expensive 358 // inEncoding(char ch, String encoding) 359 // from being called, at least for these common 360 // encodings. 361 for (int unicode = 1; unicode < 127; unicode++) { 362 final int idx = unicode - m_explFirst; 363 if (0 <= idx && idx < RANGE) { 364 m_alreadyKnown[idx] = true; 365 m_isInEncoding[idx] = true; 366 } 367 } 368 } 369 } 370 371 /* A little bit more than optimization. 372 * 373 * We will say that any character is in the encoding if 374 * we don't have an encoding. 375 * This is meaningful when the serializer is being used 376 * in temporary output state, where we are not writing to 377 * the final output tree. It is when writing to the 378 * final output tree that we need to worry about the output 379 * encoding 380 */ 381 if (javaName == null) { 382 for (int idx = 0; idx < m_alreadyKnown.length; idx++) { 383 m_alreadyKnown[idx] = true; 384 m_isInEncoding[idx] = true; 385 } 386 } 387 } 388 } 389 } 390 391 /** 392 * This is heart of the code that determines if a given character 393 * is in the given encoding. This method is probably expensive, 394 * and the answer should be cached. 395 * <p> 396 * This method is not a public API, 397 * and should only be used internally within the serializer. 398 * @param ch the char in question, that is not a high char of 399 * a high/low surrogate pair. 400 * @param encoding the Java name of the enocding. 401 * 402 * @xsl.usage internal 403 * 404 */ inEncoding(char ch, String encoding)405 private static boolean inEncoding(char ch, String encoding) { 406 boolean isInEncoding; 407 try { 408 char cArray[] = new char[1]; 409 cArray[0] = ch; 410 // Construct a String from the char 411 String s = new String(cArray); 412 // Encode the String into a sequence of bytes 413 // using the given, named charset. 414 byte[] bArray = s.getBytes(encoding); 415 isInEncoding = inEncoding(ch, bArray); 416 417 } catch (Exception e) { 418 isInEncoding = false; 419 420 // If for some reason the encoding is null, e.g. 421 // for a temporary result tree, we should just 422 // say that every character is in the encoding. 423 if (encoding == null) 424 isInEncoding = true; 425 } 426 return isInEncoding; 427 } 428 429 /** 430 * This is heart of the code that determines if a given high/low 431 * surrogate pair forms a character that is in the given encoding. 432 * This method is probably expensive, and the answer should be cached. 433 * <p> 434 * This method is not a public API, 435 * and should only be used internally within the serializer. 436 * @param high the high char of 437 * a high/low surrogate pair. 438 * @param low the low char of a high/low surrogate pair. 439 * @param encoding the Java name of the encoding. 440 * 441 * @xsl.usage internal 442 * 443 */ inEncoding(char high, char low, String encoding)444 private static boolean inEncoding(char high, char low, String encoding) { 445 boolean isInEncoding; 446 try { 447 char cArray[] = new char[2]; 448 cArray[0] = high; 449 cArray[1] = low; 450 // Construct a String from the char 451 String s = new String(cArray); 452 // Encode the String into a sequence of bytes 453 // using the given, named charset. 454 byte[] bArray = s.getBytes(encoding); 455 isInEncoding = inEncoding(high,bArray); 456 } catch (Exception e) { 457 isInEncoding = false; 458 } 459 460 return isInEncoding; 461 } 462 463 /** 464 * This method is the core of determining if character 465 * is in the encoding. The method is not foolproof, because 466 * s.getBytes(encoding) has specified behavior only if the 467 * characters are in the specified encoding. However this 468 * method tries it's best. 469 * @param ch the char that was converted using getBytes, or 470 * the first char of a high/low pair that was converted. 471 * @param data the bytes written out by the call to s.getBytes(encoding); 472 * @return true if the character is in the encoding. 473 */ inEncoding(char ch, byte[] data)474 private static boolean inEncoding(char ch, byte[] data) { 475 final boolean isInEncoding; 476 // If the string written out as data is not in the encoding, 477 // the output is not specified according to the documentation 478 // on the String.getBytes(encoding) method, 479 // but we do our best here. 480 if (data==null || data.length == 0) { 481 isInEncoding = false; 482 } 483 else { 484 if (data[0] == 0) 485 isInEncoding = false; 486 else if (data[0] == '?' && ch != '?') 487 isInEncoding = false; 488 /* 489 * else if (isJapanese) { 490 * // isJapanese is really 491 * // ( "EUC-JP".equals(javaName) 492 * // || "EUC_JP".equals(javaName) 493 * // || "SJIS".equals(javaName) ) 494 * 495 * // Work around some bugs in JRE for Japanese 496 * if(data[0] == 0x21) 497 * isInEncoding = false; 498 * else if (ch == 0xA5) 499 * isInEncoding = false; 500 * else 501 * isInEncoding = true; 502 * } 503 */ 504 505 else { 506 // We don't know for sure, but it looks like it is in the encoding 507 isInEncoding = true; 508 } 509 } 510 return isInEncoding; 511 } 512 513 } 514