1 /*
2  * reserved comment block
3  * DO NOT REMOVE OR ALTER!
4  */
5 /*
6  * Licensed to the Apache Software Foundation (ASF) under one or more
7  * contributor license agreements.  See the NOTICE file distributed with
8  * this work for additional information regarding copyright ownership.
9  * The ASF licenses this file to You under the Apache License, Version 2.0
10  * (the "License"); you may not use this file except in compliance with
11  * the License.  You may obtain a copy of the License at
12  *
13  *      http://www.apache.org/licenses/LICENSE-2.0
14  *
15  * Unless required by applicable law or agreed to in writing, software
16  * distributed under the License is distributed on an "AS IS" BASIS,
17  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18  * See the License for the specific language governing permissions and
19  * limitations under the License.
20  */
21 
22 package com.sun.org.apache.xml.internal.serializer;
23 
24 import java.io.UnsupportedEncodingException;
25 
26 /**
27  * Holds information about a given encoding, which is the Java name for the
28  * encoding, the equivalent ISO name.
29  * <p>
30  * An object of this type has two useful methods
31  * <pre>
32  * isInEncoding(char ch);
33  * </pre>
34  * which can be called if the character is not the high one in
35  * a surrogate pair and:
36  * <pre>
37  * isInEncoding(char high, char low);
38  * </pre>
39  * which can be called if the two characters from a high/low surrogate pair.
40  * <p>
41  * An EncodingInfo object is a node in a binary search tree. Such a node
42  * will answer if a character is in the encoding, and do so for a given
43  * range of unicode values (<code>m_first</code> to
44  * <code>m_last</code>). It will handle a certain range of values
45  * explicitly (<code>m_explFirst</code> to <code>m_explLast</code>).
46  * If the unicode point is before that explicit range, that is it
47  * is in the range <code>m_first <= value < m_explFirst</code>, then it will delegate to another EncodingInfo object for The root
48  * of such a tree, m_before.  Likewise for values in the range
49  * <code>m_explLast < value <= m_last</code>, but delgating to <code>m_after</code>
50  * <p>
51  * Actually figuring out if a code point is in the encoding is expensive. So the
52  * purpose of this tree is to cache such determinations, and not to build the
53  * entire tree of information at the start, but only build up as much of the
54  * tree as is used during the transformation.
55  * <p>
56  * This Class is not a public API, and should only be used internally within
57  * the serializer.
58  *
59  * @xsl.usage internal
60  */
61 public final class EncodingInfo extends Object
62 {
63 
64     /**
65      * The ISO encoding name.
66      */
67     final String name;
68 
69     /**
70      * The name used by the Java convertor.
71      */
72     final String javaName;
73 
74     /**
75      * A helper object that we can ask if a
76      * single char, or a surrogate UTF-16 pair
77      * of chars that form a single character,
78      * is in this encoding.
79      */
80     private InEncoding m_encoding;
81 
82     /**
83      * This is not a public API. It returns true if the
84      * char in question is in the encoding.
85      * @param ch the char in question.
86      * @xsl.usage internal
87      */
isInEncoding(char ch)88     public boolean isInEncoding(char ch) {
89         if (m_encoding == null) {
90             m_encoding = new EncodingImpl();
91 
92             // One could put alternate logic in here to
93             // instantiate another object that implements the
94             // InEncoding interface. For example if the JRE is 1.4 or up
95             // we could have an object that uses JRE 1.4 methods
96         }
97         return m_encoding.isInEncoding(ch);
98     }
99 
100     /**
101      * This is not a public API. It returns true if the
102      * character formed by the high/low pair is in the encoding.
103      * @param high a char that the a high char of a high/low surrogate pair.
104      * @param low a char that is the low char of a high/low surrogate pair.
105      * @xsl.usage internal
106      */
isInEncoding(char high, char low)107     public boolean isInEncoding(char high, char low) {
108         if (m_encoding == null) {
109             m_encoding = new EncodingImpl();
110 
111             // One could put alternate logic in here to
112             // instantiate another object that implements the
113             // InEncoding interface. For example if the JRE is 1.4 or up
114             // we could have an object that uses JRE 1.4 methods
115         }
116         return m_encoding.isInEncoding(high, low);
117     }
118 
119     /**
120      * Create an EncodingInfo object based on the ISO name and Java name.
121      * If both parameters are null any character will be considered to
122      * be in the encoding. This is useful for when the serializer is in
123      * temporary output state, and has no assciated encoding.
124      *
125      * @param name reference to the ISO name.
126      * @param javaName reference to the Java encoding name.
127      */
EncodingInfo(String name, String javaName)128     public EncodingInfo(String name, String javaName)
129     {
130 
131         this.name = name;
132         this.javaName = javaName;
133     }
134 
135 
136 
137     /**
138      * A simple interface to isolate the implementation.
139      * We could also use some new JRE 1.4 methods in another implementation
140      * provided we use reflection with them.
141      * <p>
142      * This interface is not a public API,
143      * and should only be used internally within the serializer.
144      * @xsl.usage internal
145      */
146     private interface InEncoding {
147         /**
148          * Returns true if the char is in the encoding
149          */
isInEncoding(char ch)150         public boolean isInEncoding(char ch);
151         /**
152          * Returns true if the high/low surrogate pair forms
153          * a character that is in the encoding.
154          */
isInEncoding(char high, char low)155         public boolean isInEncoding(char high, char low);
156     }
157 
158     /**
159      * This class implements the
160      */
161     private class EncodingImpl implements InEncoding {
162 
163 
164 
isInEncoding(char ch1)165         public boolean isInEncoding(char ch1) {
166             final boolean ret;
167             int codePoint = Encodings.toCodePoint(ch1);
168             if (codePoint < m_explFirst) {
169                 // The unicode value is before the range
170                 // that we explictly manage, so we delegate the answer.
171 
172                 // If we don't have an m_before object to delegate to, make one.
173                 if (m_before == null)
174                     m_before =
175                         new EncodingImpl(
176                             m_encoding,
177                             m_first,
178                             m_explFirst - 1,
179                             codePoint);
180                 ret = m_before.isInEncoding(ch1);
181             } else if (m_explLast < codePoint) {
182                 // The unicode value is after the range
183                 // that we explictly manage, so we delegate the answer.
184 
185                 // If we don't have an m_after object to delegate to, make one.
186                 if (m_after == null)
187                     m_after =
188                         new EncodingImpl(
189                             m_encoding,
190                             m_explLast + 1,
191                             m_last,
192                             codePoint);
193                 ret = m_after.isInEncoding(ch1);
194             } else {
195                 // The unicode value is in the range we explitly handle
196                 final int idx = codePoint - m_explFirst;
197 
198                 // If we already know the answer, just return it.
199                 if (m_alreadyKnown[idx])
200                     ret = m_isInEncoding[idx];
201                 else {
202                     // We don't know the answer, so find out,
203                     // which may be expensive, then cache the answer
204                     ret = inEncoding(ch1, m_encoding);
205                     m_alreadyKnown[idx] = true;
206                     m_isInEncoding[idx] = ret;
207                 }
208             }
209             return ret;
210         }
211 
isInEncoding(char high, char low)212         public boolean isInEncoding(char high, char low) {
213             final boolean ret;
214             int codePoint = Encodings.toCodePoint(high,low);
215             if (codePoint < m_explFirst) {
216                 // The unicode value is before the range
217                 // that we explictly manage, so we delegate the answer.
218 
219                 // If we don't have an m_before object to delegate to, make one.
220                 if (m_before == null)
221                     m_before =
222                         new EncodingImpl(
223                             m_encoding,
224                             m_first,
225                             m_explFirst - 1,
226                             codePoint);
227                 ret = m_before.isInEncoding(high,low);
228             } else if (m_explLast < codePoint) {
229                 // The unicode value is after the range
230                 // that we explictly manage, so we delegate the answer.
231 
232                 // If we don't have an m_after object to delegate to, make one.
233                 if (m_after == null)
234                     m_after =
235                         new EncodingImpl(
236                             m_encoding,
237                             m_explLast + 1,
238                             m_last,
239                             codePoint);
240                 ret = m_after.isInEncoding(high,low);
241             } else {
242                 // The unicode value is in the range we explitly handle
243                 final int idx = codePoint - m_explFirst;
244 
245                 // If we already know the answer, just return it.
246                 if (m_alreadyKnown[idx])
247                     ret = m_isInEncoding[idx];
248                 else {
249                     // We don't know the answer, so find out,
250                     // which may be expensive, then cache the answer
251                     ret = inEncoding(high, low, m_encoding);
252                     m_alreadyKnown[idx] = true;
253                     m_isInEncoding[idx] = ret;
254                 }
255             }
256             return ret;
257         }
258 
259         /**
260          * The encoding.
261          */
262         final private String m_encoding;
263         /**
264          * m_first through m_last is the range of unicode
265          * values that this object will return an answer on.
266          * It may delegate to a similar object with a different
267          * range
268          */
269         final private int m_first;
270 
271         /**
272          * m_explFirst through m_explLast is the range of unicode
273          * value that this object handles explicitly and does not
274          * delegate to a similar object.
275          */
276         final private int m_explFirst;
277         final private int m_explLast;
278         final private int m_last;
279 
280         /**
281          * The object, of the same type as this one,
282          * that handles unicode values in a range before
283          * the range explictly handled by this object, and
284          * to which this object may delegate.
285          */
286         private InEncoding m_before;
287         /**
288          * The object, of the same type as this one,
289          * that handles unicode values in a range after
290          * the range explictly handled by this object, and
291          * to which this object may delegate.
292          */
293         private InEncoding m_after;
294 
295         /**
296          * The number of unicode values explicitly handled
297          * by a single EncodingInfo object. This value is
298          * tuneable, but is set to 128 because that covers the
299          * entire low range of ASCII type chars within a single
300          * object.
301          */
302         private static final int RANGE = 128;
303 
304         /**
305          * A flag to record if we already know the answer
306          * for the given unicode value.
307          */
308         final private boolean m_alreadyKnown[] = new boolean[RANGE];
309         /**
310          * A table holding the answer on whether the given unicode
311          * value is in the encoding.
312          */
313         final private boolean m_isInEncoding[] = new boolean[RANGE];
314 
EncodingImpl()315         private EncodingImpl() {
316             // This object will answer whether any unicode value
317             // is in the encoding, it handles values 0 through Integer.MAX_VALUE
318             this(javaName, 0, Integer.MAX_VALUE, (char) 0);
319         }
320 
EncodingImpl(String encoding, int first, int last, int codePoint)321         private EncodingImpl(String encoding, int first, int last, int codePoint) {
322             // Set the range of unicode values that this object manages
323             // either explicitly or implicitly.
324             m_first = first;
325             m_last = last;
326 
327             // Set the range of unicode values that this object
328             // explicitly manages. Align the explicitly managed values
329             // to RANGE so multiple EncodingImpl objects dont manage the same
330             // values.
331             m_explFirst = codePoint / RANGE * RANGE;
332             m_explLast = m_explFirst + (RANGE-1);
333 
334             m_encoding = encoding;
335 
336             if (javaName != null)
337             {
338                 // Some optimization.
339                 if (0 <= m_explFirst && m_explFirst <= 127) {
340                     // This particular EncodingImpl explicitly handles
341                     // characters in the low range.
342                     if ("UTF8".equals(javaName)
343                         || "UTF-16".equals(javaName)
344                         || "ASCII".equals(javaName)
345                         || "US-ASCII".equals(javaName)
346                         || "Unicode".equals(javaName)
347                         || "UNICODE".equals(javaName)
348                         || javaName.startsWith("ISO8859")) {
349 
350                         // Not only does this EncodingImpl object explicitly
351                         // handle chracters in the low range, it is
352                         // also one that we know something about, without
353                         // needing to call inEncoding(char ch, String encoding)
354                         // for this low range
355                         //
356                         // By initializing the table ahead of time
357                         // for these low values, we prevent the expensive
358                         // inEncoding(char ch, String encoding)
359                         // from being called, at least for these common
360                         // encodings.
361                         for (int unicode = 1; unicode < 127; unicode++) {
362                             final int idx = unicode - m_explFirst;
363                             if (0 <= idx && idx < RANGE) {
364                                 m_alreadyKnown[idx] = true;
365                                 m_isInEncoding[idx] = true;
366                             }
367                         }
368                     }
369                 }
370 
371                 /* A little bit more than optimization.
372                  *
373                  * We will say that any character is in the encoding if
374                  * we don't have an encoding.
375                  * This is meaningful when the serializer is being used
376                  * in temporary output state, where we are not writing to
377                  * the final output tree.  It is when writing to the
378                  * final output tree that we need to worry about the output
379                  * encoding
380                  */
381                 if (javaName == null) {
382                     for (int idx = 0; idx < m_alreadyKnown.length; idx++) {
383                         m_alreadyKnown[idx] = true;
384                         m_isInEncoding[idx] = true;
385                     }
386                 }
387             }
388         }
389     }
390 
391     /**
392      * This is heart of the code that determines if a given character
393      * is in the given encoding. This method is probably expensive,
394      * and the answer should be cached.
395      * <p>
396      * This method is not a public API,
397      * and should only be used internally within the serializer.
398      * @param ch the char in question, that is not a high char of
399      * a high/low surrogate pair.
400      * @param encoding the Java name of the enocding.
401      *
402      * @xsl.usage internal
403      *
404      */
inEncoding(char ch, String encoding)405     private static boolean inEncoding(char ch, String encoding) {
406         boolean isInEncoding;
407         try {
408             char cArray[] = new char[1];
409             cArray[0] = ch;
410             // Construct a String from the char
411             String s = new String(cArray);
412             // Encode the String into a sequence of bytes
413             // using the given, named charset.
414             byte[] bArray = s.getBytes(encoding);
415             isInEncoding = inEncoding(ch, bArray);
416 
417         } catch (Exception e) {
418             isInEncoding = false;
419 
420             // If for some reason the encoding is null, e.g.
421             // for a temporary result tree, we should just
422             // say that every character is in the encoding.
423             if (encoding == null)
424                 isInEncoding = true;
425         }
426         return isInEncoding;
427     }
428 
429     /**
430      * This is heart of the code that determines if a given high/low
431      * surrogate pair forms a character that is in the given encoding.
432      * This method is probably expensive, and the answer should be cached.
433      * <p>
434      * This method is not a public API,
435      * and should only be used internally within the serializer.
436      * @param high the high char of
437      * a high/low surrogate pair.
438      * @param low the low char of a high/low surrogate pair.
439      * @param encoding the Java name of the encoding.
440      *
441      * @xsl.usage internal
442      *
443      */
inEncoding(char high, char low, String encoding)444     private static boolean inEncoding(char high, char low, String encoding) {
445         boolean isInEncoding;
446         try {
447             char cArray[] = new char[2];
448             cArray[0] = high;
449             cArray[1] = low;
450             // Construct a String from the char
451             String s = new String(cArray);
452             // Encode the String into a sequence of bytes
453             // using the given, named charset.
454             byte[] bArray = s.getBytes(encoding);
455             isInEncoding = inEncoding(high,bArray);
456         } catch (Exception e) {
457             isInEncoding = false;
458         }
459 
460         return isInEncoding;
461     }
462 
463     /**
464      * This method is the core of determining if character
465      * is in the encoding. The method is not foolproof, because
466      * s.getBytes(encoding) has specified behavior only if the
467      * characters are in the specified encoding. However this
468      * method tries it's best.
469      * @param ch the char that was converted using getBytes, or
470      * the first char of a high/low pair that was converted.
471      * @param data the bytes written out by the call to s.getBytes(encoding);
472      * @return true if the character is in the encoding.
473      */
inEncoding(char ch, byte[] data)474     private static boolean inEncoding(char ch, byte[] data) {
475         final boolean isInEncoding;
476         // If the string written out as data is not in the encoding,
477         // the output is not specified according to the documentation
478         // on the String.getBytes(encoding) method,
479         // but we do our best here.
480         if (data==null || data.length == 0) {
481             isInEncoding = false;
482         }
483         else {
484             if (data[0] == 0)
485                 isInEncoding = false;
486             else if (data[0] == '?' && ch != '?')
487                 isInEncoding = false;
488             /*
489              * else if (isJapanese) {
490              *   // isJapanese is really
491              *   //   (    "EUC-JP".equals(javaName)
492              *   //    ||  "EUC_JP".equals(javaName)
493              *  //     ||  "SJIS".equals(javaName)   )
494              *
495              *   // Work around some bugs in JRE for Japanese
496              *   if(data[0] == 0x21)
497              *     isInEncoding = false;
498              *   else if (ch == 0xA5)
499              *     isInEncoding = false;
500              *   else
501              *     isInEncoding = true;
502              * }
503              */
504 
505             else {
506                 // We don't know for sure, but it looks like it is in the encoding
507                 isInEncoding = true;
508             }
509         }
510         return isInEncoding;
511     }
512 
513 }
514