1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  *
9  *      http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 /**
19  * $Id$
20  */
21 
22 // ---------------------------------------------------------------------------
23 //  Includes
24 // ---------------------------------------------------------------------------
25 #include <xercesc/util/TranscodingException.hpp>
26 #include <xercesc/util/XMLString.hpp>
27 #include <xercesc/util/XMLUniDefs.hpp>
28 #include <xercesc/util/XMLUTF8Transcoder.hpp>
29 
30 XERCES_CPP_NAMESPACE_BEGIN
31 
32 // ---------------------------------------------------------------------------
33 //  Local static data
34 //
35 //  gUTFBytes
36 //      A list of counts of trailing bytes for each initial byte in the input.
37 //
38 //  gUTFByteIndicator
39 //      For a UTF8 sequence of n bytes, n>=2, the first byte of the
40 //      sequence must contain n 1's followed by precisely 1 0 with the
41 //      rest of the byte containing arbitrary bits.  This array stores
42 //      the required bit pattern for validity checking.
43 //  gUTFByteIndicatorTest
44 //      When bitwise and'd with the observed value, if the observed
45 //      value is correct then a result matching gUTFByteIndicator will
46 //      be produced.
47 //
48 //  gUTFOffsets
49 //      A list of values to offset each result char type, according to how
50 //      many source bytes when into making it.
51 //
52 //  gFirstByteMark
53 //      A list of values to mask onto the first byte of an encoded sequence,
54 //      indexed by the number of bytes used to create the sequence.
55 // ---------------------------------------------------------------------------
56 static const XMLByte gUTFBytes[256] =
57 {
58         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
59     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
60     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
61     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
62     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
63     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
64     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
65     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
66     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
67     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
68     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
69     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
70     ,   0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
71     ,   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
72     ,   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
73     ,   3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
74 };
75 
76 static const XMLByte gUTFByteIndicator[6] =
77 {
78     0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
79 };
80 static const XMLByte gUTFByteIndicatorTest[6] =
81 {
82     0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE
83 };
84 
85 static const XMLUInt32 gUTFOffsets[6] =
86 {
87     0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
88 };
89 
90 static const XMLByte gFirstByteMark[7] =
91 {
92     0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
93 };
94 
95 
96 
97 // ---------------------------------------------------------------------------
98 //  XMLUTF8Transcoder: Constructors and Destructor
99 // ---------------------------------------------------------------------------
XMLUTF8Transcoder(const XMLCh * const encodingName,const XMLSize_t blockSize,MemoryManager * const manager)100 XMLUTF8Transcoder::XMLUTF8Transcoder(const  XMLCh* const    encodingName
101                                     , const XMLSize_t       blockSize
102                                     , MemoryManager* const  manager)
103 :XMLTranscoder(encodingName, blockSize, manager)
104 {
105 }
106 
~XMLUTF8Transcoder()107 XMLUTF8Transcoder::~XMLUTF8Transcoder()
108 {
109 }
110 
111 
112 // ---------------------------------------------------------------------------
113 //  XMLUTF8Transcoder: Implementation of the transcoder API
114 // ---------------------------------------------------------------------------
115 XMLSize_t
transcodeFrom(const XMLByte * const srcData,const XMLSize_t srcCount,XMLCh * const toFill,const XMLSize_t maxChars,XMLSize_t & bytesEaten,unsigned char * const charSizes)116 XMLUTF8Transcoder::transcodeFrom(const  XMLByte* const          srcData
117                                 , const XMLSize_t               srcCount
118                                 ,       XMLCh* const            toFill
119                                 , const XMLSize_t               maxChars
120                                 ,       XMLSize_t&              bytesEaten
121                                 ,       unsigned char* const    charSizes)
122 {
123     // Watch for pathological scenario. Shouldn't happen, but...
124     if (!srcCount || !maxChars)
125         return 0;
126 
127     //
128     //  Get pointers to our start and end points of the input and output
129     //  buffers.
130     //
131     const XMLByte*  srcPtr = srcData;
132     const XMLByte*  srcEnd = srcPtr + srcCount;
133     XMLCh*          outPtr = toFill;
134     XMLCh*          outEnd = outPtr + maxChars;
135     unsigned char*  sizePtr = charSizes;
136 
137 
138 
139     //
140     //  We now loop until we either run out of input data, or room to store
141     //  output chars.
142     //
143     while ((srcPtr < srcEnd) && (outPtr < outEnd))
144     {
145         // Special-case ASCII, which is a leading byte value of <= 127
146         if (*srcPtr <= 127)
147         {
148             // Handle ASCII in groups instead of single character at a time.
149             const XMLByte* srcPtr_save = srcPtr;
150             const XMLSize_t chunkSize = (srcEnd-srcPtr)<(outEnd-outPtr)?(srcEnd-srcPtr):(outEnd-outPtr);
151             for(XMLSize_t i=0;i<chunkSize && *srcPtr <= 127;++i)
152                 *outPtr++ = XMLCh(*srcPtr++);
153             memset(sizePtr,1,srcPtr - srcPtr_save);
154             sizePtr += srcPtr - srcPtr_save;
155             if (srcPtr == srcEnd || outPtr == outEnd)
156                 break;
157         }
158 
159         // See how many trailing src bytes this sequence is going to require
160         const unsigned int trailingBytes = gUTFBytes[*srcPtr];
161 
162         //
163         //  If there are not enough source bytes to do this one, then we
164         //  are done. Note that we done >= here because we are implicitly
165         //  counting the 1 byte we get no matter what.
166         //
167         //  If we break out here, then there is nothing to undo since we
168         //  haven't updated any pointers yet.
169         //
170         if (srcPtr + trailingBytes >= srcEnd)
171             break;
172 
173         // Looks ok, so lets build up the value
174         // or at least let's try to do so--remembering that
175         // we cannot assume the encoding to be valid:
176 
177         // first, test first byte
178         if((gUTFByteIndicatorTest[trailingBytes] & *srcPtr) != gUTFByteIndicator[trailingBytes]) {
179             char pos[2] = {(char)0x31, 0};
180             char len[2] = {(char)(trailingBytes+0x31), 0};
181             char byte[2] = {(char)*srcPtr,0};
182             ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len, getMemoryManager());
183         }
184 
185         /***
186          * http://www.unicode.org/reports/tr27/
187          *
188          * Table 3.1B. lists all of the byte sequences that are legal in UTF-8.
189          * A range of byte values such as A0..BF indicates that any byte from A0 to BF (inclusive)
190          * is legal in that position.
191          * Any byte value outside of the ranges listed is illegal.
192          * For example,
193          * the byte sequence <C0 AF> is illegal  since C0 is not legal in the 1st Byte column.
194          * The byte sequence <E0 9F 80> is illegal since in the row
195          *    where E0 is legal as a first byte,
196          *    9F is not legal as a second byte.
197          * The byte sequence <F4 80 83 92> is legal, since every byte in that sequence matches
198          * a byte range in a row of the table (the last row).
199          *
200          *
201          * Table 3.1B. Legal UTF-8 Byte Sequences
202          * Code Points              1st Byte    2nd Byte    3rd Byte    4th Byte
203          * =========================================================================
204          * U+0000..U+007F            00..7F
205          * -------------------------------------------------------------------------
206          * U+0080..U+07FF            C2..DF      80..BF
207          *
208          * -------------------------------------------------------------------------
209          * U+0800..U+0FFF            E0          A0..BF     80..BF
210          *                                       --
211          *
212          * U+1000..U+FFFF            E1..EF      80..BF     80..BF
213          *
214          * --------------------------------------------------------------------------
215          * U+10000..U+3FFFF          F0          90..BF     80..BF       80..BF
216          *                                       --
217          * U+40000..U+FFFFF          F1..F3      80..BF     80..BF       80..BF
218          * U+100000..U+10FFFF        F4          80..8F     80..BF       80..BF
219          *                                           --
220          * ==========================================================================
221          *
222          *  Cases where a trailing byte range is not 80..BF are underlined in the table to
223          *  draw attention to them. These occur only in the second byte of a sequence.
224          *
225          ***/
226 
227         XMLUInt32 tmpVal = 0;
228 
229         switch(trailingBytes)
230         {
231             case 1 :
232                 // UTF-8:   [110y yyyy] [10xx xxxx]
233                 // Unicode: [0000 0yyy] [yyxx xxxx]
234                 //
235                 // 0xC0, 0xC1 has been filtered out
236                 checkTrailingBytes(*(srcPtr+1), 1, 1);
237 
238                 tmpVal = *srcPtr++;
239                 tmpVal <<= 6;
240                 tmpVal += *srcPtr++;
241 
242                 break;
243             case 2 :
244                 // UTF-8:   [1110 zzzz] [10yy yyyy] [10xx xxxx]
245                 // Unicode: [zzzz yyyy] [yyxx xxxx]
246                 //
247                 if (( *srcPtr == 0xE0) && ( *(srcPtr+1) < 0xA0))
248                 {
249                     char byte0[2] = {(char)*srcPtr    ,0};
250                     char byte1[2] = {(char)*(srcPtr+1),0};
251 
252                     ThrowXMLwithMemMgr2(UTFDataFormatException
253                                       , XMLExcepts::UTF8_Invalid_3BytesSeq
254                                       , byte0
255                                       , byte1
256                                       , getMemoryManager());
257                 }
258 
259                 checkTrailingBytes(*(srcPtr+1), 2, 1);
260                 checkTrailingBytes(*(srcPtr+2), 2, 2);
261 
262                 //
263                 // D36 (a) UTF-8 is the Unicode Transformation Format that serializes
264                 //         a Unicode code point as a sequence of one to four bytes,
265                 //         as specified in Table 3.1, UTF-8 Bit Distribution.
266                 //     (b) An illegal UTF-8 code unit sequence is any byte sequence that
267                 //         does not match the patterns listed in Table 3.1B, Legal UTF-8
268                 //         Byte Sequences.
269                 //     (c) An irregular UTF-8 code unit sequence is a six-byte sequence
270                 //         where the first three bytes correspond to a high surrogate,
271                 //         and the next three bytes correspond to a low surrogate.
272                 //         As a consequence of C12, these irregular UTF-8 sequences shall
273                 //         not be generated by a conformant process.
274                 //
275                 //irregular three bytes sequence
276                 // that is zzzzyy matches leading surrogate tag 110110 or
277                 //                       trailing surrogate tag 110111
278                 // *srcPtr=1110 1101
279                 // *(srcPtr+1)=1010 yyyy or
280                 // *(srcPtr+1)=1011 yyyy
281                 //
282                 // 0xED 1110 1101
283                 // 0xA0 1010 0000
284 
285                 if ((*srcPtr == 0xED) && (*(srcPtr+1) >= 0xA0))
286                 {
287                     char byte0[2] = {(char)*srcPtr,    0};
288                     char byte1[2] = {(char)*(srcPtr+1),0};
289 
290                      ThrowXMLwithMemMgr2(UTFDataFormatException
291                               , XMLExcepts::UTF8_Irregular_3BytesSeq
292                               , byte0
293                               , byte1
294                               , getMemoryManager());
295                 }
296 
297                 tmpVal = *srcPtr++;
298                 tmpVal <<= 6;
299                 tmpVal += *srcPtr++;
300                 tmpVal <<= 6;
301                 tmpVal += *srcPtr++;
302 
303                 break;
304             case 3 :
305                 // UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
306                 // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
307                 //          [1101 11yy] [yyxx xxxx] (low surrogate)
308                 //          * uuuuu = wwww + 1
309                 //
310                 if (((*srcPtr == 0xF0) && (*(srcPtr+1) < 0x90)) ||
311                     ((*srcPtr == 0xF4) && (*(srcPtr+1) > 0x8F))  )
312                 {
313                     char byte0[2] = {(char)*srcPtr    ,0};
314                     char byte1[2] = {(char)*(srcPtr+1),0};
315 
316                     ThrowXMLwithMemMgr2(UTFDataFormatException
317                                       , XMLExcepts::UTF8_Invalid_4BytesSeq
318                                       , byte0
319                                       , byte1
320                                       , getMemoryManager());
321                 }
322 
323                 checkTrailingBytes(*(srcPtr+1), 3, 1);
324                 checkTrailingBytes(*(srcPtr+2), 3, 2);
325                 checkTrailingBytes(*(srcPtr+3), 3, 3);
326 
327                 tmpVal = *srcPtr++;
328                 tmpVal <<= 6;
329                 tmpVal += *srcPtr++;
330                 tmpVal <<= 6;
331                 tmpVal += *srcPtr++;
332                 tmpVal <<= 6;
333                 tmpVal += *srcPtr++;
334 
335                 break;
336             default: // trailingBytes > 3
337 
338                 /***
339                  * The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also allows
340                  * for the use of five- and six-byte sequences to encode characters that
341                  * are outside the range of the Unicode character set; those five- and
342                  * six-byte sequences are illegal for the use of UTF-8 as a transformation
343                  * of Unicode characters. ISO/IEC 10646 does not allow mapping of unpaired
344                  * surrogates, nor U+FFFE and U+FFFF (but it does allow other noncharacters).
345                  ***/
346                 char len[2]  = {(char)(trailingBytes+0x31), 0};
347                 char byte[2] = {(char)*srcPtr,0};
348 
349                 ThrowXMLwithMemMgr2(UTFDataFormatException
350                                   , XMLExcepts::UTF8_Exceeds_BytesLimit
351                                   , byte
352                                   , len
353                                   , getMemoryManager());
354 
355                 break;
356         }
357 
358 
359         // since trailingBytes comes from an array, this logic is redundant
360         //  default :
361         //      ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq);
362         //}
363         tmpVal -= gUTFOffsets[trailingBytes];
364 
365         //
366         //  If it will fit into a single char, then put it in. Otherwise
367         //  encode it as a surrogate pair. If its not valid, use the
368         //  replacement char.
369         //
370         if (!(tmpVal & 0xFFFF0000))
371         {
372             *sizePtr++ = trailingBytes + 1;
373             *outPtr++ = XMLCh(tmpVal);
374         }
375          else if (tmpVal > 0x10FFFF)
376         {
377             //
378             //  If we've gotten more than 32 chars so far, then just break
379             //  out for now and lets process those. When we come back in
380             //  here again, we'll get no chars and throw an exception. This
381             //  way, the error will have a line and col number closer to
382             //  the real problem area.
383             //
384             if ((outPtr - toFill) > 32)
385                 break;
386 
387             ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq, getMemoryManager());
388         }
389          else
390         {
391             //
392             //  If we have enough room to store the leading and trailing
393             //  chars, then lets do it. Else, pretend this one never
394             //  happened, and leave it for the next time.
395             //
396             if (outPtr + 1 >= outEnd)
397             {
398                 srcPtr -= (trailingBytes + 1);
399                 break;
400             }
401 
402             // Store the leading surrogate char
403             tmpVal -= 0x10000;
404             *sizePtr++ = trailingBytes + 1;
405             *outPtr++ = XMLCh((tmpVal >> 10) + 0xD800);
406 
407             //
408             //  And then the trailing char. This one accounts for no
409             //  bytes eaten from the source, so set the char size for this
410             //  one to be zero.
411             //
412             *sizePtr++ = 0;
413             *outPtr++ = XMLCh((tmpVal & 0x3FF) + 0xDC00);
414         }
415     }
416 
417     // Update the bytes eaten
418     bytesEaten = srcPtr - srcData;
419 
420     // Return the characters read
421     return outPtr - toFill;
422 }
423 
424 
425 XMLSize_t
transcodeTo(const XMLCh * const srcData,const XMLSize_t srcCount,XMLByte * const toFill,const XMLSize_t maxBytes,XMLSize_t & charsEaten,const UnRepOpts options)426 XMLUTF8Transcoder::transcodeTo( const   XMLCh* const    srcData
427                                 , const XMLSize_t       srcCount
428                                 ,       XMLByte* const  toFill
429                                 , const XMLSize_t       maxBytes
430                                 ,       XMLSize_t&      charsEaten
431                                 , const UnRepOpts       options)
432 {
433     // Watch for pathological scenario. Shouldn't happen, but...
434     if (!srcCount || !maxBytes)
435         return 0;
436 
437     //
438     //  Get pointers to our start and end points of the input and output
439     //  buffers.
440     //
441     const XMLCh*    srcPtr = srcData;
442     const XMLCh*    srcEnd = srcPtr + srcCount;
443     XMLByte*        outPtr = toFill;
444     XMLByte*        outEnd = toFill + maxBytes;
445 
446     while (srcPtr < srcEnd)
447     {
448         //
449         //  Tentatively get the next char out. We have to get it into a
450         //  32 bit value, because it could be a surrogate pair.
451         //
452         XMLUInt32 curVal = *srcPtr;
453 
454         //
455         //  If its a leading surrogate, then lets see if we have the trailing
456         //  available. If not, then give up now and leave it for next time.
457         //
458         unsigned int srcUsed = 1;
459         if ((curVal >= 0xD800) && (curVal <= 0xDBFF))
460         {
461             if (srcPtr + 1 >= srcEnd)
462                 break;
463 
464             // Create the composite surrogate pair
465             curVal = ((curVal - 0xD800) << 10)
466                     + ((*(srcPtr + 1) - 0xDC00) + 0x10000);
467 
468             // And indicate that we ate another one
469             srcUsed++;
470         }
471 
472         // Figure out how many bytes we need
473         unsigned int encodedBytes;
474         if (curVal < 0x80)
475             encodedBytes = 1;
476         else if (curVal < 0x800)
477             encodedBytes = 2;
478         else if (curVal < 0x10000)
479             encodedBytes = 3;
480         else if (curVal < 0x110000)
481             encodedBytes = 4;
482         else
483         {
484             // If the options say to throw, then throw
485             if (options == UnRep_Throw)
486             {
487                 XMLCh tmpBuf[17];
488                 XMLString::binToText(curVal, tmpBuf, 16, 16, getMemoryManager());
489                 ThrowXMLwithMemMgr2
490                 (
491                     TranscodingException
492                     , XMLExcepts::Trans_Unrepresentable
493                     , tmpBuf
494                     , getEncodingName()
495                     , getMemoryManager()
496                 );
497             }
498 
499             // Else, use the replacement character
500             *outPtr++ = chSpace;
501             srcPtr += srcUsed;
502             continue;
503         }
504 
505         //
506         //  If we cannot fully get this char into the output buffer,
507         //  then leave it for the next time.
508         //
509         if (outPtr + encodedBytes > outEnd)
510             break;
511 
512         // We can do it, so update the source index
513         srcPtr += srcUsed;
514 
515         //
516         //  And spit out the bytes. We spit them out in reverse order
517         //  here, so bump up the output pointer and work down as we go.
518         //
519         outPtr += encodedBytes;
520         switch(encodedBytes)
521         {
522             case 6 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
523                      curVal >>= 6;
524             case 5 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
525                      curVal >>= 6;
526             case 4 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
527                      curVal >>= 6;
528             case 3 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
529                      curVal >>= 6;
530             case 2 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
531                      curVal >>= 6;
532             case 1 : *--outPtr = XMLByte
533                      (
534                         curVal | gFirstByteMark[encodedBytes]
535                      );
536         }
537 
538         // Add the encoded bytes back in again to indicate we've eaten them
539         outPtr += encodedBytes;
540     }
541 
542     // Fill in the chars we ate
543     charsEaten = (srcPtr - srcData);
544 
545     // And return the bytes we filled in
546     return (outPtr - toFill);
547 }
548 
549 
canTranscodeTo(const unsigned int toCheck)550 bool XMLUTF8Transcoder::canTranscodeTo(const unsigned int toCheck)
551 {
552     // We can represent anything in the Unicode (with surrogates) range
553     return (toCheck <= 0x10FFFF);
554 }
555 
556 XERCES_CPP_NAMESPACE_END
557 
558