1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 /**
19 * $Id$
20 */
21
22 // ---------------------------------------------------------------------------
23 // Includes
24 // ---------------------------------------------------------------------------
25 #include <xercesc/util/TranscodingException.hpp>
26 #include <xercesc/util/XMLString.hpp>
27 #include <xercesc/util/XMLUniDefs.hpp>
28 #include <xercesc/util/XMLUTF8Transcoder.hpp>
29
30 XERCES_CPP_NAMESPACE_BEGIN
31
32 // ---------------------------------------------------------------------------
33 // Local static data
34 //
35 // gUTFBytes
36 // A list of counts of trailing bytes for each initial byte in the input.
37 //
38 // gUTFByteIndicator
39 // For a UTF8 sequence of n bytes, n>=2, the first byte of the
40 // sequence must contain n 1's followed by precisely 1 0 with the
41 // rest of the byte containing arbitrary bits. This array stores
42 // the required bit pattern for validity checking.
43 // gUTFByteIndicatorTest
44 // When bitwise and'd with the observed value, if the observed
45 // value is correct then a result matching gUTFByteIndicator will
46 // be produced.
47 //
48 // gUTFOffsets
49 // A list of values to offset each result char type, according to how
50 // many source bytes when into making it.
51 //
52 // gFirstByteMark
53 // A list of values to mask onto the first byte of an encoded sequence,
54 // indexed by the number of bytes used to create the sequence.
55 // ---------------------------------------------------------------------------
56 static const XMLByte gUTFBytes[256] =
57 {
58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
59 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
60 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
61 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
62 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
63 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
64 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
65 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
66 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
67 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
68 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
69 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
70 , 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
71 , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
72 , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
73 , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
74 };
75
76 static const XMLByte gUTFByteIndicator[6] =
77 {
78 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
79 };
80 static const XMLByte gUTFByteIndicatorTest[6] =
81 {
82 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE
83 };
84
85 static const XMLUInt32 gUTFOffsets[6] =
86 {
87 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
88 };
89
90 static const XMLByte gFirstByteMark[7] =
91 {
92 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
93 };
94
95
96
97 // ---------------------------------------------------------------------------
98 // XMLUTF8Transcoder: Constructors and Destructor
99 // ---------------------------------------------------------------------------
XMLUTF8Transcoder(const XMLCh * const encodingName,const XMLSize_t blockSize,MemoryManager * const manager)100 XMLUTF8Transcoder::XMLUTF8Transcoder(const XMLCh* const encodingName
101 , const XMLSize_t blockSize
102 , MemoryManager* const manager)
103 :XMLTranscoder(encodingName, blockSize, manager)
104 {
105 }
106
~XMLUTF8Transcoder()107 XMLUTF8Transcoder::~XMLUTF8Transcoder()
108 {
109 }
110
111
112 // ---------------------------------------------------------------------------
113 // XMLUTF8Transcoder: Implementation of the transcoder API
114 // ---------------------------------------------------------------------------
115 XMLSize_t
transcodeFrom(const XMLByte * const srcData,const XMLSize_t srcCount,XMLCh * const toFill,const XMLSize_t maxChars,XMLSize_t & bytesEaten,unsigned char * const charSizes)116 XMLUTF8Transcoder::transcodeFrom(const XMLByte* const srcData
117 , const XMLSize_t srcCount
118 , XMLCh* const toFill
119 , const XMLSize_t maxChars
120 , XMLSize_t& bytesEaten
121 , unsigned char* const charSizes)
122 {
123 // Watch for pathological scenario. Shouldn't happen, but...
124 if (!srcCount || !maxChars)
125 return 0;
126
127 //
128 // Get pointers to our start and end points of the input and output
129 // buffers.
130 //
131 const XMLByte* srcPtr = srcData;
132 const XMLByte* srcEnd = srcPtr + srcCount;
133 XMLCh* outPtr = toFill;
134 XMLCh* outEnd = outPtr + maxChars;
135 unsigned char* sizePtr = charSizes;
136
137
138
139 //
140 // We now loop until we either run out of input data, or room to store
141 // output chars.
142 //
143 while ((srcPtr < srcEnd) && (outPtr < outEnd))
144 {
145 // Special-case ASCII, which is a leading byte value of <= 127
146 if (*srcPtr <= 127)
147 {
148 // Handle ASCII in groups instead of single character at a time.
149 const XMLByte* srcPtr_save = srcPtr;
150 const XMLSize_t chunkSize = (srcEnd-srcPtr)<(outEnd-outPtr)?(srcEnd-srcPtr):(outEnd-outPtr);
151 for(XMLSize_t i=0;i<chunkSize && *srcPtr <= 127;++i)
152 *outPtr++ = XMLCh(*srcPtr++);
153 memset(sizePtr,1,srcPtr - srcPtr_save);
154 sizePtr += srcPtr - srcPtr_save;
155 if (srcPtr == srcEnd || outPtr == outEnd)
156 break;
157 }
158
159 // See how many trailing src bytes this sequence is going to require
160 const unsigned int trailingBytes = gUTFBytes[*srcPtr];
161
162 //
163 // If there are not enough source bytes to do this one, then we
164 // are done. Note that we done >= here because we are implicitly
165 // counting the 1 byte we get no matter what.
166 //
167 // If we break out here, then there is nothing to undo since we
168 // haven't updated any pointers yet.
169 //
170 if (srcPtr + trailingBytes >= srcEnd)
171 break;
172
173 // Looks ok, so lets build up the value
174 // or at least let's try to do so--remembering that
175 // we cannot assume the encoding to be valid:
176
177 // first, test first byte
178 if((gUTFByteIndicatorTest[trailingBytes] & *srcPtr) != gUTFByteIndicator[trailingBytes]) {
179 char pos[2] = {(char)0x31, 0};
180 char len[2] = {(char)(trailingBytes+0x31), 0};
181 char byte[2] = {(char)*srcPtr,0};
182 ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len, getMemoryManager());
183 }
184
185 /***
186 * http://www.unicode.org/reports/tr27/
187 *
188 * Table 3.1B. lists all of the byte sequences that are legal in UTF-8.
189 * A range of byte values such as A0..BF indicates that any byte from A0 to BF (inclusive)
190 * is legal in that position.
191 * Any byte value outside of the ranges listed is illegal.
192 * For example,
193 * the byte sequence <C0 AF> is illegal since C0 is not legal in the 1st Byte column.
194 * The byte sequence <E0 9F 80> is illegal since in the row
195 * where E0 is legal as a first byte,
196 * 9F is not legal as a second byte.
197 * The byte sequence <F4 80 83 92> is legal, since every byte in that sequence matches
198 * a byte range in a row of the table (the last row).
199 *
200 *
201 * Table 3.1B. Legal UTF-8 Byte Sequences
202 * Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
203 * =========================================================================
204 * U+0000..U+007F 00..7F
205 * -------------------------------------------------------------------------
206 * U+0080..U+07FF C2..DF 80..BF
207 *
208 * -------------------------------------------------------------------------
209 * U+0800..U+0FFF E0 A0..BF 80..BF
210 * --
211 *
212 * U+1000..U+FFFF E1..EF 80..BF 80..BF
213 *
214 * --------------------------------------------------------------------------
215 * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
216 * --
217 * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
218 * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
219 * --
220 * ==========================================================================
221 *
222 * Cases where a trailing byte range is not 80..BF are underlined in the table to
223 * draw attention to them. These occur only in the second byte of a sequence.
224 *
225 ***/
226
227 XMLUInt32 tmpVal = 0;
228
229 switch(trailingBytes)
230 {
231 case 1 :
232 // UTF-8: [110y yyyy] [10xx xxxx]
233 // Unicode: [0000 0yyy] [yyxx xxxx]
234 //
235 // 0xC0, 0xC1 has been filtered out
236 checkTrailingBytes(*(srcPtr+1), 1, 1);
237
238 tmpVal = *srcPtr++;
239 tmpVal <<= 6;
240 tmpVal += *srcPtr++;
241
242 break;
243 case 2 :
244 // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
245 // Unicode: [zzzz yyyy] [yyxx xxxx]
246 //
247 if (( *srcPtr == 0xE0) && ( *(srcPtr+1) < 0xA0))
248 {
249 char byte0[2] = {(char)*srcPtr ,0};
250 char byte1[2] = {(char)*(srcPtr+1),0};
251
252 ThrowXMLwithMemMgr2(UTFDataFormatException
253 , XMLExcepts::UTF8_Invalid_3BytesSeq
254 , byte0
255 , byte1
256 , getMemoryManager());
257 }
258
259 checkTrailingBytes(*(srcPtr+1), 2, 1);
260 checkTrailingBytes(*(srcPtr+2), 2, 2);
261
262 //
263 // D36 (a) UTF-8 is the Unicode Transformation Format that serializes
264 // a Unicode code point as a sequence of one to four bytes,
265 // as specified in Table 3.1, UTF-8 Bit Distribution.
266 // (b) An illegal UTF-8 code unit sequence is any byte sequence that
267 // does not match the patterns listed in Table 3.1B, Legal UTF-8
268 // Byte Sequences.
269 // (c) An irregular UTF-8 code unit sequence is a six-byte sequence
270 // where the first three bytes correspond to a high surrogate,
271 // and the next three bytes correspond to a low surrogate.
272 // As a consequence of C12, these irregular UTF-8 sequences shall
273 // not be generated by a conformant process.
274 //
275 //irregular three bytes sequence
276 // that is zzzzyy matches leading surrogate tag 110110 or
277 // trailing surrogate tag 110111
278 // *srcPtr=1110 1101
279 // *(srcPtr+1)=1010 yyyy or
280 // *(srcPtr+1)=1011 yyyy
281 //
282 // 0xED 1110 1101
283 // 0xA0 1010 0000
284
285 if ((*srcPtr == 0xED) && (*(srcPtr+1) >= 0xA0))
286 {
287 char byte0[2] = {(char)*srcPtr, 0};
288 char byte1[2] = {(char)*(srcPtr+1),0};
289
290 ThrowXMLwithMemMgr2(UTFDataFormatException
291 , XMLExcepts::UTF8_Irregular_3BytesSeq
292 , byte0
293 , byte1
294 , getMemoryManager());
295 }
296
297 tmpVal = *srcPtr++;
298 tmpVal <<= 6;
299 tmpVal += *srcPtr++;
300 tmpVal <<= 6;
301 tmpVal += *srcPtr++;
302
303 break;
304 case 3 :
305 // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
306 // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
307 // [1101 11yy] [yyxx xxxx] (low surrogate)
308 // * uuuuu = wwww + 1
309 //
310 if (((*srcPtr == 0xF0) && (*(srcPtr+1) < 0x90)) ||
311 ((*srcPtr == 0xF4) && (*(srcPtr+1) > 0x8F)) )
312 {
313 char byte0[2] = {(char)*srcPtr ,0};
314 char byte1[2] = {(char)*(srcPtr+1),0};
315
316 ThrowXMLwithMemMgr2(UTFDataFormatException
317 , XMLExcepts::UTF8_Invalid_4BytesSeq
318 , byte0
319 , byte1
320 , getMemoryManager());
321 }
322
323 checkTrailingBytes(*(srcPtr+1), 3, 1);
324 checkTrailingBytes(*(srcPtr+2), 3, 2);
325 checkTrailingBytes(*(srcPtr+3), 3, 3);
326
327 tmpVal = *srcPtr++;
328 tmpVal <<= 6;
329 tmpVal += *srcPtr++;
330 tmpVal <<= 6;
331 tmpVal += *srcPtr++;
332 tmpVal <<= 6;
333 tmpVal += *srcPtr++;
334
335 break;
336 default: // trailingBytes > 3
337
338 /***
339 * The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also allows
340 * for the use of five- and six-byte sequences to encode characters that
341 * are outside the range of the Unicode character set; those five- and
342 * six-byte sequences are illegal for the use of UTF-8 as a transformation
343 * of Unicode characters. ISO/IEC 10646 does not allow mapping of unpaired
344 * surrogates, nor U+FFFE and U+FFFF (but it does allow other noncharacters).
345 ***/
346 char len[2] = {(char)(trailingBytes+0x31), 0};
347 char byte[2] = {(char)*srcPtr,0};
348
349 ThrowXMLwithMemMgr2(UTFDataFormatException
350 , XMLExcepts::UTF8_Exceeds_BytesLimit
351 , byte
352 , len
353 , getMemoryManager());
354
355 break;
356 }
357
358
359 // since trailingBytes comes from an array, this logic is redundant
360 // default :
361 // ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq);
362 //}
363 tmpVal -= gUTFOffsets[trailingBytes];
364
365 //
366 // If it will fit into a single char, then put it in. Otherwise
367 // encode it as a surrogate pair. If its not valid, use the
368 // replacement char.
369 //
370 if (!(tmpVal & 0xFFFF0000))
371 {
372 *sizePtr++ = trailingBytes + 1;
373 *outPtr++ = XMLCh(tmpVal);
374 }
375 else if (tmpVal > 0x10FFFF)
376 {
377 //
378 // If we've gotten more than 32 chars so far, then just break
379 // out for now and lets process those. When we come back in
380 // here again, we'll get no chars and throw an exception. This
381 // way, the error will have a line and col number closer to
382 // the real problem area.
383 //
384 if ((outPtr - toFill) > 32)
385 break;
386
387 ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq, getMemoryManager());
388 }
389 else
390 {
391 //
392 // If we have enough room to store the leading and trailing
393 // chars, then lets do it. Else, pretend this one never
394 // happened, and leave it for the next time.
395 //
396 if (outPtr + 1 >= outEnd)
397 {
398 srcPtr -= (trailingBytes + 1);
399 break;
400 }
401
402 // Store the leading surrogate char
403 tmpVal -= 0x10000;
404 *sizePtr++ = trailingBytes + 1;
405 *outPtr++ = XMLCh((tmpVal >> 10) + 0xD800);
406
407 //
408 // And then the trailing char. This one accounts for no
409 // bytes eaten from the source, so set the char size for this
410 // one to be zero.
411 //
412 *sizePtr++ = 0;
413 *outPtr++ = XMLCh((tmpVal & 0x3FF) + 0xDC00);
414 }
415 }
416
417 // Update the bytes eaten
418 bytesEaten = srcPtr - srcData;
419
420 // Return the characters read
421 return outPtr - toFill;
422 }
423
424
425 XMLSize_t
transcodeTo(const XMLCh * const srcData,const XMLSize_t srcCount,XMLByte * const toFill,const XMLSize_t maxBytes,XMLSize_t & charsEaten,const UnRepOpts options)426 XMLUTF8Transcoder::transcodeTo( const XMLCh* const srcData
427 , const XMLSize_t srcCount
428 , XMLByte* const toFill
429 , const XMLSize_t maxBytes
430 , XMLSize_t& charsEaten
431 , const UnRepOpts options)
432 {
433 // Watch for pathological scenario. Shouldn't happen, but...
434 if (!srcCount || !maxBytes)
435 return 0;
436
437 //
438 // Get pointers to our start and end points of the input and output
439 // buffers.
440 //
441 const XMLCh* srcPtr = srcData;
442 const XMLCh* srcEnd = srcPtr + srcCount;
443 XMLByte* outPtr = toFill;
444 XMLByte* outEnd = toFill + maxBytes;
445
446 while (srcPtr < srcEnd)
447 {
448 //
449 // Tentatively get the next char out. We have to get it into a
450 // 32 bit value, because it could be a surrogate pair.
451 //
452 XMLUInt32 curVal = *srcPtr;
453
454 //
455 // If its a leading surrogate, then lets see if we have the trailing
456 // available. If not, then give up now and leave it for next time.
457 //
458 unsigned int srcUsed = 1;
459 if ((curVal >= 0xD800) && (curVal <= 0xDBFF))
460 {
461 if (srcPtr + 1 >= srcEnd)
462 break;
463
464 // Create the composite surrogate pair
465 curVal = ((curVal - 0xD800) << 10)
466 + ((*(srcPtr + 1) - 0xDC00) + 0x10000);
467
468 // And indicate that we ate another one
469 srcUsed++;
470 }
471
472 // Figure out how many bytes we need
473 unsigned int encodedBytes;
474 if (curVal < 0x80)
475 encodedBytes = 1;
476 else if (curVal < 0x800)
477 encodedBytes = 2;
478 else if (curVal < 0x10000)
479 encodedBytes = 3;
480 else if (curVal < 0x110000)
481 encodedBytes = 4;
482 else
483 {
484 // If the options say to throw, then throw
485 if (options == UnRep_Throw)
486 {
487 XMLCh tmpBuf[17];
488 XMLString::binToText(curVal, tmpBuf, 16, 16, getMemoryManager());
489 ThrowXMLwithMemMgr2
490 (
491 TranscodingException
492 , XMLExcepts::Trans_Unrepresentable
493 , tmpBuf
494 , getEncodingName()
495 , getMemoryManager()
496 );
497 }
498
499 // Else, use the replacement character
500 *outPtr++ = chSpace;
501 srcPtr += srcUsed;
502 continue;
503 }
504
505 //
506 // If we cannot fully get this char into the output buffer,
507 // then leave it for the next time.
508 //
509 if (outPtr + encodedBytes > outEnd)
510 break;
511
512 // We can do it, so update the source index
513 srcPtr += srcUsed;
514
515 //
516 // And spit out the bytes. We spit them out in reverse order
517 // here, so bump up the output pointer and work down as we go.
518 //
519 outPtr += encodedBytes;
520 switch(encodedBytes)
521 {
522 case 6 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
523 curVal >>= 6;
524 case 5 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
525 curVal >>= 6;
526 case 4 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
527 curVal >>= 6;
528 case 3 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
529 curVal >>= 6;
530 case 2 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
531 curVal >>= 6;
532 case 1 : *--outPtr = XMLByte
533 (
534 curVal | gFirstByteMark[encodedBytes]
535 );
536 }
537
538 // Add the encoded bytes back in again to indicate we've eaten them
539 outPtr += encodedBytes;
540 }
541
542 // Fill in the chars we ate
543 charsEaten = (srcPtr - srcData);
544
545 // And return the bytes we filled in
546 return (outPtr - toFill);
547 }
548
549
canTranscodeTo(const unsigned int toCheck)550 bool XMLUTF8Transcoder::canTranscodeTo(const unsigned int toCheck)
551 {
552 // We can represent anything in the Unicode (with surrogates) range
553 return (toCheck <= 0x10FFFF);
554 }
555
556 XERCES_CPP_NAMESPACE_END
557
558