1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2002-2015, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   file name:  ucnv_u32.c
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002jul01
14 *   created by: Markus W. Scherer
15 *
16 *   UTF-32 converter implementation. Used to be in ucnv_utf.c.
17 */
18 
19 #include "unicode/utypes.h"
20 
21 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
22 
23 #include "unicode/ucnv.h"
24 #include "unicode/utf.h"
25 #include "ucnv_bld.h"
26 #include "ucnv_cnv.h"
27 #include "cmemory.h"
28 
29 #define MAXIMUM_UCS2            0x0000FFFF
30 #define MAXIMUM_UTF             0x0010FFFF
31 #define HALF_SHIFT              10
32 #define HALF_BASE               0x0010000
33 #define HALF_MASK               0x3FF
34 #define SURROGATE_HIGH_START    0xD800
35 #define SURROGATE_LOW_START     0xDC00
36 
37 /* -SURROGATE_LOW_START + HALF_BASE */
38 #define SURROGATE_LOW_BASE      9216
39 
40 enum {
41     UCNV_NEED_TO_WRITE_BOM=1
42 };
43 
44 /* UTF-32BE ----------------------------------------------------------------- */
45 U_CDECL_BEGIN
46 static void U_CALLCONV
T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,UErrorCode * err)47 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
48                                 UErrorCode * err)
49 {
50     const unsigned char *mySource = (unsigned char *) args->source;
51     UChar *myTarget = args->target;
52     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
53     const UChar *targetLimit = args->targetLimit;
54     unsigned char *toUBytes = args->converter->toUBytes;
55     uint32_t ch, i;
56 
57     /* Restore state of current sequence */
58     if (args->converter->toULength > 0 && myTarget < targetLimit) {
59         i = args->converter->toULength;       /* restore # of bytes consumed */
60         args->converter->toULength = 0;
61 
62         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
63         args->converter->toUnicodeStatus = 0;
64         goto morebytes;
65     }
66 
67     while (mySource < sourceLimit && myTarget < targetLimit) {
68         i = 0;
69         ch = 0;
70 morebytes:
71         while (i < sizeof(uint32_t)) {
72             if (mySource < sourceLimit) {
73                 ch = (ch << 8) | (uint8_t)(*mySource);
74                 toUBytes[i++] = (char) *(mySource++);
75             }
76             else {
77                 /* stores a partially calculated target*/
78                 /* + 1 to make 0 a valid character */
79                 args->converter->toUnicodeStatus = ch + 1;
80                 args->converter->toULength = (int8_t) i;
81                 goto donefornow;
82             }
83         }
84 
85         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
86             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
87             if (ch <= MAXIMUM_UCS2)
88             {
89                 /* fits in 16 bits */
90                 *(myTarget++) = (UChar) ch;
91             }
92             else {
93                 /* write out the surrogates */
94                 *(myTarget++) = U16_LEAD(ch);
95                 ch = U16_TRAIL(ch);
96                 if (myTarget < targetLimit) {
97                     *(myTarget++) = (UChar)ch;
98                 }
99                 else {
100                     /* Put in overflow buffer (not handled here) */
101                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
102                     args->converter->UCharErrorBufferLength = 1;
103                     *err = U_BUFFER_OVERFLOW_ERROR;
104                     break;
105                 }
106             }
107         }
108         else {
109             args->converter->toULength = (int8_t)i;
110             *err = U_ILLEGAL_CHAR_FOUND;
111             break;
112         }
113     }
114 
115 donefornow:
116     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
117         /* End of target buffer */
118         *err = U_BUFFER_OVERFLOW_ERROR;
119     }
120 
121     args->target = myTarget;
122     args->source = (const char *) mySource;
123 }
124 
125 static void U_CALLCONV
T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)126 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
127                                              UErrorCode * err)
128 {
129     const unsigned char *mySource = (unsigned char *) args->source;
130     UChar *myTarget = args->target;
131     int32_t *myOffsets = args->offsets;
132     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
133     const UChar *targetLimit = args->targetLimit;
134     unsigned char *toUBytes = args->converter->toUBytes;
135     uint32_t ch, i;
136     int32_t offsetNum = 0;
137 
138     /* Restore state of current sequence */
139     if (args->converter->toULength > 0 && myTarget < targetLimit) {
140         i = args->converter->toULength;       /* restore # of bytes consumed */
141         args->converter->toULength = 0;
142 
143         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
144         args->converter->toUnicodeStatus = 0;
145         goto morebytes;
146     }
147 
148     while (mySource < sourceLimit && myTarget < targetLimit) {
149         i = 0;
150         ch = 0;
151 morebytes:
152         while (i < sizeof(uint32_t)) {
153             if (mySource < sourceLimit) {
154                 ch = (ch << 8) | (uint8_t)(*mySource);
155                 toUBytes[i++] = (char) *(mySource++);
156             }
157             else {
158                 /* stores a partially calculated target*/
159                 /* + 1 to make 0 a valid character */
160                 args->converter->toUnicodeStatus = ch + 1;
161                 args->converter->toULength = (int8_t) i;
162                 goto donefornow;
163             }
164         }
165 
166         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
167             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
168             if (ch <= MAXIMUM_UCS2) {
169                 /* fits in 16 bits */
170                 *(myTarget++) = (UChar) ch;
171                 *(myOffsets++) = offsetNum;
172             }
173             else {
174                 /* write out the surrogates */
175                 *(myTarget++) = U16_LEAD(ch);
176                 *myOffsets++ = offsetNum;
177                 ch = U16_TRAIL(ch);
178                 if (myTarget < targetLimit)
179                 {
180                     *(myTarget++) = (UChar)ch;
181                     *(myOffsets++) = offsetNum;
182                 }
183                 else {
184                     /* Put in overflow buffer (not handled here) */
185                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
186                     args->converter->UCharErrorBufferLength = 1;
187                     *err = U_BUFFER_OVERFLOW_ERROR;
188                     break;
189                 }
190             }
191         }
192         else {
193             args->converter->toULength = (int8_t)i;
194             *err = U_ILLEGAL_CHAR_FOUND;
195             break;
196         }
197         offsetNum += i;
198     }
199 
200 donefornow:
201     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
202     {
203         /* End of target buffer */
204         *err = U_BUFFER_OVERFLOW_ERROR;
205     }
206 
207     args->target = myTarget;
208     args->source = (const char *) mySource;
209     args->offsets = myOffsets;
210 }
211 
212 static void U_CALLCONV
T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,UErrorCode * err)213 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
214                                   UErrorCode * err)
215 {
216     const UChar *mySource = args->source;
217     unsigned char *myTarget;
218     const UChar *sourceLimit = args->sourceLimit;
219     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
220     UChar32 ch, ch2;
221     unsigned int indexToWrite;
222     unsigned char temp[sizeof(uint32_t)];
223 
224     if(mySource >= sourceLimit) {
225         /* no input, nothing to do */
226         return;
227     }
228 
229     /* write the BOM if necessary */
230     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
231         static const char bom[]={ 0, 0, (char)0xfeu, (char)0xffu };
232         ucnv_fromUWriteBytes(args->converter,
233                              bom, 4,
234                              &args->target, args->targetLimit,
235                              &args->offsets, -1,
236                              err);
237         args->converter->fromUnicodeStatus=0;
238     }
239 
240     myTarget = (unsigned char *) args->target;
241     temp[0] = 0;
242 
243     if (args->converter->fromUChar32) {
244         ch = args->converter->fromUChar32;
245         args->converter->fromUChar32 = 0;
246         goto lowsurogate;
247     }
248 
249     while (mySource < sourceLimit && myTarget < targetLimit) {
250         ch = *(mySource++);
251 
252         if (U_IS_SURROGATE(ch)) {
253             if (U_IS_LEAD(ch)) {
254 lowsurogate:
255                 if (mySource < sourceLimit) {
256                     ch2 = *mySource;
257                     if (U_IS_TRAIL(ch2)) {
258                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
259                         mySource++;
260                     }
261                     else {
262                         /* this is an unmatched trail code unit (2nd surrogate) */
263                         /* callback(illegal) */
264                         args->converter->fromUChar32 = ch;
265                         *err = U_ILLEGAL_CHAR_FOUND;
266                         break;
267                     }
268                 }
269                 else {
270                     /* ran out of source */
271                     args->converter->fromUChar32 = ch;
272                     if (args->flush) {
273                         /* this is an unmatched trail code unit (2nd surrogate) */
274                         /* callback(illegal) */
275                         *err = U_ILLEGAL_CHAR_FOUND;
276                     }
277                     break;
278                 }
279             }
280             else {
281                 /* this is an unmatched trail code unit (2nd surrogate) */
282                 /* callback(illegal) */
283                 args->converter->fromUChar32 = ch;
284                 *err = U_ILLEGAL_CHAR_FOUND;
285                 break;
286             }
287         }
288 
289         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
290         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
291         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
292         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
293 
294         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
295             if (myTarget < targetLimit) {
296                 *(myTarget++) = temp[indexToWrite];
297             }
298             else {
299                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
300                 *err = U_BUFFER_OVERFLOW_ERROR;
301             }
302         }
303     }
304 
305     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
306         *err = U_BUFFER_OVERFLOW_ERROR;
307     }
308 
309     args->target = (char *) myTarget;
310     args->source = mySource;
311 }
312 
313 static void U_CALLCONV
T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)314 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
315                                                UErrorCode * err)
316 {
317     const UChar *mySource = args->source;
318     unsigned char *myTarget;
319     int32_t *myOffsets;
320     const UChar *sourceLimit = args->sourceLimit;
321     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
322     UChar32 ch, ch2;
323     int32_t offsetNum = 0;
324     unsigned int indexToWrite;
325     unsigned char temp[sizeof(uint32_t)];
326 
327     if(mySource >= sourceLimit) {
328         /* no input, nothing to do */
329         return;
330     }
331 
332     /* write the BOM if necessary */
333     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
334         static const char bom[]={ 0, 0, (char)0xfeu, (char)0xffu };
335         ucnv_fromUWriteBytes(args->converter,
336                              bom, 4,
337                              &args->target, args->targetLimit,
338                              &args->offsets, -1,
339                              err);
340         args->converter->fromUnicodeStatus=0;
341     }
342 
343     myTarget = (unsigned char *) args->target;
344     myOffsets = args->offsets;
345     temp[0] = 0;
346 
347     if (args->converter->fromUChar32) {
348         ch = args->converter->fromUChar32;
349         args->converter->fromUChar32 = 0;
350         goto lowsurogate;
351     }
352 
353     while (mySource < sourceLimit && myTarget < targetLimit) {
354         ch = *(mySource++);
355 
356         if (U_IS_SURROGATE(ch)) {
357             if (U_IS_LEAD(ch)) {
358 lowsurogate:
359                 if (mySource < sourceLimit) {
360                     ch2 = *mySource;
361                     if (U_IS_TRAIL(ch2)) {
362                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
363                         mySource++;
364                     }
365                     else {
366                         /* this is an unmatched trail code unit (2nd surrogate) */
367                         /* callback(illegal) */
368                         args->converter->fromUChar32 = ch;
369                         *err = U_ILLEGAL_CHAR_FOUND;
370                         break;
371                     }
372                 }
373                 else {
374                     /* ran out of source */
375                     args->converter->fromUChar32 = ch;
376                     if (args->flush) {
377                         /* this is an unmatched trail code unit (2nd surrogate) */
378                         /* callback(illegal) */
379                         *err = U_ILLEGAL_CHAR_FOUND;
380                     }
381                     break;
382                 }
383             }
384             else {
385                 /* this is an unmatched trail code unit (2nd surrogate) */
386                 /* callback(illegal) */
387                 args->converter->fromUChar32 = ch;
388                 *err = U_ILLEGAL_CHAR_FOUND;
389                 break;
390             }
391         }
392 
393         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
394         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
395         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
396         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
397 
398         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
399             if (myTarget < targetLimit) {
400                 *(myTarget++) = temp[indexToWrite];
401                 *(myOffsets++) = offsetNum;
402             }
403             else {
404                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
405                 *err = U_BUFFER_OVERFLOW_ERROR;
406             }
407         }
408         offsetNum = offsetNum + 1 + (temp[1] != 0);
409     }
410 
411     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
412         *err = U_BUFFER_OVERFLOW_ERROR;
413     }
414 
415     args->target = (char *) myTarget;
416     args->source = mySource;
417     args->offsets = myOffsets;
418 }
419 
420 static UChar32 U_CALLCONV
T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs * args,UErrorCode * err)421 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
422                                    UErrorCode* err)
423 {
424     const uint8_t *mySource;
425     UChar32 myUChar;
426     int32_t length;
427 
428     mySource = (const uint8_t *)args->source;
429     if (mySource >= (const uint8_t *)args->sourceLimit)
430     {
431         /* no input */
432         *err = U_INDEX_OUTOFBOUNDS_ERROR;
433         return 0xffff;
434     }
435 
436     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
437     if (length < 4)
438     {
439         /* got a partial character */
440         uprv_memcpy(args->converter->toUBytes, mySource, length);
441         args->converter->toULength = (int8_t)length;
442         args->source = (const char *)(mySource + length);
443         *err = U_TRUNCATED_CHAR_FOUND;
444         return 0xffff;
445     }
446 
447     /* Don't even try to do a direct cast because the value may be on an odd address. */
448     myUChar = ((UChar32)mySource[0] << 24)
449             | ((UChar32)mySource[1] << 16)
450             | ((UChar32)mySource[2] << 8)
451             | ((UChar32)mySource[3]);
452 
453     args->source = (const char *)(mySource + 4);
454     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
455         return myUChar;
456     }
457 
458     uprv_memcpy(args->converter->toUBytes, mySource, 4);
459     args->converter->toULength = 4;
460 
461     *err = U_ILLEGAL_CHAR_FOUND;
462     return 0xffff;
463 }
464 U_CDECL_END
465 static const UConverterImpl _UTF32BEImpl = {
466     UCNV_UTF32_BigEndian,
467 
468     NULL,
469     NULL,
470 
471     NULL,
472     NULL,
473     NULL,
474 
475     T_UConverter_toUnicode_UTF32_BE,
476     T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
477     T_UConverter_fromUnicode_UTF32_BE,
478     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
479     T_UConverter_getNextUChar_UTF32_BE,
480 
481     NULL,
482     NULL,
483     NULL,
484     NULL,
485     ucnv_getNonSurrogateUnicodeSet,
486 
487     NULL,
488     NULL
489 };
490 
491 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
492 static const UConverterStaticData _UTF32BEStaticData = {
493     sizeof(UConverterStaticData),
494     "UTF-32BE",
495     1232,
496     UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
497     { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
498     0,
499     0,
500     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
501 };
502 
503 const UConverterSharedData _UTF32BEData =
504         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32BEStaticData, &_UTF32BEImpl);
505 
506 /* UTF-32LE ---------------------------------------------------------- */
507 U_CDECL_BEGIN
508 static void U_CALLCONV
T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,UErrorCode * err)509 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
510                                 UErrorCode * err)
511 {
512     const unsigned char *mySource = (unsigned char *) args->source;
513     UChar *myTarget = args->target;
514     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
515     const UChar *targetLimit = args->targetLimit;
516     unsigned char *toUBytes = args->converter->toUBytes;
517     uint32_t ch, i;
518 
519     /* Restore state of current sequence */
520     if (args->converter->toULength > 0 && myTarget < targetLimit)
521     {
522         i = args->converter->toULength;       /* restore # of bytes consumed */
523         args->converter->toULength = 0;
524 
525         /* Stores the previously calculated ch from a previous call*/
526         ch = args->converter->toUnicodeStatus - 1;
527         args->converter->toUnicodeStatus = 0;
528         goto morebytes;
529     }
530 
531     while (mySource < sourceLimit && myTarget < targetLimit)
532     {
533         i = 0;
534         ch = 0;
535 morebytes:
536         while (i < sizeof(uint32_t))
537         {
538             if (mySource < sourceLimit)
539             {
540                 ch |= ((uint8_t)(*mySource)) << (i * 8);
541                 toUBytes[i++] = (char) *(mySource++);
542             }
543             else
544             {
545                 /* stores a partially calculated target*/
546                 /* + 1 to make 0 a valid character */
547                 args->converter->toUnicodeStatus = ch + 1;
548                 args->converter->toULength = (int8_t) i;
549                 goto donefornow;
550             }
551         }
552 
553         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
554             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
555             if (ch <= MAXIMUM_UCS2) {
556                 /* fits in 16 bits */
557                 *(myTarget++) = (UChar) ch;
558             }
559             else {
560                 /* write out the surrogates */
561                 *(myTarget++) = U16_LEAD(ch);
562                 ch = U16_TRAIL(ch);
563                 if (myTarget < targetLimit) {
564                     *(myTarget++) = (UChar)ch;
565                 }
566                 else {
567                     /* Put in overflow buffer (not handled here) */
568                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
569                     args->converter->UCharErrorBufferLength = 1;
570                     *err = U_BUFFER_OVERFLOW_ERROR;
571                     break;
572                 }
573             }
574         }
575         else {
576             args->converter->toULength = (int8_t)i;
577             *err = U_ILLEGAL_CHAR_FOUND;
578             break;
579         }
580     }
581 
582 donefornow:
583     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
584     {
585         /* End of target buffer */
586         *err = U_BUFFER_OVERFLOW_ERROR;
587     }
588 
589     args->target = myTarget;
590     args->source = (const char *) mySource;
591 }
592 
593 static void U_CALLCONV
T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)594 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
595                                              UErrorCode * err)
596 {
597     const unsigned char *mySource = (unsigned char *) args->source;
598     UChar *myTarget = args->target;
599     int32_t *myOffsets = args->offsets;
600     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
601     const UChar *targetLimit = args->targetLimit;
602     unsigned char *toUBytes = args->converter->toUBytes;
603     uint32_t ch, i;
604     int32_t offsetNum = 0;
605 
606     /* Restore state of current sequence */
607     if (args->converter->toULength > 0 && myTarget < targetLimit)
608     {
609         i = args->converter->toULength;       /* restore # of bytes consumed */
610         args->converter->toULength = 0;
611 
612         /* Stores the previously calculated ch from a previous call*/
613         ch = args->converter->toUnicodeStatus - 1;
614         args->converter->toUnicodeStatus = 0;
615         goto morebytes;
616     }
617 
618     while (mySource < sourceLimit && myTarget < targetLimit)
619     {
620         i = 0;
621         ch = 0;
622 morebytes:
623         while (i < sizeof(uint32_t))
624         {
625             if (mySource < sourceLimit)
626             {
627                 ch |= ((uint8_t)(*mySource)) << (i * 8);
628                 toUBytes[i++] = (char) *(mySource++);
629             }
630             else
631             {
632                 /* stores a partially calculated target*/
633                 /* + 1 to make 0 a valid character */
634                 args->converter->toUnicodeStatus = ch + 1;
635                 args->converter->toULength = (int8_t) i;
636                 goto donefornow;
637             }
638         }
639 
640         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
641         {
642             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
643             if (ch <= MAXIMUM_UCS2)
644             {
645                 /* fits in 16 bits */
646                 *(myTarget++) = (UChar) ch;
647                 *(myOffsets++) = offsetNum;
648             }
649             else {
650                 /* write out the surrogates */
651                 *(myTarget++) = U16_LEAD(ch);
652                 *(myOffsets++) = offsetNum;
653                 ch = U16_TRAIL(ch);
654                 if (myTarget < targetLimit)
655                 {
656                     *(myTarget++) = (UChar)ch;
657                     *(myOffsets++) = offsetNum;
658                 }
659                 else
660                 {
661                     /* Put in overflow buffer (not handled here) */
662                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
663                     args->converter->UCharErrorBufferLength = 1;
664                     *err = U_BUFFER_OVERFLOW_ERROR;
665                     break;
666                 }
667             }
668         }
669         else
670         {
671             args->converter->toULength = (int8_t)i;
672             *err = U_ILLEGAL_CHAR_FOUND;
673             break;
674         }
675         offsetNum += i;
676     }
677 
678 donefornow:
679     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
680     {
681         /* End of target buffer */
682         *err = U_BUFFER_OVERFLOW_ERROR;
683     }
684 
685     args->target = myTarget;
686     args->source = (const char *) mySource;
687     args->offsets = myOffsets;
688 }
689 
690 static void U_CALLCONV
T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,UErrorCode * err)691 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
692                                   UErrorCode * err)
693 {
694     const UChar *mySource = args->source;
695     unsigned char *myTarget;
696     const UChar *sourceLimit = args->sourceLimit;
697     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
698     UChar32 ch, ch2;
699     unsigned int indexToWrite;
700     unsigned char temp[sizeof(uint32_t)];
701 
702     if(mySource >= sourceLimit) {
703         /* no input, nothing to do */
704         return;
705     }
706 
707     /* write the BOM if necessary */
708     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
709         static const char bom[]={ (char)0xffu, (char)0xfeu, 0, 0 };
710         ucnv_fromUWriteBytes(args->converter,
711                              bom, 4,
712                              &args->target, args->targetLimit,
713                              &args->offsets, -1,
714                              err);
715         args->converter->fromUnicodeStatus=0;
716     }
717 
718     myTarget = (unsigned char *) args->target;
719     temp[3] = 0;
720 
721     if (args->converter->fromUChar32)
722     {
723         ch = args->converter->fromUChar32;
724         args->converter->fromUChar32 = 0;
725         goto lowsurogate;
726     }
727 
728     while (mySource < sourceLimit && myTarget < targetLimit)
729     {
730         ch = *(mySource++);
731 
732         if (U16_IS_SURROGATE(ch)) {
733             if (U16_IS_LEAD(ch))
734             {
735 lowsurogate:
736                 if (mySource < sourceLimit)
737                 {
738                     ch2 = *mySource;
739                     if (U16_IS_TRAIL(ch2)) {
740                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
741                         mySource++;
742                     }
743                     else {
744                         /* this is an unmatched trail code unit (2nd surrogate) */
745                         /* callback(illegal) */
746                         args->converter->fromUChar32 = ch;
747                         *err = U_ILLEGAL_CHAR_FOUND;
748                         break;
749                     }
750                 }
751                 else {
752                     /* ran out of source */
753                     args->converter->fromUChar32 = ch;
754                     if (args->flush) {
755                         /* this is an unmatched trail code unit (2nd surrogate) */
756                         /* callback(illegal) */
757                         *err = U_ILLEGAL_CHAR_FOUND;
758                     }
759                     break;
760                 }
761             }
762             else {
763                 /* this is an unmatched trail code unit (2nd surrogate) */
764                 /* callback(illegal) */
765                 args->converter->fromUChar32 = ch;
766                 *err = U_ILLEGAL_CHAR_FOUND;
767                 break;
768             }
769         }
770 
771         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
772         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
773         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
774         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
775 
776         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
777         {
778             if (myTarget < targetLimit)
779             {
780                 *(myTarget++) = temp[indexToWrite];
781             }
782             else
783             {
784                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
785                 *err = U_BUFFER_OVERFLOW_ERROR;
786             }
787         }
788     }
789 
790     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
791     {
792         *err = U_BUFFER_OVERFLOW_ERROR;
793     }
794 
795     args->target = (char *) myTarget;
796     args->source = mySource;
797 }
798 
799 static void U_CALLCONV
T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)800 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
801                                                UErrorCode * err)
802 {
803     const UChar *mySource = args->source;
804     unsigned char *myTarget;
805     int32_t *myOffsets;
806     const UChar *sourceLimit = args->sourceLimit;
807     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
808     UChar32 ch, ch2;
809     unsigned int indexToWrite;
810     unsigned char temp[sizeof(uint32_t)];
811     int32_t offsetNum = 0;
812 
813     if(mySource >= sourceLimit) {
814         /* no input, nothing to do */
815         return;
816     }
817 
818     /* write the BOM if necessary */
819     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
820         static const char bom[]={ (char)0xffu, (char)0xfeu, 0, 0 };
821         ucnv_fromUWriteBytes(args->converter,
822                              bom, 4,
823                              &args->target, args->targetLimit,
824                              &args->offsets, -1,
825                              err);
826         args->converter->fromUnicodeStatus=0;
827     }
828 
829     myTarget = (unsigned char *) args->target;
830     myOffsets = args->offsets;
831     temp[3] = 0;
832 
833     if (args->converter->fromUChar32)
834     {
835         ch = args->converter->fromUChar32;
836         args->converter->fromUChar32 = 0;
837         goto lowsurogate;
838     }
839 
840     while (mySource < sourceLimit && myTarget < targetLimit)
841     {
842         ch = *(mySource++);
843 
844         if (U16_IS_SURROGATE(ch)) {
845             if (U16_IS_LEAD(ch))
846             {
847 lowsurogate:
848                 if (mySource < sourceLimit)
849                 {
850                     ch2 = *mySource;
851                     if (U16_IS_TRAIL(ch2))
852                     {
853                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
854                         mySource++;
855                     }
856                     else {
857                         /* this is an unmatched trail code unit (2nd surrogate) */
858                         /* callback(illegal) */
859                         args->converter->fromUChar32 = ch;
860                         *err = U_ILLEGAL_CHAR_FOUND;
861                         break;
862                     }
863                 }
864                 else {
865                     /* ran out of source */
866                     args->converter->fromUChar32 = ch;
867                     if (args->flush) {
868                         /* this is an unmatched trail code unit (2nd surrogate) */
869                         /* callback(illegal) */
870                         *err = U_ILLEGAL_CHAR_FOUND;
871                     }
872                     break;
873                 }
874             }
875             else {
876                 /* this is an unmatched trail code unit (2nd surrogate) */
877                 /* callback(illegal) */
878                 args->converter->fromUChar32 = ch;
879                 *err = U_ILLEGAL_CHAR_FOUND;
880                 break;
881             }
882         }
883 
884         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
885         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
886         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
887         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
888 
889         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
890         {
891             if (myTarget < targetLimit)
892             {
893                 *(myTarget++) = temp[indexToWrite];
894                 *(myOffsets++) = offsetNum;
895             }
896             else
897             {
898                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
899                 *err = U_BUFFER_OVERFLOW_ERROR;
900             }
901         }
902         offsetNum = offsetNum + 1 + (temp[2] != 0);
903     }
904 
905     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
906     {
907         *err = U_BUFFER_OVERFLOW_ERROR;
908     }
909 
910     args->target = (char *) myTarget;
911     args->source = mySource;
912     args->offsets = myOffsets;
913 }
914 
915 static UChar32 U_CALLCONV
T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs * args,UErrorCode * err)916 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
917                                    UErrorCode* err)
918 {
919     const uint8_t *mySource;
920     UChar32 myUChar;
921     int32_t length;
922 
923     mySource = (const uint8_t *)args->source;
924     if (mySource >= (const uint8_t *)args->sourceLimit)
925     {
926         /* no input */
927         *err = U_INDEX_OUTOFBOUNDS_ERROR;
928         return 0xffff;
929     }
930 
931     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
932     if (length < 4)
933     {
934         /* got a partial character */
935         uprv_memcpy(args->converter->toUBytes, mySource, length);
936         args->converter->toULength = (int8_t)length;
937         args->source = (const char *)(mySource + length);
938         *err = U_TRUNCATED_CHAR_FOUND;
939         return 0xffff;
940     }
941 
942     /* Don't even try to do a direct cast because the value may be on an odd address. */
943     myUChar = ((UChar32)mySource[3] << 24)
944             | ((UChar32)mySource[2] << 16)
945             | ((UChar32)mySource[1] << 8)
946             | ((UChar32)mySource[0]);
947 
948     args->source = (const char *)(mySource + 4);
949     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
950         return myUChar;
951     }
952 
953     uprv_memcpy(args->converter->toUBytes, mySource, 4);
954     args->converter->toULength = 4;
955 
956     *err = U_ILLEGAL_CHAR_FOUND;
957     return 0xffff;
958 }
959 U_CDECL_END
960 static const UConverterImpl _UTF32LEImpl = {
961     UCNV_UTF32_LittleEndian,
962 
963     NULL,
964     NULL,
965 
966     NULL,
967     NULL,
968     NULL,
969 
970     T_UConverter_toUnicode_UTF32_LE,
971     T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
972     T_UConverter_fromUnicode_UTF32_LE,
973     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
974     T_UConverter_getNextUChar_UTF32_LE,
975 
976     NULL,
977     NULL,
978     NULL,
979     NULL,
980     ucnv_getNonSurrogateUnicodeSet,
981 
982     NULL,
983     NULL
984 };
985 
986 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
987 static const UConverterStaticData _UTF32LEStaticData = {
988     sizeof(UConverterStaticData),
989     "UTF-32LE",
990     1234,
991     UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
992     { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
993     0,
994     0,
995     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
996 };
997 
998 
999 const UConverterSharedData _UTF32LEData =
1000         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32LEStaticData, &_UTF32LEImpl);
1001 
1002 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
1003 
1004 /*
1005  * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1006  * accordingly.
1007  *
1008  * State values:
1009  * 0    initial state
1010  * 1    saw 00
1011  * 2    saw 00 00
1012  * 3    saw 00 00 FE
1013  * 4    -
1014  * 5    saw FF
1015  * 6    saw FF FE
1016  * 7    saw FF FE 00
1017  * 8    UTF-32BE mode
1018  * 9    UTF-32LE mode
1019  *
1020  * During detection: state&3==number of matching bytes so far.
1021  *
1022  * On output, emit U+FEFF as the first code point.
1023  */
1024 U_CDECL_BEGIN
1025 static void U_CALLCONV
_UTF32Reset(UConverter * cnv,UConverterResetChoice choice)1026 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
1027     if(choice<=UCNV_RESET_TO_UNICODE) {
1028         /* reset toUnicode: state=0 */
1029         cnv->mode=0;
1030     }
1031     if(choice!=UCNV_RESET_TO_UNICODE) {
1032         /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1033         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1034     }
1035 }
1036 
1037 static void U_CALLCONV
_UTF32Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)1038 _UTF32Open(UConverter *cnv,
1039            UConverterLoadArgs *pArgs,
1040            UErrorCode *pErrorCode) {
1041     (void)pArgs;
1042     (void)pErrorCode;
1043     _UTF32Reset(cnv, UCNV_RESET_BOTH);
1044 }
1045 
1046 static const char utf32BOM[8]={ 0, 0, (char)0xfeu, (char)0xffu, (char)0xffu, (char)0xfeu, 0, 0 };
1047 
1048 static void U_CALLCONV
_UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1049 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1050                            UErrorCode *pErrorCode) {
1051     UConverter *cnv=pArgs->converter;
1052     const char *source=pArgs->source;
1053     const char *sourceLimit=pArgs->sourceLimit;
1054     int32_t *offsets=pArgs->offsets;
1055 
1056     int32_t state, offsetDelta;
1057     char b;
1058 
1059     state=cnv->mode;
1060 
1061     /*
1062      * If we detect a BOM in this buffer, then we must add the BOM size to the
1063      * offsets because the actual converter function will not see and count the BOM.
1064      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1065      */
1066     offsetDelta=0;
1067 
1068     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1069         switch(state) {
1070         case 0:
1071             b=*source;
1072             if(b==0) {
1073                 state=1; /* could be 00 00 FE FF */
1074             } else if(b==(char)0xffu) {
1075                 state=5; /* could be FF FE 00 00 */
1076             } else {
1077                 state=8; /* default to UTF-32BE */
1078                 continue;
1079             }
1080             ++source;
1081             break;
1082         case 1:
1083         case 2:
1084         case 3:
1085         case 5:
1086         case 6:
1087         case 7:
1088             if(*source==utf32BOM[state]) {
1089                 ++state;
1090                 ++source;
1091                 if(state==4) {
1092                     state=8; /* detect UTF-32BE */
1093                     offsetDelta=(int32_t)(source-pArgs->source);
1094                 } else if(state==8) {
1095                     state=9; /* detect UTF-32LE */
1096                     offsetDelta=(int32_t)(source-pArgs->source);
1097                 }
1098             } else {
1099                 /* switch to UTF-32BE and pass the previous bytes */
1100                 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
1101 
1102                 /* reset the source */
1103                 source=pArgs->source;
1104 
1105                 if(count==(state&3)) {
1106                     /* simple: all in the same buffer, just reset source */
1107                 } else {
1108                     UBool oldFlush=pArgs->flush;
1109 
1110                     /* some of the bytes are from a previous buffer, replay those first */
1111                     pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1112                     pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1113                     pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1114 
1115                     /* no offsets: bytes from previous buffer, and not enough for output */
1116                     T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1117 
1118                     /* restore real pointers; pArgs->source will be set in case 8/9 */
1119                     pArgs->sourceLimit=sourceLimit;
1120                     pArgs->flush=oldFlush;
1121                 }
1122                 state=8;
1123                 continue;
1124             }
1125             break;
1126         case 8:
1127             /* call UTF-32BE */
1128             pArgs->source=source;
1129             if(offsets==NULL) {
1130                 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1131             } else {
1132                 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1133             }
1134             source=pArgs->source;
1135             break;
1136         case 9:
1137             /* call UTF-32LE */
1138             pArgs->source=source;
1139             if(offsets==NULL) {
1140                 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1141             } else {
1142                 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1143             }
1144             source=pArgs->source;
1145             break;
1146         default:
1147             break; /* does not occur */
1148         }
1149     }
1150 
1151     /* add BOM size to offsets - see comment at offsetDelta declaration */
1152     if(offsets!=NULL && offsetDelta!=0) {
1153         int32_t *offsetsLimit=pArgs->offsets;
1154         while(offsets<offsetsLimit) {
1155             *offsets++ += offsetDelta;
1156         }
1157     }
1158 
1159     pArgs->source=source;
1160 
1161     if(source==sourceLimit && pArgs->flush) {
1162         /* handle truncated input */
1163         switch(state) {
1164         case 0:
1165             break; /* no input at all, nothing to do */
1166         case 8:
1167             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1168             break;
1169         case 9:
1170             T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1171             break;
1172         default:
1173             /* handle 0<state<8: call UTF-32BE with too-short input */
1174             pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1175             pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1176 
1177             /* no offsets: not enough for output */
1178             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1179             pArgs->source=source;
1180             pArgs->sourceLimit=sourceLimit;
1181             state=8;
1182             break;
1183         }
1184     }
1185 
1186     cnv->mode=state;
1187 }
1188 
1189 static UChar32 U_CALLCONV
_UTF32GetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1190 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1191                    UErrorCode *pErrorCode) {
1192     switch(pArgs->converter->mode) {
1193     case 8:
1194         return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1195     case 9:
1196         return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1197     default:
1198         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1199     }
1200 }
1201 U_CDECL_END
1202 static const UConverterImpl _UTF32Impl = {
1203     UCNV_UTF32,
1204 
1205     NULL,
1206     NULL,
1207 
1208     _UTF32Open,
1209     NULL,
1210     _UTF32Reset,
1211 
1212     _UTF32ToUnicodeWithOffsets,
1213     _UTF32ToUnicodeWithOffsets,
1214 #if U_IS_BIG_ENDIAN
1215     T_UConverter_fromUnicode_UTF32_BE,
1216     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1217 #else
1218     T_UConverter_fromUnicode_UTF32_LE,
1219     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1220 #endif
1221     _UTF32GetNextUChar,
1222 
1223     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1224     NULL,
1225     NULL,
1226     NULL,
1227     ucnv_getNonSurrogateUnicodeSet,
1228 
1229     NULL,
1230     NULL
1231 };
1232 
1233 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1234 static const UConverterStaticData _UTF32StaticData = {
1235     sizeof(UConverterStaticData),
1236     "UTF-32",
1237     1236,
1238     UCNV_IBM, UCNV_UTF32, 4, 4,
1239 #if U_IS_BIG_ENDIAN
1240     { 0, 0, 0xff, 0xfd }, 4,
1241 #else
1242     { 0xfd, 0xff, 0, 0 }, 4,
1243 #endif
1244     FALSE, FALSE,
1245     0,
1246     0,
1247     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1248 };
1249 
1250 const UConverterSharedData _UTF32Data =
1251         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32StaticData, &_UTF32Impl);
1252 
1253 #endif
1254