1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 *   Copyright (C) 2001-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ******************************************************************************
10 *
11 * File ustrtrns.cpp
12 *
13 * Modification History:
14 *
15 *   Date        Name        Description
16 *   9/10/2001    Ram    Creation.
17 ******************************************************************************
18 */
19 
20 /*******************************************************************************
21  *
22  * u_strTo* and u_strFrom* APIs
23  * WCS functions moved to ustr_wcs.c for better modularization
24  *
25  *******************************************************************************
26  */
27 
28 
29 #include "unicode/putil.h"
30 #include "unicode/ustring.h"
31 #include "unicode/utf.h"
32 #include "unicode/utf8.h"
33 #include "unicode/utf16.h"
34 #include "cstring.h"
35 #include "cmemory.h"
36 #include "ustr_imp.h"
37 #include "uassert.h"
38 
39 U_CAPI UChar* U_EXPORT2
u_strFromUTF32WithSub(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const UChar32 * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)40 u_strFromUTF32WithSub(UChar *dest,
41                int32_t destCapacity,
42                int32_t *pDestLength,
43                const UChar32 *src,
44                int32_t srcLength,
45                UChar32 subchar, int32_t *pNumSubstitutions,
46                UErrorCode *pErrorCode) {
47     const UChar32 *srcLimit;
48     UChar32 ch;
49     UChar *destLimit;
50     UChar *pDest;
51     int32_t reqLength;
52     int32_t numSubstitutions;
53 
54     /* args check */
55     if(U_FAILURE(*pErrorCode)){
56         return NULL;
57     }
58     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
59         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
60         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
61     ) {
62         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
63         return NULL;
64     }
65 
66     if(pNumSubstitutions != NULL) {
67         *pNumSubstitutions = 0;
68     }
69 
70     pDest = dest;
71     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
72     reqLength = 0;
73     numSubstitutions = 0;
74 
75     if(srcLength < 0) {
76         /* simple loop for conversion of a NUL-terminated BMP string */
77         while((ch=*src) != 0 &&
78               ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
79             ++src;
80             if(pDest < destLimit) {
81                 *pDest++ = (UChar)ch;
82             } else {
83                 ++reqLength;
84             }
85         }
86         srcLimit = src;
87         if(ch != 0) {
88             /* "complicated" case, find the end of the remaining string */
89             while(*++srcLimit != 0) {}
90         }
91     } else {
92       srcLimit = (src!=NULL)?(src + srcLength):NULL;
93     }
94 
95     /* convert with length */
96     while(src < srcLimit) {
97         ch = *src++;
98         do {
99             /* usually "loops" once; twice only for writing subchar */
100             if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
101                 if(pDest < destLimit) {
102                     *pDest++ = (UChar)ch;
103                 } else {
104                     ++reqLength;
105                 }
106                 break;
107             } else if(0x10000 <= ch && ch <= 0x10ffff) {
108                 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
109                     *pDest++ = U16_LEAD(ch);
110                     *pDest++ = U16_TRAIL(ch);
111                 } else {
112                     reqLength += 2;
113                 }
114                 break;
115             } else if((ch = subchar) < 0) {
116                 /* surrogate code point, or not a Unicode code point at all */
117                 *pErrorCode = U_INVALID_CHAR_FOUND;
118                 return NULL;
119             } else {
120                 ++numSubstitutions;
121             }
122         } while(TRUE);
123     }
124 
125     reqLength += (int32_t)(pDest - dest);
126     if(pDestLength) {
127         *pDestLength = reqLength;
128     }
129     if(pNumSubstitutions != NULL) {
130         *pNumSubstitutions = numSubstitutions;
131     }
132 
133     /* Terminate the buffer */
134     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
135 
136     return dest;
137 }
138 
139 U_CAPI UChar* U_EXPORT2
u_strFromUTF32(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const UChar32 * src,int32_t srcLength,UErrorCode * pErrorCode)140 u_strFromUTF32(UChar *dest,
141                int32_t destCapacity,
142                int32_t *pDestLength,
143                const UChar32 *src,
144                int32_t srcLength,
145                UErrorCode *pErrorCode) {
146     return u_strFromUTF32WithSub(
147             dest, destCapacity, pDestLength,
148             src, srcLength,
149             U_SENTINEL, NULL,
150             pErrorCode);
151 }
152 
153 U_CAPI UChar32* U_EXPORT2
u_strToUTF32WithSub(UChar32 * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)154 u_strToUTF32WithSub(UChar32 *dest,
155              int32_t destCapacity,
156              int32_t *pDestLength,
157              const UChar *src,
158              int32_t srcLength,
159              UChar32 subchar, int32_t *pNumSubstitutions,
160              UErrorCode *pErrorCode) {
161     const UChar *srcLimit;
162     UChar32 ch;
163     UChar ch2;
164     UChar32 *destLimit;
165     UChar32 *pDest;
166     int32_t reqLength;
167     int32_t numSubstitutions;
168 
169     /* args check */
170     if(U_FAILURE(*pErrorCode)){
171         return NULL;
172     }
173     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
174         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
175         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
176     ) {
177         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
178         return NULL;
179     }
180 
181     if(pNumSubstitutions != NULL) {
182         *pNumSubstitutions = 0;
183     }
184 
185     pDest = dest;
186     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
187     reqLength = 0;
188     numSubstitutions = 0;
189 
190     if(srcLength < 0) {
191         /* simple loop for conversion of a NUL-terminated BMP string */
192         while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
193             ++src;
194             if(pDest < destLimit) {
195                 *pDest++ = ch;
196             } else {
197                 ++reqLength;
198             }
199         }
200         srcLimit = src;
201         if(ch != 0) {
202             /* "complicated" case, find the end of the remaining string */
203             while(*++srcLimit != 0) {}
204         }
205     } else {
206         srcLimit = (src!=NULL)?(src + srcLength):NULL;
207     }
208 
209     /* convert with length */
210     while(src < srcLimit) {
211         ch = *src++;
212         if(!U16_IS_SURROGATE(ch)) {
213             /* write or count ch below */
214         } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
215             ++src;
216             ch = U16_GET_SUPPLEMENTARY(ch, ch2);
217         } else if((ch = subchar) < 0) {
218             /* unpaired surrogate */
219             *pErrorCode = U_INVALID_CHAR_FOUND;
220             return NULL;
221         } else {
222             ++numSubstitutions;
223         }
224         if(pDest < destLimit) {
225             *pDest++ = ch;
226         } else {
227             ++reqLength;
228         }
229     }
230 
231     reqLength += (int32_t)(pDest - dest);
232     if(pDestLength) {
233         *pDestLength = reqLength;
234     }
235     if(pNumSubstitutions != NULL) {
236         *pNumSubstitutions = numSubstitutions;
237     }
238 
239     /* Terminate the buffer */
240     u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
241 
242     return dest;
243 }
244 
245 U_CAPI UChar32* U_EXPORT2
u_strToUTF32(UChar32 * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)246 u_strToUTF32(UChar32 *dest,
247              int32_t destCapacity,
248              int32_t *pDestLength,
249              const UChar *src,
250              int32_t srcLength,
251              UErrorCode *pErrorCode) {
252     return u_strToUTF32WithSub(
253             dest, destCapacity, pDestLength,
254             src, srcLength,
255             U_SENTINEL, NULL,
256             pErrorCode);
257 }
258 
259 U_CAPI UChar* U_EXPORT2
u_strFromUTF8WithSub(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)260 u_strFromUTF8WithSub(UChar *dest,
261               int32_t destCapacity,
262               int32_t *pDestLength,
263               const char* src,
264               int32_t srcLength,
265               UChar32 subchar, int32_t *pNumSubstitutions,
266               UErrorCode *pErrorCode){
267     /* args check */
268     if(U_FAILURE(*pErrorCode)) {
269         return NULL;
270     }
271     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
272         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
273         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
274     ) {
275         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
276         return NULL;
277     }
278 
279     if(pNumSubstitutions!=NULL) {
280         *pNumSubstitutions=0;
281     }
282     UChar *pDest = dest;
283     UChar *pDestLimit = dest+destCapacity;
284     int32_t reqLength = 0;
285     int32_t numSubstitutions=0;
286 
287     /*
288      * Inline processing of UTF-8 byte sequences:
289      *
290      * Byte sequences for the most common characters are handled inline in
291      * the conversion loops. In order to reduce the path lengths for those
292      * characters, the tests are arranged in a kind of binary search.
293      * ASCII (<=0x7f) is checked first, followed by the dividing point
294      * between 2- and 3-byte sequences (0xe0).
295      * The 3-byte branch is tested first to speed up CJK text.
296      * The compiler should combine the subtractions for the two tests for 0xe0.
297      * Each branch then tests for the other end of its range.
298      */
299 
300     if(srcLength < 0){
301         /*
302          * Transform a NUL-terminated string.
303          * The code explicitly checks for NULs only in the lead byte position.
304          * A NUL byte in the trail byte position fails the trail byte range check anyway.
305          */
306         int32_t i;
307         UChar32 c;
308         for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) {
309             // modified copy of U8_NEXT()
310             ++i;
311             if(U8_IS_SINGLE(c)) {
312                 *pDest++=(UChar)c;
313             } else {
314                 uint8_t __t1, __t2;
315                 if( /* handle U+0800..U+FFFF inline */
316                         (0xe0<=(c) && (c)<0xf0) &&
317                         U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
318                         (__t2=src[(i)+1]-0x80)<=0x3f) {
319                     *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
320                     i+=2;
321                 } else if( /* handle U+0080..U+07FF inline */
322                         ((c)<0xe0 && (c)>=0xc2) &&
323                         (__t1=src[i]-0x80)<=0x3f) {
324                     *pDest++ = (((c)&0x1f)<<6)|__t1;
325                     ++(i);
326                 } else {
327                     /* function call for "complicated" and error cases */
328                     (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
329                     if(c<0 && (++numSubstitutions, c = subchar) < 0) {
330                         *pErrorCode = U_INVALID_CHAR_FOUND;
331                         return NULL;
332                     } else if(c<=0xFFFF) {
333                         *(pDest++)=(UChar)c;
334                     } else {
335                         *(pDest++)=U16_LEAD(c);
336                         if(pDest<pDestLimit) {
337                             *(pDest++)=U16_TRAIL(c);
338                         } else {
339                             reqLength++;
340                             break;
341                         }
342                     }
343                 }
344             }
345         }
346 
347         /* Pre-flight the rest of the string. */
348         while((c = (uint8_t)src[i]) != 0) {
349             // modified copy of U8_NEXT()
350             ++i;
351             if(U8_IS_SINGLE(c)) {
352                 ++reqLength;
353             } else {
354                 uint8_t __t1, __t2;
355                 if( /* handle U+0800..U+FFFF inline */
356                         (0xe0<=(c) && (c)<0xf0) &&
357                         U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
358                         (__t2=src[(i)+1]-0x80)<=0x3f) {
359                     ++reqLength;
360                     i+=2;
361                 } else if( /* handle U+0080..U+07FF inline */
362                         ((c)<0xe0 && (c)>=0xc2) &&
363                         (__t1=src[i]-0x80)<=0x3f) {
364                     ++reqLength;
365                     ++(i);
366                 } else {
367                     /* function call for "complicated" and error cases */
368                     (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
369                     if(c<0 && (++numSubstitutions, c = subchar) < 0) {
370                         *pErrorCode = U_INVALID_CHAR_FOUND;
371                         return NULL;
372                     }
373                     reqLength += U16_LENGTH(c);
374                 }
375             }
376         }
377     } else /* srcLength >= 0 */ {
378         /* Faster loop without ongoing checking for srcLength and pDestLimit. */
379         int32_t i = 0;
380         UChar32 c;
381         for(;;) {
382             /*
383              * Each iteration of the inner loop progresses by at most 3 UTF-8
384              * bytes and one UChar, for most characters.
385              * For supplementary code points (4 & 2), which are rare,
386              * there is an additional adjustment.
387              */
388             int32_t count = (int32_t)(pDestLimit - pDest);
389             int32_t count2 = (srcLength - i) / 3;
390             if(count > count2) {
391                 count = count2; /* min(remaining dest, remaining src/3) */
392             }
393             if(count < 3) {
394                 /*
395                  * Too much overhead if we get near the end of the string,
396                  * continue with the next loop.
397                  */
398                 break;
399             }
400 
401             do {
402                 // modified copy of U8_NEXT()
403                 c = (uint8_t)src[i++];
404                 if(U8_IS_SINGLE(c)) {
405                     *pDest++=(UChar)c;
406                 } else {
407                     uint8_t __t1, __t2;
408                     if( /* handle U+0800..U+FFFF inline */
409                             (0xe0<=(c) && (c)<0xf0) &&
410                             ((i)+1)<srcLength &&
411                             U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
412                             (__t2=src[(i)+1]-0x80)<=0x3f) {
413                         *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
414                         i+=2;
415                     } else if( /* handle U+0080..U+07FF inline */
416                             ((c)<0xe0 && (c)>=0xc2) &&
417                             ((i)!=srcLength) &&
418                             (__t1=src[i]-0x80)<=0x3f) {
419                         *pDest++ = (((c)&0x1f)<<6)|__t1;
420                         ++(i);
421                     } else {
422                         if(c >= 0xf0 || subchar > 0xffff) {
423                             // We may read up to four bytes and write up to two UChars,
424                             // which we didn't account for with computing count,
425                             // so we adjust it here.
426                             if(--count == 0) {
427                                 --i;  // back out byte c
428                                 break;
429                             }
430                         }
431 
432                         /* function call for "complicated" and error cases */
433                         (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
434                         if(c<0 && (++numSubstitutions, c = subchar) < 0) {
435                             *pErrorCode = U_INVALID_CHAR_FOUND;
436                             return NULL;
437                         } else if(c<=0xFFFF) {
438                             *(pDest++)=(UChar)c;
439                         } else {
440                             *(pDest++)=U16_LEAD(c);
441                             *(pDest++)=U16_TRAIL(c);
442                         }
443                     }
444                 }
445             } while(--count > 0);
446         }
447 
448         while(i < srcLength && (pDest < pDestLimit)) {
449             // modified copy of U8_NEXT()
450             c = (uint8_t)src[i++];
451             if(U8_IS_SINGLE(c)) {
452                 *pDest++=(UChar)c;
453             } else {
454                 uint8_t __t1, __t2;
455                 if( /* handle U+0800..U+FFFF inline */
456                         (0xe0<=(c) && (c)<0xf0) &&
457                         ((i)+1)<srcLength &&
458                         U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
459                         (__t2=src[(i)+1]-0x80)<=0x3f) {
460                     *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
461                     i+=2;
462                 } else if( /* handle U+0080..U+07FF inline */
463                         ((c)<0xe0 && (c)>=0xc2) &&
464                         ((i)!=srcLength) &&
465                         (__t1=src[i]-0x80)<=0x3f) {
466                     *pDest++ = (((c)&0x1f)<<6)|__t1;
467                     ++(i);
468                 } else {
469                     /* function call for "complicated" and error cases */
470                     (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
471                     if(c<0 && (++numSubstitutions, c = subchar) < 0) {
472                         *pErrorCode = U_INVALID_CHAR_FOUND;
473                         return NULL;
474                     } else if(c<=0xFFFF) {
475                         *(pDest++)=(UChar)c;
476                     } else {
477                         *(pDest++)=U16_LEAD(c);
478                         if(pDest<pDestLimit) {
479                             *(pDest++)=U16_TRAIL(c);
480                         } else {
481                             reqLength++;
482                             break;
483                         }
484                     }
485                 }
486             }
487         }
488 
489         /* Pre-flight the rest of the string. */
490         while(i < srcLength) {
491             // modified copy of U8_NEXT()
492             c = (uint8_t)src[i++];
493             if(U8_IS_SINGLE(c)) {
494                 ++reqLength;
495             } else {
496                 uint8_t __t1, __t2;
497                 if( /* handle U+0800..U+FFFF inline */
498                         (0xe0<=(c) && (c)<0xf0) &&
499                         ((i)+1)<srcLength &&
500                         U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
501                         (__t2=src[(i)+1]-0x80)<=0x3f) {
502                     ++reqLength;
503                     i+=2;
504                 } else if( /* handle U+0080..U+07FF inline */
505                         ((c)<0xe0 && (c)>=0xc2) &&
506                         ((i)!=srcLength) &&
507                         (__t1=src[i]-0x80)<=0x3f) {
508                     ++reqLength;
509                     ++(i);
510                 } else {
511                     /* function call for "complicated" and error cases */
512                     (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
513                     if(c<0 && (++numSubstitutions, c = subchar) < 0) {
514                         *pErrorCode = U_INVALID_CHAR_FOUND;
515                         return NULL;
516                     }
517                     reqLength += U16_LENGTH(c);
518                 }
519             }
520         }
521     }
522 
523     reqLength+=(int32_t)(pDest - dest);
524 
525     if(pNumSubstitutions!=NULL) {
526         *pNumSubstitutions=numSubstitutions;
527     }
528 
529     if(pDestLength){
530         *pDestLength = reqLength;
531     }
532 
533     /* Terminate the buffer */
534     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
535 
536     return dest;
537 }
538 
539 U_CAPI UChar* U_EXPORT2
u_strFromUTF8(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UErrorCode * pErrorCode)540 u_strFromUTF8(UChar *dest,
541               int32_t destCapacity,
542               int32_t *pDestLength,
543               const char* src,
544               int32_t srcLength,
545               UErrorCode *pErrorCode){
546     return u_strFromUTF8WithSub(
547             dest, destCapacity, pDestLength,
548             src, srcLength,
549             U_SENTINEL, NULL,
550             pErrorCode);
551 }
552 
553 U_CAPI UChar * U_EXPORT2
u_strFromUTF8Lenient(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UErrorCode * pErrorCode)554 u_strFromUTF8Lenient(UChar *dest,
555                      int32_t destCapacity,
556                      int32_t *pDestLength,
557                      const char *src,
558                      int32_t srcLength,
559                      UErrorCode *pErrorCode) {
560     UChar *pDest = dest;
561     UChar32 ch;
562     int32_t reqLength = 0;
563     uint8_t* pSrc = (uint8_t*) src;
564 
565     /* args check */
566     if(U_FAILURE(*pErrorCode)){
567         return NULL;
568     }
569 
570     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
571         (destCapacity<0) || (dest == NULL && destCapacity > 0)
572     ) {
573         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
574         return NULL;
575     }
576 
577     if(srcLength < 0) {
578         /* Transform a NUL-terminated string. */
579         UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
580         uint8_t t1, t2, t3; /* trail bytes */
581 
582         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
583             if(ch < 0xc0) {
584                 /*
585                  * ASCII, or a trail byte in lead position which is treated like
586                  * a single-byte sequence for better character boundary
587                  * resynchronization after illegal sequences.
588                  */
589                 *pDest++=(UChar)ch;
590                 ++pSrc;
591                 continue;
592             } else if(ch < 0xe0) { /* U+0080..U+07FF */
593                 if((t1 = pSrc[1]) != 0) {
594                     /* 0x3080 = (0xc0 << 6) + 0x80 */
595                     *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
596                     pSrc += 2;
597                     continue;
598                 }
599             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
600                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
601                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
602                     /* 0x2080 = (0x80 << 6) + 0x80 */
603                     *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
604                     pSrc += 3;
605                     continue;
606                 }
607             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
608                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
609                     pSrc += 4;
610                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
611                     ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
612                     *(pDest++) = U16_LEAD(ch);
613                     if(pDest < pDestLimit) {
614                         *(pDest++) = U16_TRAIL(ch);
615                     } else {
616                         reqLength = 1;
617                         break;
618                     }
619                     continue;
620                 }
621             }
622 
623             /* truncated character at the end */
624             *pDest++ = 0xfffd;
625             while(*++pSrc != 0) {}
626             break;
627         }
628 
629         /* Pre-flight the rest of the string. */
630         while((ch = *pSrc) != 0) {
631             if(ch < 0xc0) {
632                 /*
633                  * ASCII, or a trail byte in lead position which is treated like
634                  * a single-byte sequence for better character boundary
635                  * resynchronization after illegal sequences.
636                  */
637                 ++reqLength;
638                 ++pSrc;
639                 continue;
640             } else if(ch < 0xe0) { /* U+0080..U+07FF */
641                 if(pSrc[1] != 0) {
642                     ++reqLength;
643                     pSrc += 2;
644                     continue;
645                 }
646             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
647                 if(pSrc[1] != 0 && pSrc[2] != 0) {
648                     ++reqLength;
649                     pSrc += 3;
650                     continue;
651                 }
652             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
653                 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
654                     reqLength += 2;
655                     pSrc += 4;
656                     continue;
657                 }
658             }
659 
660             /* truncated character at the end */
661             ++reqLength;
662             break;
663         }
664     } else /* srcLength >= 0 */ {
665       const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
666 
667         /*
668          * This function requires that if srcLength is given, then it must be
669          * destCapatity >= srcLength so that we need not check for
670          * destination buffer overflow in the loop.
671          */
672         if(destCapacity < srcLength) {
673             if(pDestLength != NULL) {
674                 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
675             }
676             *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
677             return NULL;
678         }
679 
680         if((pSrcLimit - pSrc) >= 4) {
681             pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
682 
683             /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
684             do {
685                 ch = *pSrc++;
686                 if(ch < 0xc0) {
687                     /*
688                      * ASCII, or a trail byte in lead position which is treated like
689                      * a single-byte sequence for better character boundary
690                      * resynchronization after illegal sequences.
691                      */
692                     *pDest++=(UChar)ch;
693                 } else if(ch < 0xe0) { /* U+0080..U+07FF */
694                     /* 0x3080 = (0xc0 << 6) + 0x80 */
695                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
696                 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
697                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
698                     /* 0x2080 = (0x80 << 6) + 0x80 */
699                     ch = (ch << 12) + (*pSrc++ << 6);
700                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
701                 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
702                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
703                     ch = (ch << 18) + (*pSrc++ << 12);
704                     ch += *pSrc++ << 6;
705                     ch += *pSrc++ - 0x3c82080;
706                     *(pDest++) = U16_LEAD(ch);
707                     *(pDest++) = U16_TRAIL(ch);
708                 }
709             } while(pSrc < pSrcLimit);
710 
711             pSrcLimit += 3; /* restore original pSrcLimit */
712         }
713 
714         while(pSrc < pSrcLimit) {
715             ch = *pSrc++;
716             if(ch < 0xc0) {
717                 /*
718                  * ASCII, or a trail byte in lead position which is treated like
719                  * a single-byte sequence for better character boundary
720                  * resynchronization after illegal sequences.
721                  */
722                 *pDest++=(UChar)ch;
723                 continue;
724             } else if(ch < 0xe0) { /* U+0080..U+07FF */
725                 if(pSrc < pSrcLimit) {
726                     /* 0x3080 = (0xc0 << 6) + 0x80 */
727                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
728                     continue;
729                 }
730             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
731                 if((pSrcLimit - pSrc) >= 2) {
732                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
733                     /* 0x2080 = (0x80 << 6) + 0x80 */
734                     ch = (ch << 12) + (*pSrc++ << 6);
735                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
736                     pSrc += 3;
737                     continue;
738                 }
739             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
740                 if((pSrcLimit - pSrc) >= 3) {
741                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
742                     ch = (ch << 18) + (*pSrc++ << 12);
743                     ch += *pSrc++ << 6;
744                     ch += *pSrc++ - 0x3c82080;
745                     *(pDest++) = U16_LEAD(ch);
746                     *(pDest++) = U16_TRAIL(ch);
747                     pSrc += 4;
748                     continue;
749                 }
750             }
751 
752             /* truncated character at the end */
753             *pDest++ = 0xfffd;
754             break;
755         }
756     }
757 
758     reqLength+=(int32_t)(pDest - dest);
759 
760     if(pDestLength){
761         *pDestLength = reqLength;
762     }
763 
764     /* Terminate the buffer */
765     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
766 
767     return dest;
768 }
769 
770 static inline uint8_t *
_appendUTF8(uint8_t * pDest,UChar32 c)771 _appendUTF8(uint8_t *pDest, UChar32 c) {
772     /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
773     if((c)<=0x7f) {
774         *pDest++=(uint8_t)c;
775     } else if(c<=0x7ff) {
776         *pDest++=(uint8_t)((c>>6)|0xc0);
777         *pDest++=(uint8_t)((c&0x3f)|0x80);
778     } else if(c<=0xffff) {
779         *pDest++=(uint8_t)((c>>12)|0xe0);
780         *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
781         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
782     } else /* if((uint32_t)(c)<=0x10ffff) */ {
783         *pDest++=(uint8_t)(((c)>>18)|0xf0);
784         *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
785         *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
786         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
787     }
788     return pDest;
789 }
790 
791 
792 U_CAPI char* U_EXPORT2
u_strToUTF8WithSub(char * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * pSrc,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)793 u_strToUTF8WithSub(char *dest,
794             int32_t destCapacity,
795             int32_t *pDestLength,
796             const UChar *pSrc,
797             int32_t srcLength,
798             UChar32 subchar, int32_t *pNumSubstitutions,
799             UErrorCode *pErrorCode){
800     int32_t reqLength=0;
801     uint32_t ch=0,ch2=0;
802     uint8_t *pDest = (uint8_t *)dest;
803     uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
804     int32_t numSubstitutions;
805 
806     /* args check */
807     if(U_FAILURE(*pErrorCode)){
808         return NULL;
809     }
810 
811     if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
812         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
813         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
814     ) {
815         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
816         return NULL;
817     }
818 
819     if(pNumSubstitutions!=NULL) {
820         *pNumSubstitutions=0;
821     }
822     numSubstitutions=0;
823 
824     if(srcLength==-1) {
825         while((ch=*pSrc)!=0) {
826             ++pSrc;
827             if(ch <= 0x7f) {
828                 if(pDest<pDestLimit) {
829                     *pDest++ = (uint8_t)ch;
830                 } else {
831                     reqLength = 1;
832                     break;
833                 }
834             } else if(ch <= 0x7ff) {
835                 if((pDestLimit - pDest) >= 2) {
836                     *pDest++=(uint8_t)((ch>>6)|0xc0);
837                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
838                 } else {
839                     reqLength = 2;
840                     break;
841                 }
842             } else if(ch <= 0xd7ff || ch >= 0xe000) {
843                 if((pDestLimit - pDest) >= 3) {
844                     *pDest++=(uint8_t)((ch>>12)|0xe0);
845                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
846                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
847                 } else {
848                     reqLength = 3;
849                     break;
850                 }
851             } else /* ch is a surrogate */ {
852                 int32_t length;
853 
854                 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
855                 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
856                     ++pSrc;
857                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
858                 } else if(subchar>=0) {
859                     ch=subchar;
860                     ++numSubstitutions;
861                 } else {
862                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
863                     *pErrorCode = U_INVALID_CHAR_FOUND;
864                     return NULL;
865                 }
866 
867                 length = U8_LENGTH(ch);
868                 if((pDestLimit - pDest) >= length) {
869                     /* convert and append*/
870                     pDest=_appendUTF8(pDest, ch);
871                 } else {
872                     reqLength = length;
873                     break;
874                 }
875             }
876         }
877         while((ch=*pSrc++)!=0) {
878             if(ch<=0x7f) {
879                 ++reqLength;
880             } else if(ch<=0x7ff) {
881                 reqLength+=2;
882             } else if(!U16_IS_SURROGATE(ch)) {
883                 reqLength+=3;
884             } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
885                 ++pSrc;
886                 reqLength+=4;
887             } else if(subchar>=0) {
888                 reqLength+=U8_LENGTH(subchar);
889                 ++numSubstitutions;
890             } else {
891                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
892                 *pErrorCode = U_INVALID_CHAR_FOUND;
893                 return NULL;
894             }
895         }
896     } else {
897         const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
898         int32_t count;
899 
900         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
901         for(;;) {
902             /*
903              * Each iteration of the inner loop progresses by at most 3 UTF-8
904              * bytes and one UChar, for most characters.
905              * For supplementary code points (4 & 2), which are rare,
906              * there is an additional adjustment.
907              */
908             count = (int32_t)((pDestLimit - pDest) / 3);
909             srcLength = (int32_t)(pSrcLimit - pSrc);
910             if(count > srcLength) {
911                 count = srcLength; /* min(remaining dest/3, remaining src) */
912             }
913             if(count < 3) {
914                 /*
915                  * Too much overhead if we get near the end of the string,
916                  * continue with the next loop.
917                  */
918                 break;
919             }
920             do {
921                 ch=*pSrc++;
922                 if(ch <= 0x7f) {
923                     *pDest++ = (uint8_t)ch;
924                 } else if(ch <= 0x7ff) {
925                     *pDest++=(uint8_t)((ch>>6)|0xc0);
926                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
927                 } else if(ch <= 0xd7ff || ch >= 0xe000) {
928                     *pDest++=(uint8_t)((ch>>12)|0xe0);
929                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
930                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
931                 } else /* ch is a surrogate */ {
932                     /*
933                      * We will read two UChars and probably output four bytes,
934                      * which we didn't account for with computing count,
935                      * so we adjust it here.
936                      */
937                     if(--count == 0) {
938                         --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
939                         break;  /* recompute count */
940                     }
941 
942                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
943                         ++pSrc;
944                         ch=U16_GET_SUPPLEMENTARY(ch, ch2);
945 
946                         /* writing 4 bytes per 2 UChars is ok */
947                         *pDest++=(uint8_t)((ch>>18)|0xf0);
948                         *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
949                         *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
950                         *pDest++=(uint8_t)((ch&0x3f)|0x80);
951                     } else  {
952                         /* Unicode 3.2 forbids surrogate code points in UTF-8 */
953                         if(subchar>=0) {
954                             ch=subchar;
955                             ++numSubstitutions;
956                         } else {
957                             *pErrorCode = U_INVALID_CHAR_FOUND;
958                             return NULL;
959                         }
960 
961                         /* convert and append*/
962                         pDest=_appendUTF8(pDest, ch);
963                     }
964                 }
965             } while(--count > 0);
966         }
967 
968         while(pSrc<pSrcLimit) {
969             ch=*pSrc++;
970             if(ch <= 0x7f) {
971                 if(pDest<pDestLimit) {
972                     *pDest++ = (uint8_t)ch;
973                 } else {
974                     reqLength = 1;
975                     break;
976                 }
977             } else if(ch <= 0x7ff) {
978                 if((pDestLimit - pDest) >= 2) {
979                     *pDest++=(uint8_t)((ch>>6)|0xc0);
980                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
981                 } else {
982                     reqLength = 2;
983                     break;
984                 }
985             } else if(ch <= 0xd7ff || ch >= 0xe000) {
986                 if((pDestLimit - pDest) >= 3) {
987                     *pDest++=(uint8_t)((ch>>12)|0xe0);
988                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
989                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
990                 } else {
991                     reqLength = 3;
992                     break;
993                 }
994             } else /* ch is a surrogate */ {
995                 int32_t length;
996 
997                 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
998                     ++pSrc;
999                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1000                 } else if(subchar>=0) {
1001                     ch=subchar;
1002                     ++numSubstitutions;
1003                 } else {
1004                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1005                     *pErrorCode = U_INVALID_CHAR_FOUND;
1006                     return NULL;
1007                 }
1008 
1009                 length = U8_LENGTH(ch);
1010                 if((pDestLimit - pDest) >= length) {
1011                     /* convert and append*/
1012                     pDest=_appendUTF8(pDest, ch);
1013                 } else {
1014                     reqLength = length;
1015                     break;
1016                 }
1017             }
1018         }
1019         while(pSrc<pSrcLimit) {
1020             ch=*pSrc++;
1021             if(ch<=0x7f) {
1022                 ++reqLength;
1023             } else if(ch<=0x7ff) {
1024                 reqLength+=2;
1025             } else if(!U16_IS_SURROGATE(ch)) {
1026                 reqLength+=3;
1027             } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1028                 ++pSrc;
1029                 reqLength+=4;
1030             } else if(subchar>=0) {
1031                 reqLength+=U8_LENGTH(subchar);
1032                 ++numSubstitutions;
1033             } else {
1034                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1035                 *pErrorCode = U_INVALID_CHAR_FOUND;
1036                 return NULL;
1037             }
1038         }
1039     }
1040 
1041     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1042 
1043     if(pNumSubstitutions!=NULL) {
1044         *pNumSubstitutions=numSubstitutions;
1045     }
1046 
1047     if(pDestLength){
1048         *pDestLength = reqLength;
1049     }
1050 
1051     /* Terminate the buffer */
1052     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1053     return dest;
1054 }
1055 
1056 U_CAPI char* U_EXPORT2
u_strToUTF8(char * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * pSrc,int32_t srcLength,UErrorCode * pErrorCode)1057 u_strToUTF8(char *dest,
1058             int32_t destCapacity,
1059             int32_t *pDestLength,
1060             const UChar *pSrc,
1061             int32_t srcLength,
1062             UErrorCode *pErrorCode){
1063     return u_strToUTF8WithSub(
1064             dest, destCapacity, pDestLength,
1065             pSrc, srcLength,
1066             U_SENTINEL, NULL,
1067             pErrorCode);
1068 }
1069 
1070 U_CAPI UChar* U_EXPORT2
u_strFromJavaModifiedUTF8WithSub(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)1071 u_strFromJavaModifiedUTF8WithSub(
1072         UChar *dest,
1073         int32_t destCapacity,
1074         int32_t *pDestLength,
1075         const char *src,
1076         int32_t srcLength,
1077         UChar32 subchar, int32_t *pNumSubstitutions,
1078         UErrorCode *pErrorCode) {
1079     /* args check */
1080     if(U_FAILURE(*pErrorCode)) {
1081         return NULL;
1082     }
1083     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1084         (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1085         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1086     ) {
1087         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1088         return NULL;
1089     }
1090 
1091     if(pNumSubstitutions!=NULL) {
1092         *pNumSubstitutions=0;
1093     }
1094     UChar *pDest = dest;
1095     UChar *pDestLimit = dest+destCapacity;
1096     int32_t reqLength = 0;
1097     int32_t numSubstitutions=0;
1098 
1099     if(srcLength < 0) {
1100         /*
1101          * Transform a NUL-terminated ASCII string.
1102          * Handle non-ASCII strings with slower code.
1103          */
1104         UChar32 c;
1105         while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) {
1106             *pDest++=(UChar)c;
1107             ++src;
1108         }
1109         if(c == 0) {
1110             reqLength=(int32_t)(pDest - dest);
1111             if(pDestLength) {
1112                 *pDestLength = reqLength;
1113             }
1114 
1115             /* Terminate the buffer */
1116             u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1117             return dest;
1118         }
1119         srcLength = static_cast<int32_t>(uprv_strlen(src));
1120     }
1121 
1122     /* Faster loop without ongoing checking for srcLength and pDestLimit. */
1123     UChar32 ch;
1124     uint8_t t1, t2;
1125     int32_t i = 0;
1126     for(;;) {
1127         int32_t count = (int32_t)(pDestLimit - pDest);
1128         int32_t count2 = srcLength - i;
1129         if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) {
1130             /* fast ASCII loop */
1131             int32_t start = i;
1132             uint8_t b;
1133             while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
1134                 *pDest++=b;
1135                 ++i;
1136             }
1137             int32_t delta = i - start;
1138             count -= delta;
1139             count2 -= delta;
1140         }
1141         /*
1142          * Each iteration of the inner loop progresses by at most 3 UTF-8
1143          * bytes and one UChar.
1144          */
1145         if(subchar > 0xFFFF) {
1146             break;
1147         }
1148         count2 /= 3;
1149         if(count > count2) {
1150             count = count2; /* min(remaining dest, remaining src/3) */
1151         }
1152         if(count < 3) {
1153             /*
1154              * Too much overhead if we get near the end of the string,
1155              * continue with the next loop.
1156              */
1157             break;
1158         }
1159         do {
1160             ch = (uint8_t)src[i++];
1161             if(U8_IS_SINGLE(ch)) {
1162                 *pDest++=(UChar)ch;
1163             } else {
1164                 if(ch >= 0xe0) {
1165                     if( /* handle U+0000..U+FFFF inline */
1166                         ch <= 0xef &&
1167                         (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
1168                         (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
1169                     ) {
1170                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1171                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1172                         i += 2;
1173                         continue;
1174                     }
1175                 } else {
1176                     if( /* handle U+0000..U+07FF inline */
1177                         ch >= 0xc0 &&
1178                         (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
1179                     ) {
1180                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1181                         ++i;
1182                         continue;
1183                     }
1184                 }
1185 
1186                 if(subchar < 0) {
1187                     *pErrorCode = U_INVALID_CHAR_FOUND;
1188                     return NULL;
1189                 } else if(subchar > 0xffff && --count == 0) {
1190                     /*
1191                      * We need to write two UChars, adjusted count for that,
1192                      * and ran out of space.
1193                      */
1194                     --i;  // back out byte ch
1195                     break;
1196                 } else {
1197                     /* function call for error cases */
1198                     utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
1199                     ++numSubstitutions;
1200                     *(pDest++)=(UChar)subchar;
1201                 }
1202             }
1203         } while(--count > 0);
1204     }
1205 
1206     while(i < srcLength && (pDest < pDestLimit)) {
1207         ch = (uint8_t)src[i++];
1208         if(U8_IS_SINGLE(ch)){
1209             *pDest++=(UChar)ch;
1210         } else {
1211             if(ch >= 0xe0) {
1212                 if( /* handle U+0000..U+FFFF inline */
1213                     ch <= 0xef &&
1214                     (i+1) < srcLength &&
1215                     (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
1216                     (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
1217                 ) {
1218                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1219                     *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1220                     i += 2;
1221                     continue;
1222                 }
1223             } else {
1224                 if( /* handle U+0000..U+07FF inline */
1225                     ch >= 0xc0 &&
1226                     i < srcLength &&
1227                     (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
1228                 ) {
1229                     *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1230                     ++i;
1231                     continue;
1232                 }
1233             }
1234 
1235             if(subchar < 0) {
1236                 *pErrorCode = U_INVALID_CHAR_FOUND;
1237                 return NULL;
1238             } else {
1239                 /* function call for error cases */
1240                 utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
1241                 ++numSubstitutions;
1242                 if(subchar<=0xFFFF) {
1243                     *(pDest++)=(UChar)subchar;
1244                 } else {
1245                     *(pDest++)=U16_LEAD(subchar);
1246                     if(pDest<pDestLimit) {
1247                         *(pDest++)=U16_TRAIL(subchar);
1248                     } else {
1249                         reqLength++;
1250                         break;
1251                     }
1252                 }
1253             }
1254         }
1255     }
1256 
1257     /* Pre-flight the rest of the string. */
1258     while(i < srcLength) {
1259         ch = (uint8_t)src[i++];
1260         if(U8_IS_SINGLE(ch)) {
1261             reqLength++;
1262         } else {
1263             if(ch >= 0xe0) {
1264                 if( /* handle U+0000..U+FFFF inline */
1265                     ch <= 0xef &&
1266                     (i+1) < srcLength &&
1267                     (uint8_t)(src[i] - 0x80) <= 0x3f &&
1268                     (uint8_t)(src[i+1] - 0x80) <= 0x3f
1269                 ) {
1270                     reqLength++;
1271                     i += 2;
1272                     continue;
1273                 }
1274             } else {
1275                 if( /* handle U+0000..U+07FF inline */
1276                     ch >= 0xc0 &&
1277                     i < srcLength &&
1278                     (uint8_t)(src[i] - 0x80) <= 0x3f
1279                 ) {
1280                     reqLength++;
1281                     ++i;
1282                     continue;
1283                 }
1284             }
1285 
1286             if(subchar < 0) {
1287                 *pErrorCode = U_INVALID_CHAR_FOUND;
1288                 return NULL;
1289             } else {
1290                 /* function call for error cases */
1291                 utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
1292                 ++numSubstitutions;
1293                 reqLength+=U16_LENGTH(ch);
1294             }
1295         }
1296     }
1297 
1298     if(pNumSubstitutions!=NULL) {
1299         *pNumSubstitutions=numSubstitutions;
1300     }
1301 
1302     reqLength+=(int32_t)(pDest - dest);
1303     if(pDestLength) {
1304         *pDestLength = reqLength;
1305     }
1306 
1307     /* Terminate the buffer */
1308     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1309     return dest;
1310 }
1311 
1312 U_CAPI char* U_EXPORT2
u_strToJavaModifiedUTF8(char * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)1313 u_strToJavaModifiedUTF8(
1314         char *dest,
1315         int32_t destCapacity,
1316         int32_t *pDestLength,
1317         const UChar *src,
1318         int32_t srcLength,
1319         UErrorCode *pErrorCode) {
1320     int32_t reqLength=0;
1321     uint32_t ch=0;
1322     uint8_t *pDest = (uint8_t *)dest;
1323     uint8_t *pDestLimit = pDest + destCapacity;
1324     const UChar *pSrcLimit;
1325     int32_t count;
1326 
1327     /* args check */
1328     if(U_FAILURE(*pErrorCode)){
1329         return NULL;
1330     }
1331     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1332         (dest==NULL && destCapacity!=0) || destCapacity<0
1333     ) {
1334         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1335         return NULL;
1336     }
1337 
1338     if(srcLength==-1) {
1339         /* Convert NUL-terminated ASCII, then find the string length. */
1340         while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1341             *pDest++ = (uint8_t)ch;
1342             ++src;
1343         }
1344         if(ch == 0) {
1345             reqLength=(int32_t)(pDest - (uint8_t *)dest);
1346             if(pDestLength) {
1347                 *pDestLength = reqLength;
1348             }
1349 
1350             /* Terminate the buffer */
1351             u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1352             return dest;
1353         }
1354         srcLength = u_strlen(src);
1355     }
1356 
1357     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1358     pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
1359     for(;;) {
1360         count = (int32_t)(pDestLimit - pDest);
1361         srcLength = (int32_t)(pSrcLimit - src);
1362         if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1363             /* fast ASCII loop */
1364             const UChar *prevSrc = src;
1365             int32_t delta;
1366             while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1367                 *pDest++=(uint8_t)ch;
1368                 ++src;
1369             }
1370             delta = (int32_t)(src - prevSrc);
1371             count -= delta;
1372             srcLength -= delta;
1373         }
1374         /*
1375          * Each iteration of the inner loop progresses by at most 3 UTF-8
1376          * bytes and one UChar.
1377          */
1378         count /= 3;
1379         if(count > srcLength) {
1380             count = srcLength; /* min(remaining dest/3, remaining src) */
1381         }
1382         if(count < 3) {
1383             /*
1384              * Too much overhead if we get near the end of the string,
1385              * continue with the next loop.
1386              */
1387             break;
1388         }
1389         do {
1390             ch=*src++;
1391             if(ch <= 0x7f && ch != 0) {
1392                 *pDest++ = (uint8_t)ch;
1393             } else if(ch <= 0x7ff) {
1394                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1395                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1396             } else {
1397                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1398                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1399                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1400             }
1401         } while(--count > 0);
1402     }
1403 
1404     while(src<pSrcLimit) {
1405         ch=*src++;
1406         if(ch <= 0x7f && ch != 0) {
1407             if(pDest<pDestLimit) {
1408                 *pDest++ = (uint8_t)ch;
1409             } else {
1410                 reqLength = 1;
1411                 break;
1412             }
1413         } else if(ch <= 0x7ff) {
1414             if((pDestLimit - pDest) >= 2) {
1415                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1416                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1417             } else {
1418                 reqLength = 2;
1419                 break;
1420             }
1421         } else {
1422             if((pDestLimit - pDest) >= 3) {
1423                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1424                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1425                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1426             } else {
1427                 reqLength = 3;
1428                 break;
1429             }
1430         }
1431     }
1432     while(src<pSrcLimit) {
1433         ch=*src++;
1434         if(ch <= 0x7f && ch != 0) {
1435             ++reqLength;
1436         } else if(ch<=0x7ff) {
1437             reqLength+=2;
1438         } else {
1439             reqLength+=3;
1440         }
1441     }
1442 
1443     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1444     if(pDestLength){
1445         *pDestLength = reqLength;
1446     }
1447 
1448     /* Terminate the buffer */
1449     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1450     return dest;
1451 }
1452