1 /*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===------------------------------------------------------------------------=*/
8 /*
9  * Copyright © 1991-2015 Unicode, Inc. All rights reserved.
10  * Distributed under the Terms of Use in
11  * http://www.unicode.org/copyright.html.
12  *
13  * Permission is hereby granted, free of charge, to any person obtaining
14  * a copy of the Unicode data files and any associated documentation
15  * (the "Data Files") or Unicode software and any associated documentation
16  * (the "Software") to deal in the Data Files or Software
17  * without restriction, including without limitation the rights to use,
18  * copy, modify, merge, publish, distribute, and/or sell copies of
19  * the Data Files or Software, and to permit persons to whom the Data Files
20  * or Software are furnished to do so, provided that
21  * (a) this copyright and permission notice appear with all copies
22  * of the Data Files or Software,
23  * (b) this copyright and permission notice appear in associated
24  * documentation, and
25  * (c) there is clear notice in each modified Data File or in the Software
26  * as well as in the documentation associated with the Data File(s) or
27  * Software that the data or software has been modified.
28  *
29  * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
30  * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
31  * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32  * NONINFRINGEMENT OF THIRD PARTY RIGHTS.
33  * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
34  * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
35  * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
36  * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
37  * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
38  * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
39  *
40  * Except as contained in this notice, the name of a copyright holder
41  * shall not be used in advertising or otherwise to promote the sale,
42  * use or other dealings in these Data Files or Software without prior
43  * written authorization of the copyright holder.
44  */
45 
46 /* ---------------------------------------------------------------------
47 
48     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
49     Author: Mark E. Davis, 1994.
50     Rev History: Rick McGowan, fixes & updates May 2001.
51     Sept 2001: fixed const & error conditions per
52         mods suggested by S. Parent & A. Lillich.
53     June 2002: Tim Dodd added detection and handling of incomplete
54         source sequences, enhanced error detection, added casts
55         to eliminate compiler warnings.
56     July 2003: slight mods to back out aggressive FFFE detection.
57     Jan 2004: updated switches in from-UTF8 conversions.
58     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
59 
60     See the header file "ConvertUTF.h" for complete documentation.
61 
62 ------------------------------------------------------------------------ */
63 
64 #include "llvm/Support/ConvertUTF.h"
65 #ifdef CVTUTF_DEBUG
66 #include <stdio.h>
67 #endif
68 #include <assert.h>
69 
70 /*
71  * This code extensively uses fall-through switches.
72  * Keep the compiler from warning about that.
73  */
74 #if defined(__clang__) && defined(__has_warning)
75 # if __has_warning("-Wimplicit-fallthrough")
76 #  define ConvertUTF_DISABLE_WARNINGS \
77     _Pragma("clang diagnostic push")  \
78     _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
79 #  define ConvertUTF_RESTORE_WARNINGS \
80     _Pragma("clang diagnostic pop")
81 # endif
82 #elif defined(__GNUC__) && __GNUC__ > 6
83 # define ConvertUTF_DISABLE_WARNINGS \
84    _Pragma("GCC diagnostic push")    \
85    _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
86 # define ConvertUTF_RESTORE_WARNINGS \
87    _Pragma("GCC diagnostic pop")
88 #endif
89 #ifndef ConvertUTF_DISABLE_WARNINGS
90 # define ConvertUTF_DISABLE_WARNINGS
91 #endif
92 #ifndef ConvertUTF_RESTORE_WARNINGS
93 # define ConvertUTF_RESTORE_WARNINGS
94 #endif
95 
96 ConvertUTF_DISABLE_WARNINGS
97 
98 namespace llvm {
99 
100 static const int halfShift  = 10; /* used for shifting by 10 bits */
101 
102 static const UTF32 halfBase = 0x0010000UL;
103 static const UTF32 halfMask = 0x3FFUL;
104 
105 #define UNI_SUR_HIGH_START  (UTF32)0xD800
106 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
107 #define UNI_SUR_LOW_START   (UTF32)0xDC00
108 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
109 
110 /* --------------------------------------------------------------------- */
111 
112 /*
113  * Index into the table below with the first byte of a UTF-8 sequence to
114  * get the number of trailing bytes that are supposed to follow it.
115  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
116  * left as-is for anyone who may want to do such conversion, which was
117  * allowed in earlier algorithms.
118  */
119 static const char trailingBytesForUTF8[256] = {
120     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
121     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
122     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
123     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
124     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
125     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
126     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
127     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
128 };
129 
130 /*
131  * Magic values subtracted from a buffer value during UTF8 conversion.
132  * This table contains as many values as there might be trailing bytes
133  * in a UTF-8 sequence.
134  */
135 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
136                      0x03C82080UL, 0xFA082080UL, 0x82082080UL };
137 
138 /*
139  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
140  * into the first byte, depending on how many bytes follow.  There are
141  * as many entries in this table as there are UTF-8 sequence types.
142  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
143  * for *legal* UTF-8 will be 4 or fewer bytes total.
144  */
145 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
146 
147 /* --------------------------------------------------------------------- */
148 
149 /* The interface converts a whole buffer to avoid function-call overhead.
150  * Constants have been gathered. Loops & conditionals have been removed as
151  * much as possible for efficiency, in favor of drop-through switches.
152  * (See "Note A" at the bottom of the file for equivalent code.)
153  * If your compiler supports it, the "isLegalUTF8" call can be turned
154  * into an inline function.
155  */
156 
157 
158 /* --------------------------------------------------------------------- */
159 
ConvertUTF32toUTF16(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)160 ConversionResult ConvertUTF32toUTF16 (
161         const UTF32** sourceStart, const UTF32* sourceEnd,
162         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
163     ConversionResult result = conversionOK;
164     const UTF32* source = *sourceStart;
165     UTF16* target = *targetStart;
166     while (source < sourceEnd) {
167         UTF32 ch;
168         if (target >= targetEnd) {
169             result = targetExhausted; break;
170         }
171         ch = *source++;
172         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
173             /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
174             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
175                 if (flags == strictConversion) {
176                     --source; /* return to the illegal value itself */
177                     result = sourceIllegal;
178                     break;
179                 } else {
180                     *target++ = UNI_REPLACEMENT_CHAR;
181                 }
182             } else {
183                 *target++ = (UTF16)ch; /* normal case */
184             }
185         } else if (ch > UNI_MAX_LEGAL_UTF32) {
186             if (flags == strictConversion) {
187                 result = sourceIllegal;
188             } else {
189                 *target++ = UNI_REPLACEMENT_CHAR;
190             }
191         } else {
192             /* target is a character in range 0xFFFF - 0x10FFFF. */
193             if (target + 1 >= targetEnd) {
194                 --source; /* Back up source pointer! */
195                 result = targetExhausted; break;
196             }
197             ch -= halfBase;
198             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
199             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
200         }
201     }
202     *sourceStart = source;
203     *targetStart = target;
204     return result;
205 }
206 
207 /* --------------------------------------------------------------------- */
208 
ConvertUTF16toUTF32(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)209 ConversionResult ConvertUTF16toUTF32 (
210         const UTF16** sourceStart, const UTF16* sourceEnd,
211         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
212     ConversionResult result = conversionOK;
213     const UTF16* source = *sourceStart;
214     UTF32* target = *targetStart;
215     UTF32 ch, ch2;
216     while (source < sourceEnd) {
217         const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
218         ch = *source++;
219         /* If we have a surrogate pair, convert to UTF32 first. */
220         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
221             /* If the 16 bits following the high surrogate are in the source buffer... */
222             if (source < sourceEnd) {
223                 ch2 = *source;
224                 /* If it's a low surrogate, convert to UTF32. */
225                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
226                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
227                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
228                     ++source;
229                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
230                     --source; /* return to the illegal value itself */
231                     result = sourceIllegal;
232                     break;
233                 }
234             } else { /* We don't have the 16 bits following the high surrogate. */
235                 --source; /* return to the high surrogate */
236                 result = sourceExhausted;
237                 break;
238             }
239         } else if (flags == strictConversion) {
240             /* UTF-16 surrogate values are illegal in UTF-32 */
241             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
242                 --source; /* return to the illegal value itself */
243                 result = sourceIllegal;
244                 break;
245             }
246         }
247         if (target >= targetEnd) {
248             source = oldSource; /* Back up source pointer! */
249             result = targetExhausted; break;
250         }
251         *target++ = ch;
252     }
253     *sourceStart = source;
254     *targetStart = target;
255 #ifdef CVTUTF_DEBUG
256 if (result == sourceIllegal) {
257     fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
258     fflush(stderr);
259 }
260 #endif
261     return result;
262 }
ConvertUTF16toUTF8(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)263 ConversionResult ConvertUTF16toUTF8 (
264         const UTF16** sourceStart, const UTF16* sourceEnd,
265         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
266     ConversionResult result = conversionOK;
267     const UTF16* source = *sourceStart;
268     UTF8* target = *targetStart;
269     while (source < sourceEnd) {
270         UTF32 ch;
271         unsigned short bytesToWrite = 0;
272         const UTF32 byteMask = 0xBF;
273         const UTF32 byteMark = 0x80;
274         const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
275         ch = *source++;
276         /* If we have a surrogate pair, convert to UTF32 first. */
277         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
278             /* If the 16 bits following the high surrogate are in the source buffer... */
279             if (source < sourceEnd) {
280                 UTF32 ch2 = *source;
281                 /* If it's a low surrogate, convert to UTF32. */
282                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
283                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
284                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
285                     ++source;
286                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
287                     --source; /* return to the illegal value itself */
288                     result = sourceIllegal;
289                     break;
290                 }
291             } else { /* We don't have the 16 bits following the high surrogate. */
292                 --source; /* return to the high surrogate */
293                 result = sourceExhausted;
294                 break;
295             }
296         } else if (flags == strictConversion) {
297             /* UTF-16 surrogate values are illegal in UTF-32 */
298             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
299                 --source; /* return to the illegal value itself */
300                 result = sourceIllegal;
301                 break;
302             }
303         }
304         /* Figure out how many bytes the result will require */
305         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
306         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
307         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
308         } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
309         } else {                            bytesToWrite = 3;
310                                             ch = UNI_REPLACEMENT_CHAR;
311         }
312 
313         target += bytesToWrite;
314         if (target > targetEnd) {
315             source = oldSource; /* Back up source pointer! */
316             target -= bytesToWrite; result = targetExhausted; break;
317         }
318         switch (bytesToWrite) { /* note: everything falls through. */
319             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
320             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
321             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
322             case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
323         }
324         target += bytesToWrite;
325     }
326     *sourceStart = source;
327     *targetStart = target;
328     return result;
329 }
330 
331 /* --------------------------------------------------------------------- */
332 
ConvertUTF32toUTF8(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)333 ConversionResult ConvertUTF32toUTF8 (
334         const UTF32** sourceStart, const UTF32* sourceEnd,
335         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
336     ConversionResult result = conversionOK;
337     const UTF32* source = *sourceStart;
338     UTF8* target = *targetStart;
339     while (source < sourceEnd) {
340         UTF32 ch;
341         unsigned short bytesToWrite = 0;
342         const UTF32 byteMask = 0xBF;
343         const UTF32 byteMark = 0x80;
344         ch = *source++;
345         if (flags == strictConversion ) {
346             /* UTF-16 surrogate values are illegal in UTF-32 */
347             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
348                 --source; /* return to the illegal value itself */
349                 result = sourceIllegal;
350                 break;
351             }
352         }
353         /*
354          * Figure out how many bytes the result will require. Turn any
355          * illegally large UTF32 things (> Plane 17) into replacement chars.
356          */
357         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
358         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
359         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
360         } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
361         } else {                            bytesToWrite = 3;
362                                             ch = UNI_REPLACEMENT_CHAR;
363                                             result = sourceIllegal;
364         }
365 
366         target += bytesToWrite;
367         if (target > targetEnd) {
368             --source; /* Back up source pointer! */
369             target -= bytesToWrite; result = targetExhausted; break;
370         }
371         switch (bytesToWrite) { /* note: everything falls through. */
372             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
373             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
374             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
375             case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
376         }
377         target += bytesToWrite;
378     }
379     *sourceStart = source;
380     *targetStart = target;
381     return result;
382 }
383 
384 /* --------------------------------------------------------------------- */
385 
386 /*
387  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
388  * This must be called with the length pre-determined by the first byte.
389  * If not calling this from ConvertUTF8to*, then the length can be set by:
390  *  length = trailingBytesForUTF8[*source]+1;
391  * and the sequence is illegal right away if there aren't that many bytes
392  * available.
393  * If presented with a length > 4, this returns false.  The Unicode
394  * definition of UTF-8 goes up to 4-byte sequences.
395  */
396 
isLegalUTF8(const UTF8 * source,int length)397 static Boolean isLegalUTF8(const UTF8 *source, int length) {
398     UTF8 a;
399     const UTF8 *srcptr = source+length;
400     switch (length) {
401     default: return false;
402         /* Everything else falls through when "true"... */
403     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
404     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
405     case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
406 
407         switch (*source) {
408             /* no fall-through in this inner switch */
409             case 0xE0: if (a < 0xA0) return false; break;
410             case 0xED: if (a > 0x9F) return false; break;
411             case 0xF0: if (a < 0x90) return false; break;
412             case 0xF4: if (a > 0x8F) return false; break;
413             default:   if (a < 0x80) return false;
414         }
415 
416     case 1: if (*source >= 0x80 && *source < 0xC2) return false;
417     }
418     if (*source > 0xF4) return false;
419     return true;
420 }
421 
422 /* --------------------------------------------------------------------- */
423 
424 /*
425  * Exported function to return whether a UTF-8 sequence is legal or not.
426  * This is not used here; it's just exported.
427  */
isLegalUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)428 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
429     int length = trailingBytesForUTF8[*source]+1;
430     if (length > sourceEnd - source) {
431         return false;
432     }
433     return isLegalUTF8(source, length);
434 }
435 
436 /*
437  * Exported function to return the size of the first utf-8 code unit sequence,
438  * Or 0 if the sequence is not valid;
439  */
getUTF8SequenceSize(const UTF8 * source,const UTF8 * sourceEnd)440 unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
441   int length = trailingBytesForUTF8[*source] + 1;
442   return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length
443                                                                        : 0;
444 }
445 
446 /* --------------------------------------------------------------------- */
447 
448 static unsigned
findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)449 findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
450                                           const UTF8 *sourceEnd) {
451   UTF8 b1, b2, b3;
452 
453   assert(!isLegalUTF8Sequence(source, sourceEnd));
454 
455   /*
456    * Unicode 6.3.0, D93b:
457    *
458    *   Maximal subpart of an ill-formed subsequence: The longest code unit
459    *   subsequence starting at an unconvertible offset that is either:
460    *   a. the initial subsequence of a well-formed code unit sequence, or
461    *   b. a subsequence of length one.
462    */
463 
464   if (source == sourceEnd)
465     return 0;
466 
467   /*
468    * Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
469    * Byte Sequences.
470    */
471 
472   b1 = *source;
473   ++source;
474   if (b1 >= 0xC2 && b1 <= 0xDF) {
475     /*
476      * First byte is valid, but we know that this code unit sequence is
477      * invalid, so the maximal subpart has to end after the first byte.
478      */
479     return 1;
480   }
481 
482   if (source == sourceEnd)
483     return 1;
484 
485   b2 = *source;
486   ++source;
487 
488   if (b1 == 0xE0) {
489     return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
490   }
491   if (b1 >= 0xE1 && b1 <= 0xEC) {
492     return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
493   }
494   if (b1 == 0xED) {
495     return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
496   }
497   if (b1 >= 0xEE && b1 <= 0xEF) {
498     return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
499   }
500   if (b1 == 0xF0) {
501     if (b2 >= 0x90 && b2 <= 0xBF) {
502       if (source == sourceEnd)
503         return 2;
504 
505       b3 = *source;
506       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
507     }
508     return 1;
509   }
510   if (b1 >= 0xF1 && b1 <= 0xF3) {
511     if (b2 >= 0x80 && b2 <= 0xBF) {
512       if (source == sourceEnd)
513         return 2;
514 
515       b3 = *source;
516       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
517     }
518     return 1;
519   }
520   if (b1 == 0xF4) {
521     if (b2 >= 0x80 && b2 <= 0x8F) {
522       if (source == sourceEnd)
523         return 2;
524 
525       b3 = *source;
526       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
527     }
528     return 1;
529   }
530 
531   assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
532   /*
533    * There are no valid sequences that start with these bytes.  Maximal subpart
534    * is defined to have length 1 in these cases.
535    */
536   return 1;
537 }
538 
539 /* --------------------------------------------------------------------- */
540 
541 /*
542  * Exported function to return the total number of bytes in a codepoint
543  * represented in UTF-8, given the value of the first byte.
544  */
getNumBytesForUTF8(UTF8 first)545 unsigned getNumBytesForUTF8(UTF8 first) {
546   return trailingBytesForUTF8[first] + 1;
547 }
548 
549 /* --------------------------------------------------------------------- */
550 
551 /*
552  * Exported function to return whether a UTF-8 string is legal or not.
553  * This is not used here; it's just exported.
554  */
isLegalUTF8String(const UTF8 ** source,const UTF8 * sourceEnd)555 Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
556     while (*source != sourceEnd) {
557         int length = trailingBytesForUTF8[**source] + 1;
558         if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
559             return false;
560         *source += length;
561     }
562     return true;
563 }
564 
565 /* --------------------------------------------------------------------- */
566 
ConvertUTF8toUTF16(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)567 ConversionResult ConvertUTF8toUTF16 (
568         const UTF8** sourceStart, const UTF8* sourceEnd,
569         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
570     ConversionResult result = conversionOK;
571     const UTF8* source = *sourceStart;
572     UTF16* target = *targetStart;
573     while (source < sourceEnd) {
574         UTF32 ch = 0;
575         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
576         if (extraBytesToRead >= sourceEnd - source) {
577             result = sourceExhausted; break;
578         }
579         /* Do this check whether lenient or strict */
580         if (!isLegalUTF8(source, extraBytesToRead+1)) {
581             result = sourceIllegal;
582             break;
583         }
584         /*
585          * The cases all fall through. See "Note A" below.
586          */
587         switch (extraBytesToRead) {
588             case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
589             case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
590             case 3: ch += *source++; ch <<= 6;
591             case 2: ch += *source++; ch <<= 6;
592             case 1: ch += *source++; ch <<= 6;
593             case 0: ch += *source++;
594         }
595         ch -= offsetsFromUTF8[extraBytesToRead];
596 
597         if (target >= targetEnd) {
598             source -= (extraBytesToRead+1); /* Back up source pointer! */
599             result = targetExhausted; break;
600         }
601         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
602             /* UTF-16 surrogate values are illegal in UTF-32 */
603             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
604                 if (flags == strictConversion) {
605                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
606                     result = sourceIllegal;
607                     break;
608                 } else {
609                     *target++ = UNI_REPLACEMENT_CHAR;
610                 }
611             } else {
612                 *target++ = (UTF16)ch; /* normal case */
613             }
614         } else if (ch > UNI_MAX_UTF16) {
615             if (flags == strictConversion) {
616                 result = sourceIllegal;
617                 source -= (extraBytesToRead+1); /* return to the start */
618                 break; /* Bail out; shouldn't continue */
619             } else {
620                 *target++ = UNI_REPLACEMENT_CHAR;
621             }
622         } else {
623             /* target is a character in range 0xFFFF - 0x10FFFF. */
624             if (target + 1 >= targetEnd) {
625                 source -= (extraBytesToRead+1); /* Back up source pointer! */
626                 result = targetExhausted; break;
627             }
628             ch -= halfBase;
629             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
630             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
631         }
632     }
633     *sourceStart = source;
634     *targetStart = target;
635     return result;
636 }
637 
638 /* --------------------------------------------------------------------- */
639 
ConvertUTF8toUTF32Impl(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags,Boolean InputIsPartial)640 static ConversionResult ConvertUTF8toUTF32Impl(
641         const UTF8** sourceStart, const UTF8* sourceEnd,
642         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
643         Boolean InputIsPartial) {
644     ConversionResult result = conversionOK;
645     const UTF8* source = *sourceStart;
646     UTF32* target = *targetStart;
647     while (source < sourceEnd) {
648         UTF32 ch = 0;
649         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
650         if (extraBytesToRead >= sourceEnd - source) {
651             if (flags == strictConversion || InputIsPartial) {
652                 result = sourceExhausted;
653                 break;
654             } else {
655                 result = sourceIllegal;
656 
657                 /*
658                  * Replace the maximal subpart of ill-formed sequence with
659                  * replacement character.
660                  */
661                 source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
662                                                                     sourceEnd);
663                 *target++ = UNI_REPLACEMENT_CHAR;
664                 continue;
665             }
666         }
667         if (target >= targetEnd) {
668             result = targetExhausted; break;
669         }
670 
671         /* Do this check whether lenient or strict */
672         if (!isLegalUTF8(source, extraBytesToRead+1)) {
673             result = sourceIllegal;
674             if (flags == strictConversion) {
675                 /* Abort conversion. */
676                 break;
677             } else {
678                 /*
679                  * Replace the maximal subpart of ill-formed sequence with
680                  * replacement character.
681                  */
682                 source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
683                                                                     sourceEnd);
684                 *target++ = UNI_REPLACEMENT_CHAR;
685                 continue;
686             }
687         }
688         /*
689          * The cases all fall through. See "Note A" below.
690          */
691         switch (extraBytesToRead) {
692             case 5: ch += *source++; ch <<= 6;
693             case 4: ch += *source++; ch <<= 6;
694             case 3: ch += *source++; ch <<= 6;
695             case 2: ch += *source++; ch <<= 6;
696             case 1: ch += *source++; ch <<= 6;
697             case 0: ch += *source++;
698         }
699         ch -= offsetsFromUTF8[extraBytesToRead];
700 
701         if (ch <= UNI_MAX_LEGAL_UTF32) {
702             /*
703              * UTF-16 surrogate values are illegal in UTF-32, and anything
704              * over Plane 17 (> 0x10FFFF) is illegal.
705              */
706             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
707                 if (flags == strictConversion) {
708                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
709                     result = sourceIllegal;
710                     break;
711                 } else {
712                     *target++ = UNI_REPLACEMENT_CHAR;
713                 }
714             } else {
715                 *target++ = ch;
716             }
717         } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
718             result = sourceIllegal;
719             *target++ = UNI_REPLACEMENT_CHAR;
720         }
721     }
722     *sourceStart = source;
723     *targetStart = target;
724     return result;
725 }
726 
ConvertUTF8toUTF32Partial(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)727 ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
728                                            const UTF8 *sourceEnd,
729                                            UTF32 **targetStart,
730                                            UTF32 *targetEnd,
731                                            ConversionFlags flags) {
732   return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
733                                 flags, /*InputIsPartial=*/true);
734 }
735 
ConvertUTF8toUTF32(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)736 ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
737                                     const UTF8 *sourceEnd, UTF32 **targetStart,
738                                     UTF32 *targetEnd, ConversionFlags flags) {
739   return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
740                                 flags, /*InputIsPartial=*/false);
741 }
742 
743 /* ---------------------------------------------------------------------
744 
745     Note A.
746     The fall-through switches in UTF-8 reading code save a
747     temp variable, some decrements & conditionals.  The switches
748     are equivalent to the following loop:
749         {
750             int tmpBytesToRead = extraBytesToRead+1;
751             do {
752                 ch += *source++;
753                 --tmpBytesToRead;
754                 if (tmpBytesToRead) ch <<= 6;
755             } while (tmpBytesToRead > 0);
756         }
757     In UTF-8 writing code, the switches on "bytesToWrite" are
758     similarly unrolled loops.
759 
760    --------------------------------------------------------------------- */
761 
762 } // namespace llvm
763 
764 ConvertUTF_RESTORE_WARNINGS
765