1 /*
2  * Copyright 2001-2004 Unicode, Inc.
3  *
4  * Disclaimer
5  *
6  * This source code is provided as is by Unicode, Inc. No claims are
7  * made as to fitness for any particular purpose. No warranties of any
8  * kind are expressed or implied. The recipient agrees to determine
9  * applicability of information provided. If this file has been
10  * purchased on magnetic or optical media from Unicode, Inc., the
11  * sole remedy for any claim will be exchange of defective media
12  * within 90 days of receipt.
13  *
14  * Limitations on Rights to Redistribute This Code
15  *
16  * Unicode, Inc. hereby grants the right to freely use the information
17  * supplied in this file in the creation of products supporting the
18  * Unicode Standard, and to make copies of this file in any form
19  * for internal or external distribution as long as this notice
20  * remains attached.
21  */
22 
23 /* ---------------------------------------------------------------------
24 
25     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26     Author: Mark E. Davis, 1994.
27     Rev History: Rick McGowan, fixes & updates May 2001.
28     Sept 2001: fixed const & error conditions per
29 	mods suggested by S. Parent & A. Lillich.
30     June 2002: Tim Dodd added detection and handling of incomplete
31 	source sequences, enhanced error detection, added casts
32 	to eliminate compiler warnings.
33     July 2003: slight mods to back out aggressive FFFE detection.
34     Jan 2004: updated switches in from-UTF8 conversions.
35     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36 
37     See the header file "ConvertUTF.h" for complete documentation.
38 
39 ------------------------------------------------------------------------ */
40 
41 
42 #include "ConvertUTF.h"
43 #ifdef CVTUTF_DEBUG
44 #include <stdio.h>
45 #endif
46 
47 static const int halfShift  = 10; /* used for shifting by 10 bits */
48 
49 static const UTF32 halfBase = 0x0010000UL;
50 static const UTF32 halfMask = 0x3FFUL;
51 
52 #define UNI_SUR_HIGH_START  (UTF32)0xD800
53 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
54 #define UNI_SUR_LOW_START   (UTF32)0xDC00
55 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
56 #define false	   0
57 #define true	    1
58 
59 /* --------------------------------------------------------------------- */
60 
ConvertUTF32toUTF16(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)61 ConversionResult ConvertUTF32toUTF16 (
62 	const UTF32** sourceStart, const UTF32* sourceEnd,
63 	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
64     ConversionResult result = conversionOK;
65     const UTF32* source = *sourceStart;
66     UTF16* target = *targetStart;
67     while (source < sourceEnd) {
68 	UTF32 ch;
69 	if (target >= targetEnd) {
70 	    result = targetExhausted; break;
71 	}
72 	ch = *source++;
73 	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
74 	    /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
75 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
76 		if (flags == strictConversion) {
77 		    --source; /* return to the illegal value itself */
78 		    result = sourceIllegal;
79 		    break;
80 		} else {
81 		    *target++ = UNI_REPLACEMENT_CHAR;
82 		}
83 	    } else {
84 		*target++ = (UTF16)ch; /* normal case */
85 	    }
86 	} else if (ch > UNI_MAX_LEGAL_UTF32) {
87 	    if (flags == strictConversion) {
88 		result = sourceIllegal;
89 	    } else {
90 		*target++ = UNI_REPLACEMENT_CHAR;
91 	    }
92 	} else {
93 	    /* target is a character in range 0xFFFF - 0x10FFFF. */
94 	    if (target + 1 >= targetEnd) {
95 		--source; /* Back up source pointer! */
96 		result = targetExhausted; break;
97 	    }
98 	    ch -= halfBase;
99 	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
100 	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
101 	}
102     }
103     *sourceStart = source;
104     *targetStart = target;
105     return result;
106 }
107 
108 /* --------------------------------------------------------------------- */
109 
ConvertUTF16toUTF32(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)110 ConversionResult ConvertUTF16toUTF32 (
111 	const UTF16** sourceStart, const UTF16* sourceEnd,
112 	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
113     ConversionResult result = conversionOK;
114     const UTF16* source = *sourceStart;
115     UTF32* target = *targetStart;
116     UTF32 ch, ch2;
117     while (source < sourceEnd) {
118 	const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
119 	ch = *source++;
120 	/* If we have a surrogate pair, convert to UTF32 first. */
121 	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
122 	    /* If the 16 bits following the high surrogate are in the source buffer... */
123 	    if (source < sourceEnd) {
124 		ch2 = *source;
125 		/* If it's a low surrogate, convert to UTF32. */
126 		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
127 		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
128 			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
129 		    ++source;
130 		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
131 		    --source; /* return to the illegal value itself */
132 		    result = sourceIllegal;
133 		    break;
134 		}
135 	    } else { /* We don't have the 16 bits following the high surrogate. */
136 		--source; /* return to the high surrogate */
137 		result = sourceExhausted;
138 		break;
139 	    }
140 	} else if (flags == strictConversion) {
141 	    /* UTF-16 surrogate values are illegal in UTF-32 */
142 	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
143 		--source; /* return to the illegal value itself */
144 		result = sourceIllegal;
145 		break;
146 	    }
147 	}
148 	if (target >= targetEnd) {
149 	    source = oldSource; /* Back up source pointer! */
150 	    result = targetExhausted; break;
151 	}
152 	*target++ = ch;
153     }
154     *sourceStart = source;
155     *targetStart = target;
156 #ifdef CVTUTF_DEBUG
157 if (result == sourceIllegal) {
158     fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
159     fflush(stderr);
160 }
161 #endif
162     return result;
163 }
164 
165 /* --------------------------------------------------------------------- */
166 
167 /*
168  * Index into the table below with the first byte of a UTF-8 sequence to
169  * get the number of trailing bytes that are supposed to follow it.
170  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
171  * left as-is for anyone who may want to do such conversion, which was
172  * allowed in earlier algorithms.
173  */
174 static const char trailingBytesForUTF8[256] = {
175     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
176     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
177     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
178     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
179     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
180     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
181     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
182     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
183 };
184 
185 /*
186  * Magic values subtracted from a buffer value during UTF8 conversion.
187  * This table contains as many values as there might be trailing bytes
188  * in a UTF-8 sequence.
189  */
190 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
191 		     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
192 
193 /*
194  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
195  * into the first byte, depending on how many bytes follow.  There are
196  * as many entries in this table as there are UTF-8 sequence types.
197  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
198  * for *legal* UTF-8 will be 4 or fewer bytes total.
199  */
200 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
201 
202 /* --------------------------------------------------------------------- */
203 
204 /* The interface converts a whole buffer to avoid function-call overhead.
205  * Constants have been gathered. Loops & conditionals have been removed as
206  * much as possible for efficiency, in favor of drop-through switches.
207  * (See "Note A" at the bottom of the file for equivalent code.)
208  * If your compiler supports it, the "isLegalUTF8" call can be turned
209  * into an inline function.
210  */
211 
212 /* --------------------------------------------------------------------- */
213 
ConvertUTF16toUTF8(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)214 ConversionResult ConvertUTF16toUTF8 (
215 	const UTF16** sourceStart, const UTF16* sourceEnd,
216 	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
217     ConversionResult result = conversionOK;
218     const UTF16* source = *sourceStart;
219     UTF8* target = *targetStart;
220     while (source < sourceEnd) {
221 	UTF32 ch;
222 	unsigned short bytesToWrite = 0;
223 	const UTF32 byteMask = 0xBF;
224 	const UTF32 byteMark = 0x80;
225 	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
226 	ch = *source++;
227 	/* If we have a surrogate pair, convert to UTF32 first. */
228 	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
229 	    /* If the 16 bits following the high surrogate are in the source buffer... */
230 	    if (source < sourceEnd) {
231 		UTF32 ch2 = *source;
232 		/* If it's a low surrogate, convert to UTF32. */
233 		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
234 		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
235 			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
236 		    ++source;
237 		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
238 		    --source; /* return to the illegal value itself */
239 		    result = sourceIllegal;
240 		    break;
241 		}
242 	    } else { /* We don't have the 16 bits following the high surrogate. */
243 		--source; /* return to the high surrogate */
244 		result = sourceExhausted;
245 		break;
246 	    }
247 	} else if (flags == strictConversion) {
248 	    /* UTF-16 surrogate values are illegal in UTF-32 */
249 	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
250 		--source; /* return to the illegal value itself */
251 		result = sourceIllegal;
252 		break;
253 	    }
254 	}
255 	/* Figure out how many bytes the result will require */
256 	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
257 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
258 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
259 	} else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
260 	} else {			    bytesToWrite = 3;
261 					    ch = UNI_REPLACEMENT_CHAR;
262 	}
263 
264 	target += bytesToWrite;
265 	if (target > targetEnd) {
266 	    source = oldSource; /* Back up source pointer! */
267 	    target -= bytesToWrite; result = targetExhausted; break;
268 	}
269 	switch (bytesToWrite) { /* note: everything falls through. */
270 	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
271 	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
272 	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
273 	    case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
274 	}
275 	target += bytesToWrite;
276     }
277     *sourceStart = source;
278     *targetStart = target;
279     return result;
280 }
281 
282 /* --------------------------------------------------------------------- */
283 
284 /*
285  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
286  * This must be called with the length pre-determined by the first byte.
287  * If not calling this from ConvertUTF8to*, then the length can be set by:
288  *  length = trailingBytesForUTF8[*source]+1;
289  * and the sequence is illegal right away if there aren't that many bytes
290  * available.
291  * If presented with a length > 4, this returns false.  The Unicode
292  * definition of UTF-8 goes up to 4-byte sequences.
293  */
294 
isLegalUTF8(const UTF8 * source,int length)295 static Boolean isLegalUTF8(const UTF8 *source, int length) {
296     UTF8 a;
297     const UTF8 *srcptr = source+length;
298     switch (length) {
299     default: return false;
300 	/* Everything else falls through when "true"... */
301     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
302     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
303     case 2: if ((a = (*--srcptr)) > 0xBF) return false;
304 
305 	switch (*source) {
306 	    /* no fall-through in this inner switch */
307 	    case 0xE0: if (a < 0xA0) return false; break;
308 	    case 0xED: if (a > 0x9F) return false; break;
309 	    case 0xF0: if (a < 0x90) return false; break;
310 	    case 0xF4: if (a > 0x8F) return false; break;
311 	    default:   if (a < 0x80) return false;
312 	}
313 
314     case 1: if (*source >= 0x80 && *source < 0xC2) return false;
315     }
316     if (*source > 0xF4) return false;
317     return true;
318 }
319 
320 /* --------------------------------------------------------------------- */
321 
322 /*
323  * Exported function to return whether a UTF-8 sequence is legal or not.
324  * This is not used here; it's just exported.
325  */
isLegalUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)326 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
327     int length = trailingBytesForUTF8[*source]+1;
328     if (source+length > sourceEnd) {
329 	return false;
330     }
331     return isLegalUTF8(source, length);
332 }
333 
334 /* --------------------------------------------------------------------- */
335 
ConvertUTF8toUTF16(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)336 ConversionResult ConvertUTF8toUTF16 (
337 	const UTF8** sourceStart, const UTF8* sourceEnd,
338 	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
339     ConversionResult result = conversionOK;
340     const UTF8* source = *sourceStart;
341     UTF16* target = *targetStart;
342     while (source < sourceEnd) {
343 	UTF32 ch = 0;
344 	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
345 	if (source + extraBytesToRead >= sourceEnd) {
346 	    result = sourceExhausted; break;
347 	}
348 	/* Do this check whether lenient or strict */
349 	if (! isLegalUTF8(source, extraBytesToRead+1)) {
350 	    result = sourceIllegal;
351 	    break;
352 	}
353 	/*
354 	 * The cases all fall through. See "Note A" below.
355 	 */
356 	switch (extraBytesToRead) {
357 	    case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
358 	    case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
359 	    case 3: ch += *source++; ch <<= 6;
360 	    case 2: ch += *source++; ch <<= 6;
361 	    case 1: ch += *source++; ch <<= 6;
362 	    case 0: ch += *source++;
363 	}
364 	ch -= offsetsFromUTF8[extraBytesToRead];
365 
366 	if (target >= targetEnd) {
367 	    source -= (extraBytesToRead+1); /* Back up source pointer! */
368 	    result = targetExhausted; break;
369 	}
370 	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
371 	    /* UTF-16 surrogate values are illegal in UTF-32 */
372 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
373 		if (flags == strictConversion) {
374 		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
375 		    result = sourceIllegal;
376 		    break;
377 		} else {
378 		    *target++ = UNI_REPLACEMENT_CHAR;
379 		}
380 	    } else {
381 		*target++ = (UTF16)ch; /* normal case */
382 	    }
383 	} else if (ch > UNI_MAX_UTF16) {
384 	    if (flags == strictConversion) {
385 		result = sourceIllegal;
386 		source -= (extraBytesToRead+1); /* return to the start */
387 		break; /* Bail out; shouldn't continue */
388 	    } else {
389 		*target++ = UNI_REPLACEMENT_CHAR;
390 	    }
391 	} else {
392 	    /* target is a character in range 0xFFFF - 0x10FFFF. */
393 	    if (target + 1 >= targetEnd) {
394 		source -= (extraBytesToRead+1); /* Back up source pointer! */
395 		result = targetExhausted; break;
396 	    }
397 	    ch -= halfBase;
398 	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
399 	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
400 	}
401     }
402     *sourceStart = source;
403     *targetStart = target;
404     return result;
405 }
406 
407 /* --------------------------------------------------------------------- */
408 
ConvertUTF32toUTF8(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)409 ConversionResult ConvertUTF32toUTF8 (
410 	const UTF32** sourceStart, const UTF32* sourceEnd,
411 	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
412     ConversionResult result = conversionOK;
413     const UTF32* source = *sourceStart;
414     UTF8* target = *targetStart;
415     while (source < sourceEnd) {
416 	UTF32 ch;
417 	unsigned short bytesToWrite = 0;
418 	const UTF32 byteMask = 0xBF;
419 	const UTF32 byteMark = 0x80;
420 	ch = *source++;
421 	if (flags == strictConversion ) {
422 	    /* UTF-16 surrogate values are illegal in UTF-32 */
423 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
424 		--source; /* return to the illegal value itself */
425 		result = sourceIllegal;
426 		break;
427 	    }
428 	}
429 	/*
430 	 * Figure out how many bytes the result will require. Turn any
431 	 * illegally large UTF32 things (> Plane 17) into replacement chars.
432 	 */
433 	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
434 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
435 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
436 	} else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
437 	} else {			    bytesToWrite = 3;
438 					    ch = UNI_REPLACEMENT_CHAR;
439 					    result = sourceIllegal;
440 	}
441 
442 	target += bytesToWrite;
443 	if (target > targetEnd) {
444 	    --source; /* Back up source pointer! */
445 	    target -= bytesToWrite; result = targetExhausted; break;
446 	}
447 	switch (bytesToWrite) { /* note: everything falls through. */
448 	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
449 	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
450 	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
451 	    case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
452 	}
453 	target += bytesToWrite;
454     }
455     *sourceStart = source;
456     *targetStart = target;
457     return result;
458 }
459 
460 /* --------------------------------------------------------------------- */
461 
ConvertUTF8toUTF32(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)462 ConversionResult ConvertUTF8toUTF32 (
463 	const UTF8** sourceStart, const UTF8* sourceEnd,
464 	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
465     ConversionResult result = conversionOK;
466     const UTF8* source = *sourceStart;
467     UTF32* target = *targetStart;
468     while (source < sourceEnd) {
469 	UTF32 ch = 0;
470 	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
471 	if (source + extraBytesToRead >= sourceEnd) {
472 	    result = sourceExhausted; break;
473 	}
474 	/* Do this check whether lenient or strict */
475 	if (! isLegalUTF8(source, extraBytesToRead+1)) {
476 	    result = sourceIllegal;
477 	    break;
478 	}
479 	/*
480 	 * The cases all fall through. See "Note A" below.
481 	 */
482 	switch (extraBytesToRead) {
483 	    case 5: ch += *source++; ch <<= 6;
484 	    case 4: ch += *source++; ch <<= 6;
485 	    case 3: ch += *source++; ch <<= 6;
486 	    case 2: ch += *source++; ch <<= 6;
487 	    case 1: ch += *source++; ch <<= 6;
488 	    case 0: ch += *source++;
489 	}
490 	ch -= offsetsFromUTF8[extraBytesToRead];
491 
492 	if (target >= targetEnd) {
493 	    source -= (extraBytesToRead+1); /* Back up the source pointer! */
494 	    result = targetExhausted; break;
495 	}
496 	if (ch <= UNI_MAX_LEGAL_UTF32) {
497 	    /*
498 	     * UTF-16 surrogate values are illegal in UTF-32, and anything
499 	     * over Plane 17 (> 0x10FFFF) is illegal.
500 	     */
501 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
502 		if (flags == strictConversion) {
503 		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
504 		    result = sourceIllegal;
505 		    break;
506 		} else {
507 		    *target++ = UNI_REPLACEMENT_CHAR;
508 		}
509 	    } else {
510 		*target++ = ch;
511 	    }
512 	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
513 	    result = sourceIllegal;
514 	    *target++ = UNI_REPLACEMENT_CHAR;
515 	}
516     }
517     *sourceStart = source;
518     *targetStart = target;
519     return result;
520 }
521 
522 /* ---------------------------------------------------------------------
523 
524     Note A.
525     The fall-through switches in UTF-8 reading code save a
526     temp variable, some decrements & conditionals.  The switches
527     are equivalent to the following loop:
528 	{
529 	    int tmpBytesToRead = extraBytesToRead+1;
530 	    do {
531 		ch += *source++;
532 		--tmpBytesToRead;
533 		if (tmpBytesToRead) ch <<= 6;
534 	    } while (tmpBytesToRead > 0);
535 	}
536     In UTF-8 writing code, the switches on "bytesToWrite" are
537     similarly unrolled loops.
538 
539    --------------------------------------------------------------------- */
540