1 /*
2 * Copyright 2001-2004 Unicode, Inc.
3 *
4 * Disclaimer
5 *
6 * This source code is provided as is by Unicode, Inc. No claims are
7 * made as to fitness for any particular purpose. No warranties of any
8 * kind are expressed or implied. The recipient agrees to determine
9 * applicability of information provided. If this file has been
10 * purchased on magnetic or optical media from Unicode, Inc., the
11 * sole remedy for any claim will be exchange of defective media
12 * within 90 days of receipt.
13 *
14 * Limitations on Rights to Redistribute This Code
15 *
16 * Unicode, Inc. hereby grants the right to freely use the information
17 * supplied in this file in the creation of products supporting the
18 * Unicode Standard, and to make copies of this file in any form
19 * for internal or external distribution as long as this notice
20 * remains attached.
21 */
22
tsk_stack_create()23 /* ---------------------------------------------------------------------
24
25 Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26 Author: Mark E. Davis, 1994.
27 Rev History: Rick McGowan, fixes & updates May 2001.
28 Sept 2001: fixed const & error conditions per
29 mods suggested by S. Parent & A. Lillich.
30 June 2002: Tim Dodd added detection and handling of incomplete
31 source sequences, enhanced error detection, added casts
32 to eliminate compiler warnings.
33 July 2003: slight mods to back out aggressive FFFE detection.
34 Jan 2004: updated switches in from-UTF8 conversions.
35 Oct 2004: updated to use TSK_UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36
37 See the header file "ConvertUTF.h" for complete documentation.
38
39 ------------------------------------------------------------------------ */
40
41 /** \file tsk_unicode.c
42 * A local copy of the Unicode conversion routines from unicode.org.
43 */
44
45 #include "tsk_base_i.h"
46
47 /* Some fundamental constants */
48 typedef unsigned long UTF32; /* at least 32 bits */
tsk_stack_push(TSK_STACK * a_tsk_stack,uint64_t a_val)49 #define TSK_UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
50 #define TSK_UNI_MAX_BMP (UTF32)0x0000FFFF
51 #define TSK_UNI_MAX_UTF16 (UTF32)0x0010FFFF
52 #define TSK_UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
53 #define TSK_UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
54
55
56 static const int halfShift = 10; /* used for shifting by 10 bits */
57
58 static const UTF32 halfBase = 0x0010000UL;
59 static const UTF32 halfMask = 0x3FFUL;
60
61 #define UNI_SUR_HIGH_START (UTF32)0xD800
62 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
63 #define UNI_SUR_LOW_START (UTF32)0xDC00
64 #define UNI_SUR_LOW_END (UTF32)0xDFFF
65 #define false 0
66 #define true 1
67
68 /* --------------------------------------------------------------------- */
tsk_stack_pop(TSK_STACK * a_tsk_stack)69
70
71 /* --------------------------------------------------------------------- */
72
73 /*
74 * Index into the table below with the first byte of a UTF-8 sequence to
75 * get the number of trailing bytes that are supposed to follow it.
76 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
77 * left as-is for anyone who may want to do such conversion, which was
78 * allowed in earlier algorithms.
79 */
80 static const char trailingBytesForUTF8[256] = {
81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
82 0, 0, 0, 0, 0, 0, 0, 0,
83 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
84 0, 0, 0, 0, 0, 0, 0, 0,
85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
86 0, 0, 0, 0, 0, 0, 0, 0,
87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88 0, 0, 0, 0, 0, 0, 0, 0,
89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
90 0, 0, 0, 0, 0, 0, 0, 0,
91 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
92 0, 0, 0, 0, 0, 0, 0, 0,
93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
94 1, 1, 1, 1, 1, 1, 1, 1,
95 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
96 4, 4, 4, 4, 5, 5, 5, 5
97 };
98
tsk_stack_free(TSK_STACK * a_tsk_stack)99 /*
100 * Magic values subtracted from a buffer value during UTF8 conversion.
101 * This table contains as many values as there might be trailing bytes
102 * in a UTF-8 sequence.
103 */
104 static const UTF32 offsetsFromUTF8[6] =
105 { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
106 0x03C82080UL, 0xFA082080UL, 0x82082080UL
107 };
108
109
110 /*
111 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
112 * into the first byte, depending on how many bytes follow. There are
113 * as many entries in this table as there are UTF-8 sequence types.
114 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
115 * for *legal* UTF-8 will be 4 or fewer bytes total.
116 */
117 static const UTF8 firstByteMark[7] =
118 { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
119
120
121 /* --------------------------------------------------------------------- */
122
123 /* The interface converts a whole buffer to avoid function-call overhead.
124 * Constants have been gathered. Loops & conditionals have been removed as
125 * much as possible for efficiency, in favor of drop-through switches.
126 * (See "Note A" at the bottom of the file for equivalent code.)
127 * If your compiler supports it, the "isLegalUTF8" call can be turned
128 * into an inline function.
129 */
130
131 /* --------------------------------------------------------------------- */
132
133
134 /**
135 * \ingroup baselib
136 * Convert a UTF-16 string to UTF-8.
137 * @param endian Endian ordering flag of UTF-16 text
138 * @param sourceStart Pointer to pointer to start of UTF-16 string. Will be updated to last char processed.
139 * @param sourceEnd Pointer to one entry past end of UTF-16 string
140 * @param targetStart Pointer to pointer to place where UTF-8 string should be written. Will be updated to next place to write to.
141 * @param targetEnd Pointer to end of UTF-8 buffer
142 * @param flags Flags used during conversion
143 * @returns error code
144 */
145 TSKConversionResult
146 tsk_UTF16toUTF8(TSK_ENDIAN_ENUM endian, const UTF16 ** sourceStart,
147 const UTF16 * sourceEnd, UTF8 ** targetStart,
148 UTF8 * targetEnd, TSKConversionFlags flags)
149 {
150 TSKConversionResult result = TSKconversionOK;
151 const UTF16 *source = *sourceStart;
152 UTF8 *target = *targetStart;
153 while (source < sourceEnd) {
154 UTF32 ch;
155 unsigned short bytesToWrite = 0;
156 const UTF32 byteMask = 0xBF;
157 const UTF32 byteMark = 0x80;
158 const UTF16 *oldSource = source; /* In case we have to back up because of target overflow. */
159 ch = tsk_getu16(endian, (uint8_t *) source);
160 source++;
161
162 /* If we have a surrogate pair, convert to UTF32 first. */
163 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
164 /* If the 16 bits following the high surrogate are in the source buffer... */
165 if (source < sourceEnd) {
166 UTF32 ch2 = tsk_getu16(endian, (uint8_t *) source);
167 ++source;
168
169 /* If it's a low surrogate, convert to UTF32. */
170 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
171 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
172 + (ch2 - UNI_SUR_LOW_START) + halfBase;
173 }
174 else if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */
175 result = TSKsourceIllegal;
176 break;
177 }
178 // replace with another character
179 else {
180 ch = '^';
181 }
182 }
183 else { /* We don't have the 16 bits following the high surrogate. */
184 --source; /* return to the high surrogate */
185 result = TSKsourceExhausted;
186 break;
187 }
188 }
189 /* UTF-16 surrogate values are illegal in UTF-32 */
190 else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
191 if (flags == TSKstrictConversion) {
192 --source; /* return to the illegal value itself */
193 result = TSKsourceIllegal;
194 break;
195 }
196 // replace with another character
197 else {
198 ch = '^';
199 }
200 }
201
202 /* Figure out how many bytes the result will require */
203 if (ch < (UTF32) 0x80) {
204 bytesToWrite = 1;
205 }
206 else if (ch < (UTF32) 0x800) {
207 bytesToWrite = 2;
208 }
209 else if (ch < (UTF32) 0x10000) {
210 bytesToWrite = 3;
211 }
212 else if (ch < (UTF32) 0x110000) {
213 bytesToWrite = 4;
214 }
215 else {
216 bytesToWrite = 3;
217 ch = TSK_UNI_REPLACEMENT_CHAR;
218 }
219
220 target += bytesToWrite;
221 if (target > targetEnd) {
222 source = oldSource; /* Back up source pointer! */
223 target -= bytesToWrite;
224 result = TSKtargetExhausted;
225 break;
226 }
227 switch (bytesToWrite) { /* note: everything falls through. */
228 case 4:
229 *--target = (UTF8) ((ch | byteMark) & byteMask);
230 ch >>= 6;
231 case 3:
232 *--target = (UTF8) ((ch | byteMark) & byteMask);
233 ch >>= 6;
234 case 2:
235 *--target = (UTF8) ((ch | byteMark) & byteMask);
236 ch >>= 6;
237 case 1:
238 *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
239 }
240 target += bytesToWrite;
241 }
242 *sourceStart = source;
243 *targetStart = target;
244 return result;
245 }
246
247
248 /**
249 * \ingroup baselib
250 * Convert a UTF-16 string in local endian ordering to UTF-8.
251 * @param sourceStart Pointer to pointer to start of UTF-16 string. Will be updated to last char processed.
252 * @param sourceEnd Pointer to one entry past end of UTF-16 string
253 * @param targetStart Pointer to pointer to place where UTF-8 string should be written. Will be updated to next place to write to.
254 * @param targetEnd Pointer to end of UTF-8 buffer
255 * @param flags Flags used during conversion
256 * @returns error code
257 */
258 TSKConversionResult
259 tsk_UTF16toUTF8_lclorder(const UTF16 ** sourceStart,
260 const UTF16 * sourceEnd, UTF8 ** targetStart,
261 UTF8 * targetEnd, TSKConversionFlags flags)
262 {
263 TSKConversionResult result = TSKconversionOK;
264 const UTF16 *source = *sourceStart;
265 UTF8 *target = *targetStart;
266 while (source < sourceEnd) {
267 UTF32 ch;
268 unsigned short bytesToWrite = 0;
269 const UTF32 byteMask = 0xBF;
270 const UTF32 byteMark = 0x80;
271 const UTF16 *oldSource = source; /* In case we have to back up because of target overflow. */
272 ch = *source++;
273
274 /* If we have a surrogate pair, convert to UTF32 first. */
275 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
276 /* If the 16 bits following the high surrogate are in the source buffer... */
277 if (source < sourceEnd) {
278 UTF32 ch2 = *source;
279 source++;
280 /* If it's a low surrogate, convert to UTF32. */
281 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
282 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
283 + (ch2 - UNI_SUR_LOW_START) + halfBase;
284 }
285 else if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */
286 result = TSKsourceIllegal;
287 break;
288 }
289 // replace with another character
290 else {
291 ch = '^';
292 }
293 }
294 else { /* We don't have the 16 bits following the high surrogate. */
295 --source; /* return to the high surrogate */
296 result = TSKsourceExhausted;
297 break;
298 }
299 }
300 /* UTF-16 surrogate values are illegal in UTF-32 */
301 else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
302 if (flags == TSKstrictConversion) {
303 --source; /* return to the illegal value itself */
304 result = TSKsourceIllegal;
305 break;
306 }
307 // replace with another character
308 else {
309 ch = '^';
310 }
311 }
312
313 /* Figure out how many bytes the result will require */
314 if (ch < (UTF32) 0x80) {
315 bytesToWrite = 1;
316 }
317 else if (ch < (UTF32) 0x800) {
318 bytesToWrite = 2;
319 }
320 else if (ch < (UTF32) 0x10000) {
321 bytesToWrite = 3;
322 }
323 else if (ch < (UTF32) 0x110000) {
324 bytesToWrite = 4;
325 }
326 else {
327 bytesToWrite = 3;
328 ch = TSK_UNI_REPLACEMENT_CHAR;
329 }
330
331 target += bytesToWrite;
332 if (target > targetEnd) {
333 source = oldSource; /* Back up source pointer! */
334 target -= bytesToWrite;
335 result = TSKtargetExhausted;
336 break;
337 }
338 switch (bytesToWrite) { /* note: everything falls through. */
339 case 4:
340 *--target = (UTF8) ((ch | byteMark) & byteMask);
341 ch >>= 6;
342 case 3:
343 *--target = (UTF8) ((ch | byteMark) & byteMask);
344 ch >>= 6;
345 case 2:
346 *--target = (UTF8) ((ch | byteMark) & byteMask);
347 ch >>= 6;
348 case 1:
349 *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
350 }
351 target += bytesToWrite;
352 }
353 *sourceStart = source;
354 *targetStart = target;
355 return result;
356 }
357
358 TSKConversionResult
359 tsk_UTF16WtoUTF8_lclorder(const wchar_t ** sourceStart,
360 const wchar_t * sourceEnd, UTF8 ** targetStart,
361 UTF8 * targetEnd, TSKConversionFlags flags)
362 {
363 TSKConversionResult result = TSKconversionOK;
364 const wchar_t *source = *sourceStart;
365 UTF8 *target = *targetStart;
366 while (source < sourceEnd) {
367 UTF32 ch;
368 unsigned short bytesToWrite = 0;
369 const UTF32 byteMask = 0xBF;
370 const UTF32 byteMark = 0x80;
371 const wchar_t *oldSource = source; /* In case we have to back up because of target overflow. */
372 ch = *source++;
373
374 /* If we have a surrogate pair, convert to UTF32 first. */
375 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
376 /* If the 16 bits following the high surrogate are in the source buffer... */
377 if (source < sourceEnd) {
378 UTF32 ch2 = *source;
379 source++;
380 /* If it's a low surrogate, convert to UTF32. */
381 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
382 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
383 + (ch2 - UNI_SUR_LOW_START) + halfBase;
384 }
385 else if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */
386 result = TSKsourceIllegal;
387 break;
388 }
389 // replace with another character
390 else {
391 ch = '^';
392 }
393 }
394 else { /* We don't have the 16 bits following the high surrogate. */
395 --source; /* return to the high surrogate */
396 result = TSKsourceExhausted;
397 break;
398 }
399 }
400 /* UTF-16 surrogate values are illegal in UTF-32 */
401 else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
402 if (flags == TSKstrictConversion) {
403 --source; /* return to the illegal value itself */
404 result = TSKsourceIllegal;
405 break;
406 }
407 // replace with another character
408 else {
409 ch = '^';
410 }
411 }
412
413 /* Figure out how many bytes the result will require */
414 if (ch < (UTF32) 0x80) {
415 bytesToWrite = 1;
416 }
417 else if (ch < (UTF32) 0x800) {
418 bytesToWrite = 2;
419 }
420 else if (ch < (UTF32) 0x10000) {
421 bytesToWrite = 3;
422 }
423 else if (ch < (UTF32) 0x110000) {
424 bytesToWrite = 4;
425 }
426 else {
427 bytesToWrite = 3;
428 ch = TSK_UNI_REPLACEMENT_CHAR;
429 }
430
431 target += bytesToWrite;
432 if (target > targetEnd) {
433 source = oldSource; /* Back up source pointer! */
434 target -= bytesToWrite;
435 result = TSKtargetExhausted;
436 break;
437 }
438 switch (bytesToWrite) { /* note: everything falls through. */
439 case 4:
440 *--target = (UTF8) ((ch | byteMark) & byteMask);
441 ch >>= 6;
442 case 3:
443 *--target = (UTF8) ((ch | byteMark) & byteMask);
444 ch >>= 6;
445 case 2:
446 *--target = (UTF8) ((ch | byteMark) & byteMask);
447 ch >>= 6;
448 case 1:
449 *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
450 }
451 target += bytesToWrite;
452 }
453 *sourceStart = source;
454 *targetStart = target;
455 return result;
456 }
457
458 /* --------------------------------------------------------------------- */
459
460 /*
461 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
462 * This must be called with the length pre-determined by the first byte.
463 * If not calling this from ConvertUTF8to*, then the length can be set by:
464 * length = trailingBytesForUTF8[*source]+1;
465 * and the sequence is illegal right away if there aren't that many bytes
466 * available.
467 * If presented with a length > 4, this returns false. The Unicode
468 * definition of UTF-8 goes up to 4-byte sequences.
469 */
470
471 static Boolean
472 isLegalUTF8(const UTF8 * source, int length)
473 {
474 UTF8 a;
475 const UTF8 *srcptr = source + length;
476 switch (length) {
477 default:
478 return false;
479 /* Everything else falls through when "true"... */
480 case 4:
481 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
482 return false;
483 case 3:
484 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
485 return false;
486 case 2:
487 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
488 return false;
489
490 switch (*source) {
491 /* no fall-through in this inner switch */
492 case 0xE0:
493 if (a < 0xA0)
494 return false;
495 break;
496 case 0xED:
497 if (a > 0x9F)
498 return false;
499 break;
500 case 0xF0:
501 if (a < 0x90)
502 return false;
503 break;
504 case 0xF4:
505 if (a > 0x8F)
506 return false;
507 break;
508 default:
509 if (a < 0x80)
510 return false;
511 }
512
513 case 1:
514 if (*source >= 0x80 && *source < 0xC2)
515 return false;
516 }
517 if (*source > 0xF4)
518 return false;
519 return true;
520 }
521
522 /* --------------------------------------------------------------------- */
523
524 /*
525 * Exported function to return whether a UTF-8 sequence is legal or not.
526 * This is not used here; it's just exported.
527 */
528 Boolean
529 tsk_isLegalUTF8Sequence(const UTF8 * source, const UTF8 * sourceEnd)
530 {
531 int length = trailingBytesForUTF8[*source] + 1;
532 if (source + length > sourceEnd) {
533 return false;
534 }
535 return isLegalUTF8(source, length);
536 }
537
538 /**
539 * Cleans up the passed in string to replace invalid
540 * UTF-8 values with the passed in character.
541 * @param source String to be cleaned up
542 * @param replacement Character to insert into source as needed.
543 */
544 void
545 tsk_cleanupUTF8(char *source, const char replacement)
546 {
547 size_t total_len = strlen(source);
548 size_t cur_idx = 0;
549
550 while (cur_idx < total_len) {
551 int length = trailingBytesForUTF8[(UTF8) source[cur_idx]] + 1;
552 if (cur_idx + length > total_len) {
553 while (cur_idx < total_len) {
554 source[cur_idx] = replacement;
555 cur_idx++;
556 }
557 break;
558 }
559 if (isLegalUTF8((UTF8 *) & source[cur_idx], length) == false) {
560 int i;
561 for (i = 0; i < length; i++) {
562 source[cur_idx + i] = replacement;
563 }
564 }
565 cur_idx += length;
566 }
567 }
568
569 /* --------------------------------------------------------------------- */
570
571
572
573 /**
574 * \ingroup baselib
575 * Convert a UTF-8 string to UTF-16 (in local endian ordering).
576 * @param sourceStart Pointer to pointer to start of UTF-8 string. Will be updated to last char processed.
577 * @param sourceEnd Pointer to one entry past end of UTF-8 string
578 * @param targetStart Pointer to pointer to place where UTF-16 string should be written. Will be updated to next place to write to.
579 * @param targetEnd Pointer to end of UTF-16 buffer
580 * @param flags Flags used during conversion
581 * @returns error code
582 */
583 TSKConversionResult
584 tsk_UTF8toUTF16(const UTF8 ** sourceStart,
585 const UTF8 * sourceEnd, UTF16 ** targetStart,
586 UTF16 * targetEnd, TSKConversionFlags flags)
587 {
588 TSKConversionResult result = TSKconversionOK;
589 const UTF8 *source = *sourceStart;
590 UTF16 *target = *targetStart;
591 while (source < sourceEnd) {
592 UTF32 ch = 0;
593 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
594 if (source + extraBytesToRead >= sourceEnd) {
595 result = TSKsourceExhausted;
596 break;
597 }
598 /* Do this check whether lenient or strict */
599 if (!isLegalUTF8(source, extraBytesToRead + 1)) {
600 result = TSKsourceIllegal;
601 break;
602 }
603 /*
604 * The cases all fall through. See "Note A" below.
605 */
606 switch (extraBytesToRead) {
607 case 5:
608 ch += *source++;
609 ch <<= 6; /* remember, illegal UTF-8 */
610 case 4:
611 ch += *source++;
612 ch <<= 6; /* remember, illegal UTF-8 */
613 case 3:
614 ch += *source++;
615 ch <<= 6;
616 case 2:
617 ch += *source++;
618 ch <<= 6;
619 case 1:
620 ch += *source++;
621 ch <<= 6;
622 case 0:
623 ch += *source++;
624 }
625 ch -= offsetsFromUTF8[extraBytesToRead];
626
627 if (target >= targetEnd) {
628 source -= (extraBytesToRead + 1); /* Back up source pointer! */
629 result = TSKtargetExhausted;
630 break;
631 }
632 if (ch <= TSK_UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
633 /* UTF-16 surrogate values are illegal in UTF-32 */
634 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
635 if (flags == TSKstrictConversion) {
636 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
637 result = TSKsourceIllegal;
638 break;
639 }
640 else {
641 *target++ = TSK_UNI_REPLACEMENT_CHAR;
642 }
643 }
644 else {
645 *target++ = (UTF16) ch; /* normal case */
646 }
647 }
648 else if (ch > TSK_UNI_MAX_UTF16) {
649 if (flags == TSKstrictConversion) {
650 result = TSKsourceIllegal;
651 source -= (extraBytesToRead + 1); /* return to the start */
652 break; /* Bail out; shouldn't continue */
653 }
654 else {
655 *target++ = TSK_UNI_REPLACEMENT_CHAR;
656 }
657 }
658 else {
659 /* target is a character in range 0xFFFF - 0x10FFFF. */
660 if (target + 1 >= targetEnd) {
661 source -= (extraBytesToRead + 1); /* Back up source pointer! */
662 result = TSKtargetExhausted;
663 break;
664 }
665 ch -= halfBase;
666 *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
667 *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
668 }
669 }
670 *sourceStart = source;
671 *targetStart = target;
672 return result;
673 }
674
675