1 /* tag: Tom Lord Tue Dec 4 14:41:20 2001 (cvtutf.c)
2 */
3 /* ================================================================ */
4 /*
5 * Copyright 2001 Unicode, Inc.
6 *
7 * Disclaimer
8 *
9 * This source code is provided as is by Unicode, Inc. No claims are
10 * made as to fitness for any particular purpose. No warranties of any
11 * kind are expressed or implied. The recipient agrees to determine
12 * applicability of information provided. If this file has been
13 * purchased on magnetic or optical media from Unicode, Inc., the
14 * sole remedy for any claim will be exchange of defective media
15 * within 90 days of receipt.
16 *
17 * Limitations on Rights to Redistribute This Code
18 *
19 * Unicode, Inc. hereby grants the right to freely use the information
20 * supplied in this file in the creation of products supporting the
21 * Unicode Standard, and to make copies of this file in any form
22 * for internal or external distribution as long as this notice
23 * remains attached.
24 */
25
26 /* ---------------------------------------------------------------------
27
28 Conversions between UTF32, UTF-16, and UTF-8. Source code file.
29 Author: Mark E. Davis, 1994.
30 Rev History: Rick McGowan, fixes & updates May 2001.
31 Sept 2001: fixed const & error conditions per
32 mods suggested by S. Parent & A. Lillich.
33
34 See the header file "ConvertUTF.h" for complete documentation.
35
36 ------------------------------------------------------------------------ */
37
38 #include "hackerlab/tests/uni-tests/cvtutf.h"
39
40 #ifdef CVTUTF_DEBUG
41 #include <stdio.h>
42 #endif
43
44 static const int halfShift = 10; /* used for shifting by 10 bits */
45
46 static const UTF32 halfBase = 0x0010000UL;
47 static const UTF32 halfMask = 0x3FFUL;
48
49 #define UNI_SUR_HIGH_START (UTF32)0xD800
50 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
51 #define UNI_SUR_LOW_START (UTF32)0xDC00
52 #define UNI_SUR_LOW_END (UTF32)0xDFFF
53 #define false 0
54 #define true 1
55
56 /* --------------------------------------------------------------------- */
57
ConvertUTF32toUTF16(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)58 ConversionResult ConvertUTF32toUTF16 (
59 const UTF32** sourceStart, const UTF32* sourceEnd,
60 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
61 ConversionResult result = conversionOK;
62 const UTF32* source = *sourceStart;
63 UTF16* target = *targetStart;
64 while (source < sourceEnd) {
65 UTF32 ch;
66 if (target >= targetEnd) {
67 result = targetExhausted; break;
68 }
69 ch = *source++;
70 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
71 if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) {
72 --source; /* return to the illegal value itself */
73 result = sourceIllegal;
74 break;
75 } else {
76 *target++ = ch; /* normal case */
77 }
78 } else if (ch > UNI_MAX_UTF16) {
79 if (flags == strictConversion) {
80 result = sourceIllegal;
81 } else {
82 *target++ = UNI_REPLACEMENT_CHAR;
83 }
84 } else {
85 /* target is a character in range 0xFFFF - 0x10FFFF. */
86 if (target + 1 >= targetEnd) {
87 --source; /* Back up source pointer! */
88 result = targetExhausted; break;
89 }
90 ch -= halfBase;
91 *target++ = (ch >> halfShift) + UNI_SUR_HIGH_START;
92 *target++ = (ch & halfMask) + UNI_SUR_LOW_START;
93 }
94 }
95 *sourceStart = source;
96 *targetStart = target;
97 return result;
98 }
99
100 /* --------------------------------------------------------------------- */
101
ConvertUTF16toUTF32(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)102 ConversionResult ConvertUTF16toUTF32 (
103 const UTF16** sourceStart, const UTF16* sourceEnd,
104 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
105 ConversionResult result = conversionOK;
106 const UTF16* source = *sourceStart;
107 UTF32* target = *targetStart;
108 UTF32 ch, ch2;
109 while (source < sourceEnd) {
110 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
111 ch = *source++;
112 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) {
113 ch2 = *source;
114 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
115 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
116 + (ch2 - UNI_SUR_LOW_START) + halfBase;
117 ++source;
118 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
119 --source; /* return to the illegal value itself */
120 result = sourceIllegal;
121 break;
122 }
123 } else if ((flags == strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)) {
124 /* an unpaired low surrogate */
125 --source; /* return to the illegal value itself */
126 result = sourceIllegal;
127 break;
128 }
129 if (target >= targetEnd) {
130 source = oldSource; /* Back up source pointer! */
131 result = targetExhausted; break;
132 }
133 *target++ = ch;
134 }
135 *sourceStart = source;
136 *targetStart = target;
137 #ifdef CVTUTF_DEBUG
138 if (result == sourceIllegal) {
139 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
140 fflush(stderr);
141 }
142 #endif
143 return result;
144 }
145
146 /* --------------------------------------------------------------------- */
147
148 /*
149 * Index into the table below with the first byte of a UTF-8 sequence to
150 * get the number of trailing bytes that are supposed to follow it.
151 */
152 static const char trailingBytesForUTF8[256] = {
153 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
154 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
155 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
156 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
157 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
158 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
159 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
160 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
161 };
162
163 /*
164 * Magic values subtracted from a buffer value during UTF8 conversion.
165 * This table contains as many values as there might be trailing bytes
166 * in a UTF-8 sequence.
167 */
168 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
169 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
170
171 /*
172 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
173 * into the first byte, depending on how many bytes follow. There are
174 * as many entries in this table as there are UTF-8 sequence types.
175 * (I.e., one byte sequence, two byte... six byte sequence.)
176 */
177 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
178
179 /* --------------------------------------------------------------------- */
180
181 /* The interface converts a whole buffer to avoid function-call overhead.
182 * Constants have been gathered. Loops & conditionals have been removed as
183 * much as possible for efficiency, in favor of drop-through switches.
184 * (See "Note A" at the bottom of the file for equivalent code.)
185 * If your compiler supports it, the "isLegalUTF8" call can be turned
186 * into an inline function.
187 */
188
189 /* --------------------------------------------------------------------- */
190
ConvertUTF16toUTF8(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)191 ConversionResult ConvertUTF16toUTF8 (
192 const UTF16** sourceStart, const UTF16* sourceEnd,
193 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
194 ConversionResult result = conversionOK;
195 const UTF16* source = *sourceStart;
196 UTF8* target = *targetStart;
197 while (source < sourceEnd) {
198 UTF32 ch;
199 unsigned short bytesToWrite = 0;
200 const UTF32 byteMask = 0xBF;
201 const UTF32 byteMark = 0x80;
202 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
203 ch = *source++;
204 /* If we have a surrogate pair, convert to UTF32 first. */
205 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) {
206 UTF32 ch2 = *source;
207 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
208 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
209 + (ch2 - UNI_SUR_LOW_START) + halfBase;
210 ++source;
211 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
212 --source; /* return to the illegal value itself */
213 result = sourceIllegal;
214 break;
215 }
216 } else if ((flags == strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)) {
217 --source; /* return to the illegal value itself */
218 result = sourceIllegal;
219 break;
220 }
221 /* Figure out how many bytes the result will require */
222 if (ch < (UTF32)0x80) { bytesToWrite = 1;
223 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
224 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
225 } else if (ch < (UTF32)0x200000) { bytesToWrite = 4;
226 } else { bytesToWrite = 2;
227 ch = UNI_REPLACEMENT_CHAR;
228 }
229
230 target += bytesToWrite;
231 if (target > targetEnd) {
232 source = oldSource; /* Back up source pointer! */
233 target -= bytesToWrite; result = targetExhausted; break;
234 }
235 switch (bytesToWrite) { /* note: everything falls through. */
236 case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6;
237 case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6;
238 case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6;
239 case 1: *--target = ch | firstByteMark[bytesToWrite];
240 }
241 target += bytesToWrite;
242 }
243 *sourceStart = source;
244 *targetStart = target;
245 return result;
246 }
247
248 /* --------------------------------------------------------------------- */
249
250 /*
251 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
252 * This must be called with the length pre-determined by the first byte.
253 * If not calling this from ConvertUTF8to*, then the length can be set by:
254 * length = trailingBytesForUTF8[*source]+1;
255 * and the sequence is illegal right away if there aren't that many bytes
256 * available.
257 * If presented with a length > 4, this returns false. The Unicode
258 * definition of UTF-8 goes up to 4-byte sequences.
259 */
260
isLegalUTF8(const UTF8 * source,int length)261 static Boolean isLegalUTF8(const UTF8 *source, int length) {
262 UTF8 a;
263 const UTF8 *srcptr = source+length;
264 switch (length) {
265 default: return false;
266 /* Everything else falls through when "true"... */
267 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
268 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
269 case 2: if ((a = (*--srcptr)) > 0xBF) return false;
270 switch (*source) {
271 /* no fall-through in this inner switch */
272 case 0xE0: if (a < 0xA0) return false; break;
273 case 0xF0: if (a < 0x90) return false; break;
274 case 0xF4: if (a > 0x8F) return false; break;
275 default: if (a < 0x80) return false;
276 }
277 case 1: if (*source >= 0x80 && *source < 0xC2) return false;
278 if (*source > 0xF4) return false;
279 }
280 return true;
281 }
282
283 /* --------------------------------------------------------------------- */
284
285 /*
286 * Exported function to return whether a UTF-8 sequence is legal or not.
287 * This is not used here; it's just exported.
288 */
isLegalUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)289 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
290 int length = trailingBytesForUTF8[*source]+1;
291 if (source+length > sourceEnd) {
292 return false;
293 }
294 return isLegalUTF8(source, length);
295 }
296
297 /* --------------------------------------------------------------------- */
298
ConvertUTF8toUTF16(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)299 ConversionResult ConvertUTF8toUTF16 (
300 const UTF8** sourceStart, const UTF8* sourceEnd,
301 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
302 ConversionResult result = conversionOK;
303 const UTF8* source = *sourceStart;
304 UTF16* target = *targetStart;
305 while (source < sourceEnd) {
306 UTF32 ch = 0;
307 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
308 if (source + extraBytesToRead >= sourceEnd) {
309 result = sourceExhausted; break;
310 }
311 /* Do this check whether lenient or strict */
312 if (! isLegalUTF8(source, extraBytesToRead+1)) {
313 result = sourceIllegal;
314 break;
315 }
316 /*
317 * The cases all fall through. See "Note A" below.
318 */
319 switch (extraBytesToRead) {
320 case 3: ch += *source++; ch <<= 6;
321 case 2: ch += *source++; ch <<= 6;
322 case 1: ch += *source++; ch <<= 6;
323 case 0: ch += *source++;
324 }
325 ch -= offsetsFromUTF8[extraBytesToRead];
326
327 if (target >= targetEnd) {
328 source -= (extraBytesToRead+1); /* Back up source pointer! */
329 result = targetExhausted; break;
330 }
331 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
332 if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) {
333 source -= (extraBytesToRead+1); /* return to the illegal value itself */
334 result = sourceIllegal;
335 break;
336 } else {
337 *target++ = ch; /* normal case */
338 }
339 } else if (ch > UNI_MAX_UTF16) {
340 if (flags == strictConversion) {
341 result = sourceIllegal;
342 source -= (extraBytesToRead+1); /* return to the start */
343 break; /* Bail out; shouldn't continue */
344 } else {
345 *target++ = UNI_REPLACEMENT_CHAR;
346 }
347 } else {
348 /* target is a character in range 0xFFFF - 0x10FFFF. */
349 if (target + 1 >= targetEnd) {
350 source -= (extraBytesToRead+1); /* Back up source pointer! */
351 result = targetExhausted; break;
352 }
353 ch -= halfBase;
354 *target++ = (ch >> halfShift) + UNI_SUR_HIGH_START;
355 *target++ = (ch & halfMask) + UNI_SUR_LOW_START;
356 }
357 }
358 *sourceStart = source;
359 *targetStart = target;
360 return result;
361 }
362
363 /* --------------------------------------------------------------------- */
364
ConvertUTF32toUTF8(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)365 ConversionResult ConvertUTF32toUTF8 (
366 const UTF32** sourceStart, const UTF32* sourceEnd,
367 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
368 ConversionResult result = conversionOK;
369 const UTF32* source = *sourceStart;
370 UTF8* target = *targetStart;
371 while (source < sourceEnd) {
372 UTF32 ch;
373 unsigned short bytesToWrite = 0;
374 const UTF32 byteMask = 0xBF;
375 const UTF32 byteMark = 0x80;
376 ch = *source++;
377 /* surrogates of any stripe are not legal UTF32 characters */
378 if (flags == strictConversion ) {
379 if ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)) {
380 --source; /* return to the illegal value itself */
381 result = sourceIllegal;
382 break;
383 }
384 }
385 /* Figure out how many bytes the result will require */
386 if (ch < (UTF32)0x80) { bytesToWrite = 1;
387 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
388 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
389 } else if (ch < (UTF32)0x200000) { bytesToWrite = 4;
390 } else { bytesToWrite = 2;
391 ch = UNI_REPLACEMENT_CHAR;
392 }
393
394 target += bytesToWrite;
395 if (target > targetEnd) {
396 --source; /* Back up source pointer! */
397 target -= bytesToWrite; result = targetExhausted; break;
398 }
399 switch (bytesToWrite) { /* note: everything falls through. */
400 case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6;
401 case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6;
402 case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6;
403 case 1: *--target = ch | firstByteMark[bytesToWrite];
404 }
405 target += bytesToWrite;
406 }
407 *sourceStart = source;
408 *targetStart = target;
409 return result;
410 }
411
412 /* --------------------------------------------------------------------- */
413
ConvertUTF8toUTF32(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)414 ConversionResult ConvertUTF8toUTF32 (
415 const UTF8** sourceStart, const UTF8* sourceEnd,
416 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
417 ConversionResult result = conversionOK;
418 const UTF8* source = *sourceStart;
419 UTF32* target = *targetStart;
420 while (source < sourceEnd) {
421 UTF32 ch = 0;
422 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
423 if (source + extraBytesToRead >= sourceEnd) {
424 result = sourceExhausted; break;
425 }
426 /* Do this check whether lenient or strict */
427 if (! isLegalUTF8(source, extraBytesToRead+1)) {
428 result = sourceIllegal;
429 break;
430 }
431 /*
432 * The cases all fall through. See "Note A" below.
433 */
434 switch (extraBytesToRead) {
435 case 3: ch += *source++; ch <<= 6;
436 case 2: ch += *source++; ch <<= 6;
437 case 1: ch += *source++; ch <<= 6;
438 case 0: ch += *source++;
439 }
440 ch -= offsetsFromUTF8[extraBytesToRead];
441
442 if (target >= targetEnd) {
443 source -= (extraBytesToRead+1); /* Back up the source pointer! */
444 result = targetExhausted; break;
445 }
446 if (ch <= UNI_MAX_UTF32) {
447 *target++ = ch;
448 } else { /* i.e., ch > UNI_MAX_UTF32 */
449 *target++ = UNI_REPLACEMENT_CHAR;
450 }
451 }
452 *sourceStart = source;
453 *targetStart = target;
454 return result;
455 }
456
457 /* ---------------------------------------------------------------------
458
459 Note A.
460 The fall-through switches in UTF-8 reading code save a
461 temp variable, some decrements & conditionals. The switches
462 are equivalent to the following loop:
463 {
464 int tmpBytesToRead = extraBytesToRead+1;
465 do {
466 ch += *source++;
467 --tmpBytesToRead;
468 if (tmpBytesToRead) ch <<= 6;
469 } while (tmpBytesToRead > 0);
470 }
471 In UTF-8 writing code, the switches on "bytesToWrite" are
472 similarly unrolled loops.
473
474 --------------------------------------------------------------------- */
475
476
477