1 /* tag: Tom Lord Tue Dec  4 14:41:20 2001 (cvtutf.c)
2  */
3 /* ================================================================ */
4 /*
5  * Copyright 2001 Unicode, Inc.
6  *
7  * Disclaimer
8  *
9  * This source code is provided as is by Unicode, Inc. No claims are
10  * made as to fitness for any particular purpose. No warranties of any
11  * kind are expressed or implied. The recipient agrees to determine
12  * applicability of information provided. If this file has been
13  * purchased on magnetic or optical media from Unicode, Inc., the
14  * sole remedy for any claim will be exchange of defective media
15  * within 90 days of receipt.
16  *
17  * Limitations on Rights to Redistribute This Code
18  *
19  * Unicode, Inc. hereby grants the right to freely use the information
20  * supplied in this file in the creation of products supporting the
21  * Unicode Standard, and to make copies of this file in any form
22  * for internal or external distribution as long as this notice
23  * remains attached.
24  */
25 
26 /* ---------------------------------------------------------------------
27 
28     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
29 	Author: Mark E. Davis, 1994.
30 	Rev History: Rick McGowan, fixes & updates May 2001.
31 	Sept 2001: fixed const & error conditions per
32 		mods suggested by S. Parent & A. Lillich.
33 
34     See the header file "ConvertUTF.h" for complete documentation.
35 
36 ------------------------------------------------------------------------ */
37 
38 #include "hackerlab/tests/uni-tests/cvtutf.h"
39 
40 #ifdef CVTUTF_DEBUG
41 #include <stdio.h>
42 #endif
43 
44 static const int halfShift	= 10; /* used for shifting by 10 bits */
45 
46 static const UTF32 halfBase	= 0x0010000UL;
47 static const UTF32 halfMask	= 0x3FFUL;
48 
49 #define UNI_SUR_HIGH_START	(UTF32)0xD800
50 #define UNI_SUR_HIGH_END	(UTF32)0xDBFF
51 #define UNI_SUR_LOW_START	(UTF32)0xDC00
52 #define UNI_SUR_LOW_END		(UTF32)0xDFFF
53 #define false			0
54 #define true			1
55 
56 /* --------------------------------------------------------------------- */
57 
ConvertUTF32toUTF16(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)58 ConversionResult ConvertUTF32toUTF16 (
59 		const UTF32** sourceStart, const UTF32* sourceEnd,
60 		UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
61 	ConversionResult result = conversionOK;
62 	const UTF32* source = *sourceStart;
63 	UTF16* target = *targetStart;
64 	while (source < sourceEnd) {
65 		UTF32 ch;
66 		if (target >= targetEnd) {
67 			result = targetExhausted; break;
68 		}
69 		ch = *source++;
70 		if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
71 			if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) {
72 				--source; /* return to the illegal value itself */
73 				result = sourceIllegal;
74 				break;
75 			} else {
76 			    *target++ = ch;	/* normal case */
77 			}
78 		} else if (ch > UNI_MAX_UTF16) {
79 			if (flags == strictConversion) {
80 				result = sourceIllegal;
81 			} else {
82 				*target++ = UNI_REPLACEMENT_CHAR;
83 			}
84 		} else {
85 			/* target is a character in range 0xFFFF - 0x10FFFF. */
86 			if (target + 1 >= targetEnd) {
87 				--source; /* Back up source pointer! */
88 				result = targetExhausted; break;
89 			}
90 			ch -= halfBase;
91 			*target++ = (ch >> halfShift) + UNI_SUR_HIGH_START;
92 			*target++ = (ch & halfMask) + UNI_SUR_LOW_START;
93 		}
94 	}
95 	*sourceStart = source;
96 	*targetStart = target;
97 	return result;
98 }
99 
100 /* --------------------------------------------------------------------- */
101 
ConvertUTF16toUTF32(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)102 ConversionResult ConvertUTF16toUTF32 (
103 		const UTF16** sourceStart, const UTF16* sourceEnd,
104 		UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
105 	ConversionResult result = conversionOK;
106 	const UTF16* source = *sourceStart;
107 	UTF32* target = *targetStart;
108 	UTF32 ch, ch2;
109 	while (source < sourceEnd) {
110 		const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
111 		ch = *source++;
112 		if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) {
113 			ch2 = *source;
114 			if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
115 				ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
116 					+ (ch2 - UNI_SUR_LOW_START) + halfBase;
117 				++source;
118 			} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
119 				--source; /* return to the illegal value itself */
120 				result = sourceIllegal;
121 				break;
122 			}
123 		} else if ((flags == strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)) {
124 			/* an unpaired low surrogate */
125 			--source; /* return to the illegal value itself */
126 			result = sourceIllegal;
127 			break;
128 		}
129 		if (target >= targetEnd) {
130 			source = oldSource; /* Back up source pointer! */
131 			result = targetExhausted; break;
132 		}
133 		*target++ = ch;
134 	}
135 	*sourceStart = source;
136 	*targetStart = target;
137 #ifdef CVTUTF_DEBUG
138 if (result == sourceIllegal) {
139     fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
140     fflush(stderr);
141 }
142 #endif
143 	return result;
144 }
145 
146 /* --------------------------------------------------------------------- */
147 
148 /*
149  * Index into the table below with the first byte of a UTF-8 sequence to
150  * get the number of trailing bytes that are supposed to follow it.
151  */
152 static const char trailingBytesForUTF8[256] = {
153 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
154 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
155 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
156 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
157 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
158 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
159 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
160 	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
161 };
162 
163 /*
164  * Magic values subtracted from a buffer value during UTF8 conversion.
165  * This table contains as many values as there might be trailing bytes
166  * in a UTF-8 sequence.
167  */
168 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
169 					 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
170 
171 /*
172  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
173  * into the first byte, depending on how many bytes follow.  There are
174  * as many entries in this table as there are UTF-8 sequence types.
175  * (I.e., one byte sequence, two byte... six byte sequence.)
176  */
177 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
178 
179 /* --------------------------------------------------------------------- */
180 
181 /* The interface converts a whole buffer to avoid function-call overhead.
182  * Constants have been gathered. Loops & conditionals have been removed as
183  * much as possible for efficiency, in favor of drop-through switches.
184  * (See "Note A" at the bottom of the file for equivalent code.)
185  * If your compiler supports it, the "isLegalUTF8" call can be turned
186  * into an inline function.
187  */
188 
189 /* --------------------------------------------------------------------- */
190 
ConvertUTF16toUTF8(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)191 ConversionResult ConvertUTF16toUTF8 (
192 		const UTF16** sourceStart, const UTF16* sourceEnd,
193 		UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
194 	ConversionResult result = conversionOK;
195 	const UTF16* source = *sourceStart;
196 	UTF8* target = *targetStart;
197 	while (source < sourceEnd) {
198 		UTF32 ch;
199 		unsigned short bytesToWrite = 0;
200 		const UTF32 byteMask = 0xBF;
201 		const UTF32 byteMark = 0x80;
202 		const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
203 		ch = *source++;
204 		/* If we have a surrogate pair, convert to UTF32 first. */
205 		if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) {
206 			UTF32 ch2 = *source;
207 			if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
208 				ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
209 					+ (ch2 - UNI_SUR_LOW_START) + halfBase;
210 				++source;
211 			} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
212 				--source; /* return to the illegal value itself */
213 				result = sourceIllegal;
214 				break;
215 			}
216 		} else if ((flags == strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)) {
217 			--source; /* return to the illegal value itself */
218 			result = sourceIllegal;
219 			break;
220 		}
221 		/* Figure out how many bytes the result will require */
222 		if (ch < (UTF32)0x80) {			bytesToWrite = 1;
223 		} else if (ch < (UTF32)0x800) {		bytesToWrite = 2;
224 		} else if (ch < (UTF32)0x10000) {	bytesToWrite = 3;
225 		} else if (ch < (UTF32)0x200000) {	bytesToWrite = 4;
226 		} else {				bytesToWrite = 2;
227 							ch = UNI_REPLACEMENT_CHAR;
228 		}
229 
230 		target += bytesToWrite;
231 		if (target > targetEnd) {
232 			source = oldSource; /* Back up source pointer! */
233 			target -= bytesToWrite; result = targetExhausted; break;
234 		}
235 		switch (bytesToWrite) {	/* note: everything falls through. */
236 			case 4:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
237 			case 3:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
238 			case 2:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
239 			case 1:	*--target =  ch | firstByteMark[bytesToWrite];
240 		}
241 		target += bytesToWrite;
242 	}
243 	*sourceStart = source;
244 	*targetStart = target;
245 	return result;
246 }
247 
248 /* --------------------------------------------------------------------- */
249 
250 /*
251  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
252  * This must be called with the length pre-determined by the first byte.
253  * If not calling this from ConvertUTF8to*, then the length can be set by:
254  *	length = trailingBytesForUTF8[*source]+1;
255  * and the sequence is illegal right away if there aren't that many bytes
256  * available.
257  * If presented with a length > 4, this returns false.  The Unicode
258  * definition of UTF-8 goes up to 4-byte sequences.
259  */
260 
isLegalUTF8(const UTF8 * source,int length)261 static Boolean isLegalUTF8(const UTF8 *source, int length) {
262 	UTF8 a;
263 	const UTF8 *srcptr = source+length;
264 	switch (length) {
265 	default: return false;
266 		/* Everything else falls through when "true"... */
267 	case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
268 	case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
269 	case 2: if ((a = (*--srcptr)) > 0xBF) return false;
270 		switch (*source) {
271 		    /* no fall-through in this inner switch */
272 		    case 0xE0: if (a < 0xA0) return false; break;
273 		    case 0xF0: if (a < 0x90) return false; break;
274 		    case 0xF4: if (a > 0x8F) return false; break;
275 		    default:  if (a < 0x80) return false;
276 		}
277     	case 1: if (*source >= 0x80 && *source < 0xC2) return false;
278 		if (*source > 0xF4) return false;
279 	}
280 	return true;
281 }
282 
283 /* --------------------------------------------------------------------- */
284 
285 /*
286  * Exported function to return whether a UTF-8 sequence is legal or not.
287  * This is not used here; it's just exported.
288  */
isLegalUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)289 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
290 	int length = trailingBytesForUTF8[*source]+1;
291 	if (source+length > sourceEnd) {
292 	    return false;
293 	}
294 	return isLegalUTF8(source, length);
295 }
296 
297 /* --------------------------------------------------------------------- */
298 
ConvertUTF8toUTF16(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)299 ConversionResult ConvertUTF8toUTF16 (
300 		const UTF8** sourceStart, const UTF8* sourceEnd,
301 		UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
302 	ConversionResult result = conversionOK;
303 	const UTF8* source = *sourceStart;
304 	UTF16* target = *targetStart;
305 	while (source < sourceEnd) {
306 		UTF32 ch = 0;
307 		unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
308 		if (source + extraBytesToRead >= sourceEnd) {
309 			result = sourceExhausted; break;
310 		}
311 		/* Do this check whether lenient or strict */
312 		if (! isLegalUTF8(source, extraBytesToRead+1)) {
313 			result = sourceIllegal;
314 			break;
315 		}
316 		/*
317 		 * The cases all fall through. See "Note A" below.
318 		 */
319 		switch (extraBytesToRead) {
320 			case 3:	ch += *source++; ch <<= 6;
321 			case 2:	ch += *source++; ch <<= 6;
322 			case 1:	ch += *source++; ch <<= 6;
323 			case 0:	ch += *source++;
324 		}
325 		ch -= offsetsFromUTF8[extraBytesToRead];
326 
327 		if (target >= targetEnd) {
328 			source -= (extraBytesToRead+1);	/* Back up source pointer! */
329 			result = targetExhausted; break;
330 		}
331 		if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
332 			if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) {
333 				source -= (extraBytesToRead+1); /* return to the illegal value itself */
334 				result = sourceIllegal;
335 				break;
336 			} else {
337 			    *target++ = ch;	/* normal case */
338 			}
339 		} else if (ch > UNI_MAX_UTF16) {
340 			if (flags == strictConversion) {
341 				result = sourceIllegal;
342 				source -= (extraBytesToRead+1); /* return to the start */
343 				break; /* Bail out; shouldn't continue */
344 			} else {
345 				*target++ = UNI_REPLACEMENT_CHAR;
346 			}
347 		} else {
348 			/* target is a character in range 0xFFFF - 0x10FFFF. */
349 			if (target + 1 >= targetEnd) {
350 				source -= (extraBytesToRead+1);	/* Back up source pointer! */
351 				result = targetExhausted; break;
352 			}
353 			ch -= halfBase;
354 			*target++ = (ch >> halfShift) + UNI_SUR_HIGH_START;
355 			*target++ = (ch & halfMask) + UNI_SUR_LOW_START;
356 		}
357 	}
358 	*sourceStart = source;
359 	*targetStart = target;
360 	return result;
361 }
362 
363 /* --------------------------------------------------------------------- */
364 
ConvertUTF32toUTF8(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)365 ConversionResult ConvertUTF32toUTF8 (
366 		const UTF32** sourceStart, const UTF32* sourceEnd,
367 		UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
368 	ConversionResult result = conversionOK;
369 	const UTF32* source = *sourceStart;
370 	UTF8* target = *targetStart;
371 	while (source < sourceEnd) {
372 		UTF32 ch;
373 		unsigned short bytesToWrite = 0;
374 		const UTF32 byteMask = 0xBF;
375 		const UTF32 byteMark = 0x80;
376 		ch = *source++;
377 		/* surrogates of any stripe are not legal UTF32 characters */
378 		if (flags == strictConversion ) {
379 			if ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)) {
380 				--source; /* return to the illegal value itself */
381 				result = sourceIllegal;
382 				break;
383 			}
384 		}
385 		/* Figure out how many bytes the result will require */
386 		if (ch < (UTF32)0x80) {			bytesToWrite = 1;
387 		} else if (ch < (UTF32)0x800) {		bytesToWrite = 2;
388 		} else if (ch < (UTF32)0x10000) {	bytesToWrite = 3;
389 		} else if (ch < (UTF32)0x200000) {	bytesToWrite = 4;
390 		} else {				bytesToWrite = 2;
391 							ch = UNI_REPLACEMENT_CHAR;
392 		}
393 
394 		target += bytesToWrite;
395 		if (target > targetEnd) {
396 			--source; /* Back up source pointer! */
397 			target -= bytesToWrite; result = targetExhausted; break;
398 		}
399 		switch (bytesToWrite) {	/* note: everything falls through. */
400 			case 4:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
401 			case 3:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
402 			case 2:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
403 			case 1:	*--target =  ch | firstByteMark[bytesToWrite];
404 		}
405 		target += bytesToWrite;
406 	}
407 	*sourceStart = source;
408 	*targetStart = target;
409 	return result;
410 }
411 
412 /* --------------------------------------------------------------------- */
413 
ConvertUTF8toUTF32(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)414 ConversionResult ConvertUTF8toUTF32 (
415 		const UTF8** sourceStart, const UTF8* sourceEnd,
416 		UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
417 	ConversionResult result = conversionOK;
418 	const UTF8* source = *sourceStart;
419 	UTF32* target = *targetStart;
420 	while (source < sourceEnd) {
421 		UTF32 ch = 0;
422 		unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
423 		if (source + extraBytesToRead >= sourceEnd) {
424 			result = sourceExhausted; break;
425 		}
426 		/* Do this check whether lenient or strict */
427 		if (! isLegalUTF8(source, extraBytesToRead+1)) {
428 			result = sourceIllegal;
429 			break;
430 		}
431 		/*
432 		 * The cases all fall through. See "Note A" below.
433 		 */
434 		switch (extraBytesToRead) {
435 			case 3:	ch += *source++; ch <<= 6;
436 			case 2:	ch += *source++; ch <<= 6;
437 			case 1:	ch += *source++; ch <<= 6;
438 			case 0:	ch += *source++;
439 		}
440 		ch -= offsetsFromUTF8[extraBytesToRead];
441 
442 		if (target >= targetEnd) {
443 			source -= (extraBytesToRead+1);	/* Back up the source pointer! */
444 			result = targetExhausted; break;
445 		}
446 		if (ch <= UNI_MAX_UTF32) {
447 			*target++ = ch;
448 		} else { /* i.e., ch > UNI_MAX_UTF32 */
449 			*target++ = UNI_REPLACEMENT_CHAR;
450 		}
451 	}
452 	*sourceStart = source;
453 	*targetStart = target;
454 	return result;
455 }
456 
457 /* ---------------------------------------------------------------------
458 
459 	Note A.
460 	The fall-through switches in UTF-8 reading code save a
461 	temp variable, some decrements & conditionals.  The switches
462 	are equivalent to the following loop:
463 		{
464 			int tmpBytesToRead = extraBytesToRead+1;
465 			do {
466 				ch += *source++;
467 				--tmpBytesToRead;
468 				if (tmpBytesToRead) ch <<= 6;
469 			} while (tmpBytesToRead > 0);
470 		}
471 	In UTF-8 writing code, the switches on "bytesToWrite" are
472 	similarly unrolled loops.
473 
474    --------------------------------------------------------------------- */
475 
476 
477