1 // =================================================================================================
2 // Copyright 2004 Adobe Systems Incorporated
3 // All Rights Reserved.
4 //
5 // NOTICE:  Adobe permits you to use, modify, and distribute this file in accordance with the terms
6 // of the Adobe license agreement accompanying it.
7 // =================================================================================================
8 
9 #include "public/include/XMP_Const.h"
10 
11 #define UC_Assert(cond) /* Nothing for now, should be XMP_Assert. */
12 #define UC_Throw(msg,id)  throw XMP_Error ( id, msg )
13 
14 #include "source/UnicodeConversions.hpp"
15 
16 #if SUNOS_SPARC || XMP_IOS_ARM
17 	#include "string.h"
18 #endif
19 
20 using namespace std;
21 
22 // =================================================================================================
23 
24 // *** Look into using asm inlines, e.g. count-leading bits for multi-byte UTF-8.
25 
26 CodePoint_to_UTF16_Proc CodePoint_to_UTF16BE = 0;
27 CodePoint_to_UTF16_Proc CodePoint_to_UTF16LE = 0;
28 
29 CodePoint_from_UTF16_Proc CodePoint_from_UTF16BE = 0;
30 CodePoint_from_UTF16_Proc CodePoint_from_UTF16LE = 0;
31 
32 UTF8_to_UTF16_Proc  UTF8_to_UTF16BE = 0;
33 UTF8_to_UTF16_Proc  UTF8_to_UTF16LE = 0;
34 UTF8_to_UTF32_Proc  UTF8_to_UTF32BE = 0;
35 UTF8_to_UTF32_Proc  UTF8_to_UTF32LE = 0;
36 
37 UTF16_to_UTF8_Proc  UTF16BE_to_UTF8 = 0;
38 UTF16_to_UTF8_Proc  UTF16LE_to_UTF8 = 0;
39 UTF32_to_UTF8_Proc  UTF32BE_to_UTF8 = 0;
40 UTF32_to_UTF8_Proc  UTF32LE_to_UTF8 = 0;
41 
42 UTF8_to_UTF16_Proc  UTF8_to_UTF16Native = 0;
43 UTF8_to_UTF32_Proc  UTF8_to_UTF32Native = 0;
44 UTF16_to_UTF8_Proc  UTF16Native_to_UTF8 = 0;
45 UTF32_to_UTF8_Proc  UTF32Native_to_UTF8 = 0;
46 
47 UTF16_to_UTF32_Proc UTF16BE_to_UTF32BE = 0;
48 UTF16_to_UTF32_Proc UTF16BE_to_UTF32LE = 0;
49 UTF16_to_UTF32_Proc UTF16LE_to_UTF32BE = 0;
50 UTF16_to_UTF32_Proc UTF16LE_to_UTF32LE = 0;
51 
52 UTF32_to_UTF16_Proc UTF32BE_to_UTF16BE = 0;
53 UTF32_to_UTF16_Proc UTF32BE_to_UTF16LE = 0;
54 UTF32_to_UTF16_Proc UTF32LE_to_UTF16BE = 0;
55 UTF32_to_UTF16_Proc UTF32LE_to_UTF16LE = 0;
56 
57 // -------------------------------------------------------------------------------------------------
58 
59 static size_t swap32to16Offset = 0;	// Offset to "convert" a swapped UTF32 pointer into a swapped UTF16 pointer.
60 
61 // -------------------------------------------------------------------------------------------------
62 
63 static void CodePoint_to_UTF16Nat ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written );
64 static void CodePoint_to_UTF16Swp ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written );
65 
66 static void CodePoint_from_UTF16Nat ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read );
67 static void CodePoint_from_UTF16Swp ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read );
68 
69 // -------------------------------------------------------------------------------------------------
70 
71 static void UTF8_to_UTF16Nat ( const UTF8Unit *  utf8In,    const size_t utf8Len,
72 				               UTF16Unit *       utf16Out,  const size_t utf16Len,
73 				               size_t *          utf8Read,  size_t *     utf16Written );
74 
75 static void UTF8_to_UTF16Swp ( const UTF8Unit *  utf8In,    const size_t utf8Len,
76 				               UTF16Unit *       utf16Out,  const size_t utf16Len,
77 				               size_t *          utf8Read,  size_t *     utf16Written );
78 
79 static void UTF8_to_UTF32Nat ( const UTF8Unit *  utf8In,    const size_t utf8Len,
80 				               UTF32Unit *       utf32Out,  const size_t utf32Len,
81 				               size_t *          utf8Read,  size_t *     utf32Written );
82 
83 static void UTF8_to_UTF32Swp ( const UTF8Unit *  utf8In,    const size_t utf8Len,
84 				               UTF32Unit *       utf32Out,  const size_t utf32Len,
85 				               size_t *          utf8Read,  size_t *     utf32Written );
86 
87 // -------------------------------------------------------------------------------------------------
88 
89 static void UTF16Nat_to_UTF8 ( const UTF16Unit * utf16In,   const size_t utf16Len,
90 				               UTF8Unit *        utf8Out,   const size_t utf8Len,
91 				               size_t *          utf16Read, size_t *     utf8Written );
92 
93 static void UTF16Swp_to_UTF8 ( const UTF16Unit * utf16In,   const size_t utf16Len,
94 				               UTF8Unit *        utf8Out,   const size_t utf8Len,
95 				               size_t *          utf16Read, size_t *     utf8Written );
96 
97 static void UTF32Nat_to_UTF8 ( const UTF32Unit * utf32In,   const size_t utf32Len,
98 				               UTF8Unit *        utf8Out,   const size_t utf8Len,
99 				               size_t *          utf32Read, size_t *     utf8Written );
100 
101 static void UTF32Swp_to_UTF8 ( const UTF32Unit * utf32In,   const size_t utf32Len,
102 				               UTF8Unit *        utf8Out,   const size_t utf8Len,
103 				               size_t *          utf32Read, size_t *     utf8Written );
104 
105 // -------------------------------------------------------------------------------------------------
106 
107 static void UTF16Nat_to_UTF32Nat ( const UTF16Unit * utf16In,   const size_t utf16Len,
108 				                   UTF32Unit *       utf32Out,  const size_t utf32Len,
109 				                   size_t *          utf16Read, size_t *     utf32Written );
110 
111 static void UTF16Nat_to_UTF32Swp ( const UTF16Unit * utf16In,   const size_t utf16Len,
112 				                   UTF32Unit *       utf32Out,  const size_t utf32Len,
113 				                   size_t *          utf16Read, size_t *     utf32Written );
114 
115 static void UTF16Swp_to_UTF32Nat ( const UTF16Unit * utf16In,   const size_t utf16Len,
116 				                   UTF32Unit *       utf32Out,  const size_t utf32Len,
117 				                   size_t *          utf16Read, size_t *     utf32Written );
118 
119 static void UTF16Swp_to_UTF32Swp ( const UTF16Unit * utf16In,   const size_t utf16Len,
120 				                   UTF32Unit *       utf32Out,  const size_t utf32Len,
121 				                   size_t *          utf16Read, size_t *     utf32Written );
122 
123 // -------------------------------------------------------------------------------------------------
124 
125 static void UTF32Nat_to_UTF16Nat ( const UTF32Unit * utf32In,   const size_t utf32Len,
126 				                   UTF16Unit *       utf16Out,  const size_t utf16Len,
127 				                   size_t *          utf32Read, size_t *     utf16Written );
128 
129 static void UTF32Nat_to_UTF16Swp ( const UTF32Unit * utf32In,   const size_t utf32Len,
130 				                   UTF16Unit *       utf16Out,  const size_t utf16Len,
131 				                   size_t *          utf32Read, size_t *     utf16Written );
132 
133 static void UTF32Swp_to_UTF16Nat ( const UTF32Unit * utf32In,   const size_t utf32Len,
134 				                   UTF16Unit *       utf16Out,  const size_t utf16Len,
135 				                   size_t *          utf32Read, size_t *     utf16Written );
136 
137 static void UTF32Swp_to_UTF16Swp ( const UTF32Unit * utf32In,   const size_t utf32Len,
138 				                   UTF16Unit *       utf16Out,  const size_t utf16Len,
139 				                   size_t *          utf32Read, size_t *     utf16Written );
140 
141 // =================================================================================================
142 
InitializeUnicodeConversions()143 void InitializeUnicodeConversions()
144 {
145 	UC_Assert ( (sizeof(UTF8Unit) == 1) && (sizeof(UTF16Unit) == 2) && (sizeof(UTF32Unit) == 4) );
146 
147 	UTF16Unit u16  = 0x00FF;
148 	bool bigEndian = (*((UTF8Unit*)&u16) == 0);
149 
150 	UTF8_to_UTF16Native = UTF8_to_UTF16Nat;
151 	UTF8_to_UTF32Native = UTF8_to_UTF32Nat;
152 	UTF16Native_to_UTF8 = UTF16Nat_to_UTF8;
153 	UTF32Native_to_UTF8 = UTF32Nat_to_UTF8;
154 
155 	if ( bigEndian ) {
156 
157 		swap32to16Offset = 0;
158 
159 		CodePoint_to_UTF16BE = CodePoint_to_UTF16Nat;
160 		CodePoint_to_UTF16LE = CodePoint_to_UTF16Swp;
161 
162 		CodePoint_from_UTF16BE = CodePoint_from_UTF16Nat;
163 		CodePoint_from_UTF16LE = CodePoint_from_UTF16Swp;
164 
165 		UTF8_to_UTF16BE = UTF8_to_UTF16Nat;
166 		UTF8_to_UTF16LE = UTF8_to_UTF16Swp;
167 		UTF8_to_UTF32BE = UTF8_to_UTF32Nat;
168 		UTF8_to_UTF32LE = UTF8_to_UTF32Swp;
169 
170 		UTF16BE_to_UTF8 = UTF16Nat_to_UTF8;
171 		UTF16LE_to_UTF8 = UTF16Swp_to_UTF8;
172 		UTF32BE_to_UTF8 = UTF32Nat_to_UTF8;
173 		UTF32LE_to_UTF8 = UTF32Swp_to_UTF8;
174 
175 		UTF16BE_to_UTF32BE = UTF16Nat_to_UTF32Nat;
176 		UTF16BE_to_UTF32LE = UTF16Nat_to_UTF32Swp;
177 		UTF16LE_to_UTF32BE = UTF16Swp_to_UTF32Nat;
178 		UTF16LE_to_UTF32LE = UTF16Swp_to_UTF32Swp;
179 
180 		UTF32BE_to_UTF16BE = UTF32Nat_to_UTF16Nat;
181 		UTF32BE_to_UTF16LE = UTF32Nat_to_UTF16Swp;
182 		UTF32LE_to_UTF16BE = UTF32Swp_to_UTF16Nat;
183 		UTF32LE_to_UTF16LE = UTF32Swp_to_UTF16Swp;
184 
185 	} else {
186 
187 		swap32to16Offset = 1;	// ! Offset in UTF16 units!
188 
189 		CodePoint_to_UTF16BE = CodePoint_to_UTF16Swp;
190 		CodePoint_to_UTF16LE = CodePoint_to_UTF16Nat;
191 
192 		CodePoint_from_UTF16BE = CodePoint_from_UTF16Swp;
193 		CodePoint_from_UTF16LE = CodePoint_from_UTF16Nat;
194 
195 		UTF8_to_UTF16BE = UTF8_to_UTF16Swp;
196 		UTF8_to_UTF16LE = UTF8_to_UTF16Nat;
197 		UTF8_to_UTF32BE = UTF8_to_UTF32Swp;
198 		UTF8_to_UTF32LE = UTF8_to_UTF32Nat;
199 
200 		UTF16BE_to_UTF8 = UTF16Swp_to_UTF8;
201 		UTF16LE_to_UTF8 = UTF16Nat_to_UTF8;
202 		UTF32BE_to_UTF8 = UTF32Swp_to_UTF8;
203 		UTF32LE_to_UTF8 = UTF32Nat_to_UTF8;
204 
205 		UTF16BE_to_UTF32BE = UTF16Swp_to_UTF32Swp;
206 		UTF16BE_to_UTF32LE = UTF16Swp_to_UTF32Nat;
207 		UTF16LE_to_UTF32BE = UTF16Nat_to_UTF32Swp;
208 		UTF16LE_to_UTF32LE = UTF16Nat_to_UTF32Nat;
209 
210 		UTF32BE_to_UTF16BE = UTF32Swp_to_UTF16Swp;
211 		UTF32BE_to_UTF16LE = UTF32Swp_to_UTF16Nat;
212 		UTF32LE_to_UTF16BE = UTF32Nat_to_UTF16Swp;
213 		UTF32LE_to_UTF16LE = UTF32Nat_to_UTF16Nat;
214 
215 	}
216 
217 }	// InitializeUnicodeConversions
218 
219 // =================================================================================================
220 
221 #if SUNOS_SPARC || XMP_IOS_ARM
222 	#define DefineAndGetValue(type,inPtr) type inUnit; memcpy ( &inUnit, inPtr, sizeof(type) );
223 #else
224 	#define DefineAndGetValue(type,inPtr) type inUnit = *((type *)inPtr);
225 #endif
226 
UTF16InSwap(const void * inPtr)227 static inline UTF16Unit UTF16InSwap ( const void * inPtr )
228 {
229 	DefineAndGetValue ( UTF16Unit, inPtr );
230 	return (inUnit << 8) | (inUnit >> 8);
231 }
UTF32InSwap(const void * inPtr)232 static inline UTF32Unit UTF32InSwap ( const void * inPtr )
233 {
234 	DefineAndGetValue ( UTF32Unit, inPtr );
235 	return (inUnit << 24) | ((inUnit << 8) & 0x00FF0000) | ((inUnit >> 8) & 0x0000FF00) | (inUnit >> 24);
236 }
237 
UTF16OutSwap(UTF16Unit * outPtr,const UTF16Unit value)238 static inline void UTF16OutSwap ( UTF16Unit * outPtr, const UTF16Unit value )
239 {
240 	UTF16Unit outUnit = (value << 8) | (value >> 8);
241 	*outPtr = outUnit;
242 }
243 
UTF32OutSwap(UTF32Unit * outPtr,const UTF32Unit value)244 static inline void UTF32OutSwap ( UTF32Unit * outPtr, const UTF32Unit value )
245 {
246 	UTF32Unit outUnit = (value << 24) | ((value << 8) & 0x00FF0000) | ((value >> 8) & 0x0000FF00) | (value >> 24);
247 	*outPtr = outUnit;
248 }
249 
250 // =================================================================================================
251 
SwapUTF16(const UTF16Unit * utf16In,UTF16Unit * utf16Out,const size_t utf16Len)252 void SwapUTF16 ( const UTF16Unit * utf16In, UTF16Unit * utf16Out, const size_t utf16Len )
253 {
254 	for ( size_t i = 0; i < utf16Len; ++i ) utf16Out[i] = UTF16InSwap(utf16In+i);
255 }
256 
SwapUTF32(const UTF32Unit * utf32In,UTF32Unit * utf32Out,const size_t utf32Len)257 void SwapUTF32 ( const UTF32Unit * utf32In, UTF32Unit * utf32Out, const size_t utf32Len ) {
258 	for ( size_t i = 0; i < utf32Len; ++i ) utf32Out[i] = UTF32InSwap(utf32In+i);
259 }
260 
261 // =================================================================================================
262 
ToUTF16(const UTF8Unit * utf8In,size_t utf8Len,std::string * utf16Str,bool bigEndian)263 extern void ToUTF16 ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf16Str, bool bigEndian )
264 {
265 	UTF8_to_UTF16_Proc Converter = UTF8_to_UTF16LE;
266 	if ( bigEndian ) Converter = UTF8_to_UTF16BE;
267 
268 	enum { kBufferSize = 8*1024 };
269 	UTF16Unit u16Buffer[kBufferSize];	// 16K bytes
270 	size_t readCount, writeCount;
271 
272 	utf16Str->erase();
273 	utf16Str->reserve ( 2*utf8Len );	// As good a guess as any.
274 
275 	while ( utf8Len > 0 ) {
276 		Converter ( utf8In, utf8Len, u16Buffer, kBufferSize, &readCount, &writeCount );
277 		if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadUnicode );
278 		utf16Str->append ( (const char *)u16Buffer, writeCount*2 );
279 		utf8In  += readCount;
280 		utf8Len -= readCount;
281 	}
282 
283 }	// ToUTF16
284 
285 // =================================================================================================
286 
ToUTF16Native(const UTF8Unit * utf8In,size_t utf8Len,std::string * utf16Str)287 extern void ToUTF16Native ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf16Str )
288 {
289 	enum { kBufferSize = 8*1024 };
290 	UTF16Unit u16Buffer[kBufferSize];	// 16K bytes
291 	size_t readCount, writeCount;
292 
293 	utf16Str->erase();
294 	utf16Str->reserve ( 2*utf8Len );	// As good a guess as any.
295 
296 	while ( utf8Len > 0 ) {
297 		UTF8_to_UTF16Nat ( utf8In, utf8Len, u16Buffer, kBufferSize, &readCount, &writeCount );
298 		if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadUnicode );
299 		utf16Str->append ( (const char *)u16Buffer, writeCount*2 );
300 		utf8In  += readCount;
301 		utf8Len -= readCount;
302 	}
303 
304 }	// ToUTF16Native
305 
306 // =================================================================================================
307 
ToUTF32(const UTF8Unit * utf8In,size_t utf8Len,std::string * utf32Str,bool bigEndian)308 extern void ToUTF32 ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf32Str, bool bigEndian )
309 {
310 	UTF8_to_UTF32_Proc Converter = UTF8_to_UTF32LE;
311 	if ( bigEndian ) Converter = UTF8_to_UTF32BE;
312 
313 	enum { kBufferSize = 4*1024 };
314 	UTF32Unit u32Buffer[kBufferSize];	// 16K bytes
315 	size_t readCount, writeCount;
316 
317 	utf32Str->erase();
318 	utf32Str->reserve ( 4*utf8Len );	// As good a guess as any.
319 
320 	while ( utf8Len > 0 ) {
321 		Converter ( utf8In, utf8Len, u32Buffer, kBufferSize, &readCount, &writeCount );
322 		if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadUnicode );
323 		utf32Str->append ( (const char *)u32Buffer, writeCount*4 );
324 		utf8In  += readCount;
325 		utf8Len -= readCount;
326 	}
327 
328 }	// ToUTF32
329 
330 // =================================================================================================
331 
ToUTF32Native(const UTF8Unit * utf8In,size_t utf8Len,std::string * utf32Str)332 extern void ToUTF32Native ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf32Str )
333 {
334 	enum { kBufferSize = 4*1024 };
335 	UTF32Unit u32Buffer[kBufferSize];	// 16K bytes
336 	size_t readCount, writeCount;
337 
338 	utf32Str->erase();
339 	utf32Str->reserve ( 4*utf8Len );	// As good a guess as any.
340 
341 	while ( utf8Len > 0 ) {
342 		UTF8_to_UTF32Nat ( utf8In, utf8Len, u32Buffer, kBufferSize, &readCount, &writeCount );
343 		if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadUnicode );
344 		utf32Str->append ( (const char *)u32Buffer, writeCount*4 );
345 		utf8In  += readCount;
346 		utf8Len -= readCount;
347 	}
348 
349 }	// ToUTF32Native
350 
351 // =================================================================================================
352 
FromUTF16(const UTF16Unit * utf16In,size_t utf16Len,std::string * utf8Str,bool bigEndian)353 extern void FromUTF16 ( const UTF16Unit * utf16In, size_t utf16Len, std::string * utf8Str, bool bigEndian )
354 {
355 	UTF16_to_UTF8_Proc Converter = UTF16LE_to_UTF8;
356 	if ( bigEndian ) Converter = UTF16BE_to_UTF8;
357 
358 	enum { kBufferSize = 16*1024 };
359 	UTF8Unit u8Buffer[kBufferSize];
360 	size_t readCount, writeCount;
361 
362 	utf8Str->erase();
363 	utf8Str->reserve ( 2*utf16Len );	// As good a guess as any.
364 
365 	while ( utf16Len > 0 ) {
366 		Converter ( utf16In, utf16Len, u8Buffer, kBufferSize, &readCount, &writeCount );
367 		if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadUnicode );
368 		utf8Str->append ( (const char *)u8Buffer, writeCount );
369 		utf16In  += readCount;
370 		utf16Len -= readCount;
371 	}
372 
373 }	// FromUTF16
374 
375 // =================================================================================================
376 
FromUTF16Native(const UTF16Unit * utf16In,size_t utf16Len,std::string * utf8Str)377 extern void FromUTF16Native ( const UTF16Unit * utf16In, size_t utf16Len, std::string * utf8Str )
378 {
379 	enum { kBufferSize = 16*1024 };
380 	UTF8Unit u8Buffer[kBufferSize];
381 	size_t readCount, writeCount;
382 
383 	utf8Str->erase();
384 	utf8Str->reserve ( 2*utf16Len );	// As good a guess as any.
385 
386 	while ( utf16Len > 0 ) {
387 		UTF16Nat_to_UTF8 ( utf16In, utf16Len, u8Buffer, kBufferSize, &readCount, &writeCount );
388 		if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadUnicode );
389 		utf8Str->append ( (const char *)u8Buffer, writeCount );
390 		utf16In  += readCount;
391 		utf16Len -= readCount;
392 	}
393 
394 }	// FromUTF16Native
395 
396 // =================================================================================================
397 
FromUTF32(const UTF32Unit * utf32In,size_t utf32Len,std::string * utf8Str,bool bigEndian)398 extern void FromUTF32 ( const UTF32Unit * utf32In, size_t utf32Len, std::string * utf8Str, bool bigEndian )
399 {
400 	UTF32_to_UTF8_Proc Converter = UTF32LE_to_UTF8;
401 	if ( bigEndian ) Converter = UTF32BE_to_UTF8;
402 
403 	enum { kBufferSize = 16*1024 };
404 	UTF8Unit u8Buffer[kBufferSize];
405 	size_t readCount, writeCount;
406 
407 	utf8Str->erase();
408 	utf8Str->reserve ( 2*utf32Len );	// As good a guess as any.
409 
410 	while ( utf32Len > 0 ) {
411 		Converter ( utf32In, utf32Len, u8Buffer, kBufferSize, &readCount, &writeCount );
412 		if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadUnicode );
413 		utf8Str->append ( (const char *)u8Buffer, writeCount );
414 		utf32In  += readCount;
415 		utf32Len -= readCount;
416 	}
417 
418 }	// FromUTF32
419 
420 // =================================================================================================
421 
FromUTF32Native(const UTF32Unit * utf32In,size_t utf32Len,std::string * utf8Str)422 extern void FromUTF32Native ( const UTF32Unit * utf32In, size_t utf32Len, std::string * utf8Str )
423 {
424 	enum { kBufferSize = 16*1024 };
425 	UTF8Unit u8Buffer[kBufferSize];
426 	size_t readCount, writeCount;
427 
428 	utf8Str->erase();
429 	utf8Str->reserve ( 2*utf32Len );	// As good a guess as any.
430 
431 	while ( utf32Len > 0 ) {
432 		UTF32Nat_to_UTF8 ( utf32In, utf32Len, u8Buffer, kBufferSize, &readCount, &writeCount );
433 		if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadUnicode );
434 		utf8Str->append ( (const char *)u8Buffer, writeCount );
435 		utf32In  += readCount;
436 		utf32Len -= readCount;
437 	}
438 
439 }	// FromUTF32Native
440 
441 // =================================================================================================
442 
CodePoint_to_UTF8_Multi(const UTF32Unit cpIn,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf8Written)443 static void CodePoint_to_UTF8_Multi ( const UTF32Unit cpIn, UTF8Unit * utf8Out, const size_t utf8Len, size_t * utf8Written )
444 {
445 	size_t unitCount = 0;
446 
447 	if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam );
448 	if ( (0xD800 <= cpIn) && (cpIn <= 0xDFFF) ) UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam );
449 
450 	// Compute the number of bytes using 6 data bits each. Then see if the highest order bits will
451 	// fit into the leading byte. Write the UTF-8 sequence if there is enough room.
452 
453 	UTF32Unit temp, mask;
454 	size_t bytesNeeded = 0;
455 	for ( temp = cpIn; temp != 0; temp = temp >> 6 ) ++bytesNeeded;
456 
457 	temp = cpIn >> ((bytesNeeded-1)*6);	// The highest order data bits.
458 	mask = (0x80 >> bytesNeeded) - 1;	// Available data bits in the leading byte.
459 	if ( temp > mask ) ++bytesNeeded;
460 
461 	if ( bytesNeeded > utf8Len ) goto Done;	// Not enough room for the output.
462 	unitCount = bytesNeeded;
463 
464 	temp = cpIn;
465 	for ( --bytesNeeded; bytesNeeded > 0; --bytesNeeded ) {
466 		utf8Out[bytesNeeded] = 0x80 | UTF8Unit ( temp & 0x3F );
467 		temp = temp >> 6;
468 	}
469 
470 	mask = ~((1 << (8-unitCount)) - 1);
471 	utf8Out[0] = UTF8Unit ( mask | temp );
472 
473 Done:
474 	*utf8Written = unitCount;
475 	return;
476 
477 }	// CodePoint_to_UTF8_Multi
478 
479 // =================================================================================================
480 
CodePoint_to_UTF8(const UTF32Unit cpIn,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf8Written)481 void CodePoint_to_UTF8 ( const UTF32Unit cpIn, UTF8Unit * utf8Out, const size_t utf8Len, size_t * utf8Written )
482 {
483 	size_t unitCount = 0;
484 
485 	UC_Assert ( (utf8Out != 0) && (utf8Written != 0) );
486 	if ( utf8Len == 0 ) goto Done;
487 	if ( cpIn > 0x7F ) goto MultiByte;	// ! Force linear execution path for ASCII.
488 
489 	unitCount = 1;
490 	*utf8Out = UTF8Unit(cpIn);
491 
492 Done:
493 	*utf8Written = unitCount;
494 	return;
495 
496 MultiByte:
497 	 CodePoint_to_UTF8_Multi( cpIn, utf8Out, utf8Len, utf8Written );
498 	 return;
499 
500 }	// CodePoint_to_UTF8
501 
502 // =================================================================================================
503 
CodePoint_from_UTF8_Multi(const UTF8Unit * utf8In,const size_t utf8Len,UTF32Unit * cpOut,size_t * utf8Read)504 static void CodePoint_from_UTF8_Multi ( const UTF8Unit * utf8In, const size_t utf8Len, UTF32Unit * cpOut, size_t * utf8Read )
505 {
506 	UTF8Unit  inUnit = *utf8In;
507 	size_t    unitCount = 0;
508 	UTF32Unit cp;	// ! Avoid gcc complaints about declarations after goto's.
509 	const UTF8Unit * utf8Pos;
510 
511 	// -------------------------------------------------------------------------------------
512 	// We've got a multibyte UTF-8 character. The first byte has the number of bytes and the
513 	// highest order data bits. The other bytes each add 6 more data bits.
514 
515 	#if 0	// This might be a more effcient way to count the bytes.
516 		static XMP_Uns8 kByteCounts[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
517 		size_t bytesNeeded = kByteCounts [ inUnit >> 4 ];
518 		if ( (bytesNeeded < 2) || ((bytesNeeded == 4) && ((inUnit & 0x08) != 0)) ) {
519 			UC_Throw ( "Invalid UTF-8 sequence length", kXMPErr_BadParam );
520 		}
521 	#endif
522 
523 	size_t bytesNeeded = 0;	// Count the leading 1 bits in the first byte.
524 	for ( UTF8Unit temp = inUnit; temp > 0x7F; temp = temp << 1 ) ++bytesNeeded;
525 		// *** Consider CPU-specific assembly inline, e.g. cntlzw on PowerPC.
526 
527 	if ( (bytesNeeded < 2) || (bytesNeeded > 4) ) UC_Throw ( "Invalid UTF-8 sequence length", kXMPErr_BadParam );
528 	if ( bytesNeeded > utf8Len ) goto Done;	// Not enough input in this buffer.
529 	unitCount = bytesNeeded;
530 
531 	cp = inUnit & ((1 << (7-unitCount)) - 1);	// Isolate the initial data bits in the bottom of cp.
532 
533 	utf8Pos = utf8In + 1;	// We've absorbed the first byte.
534 	for ( --bytesNeeded; bytesNeeded > 0; --bytesNeeded, ++utf8Pos ) {
535 		inUnit = *utf8Pos;
536 		if ( (inUnit & UTF8Unit(0xC0)) != UTF8Unit(0x80) ) UC_Throw ( "Invalid UTF-8 data byte", kXMPErr_BadParam );
537 		cp = (cp << 6) | (inUnit & 0x3F);
538 	}
539 
540 	if ( cp >= 0xD800 ) {	// Skip the next comparisons most of the time.
541 		if ( (0xD800 <= cp) && (cp <= 0xDFFF) ) UC_Throw ( "Bad UTF-8 - surrogate code point", kXMPErr_BadParam );
542 		if ( cp > 0x10FFFF ) UC_Throw ( "Bad UTF-8 - out of range", kXMPErr_BadParam );
543 	}
544 
545 	*cpOut = cp;	// ! Don't put after Done, don't write if no input.
546 
547 Done:
548 	*utf8Read = unitCount;
549 	return;
550 
551 }	// CodePoint_from_UTF8_Multi
552 
553 // =================================================================================================
554 
CodePoint_from_UTF8(const UTF8Unit * utf8In,const size_t utf8Len,UTF32Unit * cpOut,size_t * utf8Read)555 void CodePoint_from_UTF8 ( const UTF8Unit * utf8In, const size_t utf8Len, UTF32Unit * cpOut, size_t * utf8Read )
556 {
557 	UTF8Unit inUnit;	// ! Don't read until we know there is input.
558 	size_t unitCount = 0;
559 
560 	UC_Assert ( (utf8In != 0) && (cpOut != 0) && (utf8Read != 0) );
561 	if ( utf8Len == 0 ) goto Done;
562 	inUnit = *utf8In;
563 	if ( inUnit >= 0x80 ) goto MultiByte;	// ! Force linear execution path for ASCII.
564 
565 	unitCount = 1;
566 	*cpOut = inUnit;	// ! Don't put after Done, don't write if no input.
567 
568 Done:
569 	*utf8Read = unitCount;
570 	return;
571 
572 MultiByte:
573 	CodePoint_from_UTF8_Multi ( utf8In, utf8Len, cpOut, utf8Read );
574 	return;
575 
576 }	// CodePoint_from_UTF8
577 
578 // =================================================================================================
579 
CodePoint_to_UTF16Nat_Surrogate(const UTF32Unit cpIn,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf16Written)580 static void CodePoint_to_UTF16Nat_Surrogate ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
581 {
582 	size_t    unitCount = 0;
583 	UTF32Unit temp;	// ! Avoid gcc complaints about declarations after goto's.
584 
585 	if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam );
586 	if ( utf16Len < 2 ) goto Done;	// Not enough room for the output.
587 
588 	unitCount = 2;
589 	temp = cpIn - 0x10000;
590 	utf16Out[0] = 0xD800 | UTF16Unit ( temp >> 10 );
591 	utf16Out[1] = 0xDC00 | UTF16Unit ( temp & 0x3FF );
592 
593 Done:
594 	*utf16Written = unitCount;
595 	return;
596 
597 }	// CodePoint_to_UTF16Nat_Surrogate
598 
599 // =================================================================================================
600 
CodePoint_to_UTF16Nat(const UTF32Unit cpIn,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf16Written)601 static void CodePoint_to_UTF16Nat ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
602 {
603 	size_t unitCount = 0;
604 
605 	UC_Assert ( (utf16Out != 0) && (utf16Written != 0) );
606 	if ( utf16Len == 0 ) goto Done;
607 	if ( cpIn >= 0xD800 ) goto CheckSurrogate;	// ! Force linear execution path for the BMP.
608 
609 InBMP:
610 	unitCount = 1;
611 	*utf16Out = UTF16Unit(cpIn);
612 
613 Done:
614 	*utf16Written = unitCount;
615 	return;
616 
617 CheckSurrogate:
618 	if ( cpIn > 0xFFFF ) goto SurrogatePair;
619 	if ( cpIn > 0xDFFF ) goto InBMP;
620 	UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam );
621 
622 SurrogatePair:
623 	CodePoint_to_UTF16Nat_Surrogate ( cpIn, utf16Out, utf16Len, utf16Written );
624 	return;
625 
626 }	// CodePoint_to_UTF16Nat
627 
628 // =================================================================================================
629 
CodePoint_from_UTF16Nat_Surrogate(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * cpOut,size_t * utf16Read)630 static void CodePoint_from_UTF16Nat_Surrogate ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
631 {
632 	UTF16Unit hiUnit = *utf16In;
633 	size_t    unitCount = 0;
634 	UTF16Unit loUnit;	// ! Avoid gcc complaints about declarations after goto's.
635 	UTF32Unit cp;
636 
637 	// ----------------------------------
638 	// We've got a UTF-16 surrogate pair.
639 
640 	if ( hiUnit > 0xDBFF ) UC_Throw ( "Bad UTF-16 - leading low surrogate", kXMPErr_BadParam );
641 	if ( utf16Len < 2 ) goto Done;	// Not enough input in this buffer.
642 
643 	loUnit  = *(utf16In+1);
644 	if ( (loUnit < 0xDC00) || (0xDFFF < loUnit) ) UC_Throw ( "Bad UTF-16 - missing low surrogate", kXMPErr_BadParam );
645 
646 	unitCount = 2;
647 	cp = (((hiUnit & 0x3FF) << 10) | (loUnit & 0x3FF)) + 0x10000;
648 
649 	*cpOut = cp;	// ! Don't put after Done, don't write if no input.
650 
651 Done:
652 	*utf16Read = unitCount;
653 	return;
654 
655 }	// CodePoint_from_UTF16Nat_Surrogate
656 
657 // =================================================================================================
658 
CodePoint_from_UTF16Nat(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * cpOut,size_t * utf16Read)659 static void CodePoint_from_UTF16Nat ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
660 {
661 	UTF16Unit inUnit;	// ! Don't read until we know there is input.
662 	size_t unitCount = 0;
663 
664 	UC_Assert ( (utf16In != 0) && (cpOut != 0) && (utf16Read != 0) );
665 	if ( utf16Len == 0 ) goto Done;
666 	inUnit = *utf16In;
667 	if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) goto SurrogatePair;	// ! Force linear execution path for the BMP.
668 
669 	unitCount = 1;
670 	*cpOut = inUnit;	// ! Don't put after Done, don't write if no input.
671 
672 Done:
673 	*utf16Read = unitCount;
674 	return;
675 
676 SurrogatePair:
677 	CodePoint_from_UTF16Nat_Surrogate ( utf16In, utf16Len, cpOut, utf16Read );
678 	return;
679 
680 }	// CodePoint_from_UTF16Nat
681 
682 // =================================================================================================
683 
UTF8_to_UTF16Nat(const UTF8Unit * utf8In,const size_t utf8Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf8Read,size_t * utf16Written)684 static void UTF8_to_UTF16Nat ( const UTF8Unit * utf8In,   const size_t utf8Len,
685 				               UTF16Unit *      utf16Out, const size_t utf16Len,
686 				               size_t *         utf8Read, size_t *     utf16Written )
687 {
688 	const UTF8Unit * utf8Pos  = utf8In;
689 	UTF16Unit *      utf16Pos = utf16Out;
690 
691 	size_t utf8Left  = utf8Len;
692 	size_t utf16Left = utf16Len;
693 
694 	UC_Assert ( (utf8In != 0) && (utf16Out != 0) && (utf8Read != 0) && (utf16Written != 0) );
695 
696 	while ( (utf8Left > 0) && (utf16Left > 0) ) {
697 
698 		// Do a run of ASCII, it copies 1 input unit into 1 output unit.
699 		size_t i, limit = utf8Left;
700 		if ( limit > utf16Left ) limit = utf16Left;
701 		for ( i = 0; i < limit; ++i ) {
702 			UTF8Unit inUnit = *utf8Pos;
703 			if ( inUnit > 0x7F ) break;
704 			*utf16Pos = inUnit;
705 			++utf8Pos;
706 			++utf16Pos;
707 		}
708 		utf8Left  -= i;
709 		utf16Left -= i;
710 
711 		// Do a run of non-ASCII, it copies multiple input units into 1 or 2 output units.
712 		while ( (utf8Left > 0) && (utf16Left > 0) ) {
713 			UTF32Unit cp;
714 			size_t len8, len16;
715 			UTF8Unit inUnit = *utf8Pos;
716 			if ( inUnit <= 0x7F ) break;
717 			CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len8 );
718 			if ( len8 == 0 ) goto Done;		// The input buffer ends in the middle of a character.
719 			if ( cp <= 0xFFFF ) {
720 				*utf16Pos = UTF16Unit(cp);
721 				len16 = 1;
722 			} else {
723 				CodePoint_to_UTF16Nat_Surrogate ( cp, utf16Pos, utf16Left, &len16 );
724 				if ( len16 == 0 ) goto Done;	// Not enough room in the output buffer.
725 			}
726 			utf8Left  -= len8;
727 			utf8Pos   += len8;
728 			utf16Left -= len16;
729 			utf16Pos  += len16;
730 		}
731 
732 	}
733 
734 Done:	// Set the output lengths.
735 	*utf8Read = utf8Len - utf8Left;
736 	*utf16Written = utf16Len - utf16Left;
737 
738 }	// UTF8_to_UTF16Nat
739 
740 // =================================================================================================
741 
UTF8_to_UTF32Nat(const UTF8Unit * utf8In,const size_t utf8Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf8Read,size_t * utf32Written)742 static void UTF8_to_UTF32Nat ( const UTF8Unit *  utf8In,   const size_t utf8Len,
743 				               UTF32Unit *       utf32Out, const size_t utf32Len,
744 				               size_t *          utf8Read, size_t *     utf32Written )
745 {
746 	const UTF8Unit * utf8Pos  = utf8In;
747 	UTF32Unit *      utf32Pos = utf32Out;
748 
749 	size_t utf8Left  = utf8Len;
750 	size_t utf32Left = utf32Len;
751 
752 	UC_Assert ( (utf8In != 0) && (utf32Out != 0) && (utf8Read != 0) && (utf32Written != 0) );
753 
754 	while ( (utf8Left > 0) && (utf32Left > 0) ) {
755 
756 		// Do a run of ASCII, it copies 1 input unit into 1 output unit.
757 		size_t i, limit = utf8Left;
758 		if ( limit > utf32Left ) limit = utf32Left;
759 		for ( i = 0; i < limit; ++i ) {
760 			UTF8Unit inUnit = *utf8Pos;
761 			if ( inUnit > 0x7F ) break;
762 			*utf32Pos = inUnit;
763 			++utf8Pos;
764 			++utf32Pos;
765 		}
766 		utf8Left -= i;
767 		utf32Left -= i;
768 
769 		// Do a run of non-ASCII, it copies variable input into 1 output unit.
770 		while ( (utf8Left > 0) && (utf32Left > 0) ) {
771 			size_t len;
772 			UTF8Unit inUnit = *utf8Pos;
773 			if ( inUnit <= 0x7F ) break;
774 			CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, utf32Pos, &len );
775 			if ( len == 0 ) goto Done;	// The input buffer ends in the middle of a character.
776 			utf8Left  -= len;
777 			utf8Pos   += len;
778 			utf32Left -= 1;
779 			utf32Pos  += 1;
780 		}
781 
782 	}
783 
784 Done:	// Set the output lengths.
785 	*utf8Read = utf8Len - utf8Left;
786 	*utf32Written = utf32Len - utf32Left;
787 
788 }	// UTF8_to_UTF32Nat
789 
790 // =================================================================================================
791 
UTF16Nat_to_UTF8(const UTF16Unit * utf16In,const size_t utf16Len,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf16Read,size_t * utf8Written)792 static void UTF16Nat_to_UTF8 ( const UTF16Unit * utf16In,   const size_t utf16Len,
793 				               UTF8Unit *        utf8Out,   const size_t utf8Len,
794 				               size_t *          utf16Read, size_t *     utf8Written )
795 {
796 	const UTF16Unit * utf16Pos = utf16In;
797 	UTF8Unit *        utf8Pos  = utf8Out;
798 
799 	size_t utf16Left = utf16Len;
800 	size_t utf8Left  = utf8Len;
801 
802 	UC_Assert ( (utf16In != 0) && (utf8Out != 0) && (utf16Read != 0) && (utf8Written != 0) );
803 
804 	while ( (utf16Left > 0) && (utf8Left > 0) ) {
805 
806 		// Do a run of ASCII, it copies 1 input unit into 1 output unit.
807 		size_t i, limit = utf16Left;
808 		if ( limit > utf8Left ) limit = utf8Left;
809 		for ( i = 0; i < limit; ++i ) {
810 			UTF16Unit inUnit = *utf16Pos;
811 			if ( inUnit > 0x7F ) break;
812 			*utf8Pos = UTF8Unit(inUnit);
813 			++utf16Pos;
814 			++utf8Pos;
815 		}
816 		utf16Left -= i;
817 		utf8Left  -= i;
818 
819 		// Do a run of non-ASCII inside the BMP, it copies 1 input unit into multiple output units.
820 		while ( (utf16Left > 0) && (utf8Left > 0) ) {
821 			size_t len8;
822 			UTF16Unit inUnit = *utf16Pos;
823 			if ( inUnit <= 0x7F ) break;
824 			if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
825 			CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len8 );
826 			if ( len8 == 0 ) goto Done;		// Not enough room in the output buffer.
827 			utf16Left -= 1;
828 			utf16Pos  += 1;
829 			utf8Left  -= len8;
830 			utf8Pos   += len8;
831 		}
832 
833 		// Do a run of surrogate pairs, it copies 2 input units into multiple output units.
834 		while ( (utf16Left > 0) && (utf8Left > 0) ) {
835 			UTF32Unit cp;
836 			size_t len16, len8;
837 			UTF16Unit inUnit = *utf16Pos;
838 			if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
839 			CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, &cp, &len16 );
840 			if ( len16 == 0 ) goto Done;	// The input buffer ends in the middle of a surrogate pair.
841 			UC_Assert ( len16 == 2 );
842 			CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len8 );
843 			if ( len8 == 0 ) goto Done;		// Not enough room in the output buffer.
844 			utf16Left -= len16;
845 			utf16Pos  += len16;
846 			utf8Left  -= len8;
847 			utf8Pos   += len8;
848 		}
849 
850 	}
851 
852 Done:	// Set the output lengths.
853 	*utf16Read = utf16Len - utf16Left;
854 	*utf8Written = utf8Len - utf8Left;
855 
856 }	// UTF16Nat_to_UTF8
857 
858 // =================================================================================================
859 
UTF32Nat_to_UTF8(const UTF32Unit * utf32In,const size_t utf32Len,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf32Read,size_t * utf8Written)860 static void UTF32Nat_to_UTF8 ( const UTF32Unit * utf32In,   const size_t utf32Len,
861 				               UTF8Unit *        utf8Out,   const size_t utf8Len,
862 				               size_t *          utf32Read, size_t *     utf8Written )
863 {
864 	const UTF32Unit * utf32Pos = utf32In;
865 	UTF8Unit *        utf8Pos  = utf8Out;
866 
867 	size_t utf32Left = utf32Len;
868 	size_t utf8Left  = utf8Len;
869 
870 	UC_Assert ( (utf32In != 0) && (utf8Out != 0) && (utf32Read != 0) && (utf8Written != 0) );
871 
872 	while ( (utf32Left > 0) && (utf8Left > 0) ) {
873 
874 		// Do a run of ASCII, it copies 1 input unit into 1 output unit.
875 		size_t i, limit = utf32Left;
876 		if ( limit > utf8Left ) limit = utf8Left;
877 		for ( i = 0; i < limit; ++i ) {
878 			UTF32Unit inUnit = *utf32Pos;
879 			if ( inUnit > 0x7F ) break;
880 			*utf8Pos = UTF8Unit(inUnit);
881 			++utf32Pos;
882 			++utf8Pos;
883 		}
884 		utf32Left -= i;
885 		utf8Left  -= i;
886 
887 		// Do a run of non-ASCII, it copies 1 input unit into multiple output units.
888 		while ( (utf32Left > 0) && (utf8Left > 0) ) {
889 			size_t len;
890 			UTF32Unit inUnit = *utf32Pos;
891 			if ( inUnit <= 0x7F ) break;
892 			CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len );
893 			if ( len == 0 ) goto Done;	// Not enough room in the output buffer.
894 			utf32Left -= 1;
895 			utf32Pos  += 1;
896 			utf8Left  -= len;
897 			utf8Pos   += len;
898 		}
899 
900 	}
901 
902 Done:	// Set the output lengths.
903 	*utf32Read = utf32Len - utf32Left;
904 	*utf8Written = utf8Len - utf8Left;
905 
906 }	// UTF32Nat_to_UTF8
907 
908 // =================================================================================================
909 
UTF16Nat_to_UTF32Nat(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf16Read,size_t * utf32Written)910 static void UTF16Nat_to_UTF32Nat ( const UTF16Unit * utf16In,   const size_t utf16Len,
911 				                   UTF32Unit *       utf32Out,  const size_t utf32Len,
912 				                   size_t *          utf16Read, size_t *     utf32Written )
913 {
914 	const UTF16Unit * utf16Pos = utf16In;
915 	UTF32Unit *       utf32Pos = utf32Out;
916 
917 	size_t utf16Left = utf16Len;
918 	size_t utf32Left = utf32Len;
919 
920 	UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
921 
922 	while ( (utf16Left > 0) && (utf32Left > 0) ) {
923 
924 		// Do a run of BMP, it copies 1 input unit into 1 output unit.
925 		size_t i, limit = utf16Left;
926 		if ( limit > utf32Left ) limit = utf32Left;
927 		for ( i = 0; i < limit; ++i ) {
928 			UTF16Unit inUnit = *utf16Pos;
929 			if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
930 			*utf32Pos = inUnit;
931 			++utf16Pos;
932 			++utf32Pos;
933 		}
934 		utf16Left -= i;
935 		utf32Left -= i;
936 
937 		// Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
938 		while ( (utf16Left > 0) && (utf32Left > 0) ) {
939 			size_t len;
940 			UTF16Unit inUnit = *utf16Pos;
941 			if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
942 			CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, utf32Pos, &len );
943 			if ( len == 0 ) goto Done;	// The input buffer ends in the middle of a surrogate pair.
944 			UC_Assert ( len == 2 );
945 			utf16Left -= len;
946 			utf16Pos  += len;
947 			utf32Left -= 1;
948 			utf32Pos  += 1;
949 		}
950 
951 	}
952 
953 Done:	// Set the output lengths.
954 	*utf16Read = utf16Len - utf16Left;
955 	*utf32Written = utf32Len - utf32Left;
956 
957 }	// UTF16Nat_to_UTF32Nat
958 
959 // =================================================================================================
960 
UTF32Nat_to_UTF16Nat(const UTF32Unit * utf32In,const size_t utf32Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf32Read,size_t * utf16Written)961 static void UTF32Nat_to_UTF16Nat ( const UTF32Unit * utf32In,   const size_t utf32Len,
962 				                   UTF16Unit *       utf16Out,  const size_t utf16Len,
963 				                   size_t *          utf32Read, size_t *     utf16Written )
964 {
965 	const UTF32Unit * utf32Pos = utf32In;
966 	UTF16Unit *       utf16Pos = utf16Out;
967 
968 	size_t utf32Left = utf32Len;
969 	size_t utf16Left = utf16Len;
970 
971 	UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
972 
973 	while ( (utf32Left > 0) && (utf16Left > 0) ) {
974 
975 		// Do a run of BMP, it copies 1 input unit into 1 output unit.
976 		size_t i, limit = utf32Left;
977 		if ( limit > utf16Left ) limit = utf16Left;
978 		for ( i = 0; i < limit; ++i ) {
979 			UTF32Unit inUnit = *utf32Pos;
980 			if ( inUnit > 0xFFFF ) break;
981 			*utf16Pos = UTF16Unit(inUnit);
982 			++utf32Pos;
983 			++utf16Pos;
984 		}
985 		utf32Left -= i;
986 		utf16Left -= i;
987 
988 		// Do a run of non-BMP, it copies 1 input unit into 2 output units.
989 		while ( (utf32Left > 0) && (utf16Left > 0) ) {
990 			size_t len;
991 			UTF32Unit inUnit = *utf32Pos;
992 			if ( inUnit <= 0xFFFF ) break;
993 			CodePoint_to_UTF16Nat_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
994 			if ( len == 0 ) goto Done;	// Not enough room in the output buffer.
995 			UC_Assert ( len == 2 );
996 			utf32Left -= 1;
997 			utf32Pos  += 1;
998 			utf16Left -= 2;
999 			utf16Pos  += 2;
1000 		}
1001 
1002 	}
1003 
1004 Done:	// Set the output lengths.
1005 	*utf32Read = utf32Len - utf32Left;
1006 	*utf16Written = utf16Len - utf16Left;
1007 
1008 }	// UTF32Nat_to_UTF16Nat
1009 
1010 // =================================================================================================
1011 
CodePoint_to_UTF16Swp_Surrogate(const UTF32Unit cpIn,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf16Written)1012 static void CodePoint_to_UTF16Swp_Surrogate ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
1013 {
1014 	size_t unitCount = 0;
1015 	UTF32Unit temp;	// ! Avoid gcc complaints about declarations after goto's.
1016 
1017 	if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam );
1018 	if ( utf16Len < 2 ) goto Done;	// Not enough room for the output.
1019 
1020 	unitCount = 2;
1021 	temp = cpIn - 0x10000;
1022 	UTF16OutSwap ( &utf16Out[0], (0xD800 | UTF16Unit ( temp >> 10 )) );
1023 	UTF16OutSwap ( &utf16Out[1], (0xDC00 | UTF16Unit ( temp & 0x3FF)) );
1024 
1025 Done:
1026 	*utf16Written = unitCount;
1027 	return;
1028 
1029 }	// CodePoint_to_UTF16Swp_Surrogate
1030 
1031 // =================================================================================================
1032 
CodePoint_to_UTF16Swp(const UTF32Unit cpIn,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf16Written)1033 static void CodePoint_to_UTF16Swp ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
1034 {
1035 	size_t unitCount = 0;
1036 
1037 	UC_Assert ( (utf16Out != 0) && (utf16Written != 0) );
1038 	if ( utf16Len == 0 ) goto Done;
1039 	if ( cpIn >= 0xD800 ) goto CheckSurrogate;	// ! Force linear execution path for the BMP.
1040 
1041 InBMP:
1042 	unitCount = 1;
1043 	UTF16OutSwap ( utf16Out, UTF16Unit(cpIn) );
1044 
1045 Done:
1046 	*utf16Written = unitCount;
1047 	return;
1048 
1049 CheckSurrogate:
1050 	if ( cpIn > 0xFFFF ) goto SurrogatePair;
1051 	if ( cpIn > 0xDFFF ) goto InBMP;
1052 	UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam );
1053 
1054 SurrogatePair:
1055 	CodePoint_to_UTF16Swp_Surrogate ( cpIn, utf16Out, utf16Len, utf16Written );
1056 	return;
1057 
1058 }	// CodePoint_to_UTF16Swp
1059 
1060 // =================================================================================================
1061 
CodePoint_from_UTF16Swp_Surrogate(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * cpOut,size_t * utf16Read)1062 static void CodePoint_from_UTF16Swp_Surrogate ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
1063 {
1064 	UTF16Unit hiUnit = UTF16InSwap(utf16In);
1065 	size_t unitCount = 0;
1066 	UTF16Unit loUnit;	// ! Avoid gcc complaints about declarations after goto's.
1067 	UTF32Unit cp;
1068 
1069 	// ----------------------------------
1070 	// We've got a UTF-16 surrogate pair.
1071 
1072 	if ( hiUnit > 0xDBFF ) UC_Throw ( "Bad UTF-16 - leading low surrogate", kXMPErr_BadParam );
1073 	if ( utf16Len < 2 ) goto Done;	// Not enough input in this buffer.
1074 
1075 	loUnit  = UTF16InSwap(utf16In+1);
1076 	if ( (loUnit < 0xDC00) || (0xDFFF < loUnit) ) UC_Throw ( "Bad UTF-16 - missing low surrogate", kXMPErr_BadParam );
1077 
1078 	unitCount = 2;
1079 	cp = (((hiUnit & 0x3FF) << 10) | (loUnit & 0x3FF)) + 0x10000;
1080 
1081 	*cpOut = cp;	// ! Don't put after Done, don't write if no input.
1082 
1083 Done:
1084 	*utf16Read = unitCount;
1085 	return;
1086 
1087 }	// CodePoint_from_UTF16Swp_Surrogate
1088 
1089 // =================================================================================================
1090 
CodePoint_from_UTF16Swp(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * cpOut,size_t * utf16Read)1091 static void CodePoint_from_UTF16Swp ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
1092 {
1093 	UTF16Unit inUnit;	// ! Don't read until we know there is input.
1094 	size_t unitCount = 0;
1095 
1096 	UC_Assert ( (utf16In != 0) && (cpOut != 0) && (utf16Read != 0) );
1097 	if ( utf16Len == 0 ) goto Done;
1098 	inUnit = UTF16InSwap(utf16In);
1099 	if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) goto SurrogatePair;	// ! Force linear execution path for the BMP.
1100 
1101 	unitCount = 1;
1102 	*cpOut = inUnit;	// ! Don't put after Done, don't write if no input.
1103 
1104 Done:
1105 	*utf16Read = unitCount;
1106 	return;
1107 
1108 SurrogatePair:
1109 	CodePoint_from_UTF16Swp_Surrogate ( utf16In, utf16Len, cpOut, utf16Read );
1110 	return;
1111 
1112 }	// CodePoint_from_UTF16Swp
1113 
1114 // =================================================================================================
1115 
UTF8_to_UTF16Swp(const UTF8Unit * utf8In,const size_t utf8Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf8Read,size_t * utf16Written)1116 static void UTF8_to_UTF16Swp ( const UTF8Unit * utf8In,   const size_t utf8Len,
1117 				               UTF16Unit *      utf16Out, const size_t utf16Len,
1118 				               size_t *         utf8Read, size_t *     utf16Written )
1119 {
1120 	const UTF8Unit * utf8Pos  = utf8In;
1121 	UTF16Unit *      utf16Pos = utf16Out;
1122 
1123 	size_t utf8Left  = utf8Len;
1124 	size_t utf16Left = utf16Len;
1125 
1126 	UC_Assert ( (utf8In != 0) && (utf16Out != 0) && (utf8Read != 0) && (utf16Written != 0) );
1127 
1128 	while ( (utf8Left > 0) && (utf16Left > 0) ) {
1129 
1130 		// Do a run of ASCII, it copies 1 input unit into 1 output unit.
1131 		size_t i, limit = utf8Left;
1132 		if ( limit > utf16Left ) limit = utf16Left;
1133 		for ( i = 0; i < limit; ++i ) {
1134 			UTF8Unit inUnit = *utf8Pos;
1135 			if ( inUnit > 0x7F ) break;
1136 			*utf16Pos = UTF16Unit(inUnit) << 8;	// Better than: UTF16OutSwap ( utf16Pos, inUnit );
1137 			++utf8Pos;
1138 			++utf16Pos;
1139 		}
1140 		utf8Left  -= i;
1141 		utf16Left -= i;
1142 
1143 		// Do a run of non-ASCII, it copies multiple input units into 1 or 2 output units.
1144 		while ( (utf8Left > 0) && (utf16Left > 0) ) {
1145 			UTF32Unit cp;
1146 			size_t len8, len16;
1147 			UTF8Unit inUnit = *utf8Pos;
1148 			if ( inUnit <= 0x7F ) break;
1149 			CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len8 );
1150 			if ( len8 == 0 ) goto Done;		// The input buffer ends in the middle of a character.
1151 			if ( cp <= 0xFFFF ) {
1152 				UTF16OutSwap ( utf16Pos, UTF16Unit(cp) );
1153 				len16 = 1;
1154 			} else {
1155 				CodePoint_to_UTF16Swp_Surrogate ( cp, utf16Pos, utf16Left, &len16 );
1156 				if ( len16 == 0 ) goto Done;	// Not enough room in the output buffer.
1157 			}
1158 			utf8Left  -= len8;
1159 			utf8Pos   += len8;
1160 			utf16Left -= len16;
1161 			utf16Pos  += len16;
1162 		}
1163 
1164 	}
1165 
1166 Done:	// Set the output lengths.
1167 	*utf8Read = utf8Len - utf8Left;
1168 	*utf16Written = utf16Len - utf16Left;
1169 
1170 }	// UTF8_to_UTF16Swp
1171 
1172 // =================================================================================================
1173 
UTF8_to_UTF32Swp(const UTF8Unit * utf8In,const size_t utf8Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf8Read,size_t * utf32Written)1174 static void UTF8_to_UTF32Swp ( const UTF8Unit *  utf8In,   const size_t utf8Len,
1175 				               UTF32Unit *       utf32Out, const size_t utf32Len,
1176 				               size_t *          utf8Read, size_t *     utf32Written )
1177 {
1178 	const UTF8Unit * utf8Pos  = utf8In;
1179 	UTF32Unit *      utf32Pos = utf32Out;
1180 
1181 	size_t utf8Left  = utf8Len;
1182 	size_t utf32Left = utf32Len;
1183 
1184 	UC_Assert ( (utf8In != 0) && (utf32Out != 0) && (utf8Read != 0) && (utf32Written != 0) );
1185 
1186 	while ( (utf8Left > 0) && (utf32Left > 0) ) {
1187 
1188 		// Do a run of ASCII, it copies 1 input unit into 1 output unit.
1189 		size_t i, limit = utf8Left;
1190 		if ( limit > utf32Left ) limit = utf32Left;
1191 		for ( i = 0; i < limit; ++i ) {
1192 			UTF8Unit inUnit = *utf8Pos;
1193 			if ( inUnit > 0x7F ) break;
1194 			*utf32Pos = UTF32Unit(inUnit) << 24;	// Better than: UTF32OutSwap ( utf32Pos, inUnit );
1195 			++utf8Pos;
1196 			++utf32Pos;
1197 		}
1198 		utf8Left -= i;
1199 		utf32Left -= i;
1200 
1201 		// Do a run of non-ASCII, it copies variable input into 1 output unit.
1202 		while ( (utf8Left > 0) && (utf32Left > 0) ) {
1203 			size_t len;
1204 			UTF32Unit cp;
1205 			UTF8Unit inUnit = *utf8Pos;
1206 			if ( inUnit <= 0x7F ) break;
1207 			CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len );
1208 			if ( len == 0 ) goto Done;	// The input buffer ends in the middle of a character.
1209 			UTF32OutSwap ( utf32Pos, cp );
1210 			utf8Left  -= len;
1211 			utf8Pos   += len;
1212 			utf32Left -= 1;
1213 			utf32Pos  += 1;
1214 		}
1215 
1216 	}
1217 
1218 Done:	// Set the output lengths.
1219 	*utf8Read = utf8Len - utf8Left;
1220 	*utf32Written = utf32Len - utf32Left;
1221 
1222 }	// UTF8_to_UTF32Swp
1223 
1224 // =================================================================================================
1225 
UTF16Swp_to_UTF8(const UTF16Unit * utf16In,const size_t utf16Len,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf16Read,size_t * utf8Written)1226 static void UTF16Swp_to_UTF8 ( const UTF16Unit * utf16In,   const size_t utf16Len,
1227 				               UTF8Unit *        utf8Out,   const size_t utf8Len,
1228 				               size_t *          utf16Read, size_t *     utf8Written )
1229 {
1230 	const UTF16Unit * utf16Pos = utf16In;
1231 	UTF8Unit *        utf8Pos  = utf8Out;
1232 
1233 	size_t utf16Left = utf16Len;
1234 	size_t utf8Left  = utf8Len;
1235 
1236 	UC_Assert ( (utf16In != 0) && (utf8Out != 0) && (utf16Read != 0) && (utf8Written != 0) );
1237 
1238 	while ( (utf16Left > 0) && (utf8Left > 0) ) {
1239 
1240 		// Do a run of ASCII, it copies 1 input unit into 1 output unit.
1241 		size_t i, limit = utf16Left;
1242 		if ( limit > utf8Left ) limit = utf8Left;
1243 		for ( i = 0; i < limit; ++i ) {
1244 			UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1245 			if ( inUnit > 0x7F ) break;
1246 			*utf8Pos = UTF8Unit(inUnit);
1247 			++utf16Pos;
1248 			++utf8Pos;
1249 		}
1250 		utf16Left -= i;
1251 		utf8Left  -= i;
1252 
1253 		// Do a run of non-ASCII inside the BMP, it copies 1 input unit into multiple output units.
1254 		while ( (utf16Left > 0) && (utf8Left > 0) ) {
1255 			size_t len8;
1256 			UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1257 			if ( inUnit <= 0x7F ) break;
1258 			if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1259 			CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len8 );
1260 			if ( len8 == 0 ) goto Done;		// Not enough room in the output buffer.
1261 			utf16Left -= 1;
1262 			utf16Pos  += 1;
1263 			utf8Left  -= len8;
1264 			utf8Pos   += len8;
1265 		}
1266 
1267 		// Do a run of surrogate pairs, it copies 2 input units into multiple output units.
1268 		while ( (utf16Left > 0) && (utf8Left > 0) ) {
1269 			UTF32Unit cp;
1270 			size_t len16, len8;
1271 			UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1272 			if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1273 			CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, &cp, &len16 );
1274 			if ( len16 == 0 ) goto Done;	// The input buffer ends in the middle of a surrogate pair.
1275 			UC_Assert ( len16 == 2 );
1276 			CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len8 );
1277 			if ( len8 == 0 ) goto Done;		// Not enough room in the output buffer.
1278 			utf16Left -= len16;
1279 			utf16Pos  += len16;
1280 			utf8Left  -= len8;
1281 			utf8Pos   += len8;
1282 		}
1283 
1284 	}
1285 
1286 Done:	// Set the output lengths.
1287 	*utf16Read = utf16Len - utf16Left;
1288 	*utf8Written = utf8Len - utf8Left;
1289 
1290 }	// UTF16Swp_to_UTF8
1291 
1292 // =================================================================================================
1293 
UTF32Swp_to_UTF8(const UTF32Unit * utf32In,const size_t utf32Len,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf32Read,size_t * utf8Written)1294 static void UTF32Swp_to_UTF8 ( const UTF32Unit * utf32In,   const size_t utf32Len,
1295 				               UTF8Unit *        utf8Out,   const size_t utf8Len,
1296 				               size_t *          utf32Read, size_t *     utf8Written )
1297 {
1298 	const UTF32Unit * utf32Pos = utf32In;
1299 	UTF8Unit *        utf8Pos  = utf8Out;
1300 
1301 	size_t utf32Left = utf32Len;
1302 	size_t utf8Left  = utf8Len;
1303 
1304 	UC_Assert ( (utf32In != 0) && (utf8Out != 0) && (utf32Read != 0) && (utf8Written != 0) );
1305 
1306 	while ( (utf32Left > 0) && (utf8Left > 0) ) {
1307 
1308 		// Do a run of ASCII, it copies 1 input unit into 1 output unit.
1309 		size_t i, limit = utf32Left;
1310 		if ( limit > utf8Left ) limit = utf8Left;
1311 		for ( i = 0; i < limit; ++i ) {
1312 			UTF32Unit cp = UTF32InSwap(utf32Pos);
1313 			if ( cp > 0x7F ) break;
1314 			*utf8Pos = UTF8Unit(cp);
1315 			++utf32Pos;
1316 			++utf8Pos;
1317 		}
1318 		utf32Left -= i;
1319 		utf8Left  -= i;
1320 
1321 		// Do a run of non-ASCII, it copies 1 input unit into multiple output units.
1322 		while ( (utf32Left > 0) && (utf8Left > 0) ) {
1323 			size_t len;
1324 			UTF32Unit cp = UTF32InSwap(utf32Pos);
1325 			if ( cp <= 0x7F ) break;
1326 			CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len );
1327 			if ( len == 0 ) goto Done;	// Not enough room in the output buffer.
1328 			utf32Left -= 1;
1329 			utf32Pos  += 1;
1330 			utf8Left  -= len;
1331 			utf8Pos   += len;
1332 		}
1333 
1334 	}
1335 
1336 Done:	// Set the output lengths.
1337 	*utf32Read = utf32Len - utf32Left;
1338 	*utf8Written = utf8Len - utf8Left;
1339 
1340 }	// UTF32Swp_to_UTF8
1341 
1342 // =================================================================================================
1343 
UTF16Swp_to_UTF32Swp(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf16Read,size_t * utf32Written)1344 static void UTF16Swp_to_UTF32Swp ( const UTF16Unit * utf16In,   const size_t utf16Len,
1345 				                   UTF32Unit *       utf32Out,  const size_t utf32Len,
1346 				                   size_t *          utf16Read, size_t *     utf32Written )
1347 {
1348 	const UTF16Unit * utf16Pos = utf16In;
1349 	UTF32Unit *       utf32Pos = utf32Out;
1350 
1351 	size_t utf16Left = utf16Len;
1352 	size_t utf32Left = utf32Len;
1353 
1354 	UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
1355 
1356 	while ( (utf16Left > 0) && (utf32Left > 0) ) {
1357 
1358 		// Do a run of BMP, it copies 1 input unit into 1 output unit.
1359 		size_t i, limit = utf16Left;
1360 		if ( limit > utf32Left ) limit = utf32Left;
1361 		for ( i = 0; i < limit; ++i ) {
1362 			UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1363 			if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1364 			*utf32Pos = UTF32Unit(*utf16Pos) << 16;	// Better than: UTF32OutSwap ( utf32Pos, inUnit );
1365 			++utf16Pos;
1366 			++utf32Pos;
1367 		}
1368 		utf16Left -= i;
1369 		utf32Left -= i;
1370 
1371 		// Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
1372 		while ( (utf16Left > 0) && (utf32Left > 0) ) {
1373 			size_t len;
1374 			UTF32Unit cp;
1375 			UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1376 			if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1377 			CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, &cp, &len );
1378 			if ( len == 0 ) goto Done;	// The input buffer ends in the middle of a surrogate pair.
1379 			UTF32OutSwap ( utf32Pos, cp );
1380 			UC_Assert ( len == 2 );
1381 			utf16Left -= len;
1382 			utf16Pos  += len;
1383 			utf32Left -= 1;
1384 			utf32Pos  += 1;
1385 		}
1386 
1387 	}
1388 
1389 Done:	// Set the output lengths.
1390 	*utf16Read = utf16Len - utf16Left;
1391 	*utf32Written = utf32Len - utf32Left;
1392 
1393 }	// UTF16Swp_to_UTF32Swp
1394 
1395 // =================================================================================================
1396 
UTF32Swp_to_UTF16Swp(const UTF32Unit * utf32In,const size_t utf32Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf32Read,size_t * utf16Written)1397 static void UTF32Swp_to_UTF16Swp ( const UTF32Unit * utf32In,   const size_t utf32Len,
1398 				                   UTF16Unit *       utf16Out,  const size_t utf16Len,
1399 				                   size_t *          utf32Read, size_t *     utf16Written )
1400 {
1401 	const UTF32Unit * utf32Pos = utf32In;
1402 	UTF16Unit *       utf16Pos = utf16Out;
1403 
1404 	size_t utf32Left = utf32Len;
1405 	size_t utf16Left = utf16Len;
1406 
1407 	const size_t k32to16Offset = swap32to16Offset;	// ! Make sure compiler treats as an invariant.
1408 
1409 	UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
1410 
1411 	while ( (utf32Left > 0) && (utf16Left > 0) ) {
1412 
1413 		// Do a run of BMP, it copies 1 input unit into 1 output unit.
1414 		size_t i, limit = utf32Left;
1415 		if ( limit > utf16Left ) limit = utf16Left;
1416 		for ( i = 0; i < limit; ++i ) {
1417 			UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1418 			if ( inUnit > 0xFFFF ) break;
1419 			*utf16Pos = *(((UTF16Unit*)utf32Pos) + k32to16Offset);	// Better than: UTF16OutSwap ( utf16Pos, UTF16Unit(inUnit) );
1420 			++utf32Pos;
1421 			++utf16Pos;
1422 		}
1423 		utf32Left -= i;
1424 		utf16Left -= i;
1425 
1426 		// Do a run of non-BMP, it copies 1 input unit into 2 output units.
1427 		while ( (utf32Left > 0) && (utf16Left > 0) ) {
1428 			size_t len;
1429 			UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1430 			if ( inUnit <= 0xFFFF ) break;
1431 			CodePoint_to_UTF16Swp_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1432 			if ( len == 0 ) goto Done;	// Not enough room in the output buffer.
1433 			UC_Assert ( len == 2 );
1434 			utf32Left -= 1;
1435 			utf32Pos  += 1;
1436 			utf16Left -= 2;
1437 			utf16Pos  += 2;
1438 		}
1439 
1440 	}
1441 
1442 Done:	// Set the output lengths.
1443 	*utf32Read = utf32Len - utf32Left;
1444 	*utf16Written = utf16Len - utf16Left;
1445 
1446 }	// UTF32Swp_to_UTF16Swp
1447 
1448 // =================================================================================================
1449 
UTF16Nat_to_UTF32Swp(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf16Read,size_t * utf32Written)1450 static void UTF16Nat_to_UTF32Swp ( const UTF16Unit * utf16In,   const size_t utf16Len,
1451 				                   UTF32Unit *       utf32Out,  const size_t utf32Len,
1452 				                   size_t *          utf16Read, size_t *     utf32Written )
1453 {
1454 	const UTF16Unit * utf16Pos = utf16In;
1455 	UTF32Unit *       utf32Pos = utf32Out;
1456 
1457 	size_t utf16Left = utf16Len;
1458 	size_t utf32Left = utf32Len;
1459 
1460 	UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
1461 
1462 	while ( (utf16Left > 0) && (utf32Left > 0) ) {
1463 
1464 		// Do a run of BMP, it copies 1 input unit into 1 output unit.
1465 		size_t i, limit = utf16Left;
1466 		if ( limit > utf32Left ) limit = utf32Left;
1467 		for ( i = 0; i < limit; ++i ) {
1468 			UTF16Unit inUnit = *utf16Pos;
1469 			if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1470 			UTF32OutSwap ( utf32Pos, inUnit );
1471 			++utf16Pos;
1472 			++utf32Pos;
1473 		}
1474 		utf16Left -= i;
1475 		utf32Left -= i;
1476 
1477 		// Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
1478 		while ( (utf16Left > 0) && (utf32Left > 0) ) {
1479 			size_t len;
1480 			UTF32Unit cp;
1481 			UTF16Unit inUnit = *utf16Pos;
1482 			if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1483 			CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, &cp, &len );
1484 			if ( len == 0 ) goto Done;	// The input buffer ends in the middle of a surrogate pair.
1485 			UC_Assert ( len == 2 );
1486 			UTF32OutSwap ( utf32Pos, cp );
1487 			utf16Left -= len;
1488 			utf16Pos  += len;
1489 			utf32Left -= 1;
1490 			utf32Pos  += 1;
1491 		}
1492 
1493 	}
1494 
1495 Done:	// Set the output lengths.
1496 	*utf16Read = utf16Len - utf16Left;
1497 	*utf32Written = utf32Len - utf32Left;
1498 
1499 }	// UTF16Nat_to_UTF32Swp
1500 
1501 // =================================================================================================
1502 
UTF16Swp_to_UTF32Nat(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf16Read,size_t * utf32Written)1503 static void UTF16Swp_to_UTF32Nat ( const UTF16Unit * utf16In,   const size_t utf16Len,
1504 				                   UTF32Unit *       utf32Out,  const size_t utf32Len,
1505 				                   size_t *          utf16Read, size_t *     utf32Written )
1506 {
1507 	const UTF16Unit * utf16Pos = utf16In;
1508 	UTF32Unit *       utf32Pos = utf32Out;
1509 
1510 	size_t utf16Left = utf16Len;
1511 	size_t utf32Left = utf32Len;
1512 
1513 	UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
1514 
1515 	while ( (utf16Left > 0) && (utf32Left > 0) ) {
1516 
1517 		// Do a run of BMP, it copies 1 input unit into 1 output unit.
1518 		size_t i, limit = utf16Left;
1519 		if ( limit > utf32Left ) limit = utf32Left;
1520 		for ( i = 0; i < limit; ++i ) {
1521 			UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1522 			if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1523 			*utf32Pos = inUnit;
1524 			++utf16Pos;
1525 			++utf32Pos;
1526 		}
1527 		utf16Left -= i;
1528 		utf32Left -= i;
1529 
1530 		// Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
1531 		while ( (utf16Left > 0) && (utf32Left > 0) ) {
1532 			size_t len;
1533 			UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1534 			if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1535 			CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, utf32Pos, &len );
1536 			if ( len == 0 ) goto Done;	// The input buffer ends in the middle of a surrogate pair.
1537 			UC_Assert ( len == 2 );
1538 			utf16Left -= len;
1539 			utf16Pos  += len;
1540 			utf32Left -= 1;
1541 			utf32Pos  += 1;
1542 		}
1543 
1544 	}
1545 
1546 Done:	// Set the output lengths.
1547 	*utf16Read = utf16Len - utf16Left;
1548 	*utf32Written = utf32Len - utf32Left;
1549 
1550 }	// UTF16Swp_to_UTF32Nat
1551 
1552 // =================================================================================================
1553 
UTF32Nat_to_UTF16Swp(const UTF32Unit * utf32In,const size_t utf32Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf32Read,size_t * utf16Written)1554 static void UTF32Nat_to_UTF16Swp ( const UTF32Unit * utf32In,   const size_t utf32Len,
1555 				                   UTF16Unit *       utf16Out,  const size_t utf16Len,
1556 				                   size_t *          utf32Read, size_t *     utf16Written )
1557 {
1558 	const UTF32Unit * utf32Pos = utf32In;
1559 	UTF16Unit *       utf16Pos = utf16Out;
1560 
1561 	size_t utf32Left = utf32Len;
1562 	size_t utf16Left = utf16Len;
1563 
1564 	UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
1565 
1566 	while ( (utf32Left > 0) && (utf16Left > 0) ) {
1567 
1568 		// Do a run of BMP, it copies 1 input unit into 1 output unit.
1569 		size_t i, limit = utf32Left;
1570 		if ( limit > utf16Left ) limit = utf16Left;
1571 		for ( i = 0; i < limit; ++i ) {
1572 			UTF32Unit inUnit = *utf32Pos;
1573 			if ( inUnit > 0xFFFF ) break;
1574 			UTF16OutSwap ( utf16Pos, UTF16Unit(inUnit) );
1575 			++utf32Pos;
1576 			++utf16Pos;
1577 		}
1578 		utf32Left -= i;
1579 		utf16Left -= i;
1580 
1581 		// Do a run of non-BMP, it copies 1 input unit into 2 output units.
1582 		while ( (utf32Left > 0) && (utf16Left > 0) ) {
1583 			size_t len;
1584 			UTF32Unit inUnit = *utf32Pos;
1585 			if ( inUnit <= 0xFFFF ) break;
1586 			CodePoint_to_UTF16Swp_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1587 			if ( len == 0 ) goto Done;	// Not enough room in the output buffer.
1588 			UC_Assert ( len == 2 );
1589 			utf32Left -= 1;
1590 			utf32Pos  += 1;
1591 			utf16Left -= 2;
1592 			utf16Pos  += 2;
1593 		}
1594 
1595 	}
1596 
1597 Done:	// Set the output lengths.
1598 	*utf32Read = utf32Len - utf32Left;
1599 	*utf16Written = utf16Len - utf16Left;
1600 
1601 }	// UTF32Nat_to_UTF16Swp
1602 
1603 // =================================================================================================
1604 
UTF32Swp_to_UTF16Nat(const UTF32Unit * utf32In,const size_t utf32Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf32Read,size_t * utf16Written)1605 static void UTF32Swp_to_UTF16Nat ( const UTF32Unit * utf32In,   const size_t utf32Len,
1606 				                   UTF16Unit *       utf16Out,  const size_t utf16Len,
1607 				                   size_t *          utf32Read, size_t *     utf16Written )
1608 {
1609 	const UTF32Unit * utf32Pos = utf32In;
1610 	UTF16Unit *       utf16Pos = utf16Out;
1611 
1612 	size_t utf32Left = utf32Len;
1613 	size_t utf16Left = utf16Len;
1614 
1615 	UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
1616 
1617 	while ( (utf32Left > 0) && (utf16Left > 0) ) {
1618 
1619 		// Do a run of BMP, it copies 1 input unit into 1 output unit.
1620 		size_t i, limit = utf32Left;
1621 		if ( limit > utf16Left ) limit = utf16Left;
1622 		for ( i = 0; i < limit; ++i ) {
1623 			UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1624 			if ( inUnit > 0xFFFF ) break;
1625 			*utf16Pos = UTF16Unit(inUnit);
1626 			++utf32Pos;
1627 			++utf16Pos;
1628 		}
1629 		utf32Left -= i;
1630 		utf16Left -= i;
1631 
1632 		// Do a run of non-BMP, it copies 1 input unit into 2 output units.
1633 		while ( (utf32Left > 0) && (utf16Left > 0) ) {
1634 			size_t len;
1635 			UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1636 			if ( inUnit <= 0xFFFF ) break;
1637 			CodePoint_to_UTF16Nat_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1638 			if ( len == 0 ) goto Done;	// Not enough room in the output buffer.
1639 			UC_Assert ( len == 2 );
1640 			utf32Left -= 1;
1641 			utf32Pos  += 1;
1642 			utf16Left -= 2;
1643 			utf16Pos  += 2;
1644 		}
1645 
1646 	}
1647 
1648 Done:	// Set the output lengths.
1649 	*utf32Read = utf32Len - utf32Left;
1650 	*utf16Written = utf16Len - utf16Left;
1651 
1652 }	// UTF32Swp_to_UTF16Nat
1653 
1654 // =================================================================================================
1655