1 // =================================================================================================
2 // Copyright 2004-2007 Adobe Systems Incorporated
3 // All Rights Reserved.
4 //
5 // NOTICE:  Adobe permits you to use, modify, and distribute this file in accordance with the terms
6 // of the Adobe license agreement accompanying it.
7 // =================================================================================================
8 
9 #include "XMP_Const.h"
10 
11 #if UnicodeTestBuild
12 	#include <cassert>
13 	#include <stdexcept>
14 	#define UC_Assert assert
15 	#define UC_Throw(m,k) throw std::logic_error ( m )
16 #else
17 	#define UC_Assert(cond) /* Nothing for now, should be XMP_Assert. */
18 	#define UC_Throw(msg,id)  throw XMP_Error ( id, msg )
19 #endif
20 
21 #include "UnicodeConversions.hpp"
22 
23 using namespace std;
24 
25 // =================================================================================================
26 
27 // *** Look into using asm inlines, e.g. count-leading bits for multi-byte UTF-8.
28 
29 CodePoint_to_UTF16_Proc CodePoint_to_UTF16BE = 0;
30 CodePoint_to_UTF16_Proc CodePoint_to_UTF16LE = 0;
31 
32 CodePoint_from_UTF16_Proc CodePoint_from_UTF16BE = 0;
33 CodePoint_from_UTF16_Proc CodePoint_from_UTF16LE = 0;
34 
35 UTF8_to_UTF16_Proc  UTF8_to_UTF16BE = 0;
36 UTF8_to_UTF16_Proc  UTF8_to_UTF16LE = 0;
37 UTF8_to_UTF32_Proc  UTF8_to_UTF32BE = 0;
38 UTF8_to_UTF32_Proc  UTF8_to_UTF32LE = 0;
39 
40 UTF16_to_UTF8_Proc  UTF16BE_to_UTF8 = 0;
41 UTF16_to_UTF8_Proc  UTF16LE_to_UTF8 = 0;
42 UTF32_to_UTF8_Proc  UTF32BE_to_UTF8 = 0;
43 UTF32_to_UTF8_Proc  UTF32LE_to_UTF8 = 0;
44 
45 UTF8_to_UTF16_Proc  UTF8_to_UTF16Native = 0;
46 UTF8_to_UTF32_Proc  UTF8_to_UTF32Native = 0;
47 UTF16_to_UTF8_Proc  UTF16Native_to_UTF8 = 0;
48 UTF32_to_UTF8_Proc  UTF32Native_to_UTF8 = 0;
49 
50 UTF16_to_UTF32_Proc UTF16BE_to_UTF32BE = 0;
51 UTF16_to_UTF32_Proc UTF16BE_to_UTF32LE = 0;
52 UTF16_to_UTF32_Proc UTF16LE_to_UTF32BE = 0;
53 UTF16_to_UTF32_Proc UTF16LE_to_UTF32LE = 0;
54 
55 UTF32_to_UTF16_Proc UTF32BE_to_UTF16BE = 0;
56 UTF32_to_UTF16_Proc UTF32BE_to_UTF16LE = 0;
57 UTF32_to_UTF16_Proc UTF32LE_to_UTF16BE = 0;
58 UTF32_to_UTF16_Proc UTF32LE_to_UTF16LE = 0;
59 
60 // -------------------------------------------------------------------------------------------------
61 
62 static size_t swap32to16Offset = 0;	// Offset to "convert" a swapped UTF32 pointer into a swapped UTF16 pointer.
63 
64 // -------------------------------------------------------------------------------------------------
65 
66 static void CodePoint_to_UTF16Nat ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written );
67 static void CodePoint_to_UTF16Swp ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written );
68 
69 static void CodePoint_from_UTF16Nat ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read );
70 static void CodePoint_from_UTF16Swp ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read );
71 
72 // -------------------------------------------------------------------------------------------------
73 
74 static void UTF8_to_UTF16Nat ( const UTF8Unit *  utf8In,    const size_t utf8Len,
75 				               UTF16Unit *       utf16Out,  const size_t utf16Len,
76 				               size_t *          utf8Read,  size_t *     utf16Written );
77 
78 static void UTF8_to_UTF16Swp ( const UTF8Unit *  utf8In,    const size_t utf8Len,
79 				               UTF16Unit *       utf16Out,  const size_t utf16Len,
80 				               size_t *          utf8Read,  size_t *     utf16Written );
81 
82 static void UTF8_to_UTF32Nat ( const UTF8Unit *  utf8In,    const size_t utf8Len,
83 				               UTF32Unit *       utf32Out,  const size_t utf32Len,
84 				               size_t *          utf8Read,  size_t *     utf32Written );
85 
86 static void UTF8_to_UTF32Swp ( const UTF8Unit *  utf8In,    const size_t utf8Len,
87 				               UTF32Unit *       utf32Out,  const size_t utf32Len,
88 				               size_t *          utf8Read,  size_t *     utf32Written );
89 
90 // -------------------------------------------------------------------------------------------------
91 
92 static void UTF16Nat_to_UTF8 ( const UTF16Unit * utf16In,   const size_t utf16Len,
93 				               UTF8Unit *        utf8Out,   const size_t utf8Len,
94 				               size_t *          utf16Read, size_t *     utf8Written );
95 
96 static void UTF16Swp_to_UTF8 ( const UTF16Unit * utf16In,   const size_t utf16Len,
97 				               UTF8Unit *        utf8Out,   const size_t utf8Len,
98 				               size_t *          utf16Read, size_t *     utf8Written );
99 
100 static void UTF32Nat_to_UTF8 ( const UTF32Unit * utf32In,   const size_t utf32Len,
101 				               UTF8Unit *        utf8Out,   const size_t utf8Len,
102 				               size_t *          utf32Read, size_t *     utf8Written );
103 
104 static void UTF32Swp_to_UTF8 ( const UTF32Unit * utf32In,   const size_t utf32Len,
105 				               UTF8Unit *        utf8Out,   const size_t utf8Len,
106 				               size_t *          utf32Read, size_t *     utf8Written );
107 
108 // -------------------------------------------------------------------------------------------------
109 
110 static void UTF16Nat_to_UTF32Nat ( const UTF16Unit * utf16In,   const size_t utf16Len,
111 				                   UTF32Unit *       utf32Out,  const size_t utf32Len,
112 				                   size_t *          utf16Read, size_t *     utf32Written );
113 
114 static void UTF16Nat_to_UTF32Swp ( const UTF16Unit * utf16In,   const size_t utf16Len,
115 				                   UTF32Unit *       utf32Out,  const size_t utf32Len,
116 				                   size_t *          utf16Read, size_t *     utf32Written );
117 
118 static void UTF16Swp_to_UTF32Nat ( const UTF16Unit * utf16In,   const size_t utf16Len,
119 				                   UTF32Unit *       utf32Out,  const size_t utf32Len,
120 				                   size_t *          utf16Read, size_t *     utf32Written );
121 
122 static void UTF16Swp_to_UTF32Swp ( const UTF16Unit * utf16In,   const size_t utf16Len,
123 				                   UTF32Unit *       utf32Out,  const size_t utf32Len,
124 				                   size_t *          utf16Read, size_t *     utf32Written );
125 
126 // -------------------------------------------------------------------------------------------------
127 
128 static void UTF32Nat_to_UTF16Nat ( const UTF32Unit * utf32In,   const size_t utf32Len,
129 				                   UTF16Unit *       utf16Out,  const size_t utf16Len,
130 				                   size_t *          utf32Read, size_t *     utf16Written );
131 
132 static void UTF32Nat_to_UTF16Swp ( const UTF32Unit * utf32In,   const size_t utf32Len,
133 				                   UTF16Unit *       utf16Out,  const size_t utf16Len,
134 				                   size_t *          utf32Read, size_t *     utf16Written );
135 
136 static void UTF32Swp_to_UTF16Nat ( const UTF32Unit * utf32In,   const size_t utf32Len,
137 				                   UTF16Unit *       utf16Out,  const size_t utf16Len,
138 				                   size_t *          utf32Read, size_t *     utf16Written );
139 
140 static void UTF32Swp_to_UTF16Swp ( const UTF32Unit * utf32In,   const size_t utf32Len,
141 				                   UTF16Unit *       utf16Out,  const size_t utf16Len,
142 				                   size_t *          utf32Read, size_t *     utf16Written );
143 
144 // =================================================================================================
145 
InitializeUnicodeConversions()146 void InitializeUnicodeConversions()
147 {
148 	UC_Assert ( (sizeof(UTF8Unit) == 1) && (sizeof(UTF16Unit) == 2) && (sizeof(UTF32Unit) == 4) );
149 
150 	UTF16Unit u16  = 0x00FF;
151 	bool bigEndian = (*((UTF8Unit*)&u16) == 0);
152 
153 	UTF8_to_UTF16Native = UTF8_to_UTF16Nat;
154 	UTF8_to_UTF32Native = UTF8_to_UTF32Nat;
155 	UTF16Native_to_UTF8 = UTF16Nat_to_UTF8;
156 	UTF32Native_to_UTF8 = UTF32Nat_to_UTF8;
157 
158 	if ( bigEndian ) {
159 
160 		swap32to16Offset = 0;
161 
162 		CodePoint_to_UTF16BE = CodePoint_to_UTF16Nat;
163 		CodePoint_to_UTF16LE = CodePoint_to_UTF16Swp;
164 
165 		CodePoint_from_UTF16BE = CodePoint_from_UTF16Nat;
166 		CodePoint_from_UTF16LE = CodePoint_from_UTF16Swp;
167 
168 		UTF8_to_UTF16BE = UTF8_to_UTF16Nat;
169 		UTF8_to_UTF16LE = UTF8_to_UTF16Swp;
170 		UTF8_to_UTF32BE = UTF8_to_UTF32Nat;
171 		UTF8_to_UTF32LE = UTF8_to_UTF32Swp;
172 
173 		UTF16BE_to_UTF8 = UTF16Nat_to_UTF8;
174 		UTF16LE_to_UTF8 = UTF16Swp_to_UTF8;
175 		UTF32BE_to_UTF8 = UTF32Nat_to_UTF8;
176 		UTF32LE_to_UTF8 = UTF32Swp_to_UTF8;
177 
178 		UTF16BE_to_UTF32BE = UTF16Nat_to_UTF32Nat;
179 		UTF16BE_to_UTF32LE = UTF16Nat_to_UTF32Swp;
180 		UTF16LE_to_UTF32BE = UTF16Swp_to_UTF32Nat;
181 		UTF16LE_to_UTF32LE = UTF16Swp_to_UTF32Swp;
182 
183 		UTF32BE_to_UTF16BE = UTF32Nat_to_UTF16Nat;
184 		UTF32BE_to_UTF16LE = UTF32Nat_to_UTF16Swp;
185 		UTF32LE_to_UTF16BE = UTF32Swp_to_UTF16Nat;
186 		UTF32LE_to_UTF16LE = UTF32Swp_to_UTF16Swp;
187 
188 	} else {
189 
190 		swap32to16Offset = 1;	// ! Offset in UTF16 units!
191 
192 		CodePoint_to_UTF16BE = CodePoint_to_UTF16Swp;
193 		CodePoint_to_UTF16LE = CodePoint_to_UTF16Nat;
194 
195 		CodePoint_from_UTF16BE = CodePoint_from_UTF16Swp;
196 		CodePoint_from_UTF16LE = CodePoint_from_UTF16Nat;
197 
198 		UTF8_to_UTF16BE = UTF8_to_UTF16Swp;
199 		UTF8_to_UTF16LE = UTF8_to_UTF16Nat;
200 		UTF8_to_UTF32BE = UTF8_to_UTF32Swp;
201 		UTF8_to_UTF32LE = UTF8_to_UTF32Nat;
202 
203 		UTF16BE_to_UTF8 = UTF16Swp_to_UTF8;
204 		UTF16LE_to_UTF8 = UTF16Nat_to_UTF8;
205 		UTF32BE_to_UTF8 = UTF32Swp_to_UTF8;
206 		UTF32LE_to_UTF8 = UTF32Nat_to_UTF8;
207 
208 		UTF16BE_to_UTF32BE = UTF16Swp_to_UTF32Swp;
209 		UTF16BE_to_UTF32LE = UTF16Swp_to_UTF32Nat;
210 		UTF16LE_to_UTF32BE = UTF16Nat_to_UTF32Swp;
211 		UTF16LE_to_UTF32LE = UTF16Nat_to_UTF32Nat;
212 
213 		UTF32BE_to_UTF16BE = UTF32Swp_to_UTF16Swp;
214 		UTF32BE_to_UTF16LE = UTF32Swp_to_UTF16Nat;
215 		UTF32LE_to_UTF16BE = UTF32Nat_to_UTF16Swp;
216 		UTF32LE_to_UTF16LE = UTF32Nat_to_UTF16Nat;
217 
218 	}
219 
220 }	// InitializeUnicodeConversions
221 
222 // =================================================================================================
223 
224 #if XMP_MacBuild && __MWERKS__
225 
226 	#define UTF16InSwap(inPtr)	UTF16Unit ( __lhbrx ( (void*)(inPtr), 0 ) )
227 	#define UTF32InSwap(inPtr)	UTF32Unit ( __lwbrx ( (void*)(inPtr), 0 ) )
228 
229 	#define UTF16OutSwap(outPtr,value)	__sthbrx ( value, (void*)(outPtr), 0 )
230 	#define UTF32OutSwap(outPtr,value)	__stwbrx ( value, (void*)(outPtr), 0 )
231 
232 #else
233 
UTF16InSwap(const UTF16Unit * inPtr)234 	static inline UTF16Unit UTF16InSwap ( const UTF16Unit * inPtr )
235 	{
236 		UTF16Unit inUnit = *inPtr;
237 		return (inUnit << 8) | (inUnit >> 8);
238 	}
239 
UTF32InSwap(const UTF32Unit * inPtr)240 	static inline UTF32Unit UTF32InSwap ( const UTF32Unit * inPtr )
241 	{
242 		UTF32Unit inUnit = *inPtr;
243 		return (inUnit << 24) | ((inUnit << 8) & 0x00FF0000) | ((inUnit >> 8) & 0x0000FF00) | (inUnit >> 24);
244 	}
245 
UTF16OutSwap(UTF16Unit * outPtr,const UTF16Unit value)246 	static inline void UTF16OutSwap ( UTF16Unit * outPtr, const UTF16Unit value )
247 	{
248 		UTF16Unit outUnit = (value << 8) | (value >> 8);
249 		*outPtr = outUnit;
250 	}
251 
UTF32OutSwap(UTF32Unit * outPtr,const UTF32Unit value)252 	static inline void UTF32OutSwap ( UTF32Unit * outPtr, const UTF32Unit value )
253 	{
254 		UTF32Unit outUnit = (value << 24) | ((value << 8) & 0x00FF0000) | ((value >> 8) & 0x0000FF00) | (value >> 24);
255 		*outPtr = outUnit;
256 	}
257 
258 #endif
259 
260 // =================================================================================================
261 
SwapUTF16(const UTF16Unit * utf16In,UTF16Unit * utf16Out,const size_t utf16Len)262 void SwapUTF16 ( const UTF16Unit * utf16In, UTF16Unit * utf16Out, const size_t utf16Len )
263 {
264 	for ( size_t i = 0; i < utf16Len; ++i ) utf16Out[i] = UTF16InSwap(utf16In+i);
265 }
266 
SwapUTF32(const UTF32Unit * utf32In,UTF32Unit * utf32Out,const size_t utf32Len)267 void SwapUTF32 ( const UTF32Unit * utf32In, UTF32Unit * utf32Out, const size_t utf32Len ) {
268 	for ( size_t i = 0; i < utf32Len; ++i ) utf32Out[i] = UTF32InSwap(utf32In+i);
269 }
270 
271 // =================================================================================================
272 
ToUTF16(const UTF8Unit * utf8In,size_t utf8Len,std::string * utf16Str,bool bigEndian)273 extern void ToUTF16 ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf16Str, bool bigEndian )
274 {
275 	UTF8_to_UTF16_Proc Converter = UTF8_to_UTF16LE;
276 	if ( bigEndian ) Converter = UTF8_to_UTF16BE;
277 
278 	enum { kBufferSize = 8*1024 };
279 	UTF16Unit u16Buffer[kBufferSize];	// 16K bytes
280 	size_t readCount, writeCount;
281 
282 	utf16Str->erase();
283 	utf16Str->reserve ( 2*utf8Len );	// As good a guess as any.
284 
285 	while ( utf8Len > 0 ) {
286 		Converter ( utf8In, utf8Len, u16Buffer, kBufferSize, &readCount, &writeCount );
287 		if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
288 		utf16Str->append ( (const char *)u16Buffer, writeCount*2 );
289 		utf8In  += readCount;
290 		utf8Len -= readCount;
291 	}
292 
293 }	// ToUTF16
294 
295 // =================================================================================================
296 
ToUTF16Native(const UTF8Unit * utf8In,size_t utf8Len,std::string * utf16Str)297 extern void ToUTF16Native ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf16Str )
298 {
299 	enum { kBufferSize = 8*1024 };
300 	UTF16Unit u16Buffer[kBufferSize];	// 16K bytes
301 	size_t readCount, writeCount;
302 
303 	utf16Str->erase();
304 	utf16Str->reserve ( 2*utf8Len );	// As good a guess as any.
305 
306 	while ( utf8Len > 0 ) {
307 		UTF8_to_UTF16Nat ( utf8In, utf8Len, u16Buffer, kBufferSize, &readCount, &writeCount );
308 		if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
309 		utf16Str->append ( (const char *)u16Buffer, writeCount*2 );
310 		utf8In  += readCount;
311 		utf8Len -= readCount;
312 	}
313 
314 }	// ToUTF16Native
315 
316 // =================================================================================================
317 
ToUTF32(const UTF8Unit * utf8In,size_t utf8Len,std::string * utf32Str,bool bigEndian)318 extern void ToUTF32 ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf32Str, bool bigEndian )
319 {
320 	UTF8_to_UTF32_Proc Converter = UTF8_to_UTF32LE;
321 	if ( bigEndian ) Converter = UTF8_to_UTF32BE;
322 
323 	enum { kBufferSize = 4*1024 };
324 	UTF32Unit u32Buffer[kBufferSize];	// 16K bytes
325 	size_t readCount, writeCount;
326 
327 	utf32Str->erase();
328 	utf32Str->reserve ( 4*utf8Len );	// As good a guess as any.
329 
330 	while ( utf8Len > 0 ) {
331 		Converter ( utf8In, utf8Len, u32Buffer, kBufferSize, &readCount, &writeCount );
332 		if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
333 		utf32Str->append ( (const char *)u32Buffer, writeCount*4 );
334 		utf8In  += readCount;
335 		utf8Len -= readCount;
336 	}
337 
338 }	// ToUTF32
339 
340 // =================================================================================================
341 
ToUTF32Native(const UTF8Unit * utf8In,size_t utf8Len,std::string * utf32Str)342 extern void ToUTF32Native ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf32Str )
343 {
344 	enum { kBufferSize = 4*1024 };
345 	UTF32Unit u32Buffer[kBufferSize];	// 16K bytes
346 	size_t readCount, writeCount;
347 
348 	utf32Str->erase();
349 	utf32Str->reserve ( 4*utf8Len );	// As good a guess as any.
350 
351 	while ( utf8Len > 0 ) {
352 		UTF8_to_UTF32Nat ( utf8In, utf8Len, u32Buffer, kBufferSize, &readCount, &writeCount );
353 		if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
354 		utf32Str->append ( (const char *)u32Buffer, writeCount*4 );
355 		utf8In  += readCount;
356 		utf8Len -= readCount;
357 	}
358 
359 }	// ToUTF32Native
360 
361 // =================================================================================================
362 
FromUTF16(const UTF16Unit * utf16In,size_t utf16Len,std::string * utf8Str,bool bigEndian)363 extern void FromUTF16 ( const UTF16Unit * utf16In, size_t utf16Len, std::string * utf8Str, bool bigEndian )
364 {
365 	UTF16_to_UTF8_Proc Converter = UTF16LE_to_UTF8;
366 	if ( bigEndian ) Converter = UTF16BE_to_UTF8;
367 
368 	enum { kBufferSize = 16*1024 };
369 	UTF8Unit u8Buffer[kBufferSize];
370 	size_t readCount, writeCount;
371 
372 	utf8Str->erase();
373 	utf8Str->reserve ( 2*utf16Len );	// As good a guess as any.
374 
375 	while ( utf16Len > 0 ) {
376 		Converter ( utf16In, utf16Len, u8Buffer, kBufferSize, &readCount, &writeCount );
377 		if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
378 		utf8Str->append ( (const char *)u8Buffer, writeCount );
379 		utf16In  += readCount;
380 		utf16Len -= readCount;
381 	}
382 
383 }	// FromUTF16
384 
385 // =================================================================================================
386 
FromUTF16Native(const UTF16Unit * utf16In,size_t utf16Len,std::string * utf8Str)387 extern void FromUTF16Native ( const UTF16Unit * utf16In, size_t utf16Len, std::string * utf8Str )
388 {
389 	enum { kBufferSize = 16*1024 };
390 	UTF8Unit u8Buffer[kBufferSize];
391 	size_t readCount, writeCount;
392 
393 	utf8Str->erase();
394 	utf8Str->reserve ( 2*utf16Len );	// As good a guess as any.
395 
396 	while ( utf16Len > 0 ) {
397 		UTF16Nat_to_UTF8 ( utf16In, utf16Len, u8Buffer, kBufferSize, &readCount, &writeCount );
398 		if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
399 		utf8Str->append ( (const char *)u8Buffer, writeCount );
400 		utf16In  += readCount;
401 		utf16Len -= readCount;
402 	}
403 
404 }	// FromUTF16Native
405 
406 // =================================================================================================
407 
FromUTF32(const UTF32Unit * utf32In,size_t utf32Len,std::string * utf8Str,bool bigEndian)408 extern void FromUTF32 ( const UTF32Unit * utf32In, size_t utf32Len, std::string * utf8Str, bool bigEndian )
409 {
410 	UTF32_to_UTF8_Proc Converter = UTF32LE_to_UTF8;
411 	if ( bigEndian ) Converter = UTF32BE_to_UTF8;
412 
413 	enum { kBufferSize = 16*1024 };
414 	UTF8Unit u8Buffer[kBufferSize];
415 	size_t readCount, writeCount;
416 
417 	utf8Str->erase();
418 	utf8Str->reserve ( 2*utf32Len );	// As good a guess as any.
419 
420 	while ( utf32Len > 0 ) {
421 		Converter ( utf32In, utf32Len, u8Buffer, kBufferSize, &readCount, &writeCount );
422 		if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
423 		utf8Str->append ( (const char *)u8Buffer, writeCount );
424 		utf32In  += readCount;
425 		utf32Len -= readCount;
426 	}
427 
428 }	// FromUTF32
429 
430 // =================================================================================================
431 
FromUTF32Native(const UTF32Unit * utf32In,size_t utf32Len,std::string * utf8Str)432 extern void FromUTF32Native ( const UTF32Unit * utf32In, size_t utf32Len, std::string * utf8Str )
433 {
434 	enum { kBufferSize = 16*1024 };
435 	UTF8Unit u8Buffer[kBufferSize];
436 	size_t readCount, writeCount;
437 
438 	utf8Str->erase();
439 	utf8Str->reserve ( 2*utf32Len );	// As good a guess as any.
440 
441 	while ( utf32Len > 0 ) {
442 		UTF32Nat_to_UTF8 ( utf32In, utf32Len, u8Buffer, kBufferSize, &readCount, &writeCount );
443 		if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
444 		utf8Str->append ( (const char *)u8Buffer, writeCount );
445 		utf32In  += readCount;
446 		utf32Len -= readCount;
447 	}
448 
449 }	// FromUTF32Native
450 
451 // =================================================================================================
452 
CodePoint_to_UTF8_Multi(const UTF32Unit cpIn,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf8Written)453 static void CodePoint_to_UTF8_Multi ( const UTF32Unit cpIn, UTF8Unit * utf8Out, const size_t utf8Len, size_t * utf8Written )
454 {
455 	size_t unitCount = 0;
456 
457 	if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam );
458 	if ( (0xD800 <= cpIn) && (cpIn <= 0xDFFF) ) UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam );
459 
460 	// Compute the number of bytes using 6 data bits each. Then see if the highest order bits will
461 	// fit into the leading byte. Write the UTF-8 sequence if there is enough room.
462 
463 	UTF32Unit temp, mask;
464 	size_t bytesNeeded = 0;
465 	for ( temp = cpIn; temp != 0; temp = temp >> 6 ) ++bytesNeeded;
466 
467 	temp = cpIn >> ((bytesNeeded-1)*6);	// The highest order data bits.
468 	mask = (0x80 >> bytesNeeded) - 1;	// Available data bits in the leading byte.
469 	if ( temp > mask ) ++bytesNeeded;
470 
471 	if ( bytesNeeded > utf8Len ) goto Done;	// Not enough room for the output.
472 	unitCount = bytesNeeded;
473 
474 	temp = cpIn;
475 	for ( --bytesNeeded; bytesNeeded > 0; --bytesNeeded ) {
476 		utf8Out[bytesNeeded] = 0x80 | UTF8Unit ( temp & 0x3F );
477 		temp = temp >> 6;
478 	}
479 
480 	mask = ~((1 << (8-unitCount)) - 1);
481 	utf8Out[0] = UTF8Unit ( mask | temp );
482 
483 Done:
484 	*utf8Written = unitCount;
485 	return;
486 
487 }	// CodePoint_to_UTF8_Multi
488 
489 // =================================================================================================
490 
CodePoint_to_UTF8(const UTF32Unit cpIn,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf8Written)491 void CodePoint_to_UTF8 ( const UTF32Unit cpIn, UTF8Unit * utf8Out, const size_t utf8Len, size_t * utf8Written )
492 {
493 	size_t unitCount = 0;
494 
495 	UC_Assert ( (utf8Out != 0) && (utf8Written != 0) );
496 	if ( utf8Len == 0 ) goto Done;
497 	if ( cpIn > 0x7F ) goto MultiByte;	// ! Force linear execution path for ASCII.
498 
499 	if ( utf8Len == 0 ) goto Done;
500 	unitCount = 1;
501 	*utf8Out = UTF8Unit(cpIn);
502 
503 Done:
504 	*utf8Written = unitCount;
505 	return;
506 
507 MultiByte:
508 	 CodePoint_to_UTF8_Multi( cpIn, utf8Out, utf8Len, utf8Written );
509 	 return;
510 
511 }	// CodePoint_to_UTF8
512 
513 // =================================================================================================
514 
CodePoint_from_UTF8_Multi(const UTF8Unit * utf8In,const size_t utf8Len,UTF32Unit * cpOut,size_t * utf8Read)515 static void CodePoint_from_UTF8_Multi ( const UTF8Unit * utf8In, const size_t utf8Len, UTF32Unit * cpOut, size_t * utf8Read )
516 {
517 	UTF8Unit  inUnit = *utf8In;
518 	size_t    unitCount = 0;
519 	UTF32Unit cp;	// ! Avoid gcc complaints about declarations after goto's.
520 	const UTF8Unit * utf8Pos;
521 
522 	// -------------------------------------------------------------------------------------
523 	// We've got a multibyte UTF-8 character. The first byte has the number of bytes and the
524 	// highest order data bits. The other bytes each add 6 more data bits.
525 
526 	#if 0	// This might be a more effcient way to count the bytes.
527 		static XMP_Uns8 kByteCounts[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
528 		size_t bytesNeeded = kByteCounts [ inUnit >> 4 ];
529 		if ( (bytesNeeded < 2) || ((bytesNeeded == 4) && ((inUnit & 0x08) != 0)) ) {
530 			UC_Throw ( "Invalid UTF-8 sequence length", kXMPErr_BadParam );
531 		}
532 	#endif
533 
534 	size_t bytesNeeded = 0;	// Count the leading 1 bits in the first byte.
535 	for ( UTF8Unit temp = inUnit; temp > 0x7F; temp = temp << 1 ) ++bytesNeeded;
536 		// *** Consider CPU-specific assembly inline, e.g. cntlzw on PowerPC.
537 
538 	if ( (bytesNeeded < 2) || (bytesNeeded > 4) ) UC_Throw ( "Invalid UTF-8 sequence length", kXMPErr_BadParam );
539 	if ( bytesNeeded > utf8Len ) goto Done;	// Not enough input in this buffer.
540 	unitCount = bytesNeeded;
541 
542 	cp = inUnit & ((1 << (7-unitCount)) - 1);	// Isolate the initial data bits in the bottom of cp.
543 
544 	utf8Pos = utf8In + 1;	// We've absorbed the first byte.
545 	for ( --bytesNeeded; bytesNeeded > 0; --bytesNeeded, ++utf8Pos ) {
546 		inUnit = *utf8Pos;
547 		if ( (inUnit & UTF8Unit(0xC0)) != UTF8Unit(0x80) ) UC_Throw ( "Invalid UTF-8 data byte", kXMPErr_BadParam );
548 		cp = (cp << 6) | (inUnit & 0x3F);
549 	}
550 
551 	if ( cp >= 0xD800 ) {	// Skip the next comparisons most of the time.
552 		if ( (0xD800 <= cp) && (cp <= 0xDFFF) ) UC_Throw ( "Bad UTF-8 - surrogate code point", kXMPErr_BadParam );
553 		if ( cp > 0x10FFFF ) UC_Throw ( "Bad UTF-8 - out of range", kXMPErr_BadParam );
554 	}
555 
556 	*cpOut = cp;	// ! Don't put after Done, don't write if no input.
557 
558 Done:
559 	*utf8Read = unitCount;
560 	return;
561 
562 }	// CodePoint_from_UTF8_Multi
563 
564 // =================================================================================================
565 
CodePoint_from_UTF8(const UTF8Unit * utf8In,const size_t utf8Len,UTF32Unit * cpOut,size_t * utf8Read)566 void CodePoint_from_UTF8 ( const UTF8Unit * utf8In, const size_t utf8Len, UTF32Unit * cpOut, size_t * utf8Read )
567 {
568 	UTF8Unit inUnit;	// ! Don't read until we know there is input.
569 	size_t unitCount = 0;
570 
571 	UC_Assert ( (utf8In != 0) && (cpOut != 0) && (utf8Read != 0) );
572 	if ( utf8Len == 0 ) goto Done;
573 	inUnit = *utf8In;
574 	if ( inUnit >= 0x80 ) goto MultiByte;	// ! Force linear execution path for ASCII.
575 
576 	unitCount = 1;
577 	*cpOut = inUnit;	// ! Don't put after Done, don't write if no input.
578 
579 Done:
580 	*utf8Read = unitCount;
581 	return;
582 
583 MultiByte:
584 	CodePoint_from_UTF8_Multi ( utf8In, utf8Len, cpOut, utf8Read );
585 	return;
586 
587 }	// CodePoint_from_UTF8
588 
589 // =================================================================================================
590 
CodePoint_to_UTF16Nat_Surrogate(const UTF32Unit cpIn,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf16Written)591 static void CodePoint_to_UTF16Nat_Surrogate ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
592 {
593 	size_t    unitCount = 0;
594 	UTF32Unit temp;	// ! Avoid gcc complaints about declarations after goto's.
595 
596 	if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam );
597 	if ( utf16Len < 2 ) goto Done;	// Not enough room for the output.
598 
599 	unitCount = 2;
600 	temp = cpIn - 0x10000;
601 	utf16Out[0] = 0xD800 | UTF16Unit ( temp >> 10 );
602 	utf16Out[1] = 0xDC00 | UTF16Unit ( temp & 0x3FF );
603 
604 Done:
605 	*utf16Written = unitCount;
606 	return;
607 
608 }	// CodePoint_to_UTF16Nat_Surrogate
609 
610 // =================================================================================================
611 
CodePoint_to_UTF16Nat(const UTF32Unit cpIn,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf16Written)612 static void CodePoint_to_UTF16Nat ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
613 {
614 	size_t unitCount = 0;
615 
616 	UC_Assert ( (utf16Out != 0) && (utf16Written != 0) );
617 	if ( utf16Len == 0 ) goto Done;
618 	if ( cpIn >= 0xD800 ) goto CheckSurrogate;	// ! Force linear execution path for the BMP.
619 
620 InBMP:
621 	unitCount = 1;
622 	*utf16Out = UTF16Unit(cpIn);
623 
624 Done:
625 	*utf16Written = unitCount;
626 	return;
627 
628 CheckSurrogate:
629 	if ( cpIn > 0xFFFF ) goto SurrogatePair;
630 	if ( cpIn > 0xDFFF ) goto InBMP;
631 	UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam );
632 
633 SurrogatePair:
634 	CodePoint_to_UTF16Nat_Surrogate ( cpIn, utf16Out, utf16Len, utf16Written );
635 	return;
636 
637 }	// CodePoint_to_UTF16Nat
638 
639 // =================================================================================================
640 
CodePoint_from_UTF16Nat_Surrogate(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * cpOut,size_t * utf16Read)641 static void CodePoint_from_UTF16Nat_Surrogate ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
642 {
643 	UTF16Unit hiUnit = *utf16In;
644 	size_t    unitCount = 0;
645 	UTF16Unit loUnit;	// ! Avoid gcc complaints about declarations after goto's.
646 	UTF32Unit cp;
647 
648 	// ----------------------------------
649 	// We've got a UTF-16 surrogate pair.
650 
651 	if ( hiUnit > 0xDBFF ) UC_Throw ( "Bad UTF-16 - leading low surrogate", kXMPErr_BadParam );
652 	if ( utf16Len < 2 ) goto Done;	// Not enough input in this buffer.
653 
654 	loUnit  = *(utf16In+1);
655 	if ( (loUnit < 0xDC00) || (0xDFFF < loUnit) ) UC_Throw ( "Bad UTF-16 - missing low surrogate", kXMPErr_BadParam );
656 
657 	unitCount = 2;
658 	cp = (((hiUnit & 0x3FF) << 10) | (loUnit & 0x3FF)) + 0x10000;
659 
660 	*cpOut = cp;	// ! Don't put after Done, don't write if no input.
661 
662 Done:
663 	*utf16Read = unitCount;
664 	return;
665 
666 }	// CodePoint_from_UTF16Nat_Surrogate
667 
668 // =================================================================================================
669 
CodePoint_from_UTF16Nat(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * cpOut,size_t * utf16Read)670 static void CodePoint_from_UTF16Nat ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
671 {
672 	UTF16Unit inUnit;	// ! Don't read until we know there is input.
673 	size_t unitCount = 0;
674 
675 	UC_Assert ( (utf16In != 0) && (cpOut != 0) && (utf16Read != 0) );
676 	if ( utf16Len == 0 ) goto Done;
677 	inUnit = *utf16In;
678 	if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) goto SurrogatePair;	// ! Force linear execution path for the BMP.
679 
680 	unitCount = 1;
681 	*cpOut = inUnit;	// ! Don't put after Done, don't write if no input.
682 
683 Done:
684 	*utf16Read = unitCount;
685 	return;
686 
687 SurrogatePair:
688 	CodePoint_from_UTF16Nat_Surrogate ( utf16In, utf16Len, cpOut, utf16Read );
689 	return;
690 
691 }	// CodePoint_from_UTF16Nat
692 
693 // =================================================================================================
694 
UTF8_to_UTF16Nat(const UTF8Unit * utf8In,const size_t utf8Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf8Read,size_t * utf16Written)695 static void UTF8_to_UTF16Nat ( const UTF8Unit * utf8In,   const size_t utf8Len,
696 				               UTF16Unit *      utf16Out, const size_t utf16Len,
697 				               size_t *         utf8Read, size_t *     utf16Written )
698 {
699 	const UTF8Unit * utf8Pos  = utf8In;
700 	UTF16Unit *      utf16Pos = utf16Out;
701 
702 	size_t utf8Left  = utf8Len;
703 	size_t utf16Left = utf16Len;
704 
705 	UC_Assert ( (utf8In != 0) && (utf16Out != 0) && (utf8Read != 0) && (utf16Written != 0) );
706 
707 	while ( (utf8Left > 0) && (utf16Left > 0) ) {
708 
709 		// Do a run of ASCII, it copies 1 input unit into 1 output unit.
710 		size_t i, limit = utf8Left;
711 		if ( limit > utf16Left ) limit = utf16Left;
712 		for ( i = 0; i < limit; ++i ) {
713 			UTF8Unit inUnit = *utf8Pos;
714 			if ( inUnit > 0x7F ) break;
715 			*utf16Pos = inUnit;
716 			++utf8Pos;
717 			++utf16Pos;
718 		}
719 		utf8Left  -= i;
720 		utf16Left -= i;
721 
722 		// Do a run of non-ASCII, it copies multiple input units into 1 or 2 output units.
723 		while ( (utf8Left > 0) && (utf16Left > 0) ) {
724 			UTF32Unit cp;
725 			size_t len8, len16;
726 			UTF8Unit inUnit = *utf8Pos;
727 			if ( inUnit <= 0x7F ) break;
728 			CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len8 );
729 			if ( len8 == 0 ) goto Done;		// The input buffer ends in the middle of a character.
730 			if ( cp <= 0xFFFF ) {
731 				*utf16Pos = UTF16Unit(cp);
732 				len16 = 1;
733 			} else {
734 				CodePoint_to_UTF16Nat_Surrogate ( cp, utf16Pos, utf16Left, &len16 );
735 				if ( len16 == 0 ) goto Done;	// Not enough room in the output buffer.
736 			}
737 			utf8Left  -= len8;
738 			utf8Pos   += len8;
739 			utf16Left -= len16;
740 			utf16Pos  += len16;
741 		}
742 
743 	}
744 
745 Done:	// Set the output lengths.
746 	*utf8Read = utf8Len - utf8Left;
747 	*utf16Written = utf16Len - utf16Left;
748 
749 }	// UTF8_to_UTF16Nat
750 
751 // =================================================================================================
752 
UTF8_to_UTF32Nat(const UTF8Unit * utf8In,const size_t utf8Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf8Read,size_t * utf32Written)753 static void UTF8_to_UTF32Nat ( const UTF8Unit *  utf8In,   const size_t utf8Len,
754 				               UTF32Unit *       utf32Out, const size_t utf32Len,
755 				               size_t *          utf8Read, size_t *     utf32Written )
756 {
757 	const UTF8Unit * utf8Pos  = utf8In;
758 	UTF32Unit *      utf32Pos = utf32Out;
759 
760 	size_t utf8Left  = utf8Len;
761 	size_t utf32Left = utf32Len;
762 
763 	UC_Assert ( (utf8In != 0) && (utf32Out != 0) && (utf8Read != 0) && (utf32Written != 0) );
764 
765 	while ( (utf8Left > 0) && (utf32Left > 0) ) {
766 
767 		// Do a run of ASCII, it copies 1 input unit into 1 output unit.
768 		size_t i, limit = utf8Left;
769 		if ( limit > utf32Left ) limit = utf32Left;
770 		for ( i = 0; i < limit; ++i ) {
771 			UTF8Unit inUnit = *utf8Pos;
772 			if ( inUnit > 0x7F ) break;
773 			*utf32Pos = inUnit;
774 			++utf8Pos;
775 			++utf32Pos;
776 		}
777 		utf8Left -= i;
778 		utf32Left -= i;
779 
780 		// Do a run of non-ASCII, it copies variable input into 1 output unit.
781 		while ( (utf8Left > 0) && (utf32Left > 0) ) {
782 			size_t len;
783 			UTF8Unit inUnit = *utf8Pos;
784 			if ( inUnit <= 0x7F ) break;
785 			CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, utf32Pos, &len );
786 			if ( len == 0 ) goto Done;	// The input buffer ends in the middle of a character.
787 			utf8Left  -= len;
788 			utf8Pos   += len;
789 			utf32Left -= 1;
790 			utf32Pos  += 1;
791 		}
792 
793 	}
794 
795 Done:	// Set the output lengths.
796 	*utf8Read = utf8Len - utf8Left;
797 	*utf32Written = utf32Len - utf32Left;
798 
799 }	// UTF8_to_UTF32Nat
800 
801 // =================================================================================================
802 
UTF16Nat_to_UTF8(const UTF16Unit * utf16In,const size_t utf16Len,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf16Read,size_t * utf8Written)803 static void UTF16Nat_to_UTF8 ( const UTF16Unit * utf16In,   const size_t utf16Len,
804 				               UTF8Unit *        utf8Out,   const size_t utf8Len,
805 				               size_t *          utf16Read, size_t *     utf8Written )
806 {
807 	const UTF16Unit * utf16Pos = utf16In;
808 	UTF8Unit *        utf8Pos  = utf8Out;
809 
810 	size_t utf16Left = utf16Len;
811 	size_t utf8Left  = utf8Len;
812 
813 	UC_Assert ( (utf16In != 0) && (utf8Out != 0) && (utf16Read != 0) && (utf8Written != 0) );
814 
815 	while ( (utf16Left > 0) && (utf8Left > 0) ) {
816 
817 		// Do a run of ASCII, it copies 1 input unit into 1 output unit.
818 		size_t i, limit = utf16Left;
819 		if ( limit > utf8Left ) limit = utf8Left;
820 		for ( i = 0; i < limit; ++i ) {
821 			UTF16Unit inUnit = *utf16Pos;
822 			if ( inUnit > 0x7F ) break;
823 			*utf8Pos = UTF8Unit(inUnit);
824 			++utf16Pos;
825 			++utf8Pos;
826 		}
827 		utf16Left -= i;
828 		utf8Left  -= i;
829 
830 		// Do a run of non-ASCII inside the BMP, it copies 1 input unit into multiple output units.
831 		while ( (utf16Left > 0) && (utf8Left > 0) ) {
832 			size_t len8;
833 			UTF16Unit inUnit = *utf16Pos;
834 			if ( inUnit <= 0x7F ) break;
835 			if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
836 			CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len8 );
837 			if ( len8 == 0 ) goto Done;		// Not enough room in the output buffer.
838 			utf16Left -= 1;
839 			utf16Pos  += 1;
840 			utf8Left  -= len8;
841 			utf8Pos   += len8;
842 		}
843 
844 		// Do a run of surrogate pairs, it copies 2 input units into multiple output units.
845 		while ( (utf16Left > 0) && (utf8Left > 0) ) {
846 			UTF32Unit cp;
847 			size_t len16, len8;
848 			UTF16Unit inUnit = *utf16Pos;
849 			if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
850 			CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, &cp, &len16 );
851 			if ( len16 == 0 ) goto Done;	// The input buffer ends in the middle of a surrogate pair.
852 			UC_Assert ( len16 == 2 );
853 			CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len8 );
854 			if ( len8 == 0 ) goto Done;		// Not enough room in the output buffer.
855 			utf16Left -= len16;
856 			utf16Pos  += len16;
857 			utf8Left  -= len8;
858 			utf8Pos   += len8;
859 		}
860 
861 	}
862 
863 Done:	// Set the output lengths.
864 	*utf16Read = utf16Len - utf16Left;
865 	*utf8Written = utf8Len - utf8Left;
866 
867 }	// UTF16Nat_to_UTF8
868 
869 // =================================================================================================
870 
UTF32Nat_to_UTF8(const UTF32Unit * utf32In,const size_t utf32Len,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf32Read,size_t * utf8Written)871 static void UTF32Nat_to_UTF8 ( const UTF32Unit * utf32In,   const size_t utf32Len,
872 				               UTF8Unit *        utf8Out,   const size_t utf8Len,
873 				               size_t *          utf32Read, size_t *     utf8Written )
874 {
875 	const UTF32Unit * utf32Pos = utf32In;
876 	UTF8Unit *        utf8Pos  = utf8Out;
877 
878 	size_t utf32Left = utf32Len;
879 	size_t utf8Left  = utf8Len;
880 
881 	UC_Assert ( (utf32In != 0) && (utf8Out != 0) && (utf32Read != 0) && (utf8Written != 0) );
882 
883 	while ( (utf32Left > 0) && (utf8Left > 0) ) {
884 
885 		// Do a run of ASCII, it copies 1 input unit into 1 output unit.
886 		size_t i, limit = utf32Left;
887 		if ( limit > utf8Left ) limit = utf8Left;
888 		for ( i = 0; i < limit; ++i ) {
889 			UTF32Unit inUnit = *utf32Pos;
890 			if ( inUnit > 0x7F ) break;
891 			*utf8Pos = UTF8Unit(inUnit);
892 			++utf32Pos;
893 			++utf8Pos;
894 		}
895 		utf32Left -= i;
896 		utf8Left  -= i;
897 
898 		// Do a run of non-ASCII, it copies 1 input unit into multiple output units.
899 		while ( (utf32Left > 0) && (utf8Left > 0) ) {
900 			size_t len;
901 			UTF32Unit inUnit = *utf32Pos;
902 			if ( inUnit <= 0x7F ) break;
903 			CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len );
904 			if ( len == 0 ) goto Done;	// Not enough room in the output buffer.
905 			utf32Left -= 1;
906 			utf32Pos  += 1;
907 			utf8Left  -= len;
908 			utf8Pos   += len;
909 		}
910 
911 	}
912 
913 Done:	// Set the output lengths.
914 	*utf32Read = utf32Len - utf32Left;
915 	*utf8Written = utf8Len - utf8Left;
916 
917 }	// UTF32Nat_to_UTF8
918 
919 // =================================================================================================
920 
UTF16Nat_to_UTF32Nat(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf16Read,size_t * utf32Written)921 static void UTF16Nat_to_UTF32Nat ( const UTF16Unit * utf16In,   const size_t utf16Len,
922 				                   UTF32Unit *       utf32Out,  const size_t utf32Len,
923 				                   size_t *          utf16Read, size_t *     utf32Written )
924 {
925 	const UTF16Unit * utf16Pos = utf16In;
926 	UTF32Unit *       utf32Pos = utf32Out;
927 
928 	size_t utf16Left = utf16Len;
929 	size_t utf32Left = utf32Len;
930 
931 	UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
932 
933 	while ( (utf16Left > 0) && (utf32Left > 0) ) {
934 
935 		// Do a run of BMP, it copies 1 input unit into 1 output unit.
936 		size_t i, limit = utf16Left;
937 		if ( limit > utf32Left ) limit = utf32Left;
938 		for ( i = 0; i < limit; ++i ) {
939 			UTF16Unit inUnit = *utf16Pos;
940 			if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
941 			*utf32Pos = inUnit;
942 			++utf16Pos;
943 			++utf32Pos;
944 		}
945 		utf16Left -= i;
946 		utf32Left -= i;
947 
948 		// Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
949 		while ( (utf16Left > 0) && (utf32Left > 0) ) {
950 			size_t len;
951 			UTF16Unit inUnit = *utf16Pos;
952 			if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
953 			CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, utf32Pos, &len );
954 			if ( len == 0 ) goto Done;	// The input buffer ends in the middle of a surrogate pair.
955 			UC_Assert ( len == 2 );
956 			utf16Left -= len;
957 			utf16Pos  += len;
958 			utf32Left -= 1;
959 			utf32Pos  += 1;
960 		}
961 
962 	}
963 
964 Done:	// Set the output lengths.
965 	*utf16Read = utf16Len - utf16Left;
966 	*utf32Written = utf32Len - utf32Left;
967 
968 }	// UTF16Nat_to_UTF32Nat
969 
970 // =================================================================================================
971 
UTF32Nat_to_UTF16Nat(const UTF32Unit * utf32In,const size_t utf32Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf32Read,size_t * utf16Written)972 static void UTF32Nat_to_UTF16Nat ( const UTF32Unit * utf32In,   const size_t utf32Len,
973 				                   UTF16Unit *       utf16Out,  const size_t utf16Len,
974 				                   size_t *          utf32Read, size_t *     utf16Written )
975 {
976 	const UTF32Unit * utf32Pos = utf32In;
977 	UTF16Unit *       utf16Pos = utf16Out;
978 
979 	size_t utf32Left = utf32Len;
980 	size_t utf16Left = utf16Len;
981 
982 	UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
983 
984 	while ( (utf32Left > 0) && (utf16Left > 0) ) {
985 
986 		// Do a run of BMP, it copies 1 input unit into 1 output unit.
987 		size_t i, limit = utf32Left;
988 		if ( limit > utf16Left ) limit = utf16Left;
989 		for ( i = 0; i < limit; ++i ) {
990 			UTF32Unit inUnit = *utf32Pos;
991 			if ( inUnit > 0xFFFF ) break;
992 			*utf16Pos = UTF16Unit(inUnit);
993 			++utf32Pos;
994 			++utf16Pos;
995 		}
996 		utf32Left -= i;
997 		utf16Left -= i;
998 
999 		// Do a run of non-BMP, it copies 1 input unit into 2 output units.
1000 		while ( (utf32Left > 0) && (utf16Left > 0) ) {
1001 			size_t len;
1002 			UTF32Unit inUnit = *utf32Pos;
1003 			if ( inUnit <= 0xFFFF ) break;
1004 			CodePoint_to_UTF16Nat_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1005 			if ( len == 0 ) goto Done;	// Not enough room in the output buffer.
1006 			UC_Assert ( len == 2 );
1007 			utf32Left -= 1;
1008 			utf32Pos  += 1;
1009 			utf16Left -= 2;
1010 			utf16Pos  += 2;
1011 		}
1012 
1013 	}
1014 
1015 Done:	// Set the output lengths.
1016 	*utf32Read = utf32Len - utf32Left;
1017 	*utf16Written = utf16Len - utf16Left;
1018 
1019 }	// UTF32Nat_to_UTF16Nat
1020 
1021 // =================================================================================================
1022 
CodePoint_to_UTF16Swp_Surrogate(const UTF32Unit cpIn,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf16Written)1023 static void CodePoint_to_UTF16Swp_Surrogate ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
1024 {
1025 	size_t unitCount = 0;
1026 	UTF32Unit temp;	// ! Avoid gcc complaints about declarations after goto's.
1027 
1028 	if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam );
1029 	if ( utf16Len < 2 ) goto Done;	// Not enough room for the output.
1030 
1031 	unitCount = 2;
1032 	temp = cpIn - 0x10000;
1033 	UTF16OutSwap ( &utf16Out[0], (0xD800 | UTF16Unit ( temp >> 10 )) );
1034 	UTF16OutSwap ( &utf16Out[1], (0xDC00 | UTF16Unit ( temp & 0x3FF)) );
1035 
1036 Done:
1037 	*utf16Written = unitCount;
1038 	return;
1039 
1040 }	// CodePoint_to_UTF16Swp_Surrogate
1041 
1042 // =================================================================================================
1043 
CodePoint_to_UTF16Swp(const UTF32Unit cpIn,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf16Written)1044 static void CodePoint_to_UTF16Swp ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
1045 {
1046 	size_t unitCount = 0;
1047 
1048 	UC_Assert ( (utf16Out != 0) && (utf16Written != 0) );
1049 	if ( utf16Len == 0 ) goto Done;
1050 	if ( cpIn >= 0xD800 ) goto CheckSurrogate;	// ! Force linear execution path for the BMP.
1051 
1052 InBMP:
1053 	unitCount = 1;
1054 	UTF16OutSwap ( utf16Out, UTF16Unit(cpIn) );
1055 
1056 Done:
1057 	*utf16Written = unitCount;
1058 	return;
1059 
1060 CheckSurrogate:
1061 	if ( cpIn > 0xFFFF ) goto SurrogatePair;
1062 	if ( cpIn > 0xDFFF ) goto InBMP;
1063 	UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam );
1064 
1065 SurrogatePair:
1066 	CodePoint_to_UTF16Swp_Surrogate ( cpIn, utf16Out, utf16Len, utf16Written );
1067 	return;
1068 
1069 }	// CodePoint_to_UTF16Swp
1070 
1071 // =================================================================================================
1072 
CodePoint_from_UTF16Swp_Surrogate(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * cpOut,size_t * utf16Read)1073 static void CodePoint_from_UTF16Swp_Surrogate ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
1074 {
1075 	UTF16Unit hiUnit = UTF16InSwap(utf16In);
1076 	size_t unitCount = 0;
1077 	UTF16Unit loUnit;	// ! Avoid gcc complaints about declarations after goto's.
1078 	UTF32Unit cp;
1079 
1080 	// ----------------------------------
1081 	// We've got a UTF-16 surrogate pair.
1082 
1083 	if ( hiUnit > 0xDBFF ) UC_Throw ( "Bad UTF-16 - leading low surrogate", kXMPErr_BadParam );
1084 	if ( utf16Len < 2 ) goto Done;	// Not enough input in this buffer.
1085 
1086 	loUnit  = UTF16InSwap(utf16In+1);
1087 	if ( (loUnit < 0xDC00) || (0xDFFF < loUnit) ) UC_Throw ( "Bad UTF-16 - missing low surrogate", kXMPErr_BadParam );
1088 
1089 	unitCount = 2;
1090 	cp = (((hiUnit & 0x3FF) << 10) | (loUnit & 0x3FF)) + 0x10000;
1091 
1092 	*cpOut = cp;	// ! Don't put after Done, don't write if no input.
1093 
1094 Done:
1095 	*utf16Read = unitCount;
1096 	return;
1097 
1098 }	// CodePoint_from_UTF16Swp_Surrogate
1099 
1100 // =================================================================================================
1101 
CodePoint_from_UTF16Swp(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * cpOut,size_t * utf16Read)1102 static void CodePoint_from_UTF16Swp ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
1103 {
1104 	UTF16Unit inUnit;	// ! Don't read until we know there is input.
1105 	size_t unitCount = 0;
1106 
1107 	UC_Assert ( (utf16In != 0) && (cpOut != 0) && (utf16Read != 0) );
1108 	if ( utf16Len == 0 ) goto Done;
1109 	inUnit = UTF16InSwap(utf16In);
1110 	if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) goto SurrogatePair;	// ! Force linear execution path for the BMP.
1111 
1112 	unitCount = 1;
1113 	*cpOut = inUnit;	// ! Don't put after Done, don't write if no input.
1114 
1115 Done:
1116 	*utf16Read = unitCount;
1117 	return;
1118 
1119 SurrogatePair:
1120 	CodePoint_from_UTF16Swp_Surrogate ( utf16In, utf16Len, cpOut, utf16Read );
1121 	return;
1122 
1123 }	// CodePoint_from_UTF16Swp
1124 
1125 // =================================================================================================
1126 
UTF8_to_UTF16Swp(const UTF8Unit * utf8In,const size_t utf8Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf8Read,size_t * utf16Written)1127 static void UTF8_to_UTF16Swp ( const UTF8Unit * utf8In,   const size_t utf8Len,
1128 				               UTF16Unit *      utf16Out, const size_t utf16Len,
1129 				               size_t *         utf8Read, size_t *     utf16Written )
1130 {
1131 	const UTF8Unit * utf8Pos  = utf8In;
1132 	UTF16Unit *      utf16Pos = utf16Out;
1133 
1134 	size_t utf8Left  = utf8Len;
1135 	size_t utf16Left = utf16Len;
1136 
1137 	UC_Assert ( (utf8In != 0) && (utf16Out != 0) && (utf8Read != 0) && (utf16Written != 0) );
1138 
1139 	while ( (utf8Left > 0) && (utf16Left > 0) ) {
1140 
1141 		// Do a run of ASCII, it copies 1 input unit into 1 output unit.
1142 		size_t i, limit = utf8Left;
1143 		if ( limit > utf16Left ) limit = utf16Left;
1144 		for ( i = 0; i < limit; ++i ) {
1145 			UTF8Unit inUnit = *utf8Pos;
1146 			if ( inUnit > 0x7F ) break;
1147 			*utf16Pos = UTF16Unit(inUnit) << 8;	// Better than: UTF16OutSwap ( utf16Pos, inUnit );
1148 			++utf8Pos;
1149 			++utf16Pos;
1150 		}
1151 		utf8Left  -= i;
1152 		utf16Left -= i;
1153 
1154 		// Do a run of non-ASCII, it copies multiple input units into 1 or 2 output units.
1155 		while ( (utf8Left > 0) && (utf16Left > 0) ) {
1156 			UTF32Unit cp;
1157 			size_t len8, len16;
1158 			UTF8Unit inUnit = *utf8Pos;
1159 			if ( inUnit <= 0x7F ) break;
1160 			CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len8 );
1161 			if ( len8 == 0 ) goto Done;		// The input buffer ends in the middle of a character.
1162 			if ( cp <= 0xFFFF ) {
1163 				UTF16OutSwap ( utf16Pos, UTF16Unit(cp) );
1164 				len16 = 1;
1165 			} else {
1166 				CodePoint_to_UTF16Swp_Surrogate ( cp, utf16Pos, utf16Left, &len16 );
1167 				if ( len16 == 0 ) goto Done;	// Not enough room in the output buffer.
1168 			}
1169 			utf8Left  -= len8;
1170 			utf8Pos   += len8;
1171 			utf16Left -= len16;
1172 			utf16Pos  += len16;
1173 		}
1174 
1175 	}
1176 
1177 Done:	// Set the output lengths.
1178 	*utf8Read = utf8Len - utf8Left;
1179 	*utf16Written = utf16Len - utf16Left;
1180 
1181 }	// UTF8_to_UTF16Swp
1182 
1183 // =================================================================================================
1184 
UTF8_to_UTF32Swp(const UTF8Unit * utf8In,const size_t utf8Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf8Read,size_t * utf32Written)1185 static void UTF8_to_UTF32Swp ( const UTF8Unit *  utf8In,   const size_t utf8Len,
1186 				               UTF32Unit *       utf32Out, const size_t utf32Len,
1187 				               size_t *          utf8Read, size_t *     utf32Written )
1188 {
1189 	const UTF8Unit * utf8Pos  = utf8In;
1190 	UTF32Unit *      utf32Pos = utf32Out;
1191 
1192 	size_t utf8Left  = utf8Len;
1193 	size_t utf32Left = utf32Len;
1194 
1195 	UC_Assert ( (utf8In != 0) && (utf32Out != 0) && (utf8Read != 0) && (utf32Written != 0) );
1196 
1197 	while ( (utf8Left > 0) && (utf32Left > 0) ) {
1198 
1199 		// Do a run of ASCII, it copies 1 input unit into 1 output unit.
1200 		size_t i, limit = utf8Left;
1201 		if ( limit > utf32Left ) limit = utf32Left;
1202 		for ( i = 0; i < limit; ++i ) {
1203 			UTF8Unit inUnit = *utf8Pos;
1204 			if ( inUnit > 0x7F ) break;
1205 			*utf32Pos = UTF32Unit(inUnit) << 24;	// Better than: UTF32OutSwap ( utf32Pos, inUnit );
1206 			++utf8Pos;
1207 			++utf32Pos;
1208 		}
1209 		utf8Left -= i;
1210 		utf32Left -= i;
1211 
1212 		// Do a run of non-ASCII, it copies variable input into 1 output unit.
1213 		while ( (utf8Left > 0) && (utf32Left > 0) ) {
1214 			size_t len;
1215 			UTF32Unit cp;
1216 			UTF8Unit inUnit = *utf8Pos;
1217 			if ( inUnit <= 0x7F ) break;
1218 			CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len );
1219 			if ( len == 0 ) goto Done;	// The input buffer ends in the middle of a character.
1220 			UTF32OutSwap ( utf32Pos, cp );
1221 			utf8Left  -= len;
1222 			utf8Pos   += len;
1223 			utf32Left -= 1;
1224 			utf32Pos  += 1;
1225 		}
1226 
1227 	}
1228 
1229 Done:	// Set the output lengths.
1230 	*utf8Read = utf8Len - utf8Left;
1231 	*utf32Written = utf32Len - utf32Left;
1232 
1233 }	// UTF8_to_UTF32Swp
1234 
1235 // =================================================================================================
1236 
UTF16Swp_to_UTF8(const UTF16Unit * utf16In,const size_t utf16Len,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf16Read,size_t * utf8Written)1237 static void UTF16Swp_to_UTF8 ( const UTF16Unit * utf16In,   const size_t utf16Len,
1238 				               UTF8Unit *        utf8Out,   const size_t utf8Len,
1239 				               size_t *          utf16Read, size_t *     utf8Written )
1240 {
1241 	const UTF16Unit * utf16Pos = utf16In;
1242 	UTF8Unit *        utf8Pos  = utf8Out;
1243 
1244 	size_t utf16Left = utf16Len;
1245 	size_t utf8Left  = utf8Len;
1246 
1247 	UC_Assert ( (utf16In != 0) && (utf8Out != 0) && (utf16Read != 0) && (utf8Written != 0) );
1248 
1249 	while ( (utf16Left > 0) && (utf8Left > 0) ) {
1250 
1251 		// Do a run of ASCII, it copies 1 input unit into 1 output unit.
1252 		size_t i, limit = utf16Left;
1253 		if ( limit > utf8Left ) limit = utf8Left;
1254 		for ( i = 0; i < limit; ++i ) {
1255 			UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1256 			if ( inUnit > 0x7F ) break;
1257 			*utf8Pos = UTF8Unit(inUnit);
1258 			++utf16Pos;
1259 			++utf8Pos;
1260 		}
1261 		utf16Left -= i;
1262 		utf8Left  -= i;
1263 
1264 		// Do a run of non-ASCII inside the BMP, it copies 1 input unit into multiple output units.
1265 		while ( (utf16Left > 0) && (utf8Left > 0) ) {
1266 			size_t len8;
1267 			UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1268 			if ( inUnit <= 0x7F ) break;
1269 			if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1270 			CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len8 );
1271 			if ( len8 == 0 ) goto Done;		// Not enough room in the output buffer.
1272 			utf16Left -= 1;
1273 			utf16Pos  += 1;
1274 			utf8Left  -= len8;
1275 			utf8Pos   += len8;
1276 		}
1277 
1278 		// Do a run of surrogate pairs, it copies 2 input units into multiple output units.
1279 		while ( (utf16Left > 0) && (utf8Left > 0) ) {
1280 			UTF32Unit cp;
1281 			size_t len16, len8;
1282 			UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1283 			if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1284 			CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, &cp, &len16 );
1285 			if ( len16 == 0 ) goto Done;	// The input buffer ends in the middle of a surrogate pair.
1286 			UC_Assert ( len16 == 2 );
1287 			CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len8 );
1288 			if ( len8 == 0 ) goto Done;		// Not enough room in the output buffer.
1289 			utf16Left -= len16;
1290 			utf16Pos  += len16;
1291 			utf8Left  -= len8;
1292 			utf8Pos   += len8;
1293 		}
1294 
1295 	}
1296 
1297 Done:	// Set the output lengths.
1298 	*utf16Read = utf16Len - utf16Left;
1299 	*utf8Written = utf8Len - utf8Left;
1300 
1301 }	// UTF16Swp_to_UTF8
1302 
1303 // =================================================================================================
1304 
UTF32Swp_to_UTF8(const UTF32Unit * utf32In,const size_t utf32Len,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf32Read,size_t * utf8Written)1305 static void UTF32Swp_to_UTF8 ( const UTF32Unit * utf32In,   const size_t utf32Len,
1306 				               UTF8Unit *        utf8Out,   const size_t utf8Len,
1307 				               size_t *          utf32Read, size_t *     utf8Written )
1308 {
1309 	const UTF32Unit * utf32Pos = utf32In;
1310 	UTF8Unit *        utf8Pos  = utf8Out;
1311 
1312 	size_t utf32Left = utf32Len;
1313 	size_t utf8Left  = utf8Len;
1314 
1315 	UC_Assert ( (utf32In != 0) && (utf8Out != 0) && (utf32Read != 0) && (utf8Written != 0) );
1316 
1317 	while ( (utf32Left > 0) && (utf8Left > 0) ) {
1318 
1319 		// Do a run of ASCII, it copies 1 input unit into 1 output unit.
1320 		size_t i, limit = utf32Left;
1321 		if ( limit > utf8Left ) limit = utf8Left;
1322 		for ( i = 0; i < limit; ++i ) {
1323 			UTF32Unit cp = UTF32InSwap(utf32Pos);
1324 			if ( cp > 0x7F ) break;
1325 			*utf8Pos = UTF8Unit(cp);
1326 			++utf32Pos;
1327 			++utf8Pos;
1328 		}
1329 		utf32Left -= i;
1330 		utf8Left  -= i;
1331 
1332 		// Do a run of non-ASCII, it copies 1 input unit into multiple output units.
1333 		while ( (utf32Left > 0) && (utf8Left > 0) ) {
1334 			size_t len;
1335 			UTF32Unit cp = UTF32InSwap(utf32Pos);
1336 			if ( cp <= 0x7F ) break;
1337 			CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len );
1338 			if ( len == 0 ) goto Done;	// Not enough room in the output buffer.
1339 			utf32Left -= 1;
1340 			utf32Pos  += 1;
1341 			utf8Left  -= len;
1342 			utf8Pos   += len;
1343 		}
1344 
1345 	}
1346 
1347 Done:	// Set the output lengths.
1348 	*utf32Read = utf32Len - utf32Left;
1349 	*utf8Written = utf8Len - utf8Left;
1350 
1351 }	// UTF32Swp_to_UTF8
1352 
1353 // =================================================================================================
1354 
UTF16Swp_to_UTF32Swp(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf16Read,size_t * utf32Written)1355 static void UTF16Swp_to_UTF32Swp ( const UTF16Unit * utf16In,   const size_t utf16Len,
1356 				                   UTF32Unit *       utf32Out,  const size_t utf32Len,
1357 				                   size_t *          utf16Read, size_t *     utf32Written )
1358 {
1359 	const UTF16Unit * utf16Pos = utf16In;
1360 	UTF32Unit *       utf32Pos = utf32Out;
1361 
1362 	size_t utf16Left = utf16Len;
1363 	size_t utf32Left = utf32Len;
1364 
1365 	UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
1366 
1367 	while ( (utf16Left > 0) && (utf32Left > 0) ) {
1368 
1369 		// Do a run of BMP, it copies 1 input unit into 1 output unit.
1370 		size_t i, limit = utf16Left;
1371 		if ( limit > utf32Left ) limit = utf32Left;
1372 		for ( i = 0; i < limit; ++i ) {
1373 			UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1374 			if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1375 			*utf32Pos = UTF32Unit(*utf16Pos) << 16;	// Better than: UTF32OutSwap ( utf32Pos, inUnit );
1376 			++utf16Pos;
1377 			++utf32Pos;
1378 		}
1379 		utf16Left -= i;
1380 		utf32Left -= i;
1381 
1382 		// Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
1383 		while ( (utf16Left > 0) && (utf32Left > 0) ) {
1384 			size_t len;
1385 			UTF32Unit cp;
1386 			UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1387 			if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1388 			CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, &cp, &len );
1389 			if ( len == 0 ) goto Done;	// The input buffer ends in the middle of a surrogate pair.
1390 			UTF32OutSwap ( utf32Pos, cp );
1391 			UC_Assert ( len == 2 );
1392 			utf16Left -= len;
1393 			utf16Pos  += len;
1394 			utf32Left -= 1;
1395 			utf32Pos  += 1;
1396 		}
1397 
1398 	}
1399 
1400 Done:	// Set the output lengths.
1401 	*utf16Read = utf16Len - utf16Left;
1402 	*utf32Written = utf32Len - utf32Left;
1403 
1404 }	// UTF16Swp_to_UTF32Swp
1405 
1406 // =================================================================================================
1407 
UTF32Swp_to_UTF16Swp(const UTF32Unit * utf32In,const size_t utf32Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf32Read,size_t * utf16Written)1408 static void UTF32Swp_to_UTF16Swp ( const UTF32Unit * utf32In,   const size_t utf32Len,
1409 				                   UTF16Unit *       utf16Out,  const size_t utf16Len,
1410 				                   size_t *          utf32Read, size_t *     utf16Written )
1411 {
1412 	const UTF32Unit * utf32Pos = utf32In;
1413 	UTF16Unit *       utf16Pos = utf16Out;
1414 
1415 	size_t utf32Left = utf32Len;
1416 	size_t utf16Left = utf16Len;
1417 
1418 	const size_t k32to16Offset = swap32to16Offset;	// ! Make sure compiler treats as an invariant.
1419 
1420 	UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
1421 
1422 	while ( (utf32Left > 0) && (utf16Left > 0) ) {
1423 
1424 		// Do a run of BMP, it copies 1 input unit into 1 output unit.
1425 		size_t i, limit = utf32Left;
1426 		if ( limit > utf16Left ) limit = utf16Left;
1427 		for ( i = 0; i < limit; ++i ) {
1428 			UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1429 			if ( inUnit > 0xFFFF ) break;
1430 			*utf16Pos = *(((UTF16Unit*)utf32Pos) + k32to16Offset);	// Better than: UTF16OutSwap ( utf16Pos, UTF16Unit(inUnit) );
1431 			++utf32Pos;
1432 			++utf16Pos;
1433 		}
1434 		utf32Left -= i;
1435 		utf16Left -= i;
1436 
1437 		// Do a run of non-BMP, it copies 1 input unit into 2 output units.
1438 		while ( (utf32Left > 0) && (utf16Left > 0) ) {
1439 			size_t len;
1440 			UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1441 			if ( inUnit <= 0xFFFF ) break;
1442 			CodePoint_to_UTF16Swp_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1443 			if ( len == 0 ) goto Done;	// Not enough room in the output buffer.
1444 			UC_Assert ( len == 2 );
1445 			utf32Left -= 1;
1446 			utf32Pos  += 1;
1447 			utf16Left -= 2;
1448 			utf16Pos  += 2;
1449 		}
1450 
1451 	}
1452 
1453 Done:	// Set the output lengths.
1454 	*utf32Read = utf32Len - utf32Left;
1455 	*utf16Written = utf16Len - utf16Left;
1456 
1457 }	// UTF32Swp_to_UTF16Swp
1458 
1459 // =================================================================================================
1460 
UTF16Nat_to_UTF32Swp(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf16Read,size_t * utf32Written)1461 static void UTF16Nat_to_UTF32Swp ( const UTF16Unit * utf16In,   const size_t utf16Len,
1462 				                   UTF32Unit *       utf32Out,  const size_t utf32Len,
1463 				                   size_t *          utf16Read, size_t *     utf32Written )
1464 {
1465 	const UTF16Unit * utf16Pos = utf16In;
1466 	UTF32Unit *       utf32Pos = utf32Out;
1467 
1468 	size_t utf16Left = utf16Len;
1469 	size_t utf32Left = utf32Len;
1470 
1471 	UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
1472 
1473 	while ( (utf16Left > 0) && (utf32Left > 0) ) {
1474 
1475 		// Do a run of BMP, it copies 1 input unit into 1 output unit.
1476 		size_t i, limit = utf16Left;
1477 		if ( limit > utf32Left ) limit = utf32Left;
1478 		for ( i = 0; i < limit; ++i ) {
1479 			UTF16Unit inUnit = *utf16Pos;
1480 			if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1481 			UTF32OutSwap ( utf32Pos, inUnit );
1482 			++utf16Pos;
1483 			++utf32Pos;
1484 		}
1485 		utf16Left -= i;
1486 		utf32Left -= i;
1487 
1488 		// Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
1489 		while ( (utf16Left > 0) && (utf32Left > 0) ) {
1490 			size_t len;
1491 			UTF32Unit cp;
1492 			UTF16Unit inUnit = *utf16Pos;
1493 			if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1494 			CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, &cp, &len );
1495 			if ( len == 0 ) goto Done;	// The input buffer ends in the middle of a surrogate pair.
1496 			UC_Assert ( len == 2 );
1497 			UTF32OutSwap ( utf32Pos, cp );
1498 			utf16Left -= len;
1499 			utf16Pos  += len;
1500 			utf32Left -= 1;
1501 			utf32Pos  += 1;
1502 		}
1503 
1504 	}
1505 
1506 Done:	// Set the output lengths.
1507 	*utf16Read = utf16Len - utf16Left;
1508 	*utf32Written = utf32Len - utf32Left;
1509 
1510 }	// UTF16Nat_to_UTF32Swp
1511 
1512 // =================================================================================================
1513 
UTF16Swp_to_UTF32Nat(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf16Read,size_t * utf32Written)1514 static void UTF16Swp_to_UTF32Nat ( const UTF16Unit * utf16In,   const size_t utf16Len,
1515 				                   UTF32Unit *       utf32Out,  const size_t utf32Len,
1516 				                   size_t *          utf16Read, size_t *     utf32Written )
1517 {
1518 	const UTF16Unit * utf16Pos = utf16In;
1519 	UTF32Unit *       utf32Pos = utf32Out;
1520 
1521 	size_t utf16Left = utf16Len;
1522 	size_t utf32Left = utf32Len;
1523 
1524 	UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
1525 
1526 	while ( (utf16Left > 0) && (utf32Left > 0) ) {
1527 
1528 		// Do a run of BMP, it copies 1 input unit into 1 output unit.
1529 		size_t i, limit = utf16Left;
1530 		if ( limit > utf32Left ) limit = utf32Left;
1531 		for ( i = 0; i < limit; ++i ) {
1532 			UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1533 			if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1534 			*utf32Pos = inUnit;
1535 			++utf16Pos;
1536 			++utf32Pos;
1537 		}
1538 		utf16Left -= i;
1539 		utf32Left -= i;
1540 
1541 		// Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
1542 		while ( (utf16Left > 0) && (utf32Left > 0) ) {
1543 			size_t len;
1544 			UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1545 			if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1546 			CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, utf32Pos, &len );
1547 			if ( len == 0 ) goto Done;	// The input buffer ends in the middle of a surrogate pair.
1548 			UC_Assert ( len == 2 );
1549 			utf16Left -= len;
1550 			utf16Pos  += len;
1551 			utf32Left -= 1;
1552 			utf32Pos  += 1;
1553 		}
1554 
1555 	}
1556 
1557 Done:	// Set the output lengths.
1558 	*utf16Read = utf16Len - utf16Left;
1559 	*utf32Written = utf32Len - utf32Left;
1560 
1561 }	// UTF16Swp_to_UTF32Nat
1562 
1563 // =================================================================================================
1564 
UTF32Nat_to_UTF16Swp(const UTF32Unit * utf32In,const size_t utf32Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf32Read,size_t * utf16Written)1565 static void UTF32Nat_to_UTF16Swp ( const UTF32Unit * utf32In,   const size_t utf32Len,
1566 				                   UTF16Unit *       utf16Out,  const size_t utf16Len,
1567 				                   size_t *          utf32Read, size_t *     utf16Written )
1568 {
1569 	const UTF32Unit * utf32Pos = utf32In;
1570 	UTF16Unit *       utf16Pos = utf16Out;
1571 
1572 	size_t utf32Left = utf32Len;
1573 	size_t utf16Left = utf16Len;
1574 
1575 	UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
1576 
1577 	while ( (utf32Left > 0) && (utf16Left > 0) ) {
1578 
1579 		// Do a run of BMP, it copies 1 input unit into 1 output unit.
1580 		size_t i, limit = utf32Left;
1581 		if ( limit > utf16Left ) limit = utf16Left;
1582 		for ( i = 0; i < limit; ++i ) {
1583 			UTF32Unit inUnit = *utf32Pos;
1584 			if ( inUnit > 0xFFFF ) break;
1585 			UTF16OutSwap ( utf16Pos, UTF16Unit(inUnit) );
1586 			++utf32Pos;
1587 			++utf16Pos;
1588 		}
1589 		utf32Left -= i;
1590 		utf16Left -= i;
1591 
1592 		// Do a run of non-BMP, it copies 1 input unit into 2 output units.
1593 		while ( (utf32Left > 0) && (utf16Left > 0) ) {
1594 			size_t len;
1595 			UTF32Unit inUnit = *utf32Pos;
1596 			if ( inUnit <= 0xFFFF ) break;
1597 			CodePoint_to_UTF16Swp_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1598 			if ( len == 0 ) goto Done;	// Not enough room in the output buffer.
1599 			UC_Assert ( len == 2 );
1600 			utf32Left -= 1;
1601 			utf32Pos  += 1;
1602 			utf16Left -= 2;
1603 			utf16Pos  += 2;
1604 		}
1605 
1606 	}
1607 
1608 Done:	// Set the output lengths.
1609 	*utf32Read = utf32Len - utf32Left;
1610 	*utf16Written = utf16Len - utf16Left;
1611 
1612 }	// UTF32Nat_to_UTF16Swp
1613 
1614 // =================================================================================================
1615 
UTF32Swp_to_UTF16Nat(const UTF32Unit * utf32In,const size_t utf32Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf32Read,size_t * utf16Written)1616 static void UTF32Swp_to_UTF16Nat ( const UTF32Unit * utf32In,   const size_t utf32Len,
1617 				                   UTF16Unit *       utf16Out,  const size_t utf16Len,
1618 				                   size_t *          utf32Read, size_t *     utf16Written )
1619 {
1620 	const UTF32Unit * utf32Pos = utf32In;
1621 	UTF16Unit *       utf16Pos = utf16Out;
1622 
1623 	size_t utf32Left = utf32Len;
1624 	size_t utf16Left = utf16Len;
1625 
1626 	UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
1627 
1628 	while ( (utf32Left > 0) && (utf16Left > 0) ) {
1629 
1630 		// Do a run of BMP, it copies 1 input unit into 1 output unit.
1631 		size_t i, limit = utf32Left;
1632 		if ( limit > utf16Left ) limit = utf16Left;
1633 		for ( i = 0; i < limit; ++i ) {
1634 			UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1635 			if ( inUnit > 0xFFFF ) break;
1636 			*utf16Pos = UTF16Unit(inUnit);
1637 			++utf32Pos;
1638 			++utf16Pos;
1639 		}
1640 		utf32Left -= i;
1641 		utf16Left -= i;
1642 
1643 		// Do a run of non-BMP, it copies 1 input unit into 2 output units.
1644 		while ( (utf32Left > 0) && (utf16Left > 0) ) {
1645 			size_t len;
1646 			UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1647 			if ( inUnit <= 0xFFFF ) break;
1648 			CodePoint_to_UTF16Nat_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1649 			if ( len == 0 ) goto Done;	// Not enough room in the output buffer.
1650 			UC_Assert ( len == 2 );
1651 			utf32Left -= 1;
1652 			utf32Pos  += 1;
1653 			utf16Left -= 2;
1654 			utf16Pos  += 2;
1655 		}
1656 
1657 	}
1658 
1659 Done:	// Set the output lengths.
1660 	*utf32Read = utf32Len - utf32Left;
1661 	*utf16Written = utf16Len - utf16Left;
1662 
1663 }	// UTF32Swp_to_UTF16Nat
1664 
1665 // =================================================================================================
1666