1 // =================================================================================================
2 // Copyright 2004-2007 Adobe Systems Incorporated
3 // All Rights Reserved.
4 //
5 // NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms
6 // of the Adobe license agreement accompanying it.
7 // =================================================================================================
8
9 #include "XMP_Const.h"
10
11 #if UnicodeTestBuild
12 #include <cassert>
13 #include <stdexcept>
14 #define UC_Assert assert
15 #define UC_Throw(m,k) throw std::logic_error ( m )
16 #else
17 #define UC_Assert(cond) /* Nothing for now, should be XMP_Assert. */
18 #define UC_Throw(msg,id) throw XMP_Error ( id, msg )
19 #endif
20
21 #include "UnicodeConversions.hpp"
22
23 using namespace std;
24
25 // =================================================================================================
26
27 // *** Look into using asm inlines, e.g. count-leading bits for multi-byte UTF-8.
28
29 CodePoint_to_UTF16_Proc CodePoint_to_UTF16BE = 0;
30 CodePoint_to_UTF16_Proc CodePoint_to_UTF16LE = 0;
31
32 CodePoint_from_UTF16_Proc CodePoint_from_UTF16BE = 0;
33 CodePoint_from_UTF16_Proc CodePoint_from_UTF16LE = 0;
34
35 UTF8_to_UTF16_Proc UTF8_to_UTF16BE = 0;
36 UTF8_to_UTF16_Proc UTF8_to_UTF16LE = 0;
37 UTF8_to_UTF32_Proc UTF8_to_UTF32BE = 0;
38 UTF8_to_UTF32_Proc UTF8_to_UTF32LE = 0;
39
40 UTF16_to_UTF8_Proc UTF16BE_to_UTF8 = 0;
41 UTF16_to_UTF8_Proc UTF16LE_to_UTF8 = 0;
42 UTF32_to_UTF8_Proc UTF32BE_to_UTF8 = 0;
43 UTF32_to_UTF8_Proc UTF32LE_to_UTF8 = 0;
44
45 UTF8_to_UTF16_Proc UTF8_to_UTF16Native = 0;
46 UTF8_to_UTF32_Proc UTF8_to_UTF32Native = 0;
47 UTF16_to_UTF8_Proc UTF16Native_to_UTF8 = 0;
48 UTF32_to_UTF8_Proc UTF32Native_to_UTF8 = 0;
49
50 UTF16_to_UTF32_Proc UTF16BE_to_UTF32BE = 0;
51 UTF16_to_UTF32_Proc UTF16BE_to_UTF32LE = 0;
52 UTF16_to_UTF32_Proc UTF16LE_to_UTF32BE = 0;
53 UTF16_to_UTF32_Proc UTF16LE_to_UTF32LE = 0;
54
55 UTF32_to_UTF16_Proc UTF32BE_to_UTF16BE = 0;
56 UTF32_to_UTF16_Proc UTF32BE_to_UTF16LE = 0;
57 UTF32_to_UTF16_Proc UTF32LE_to_UTF16BE = 0;
58 UTF32_to_UTF16_Proc UTF32LE_to_UTF16LE = 0;
59
60 // -------------------------------------------------------------------------------------------------
61
62 static size_t swap32to16Offset = 0; // Offset to "convert" a swapped UTF32 pointer into a swapped UTF16 pointer.
63
64 // -------------------------------------------------------------------------------------------------
65
66 static void CodePoint_to_UTF16Nat ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written );
67 static void CodePoint_to_UTF16Swp ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written );
68
69 static void CodePoint_from_UTF16Nat ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read );
70 static void CodePoint_from_UTF16Swp ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read );
71
72 // -------------------------------------------------------------------------------------------------
73
74 static void UTF8_to_UTF16Nat ( const UTF8Unit * utf8In, const size_t utf8Len,
75 UTF16Unit * utf16Out, const size_t utf16Len,
76 size_t * utf8Read, size_t * utf16Written );
77
78 static void UTF8_to_UTF16Swp ( const UTF8Unit * utf8In, const size_t utf8Len,
79 UTF16Unit * utf16Out, const size_t utf16Len,
80 size_t * utf8Read, size_t * utf16Written );
81
82 static void UTF8_to_UTF32Nat ( const UTF8Unit * utf8In, const size_t utf8Len,
83 UTF32Unit * utf32Out, const size_t utf32Len,
84 size_t * utf8Read, size_t * utf32Written );
85
86 static void UTF8_to_UTF32Swp ( const UTF8Unit * utf8In, const size_t utf8Len,
87 UTF32Unit * utf32Out, const size_t utf32Len,
88 size_t * utf8Read, size_t * utf32Written );
89
90 // -------------------------------------------------------------------------------------------------
91
92 static void UTF16Nat_to_UTF8 ( const UTF16Unit * utf16In, const size_t utf16Len,
93 UTF8Unit * utf8Out, const size_t utf8Len,
94 size_t * utf16Read, size_t * utf8Written );
95
96 static void UTF16Swp_to_UTF8 ( const UTF16Unit * utf16In, const size_t utf16Len,
97 UTF8Unit * utf8Out, const size_t utf8Len,
98 size_t * utf16Read, size_t * utf8Written );
99
100 static void UTF32Nat_to_UTF8 ( const UTF32Unit * utf32In, const size_t utf32Len,
101 UTF8Unit * utf8Out, const size_t utf8Len,
102 size_t * utf32Read, size_t * utf8Written );
103
104 static void UTF32Swp_to_UTF8 ( const UTF32Unit * utf32In, const size_t utf32Len,
105 UTF8Unit * utf8Out, const size_t utf8Len,
106 size_t * utf32Read, size_t * utf8Written );
107
108 // -------------------------------------------------------------------------------------------------
109
110 static void UTF16Nat_to_UTF32Nat ( const UTF16Unit * utf16In, const size_t utf16Len,
111 UTF32Unit * utf32Out, const size_t utf32Len,
112 size_t * utf16Read, size_t * utf32Written );
113
114 static void UTF16Nat_to_UTF32Swp ( const UTF16Unit * utf16In, const size_t utf16Len,
115 UTF32Unit * utf32Out, const size_t utf32Len,
116 size_t * utf16Read, size_t * utf32Written );
117
118 static void UTF16Swp_to_UTF32Nat ( const UTF16Unit * utf16In, const size_t utf16Len,
119 UTF32Unit * utf32Out, const size_t utf32Len,
120 size_t * utf16Read, size_t * utf32Written );
121
122 static void UTF16Swp_to_UTF32Swp ( const UTF16Unit * utf16In, const size_t utf16Len,
123 UTF32Unit * utf32Out, const size_t utf32Len,
124 size_t * utf16Read, size_t * utf32Written );
125
126 // -------------------------------------------------------------------------------------------------
127
128 static void UTF32Nat_to_UTF16Nat ( const UTF32Unit * utf32In, const size_t utf32Len,
129 UTF16Unit * utf16Out, const size_t utf16Len,
130 size_t * utf32Read, size_t * utf16Written );
131
132 static void UTF32Nat_to_UTF16Swp ( const UTF32Unit * utf32In, const size_t utf32Len,
133 UTF16Unit * utf16Out, const size_t utf16Len,
134 size_t * utf32Read, size_t * utf16Written );
135
136 static void UTF32Swp_to_UTF16Nat ( const UTF32Unit * utf32In, const size_t utf32Len,
137 UTF16Unit * utf16Out, const size_t utf16Len,
138 size_t * utf32Read, size_t * utf16Written );
139
140 static void UTF32Swp_to_UTF16Swp ( const UTF32Unit * utf32In, const size_t utf32Len,
141 UTF16Unit * utf16Out, const size_t utf16Len,
142 size_t * utf32Read, size_t * utf16Written );
143
144 // =================================================================================================
145
InitializeUnicodeConversions()146 void InitializeUnicodeConversions()
147 {
148 UC_Assert ( (sizeof(UTF8Unit) == 1) && (sizeof(UTF16Unit) == 2) && (sizeof(UTF32Unit) == 4) );
149
150 UTF16Unit u16 = 0x00FF;
151 bool bigEndian = (*((UTF8Unit*)&u16) == 0);
152
153 UTF8_to_UTF16Native = UTF8_to_UTF16Nat;
154 UTF8_to_UTF32Native = UTF8_to_UTF32Nat;
155 UTF16Native_to_UTF8 = UTF16Nat_to_UTF8;
156 UTF32Native_to_UTF8 = UTF32Nat_to_UTF8;
157
158 if ( bigEndian ) {
159
160 swap32to16Offset = 0;
161
162 CodePoint_to_UTF16BE = CodePoint_to_UTF16Nat;
163 CodePoint_to_UTF16LE = CodePoint_to_UTF16Swp;
164
165 CodePoint_from_UTF16BE = CodePoint_from_UTF16Nat;
166 CodePoint_from_UTF16LE = CodePoint_from_UTF16Swp;
167
168 UTF8_to_UTF16BE = UTF8_to_UTF16Nat;
169 UTF8_to_UTF16LE = UTF8_to_UTF16Swp;
170 UTF8_to_UTF32BE = UTF8_to_UTF32Nat;
171 UTF8_to_UTF32LE = UTF8_to_UTF32Swp;
172
173 UTF16BE_to_UTF8 = UTF16Nat_to_UTF8;
174 UTF16LE_to_UTF8 = UTF16Swp_to_UTF8;
175 UTF32BE_to_UTF8 = UTF32Nat_to_UTF8;
176 UTF32LE_to_UTF8 = UTF32Swp_to_UTF8;
177
178 UTF16BE_to_UTF32BE = UTF16Nat_to_UTF32Nat;
179 UTF16BE_to_UTF32LE = UTF16Nat_to_UTF32Swp;
180 UTF16LE_to_UTF32BE = UTF16Swp_to_UTF32Nat;
181 UTF16LE_to_UTF32LE = UTF16Swp_to_UTF32Swp;
182
183 UTF32BE_to_UTF16BE = UTF32Nat_to_UTF16Nat;
184 UTF32BE_to_UTF16LE = UTF32Nat_to_UTF16Swp;
185 UTF32LE_to_UTF16BE = UTF32Swp_to_UTF16Nat;
186 UTF32LE_to_UTF16LE = UTF32Swp_to_UTF16Swp;
187
188 } else {
189
190 swap32to16Offset = 1; // ! Offset in UTF16 units!
191
192 CodePoint_to_UTF16BE = CodePoint_to_UTF16Swp;
193 CodePoint_to_UTF16LE = CodePoint_to_UTF16Nat;
194
195 CodePoint_from_UTF16BE = CodePoint_from_UTF16Swp;
196 CodePoint_from_UTF16LE = CodePoint_from_UTF16Nat;
197
198 UTF8_to_UTF16BE = UTF8_to_UTF16Swp;
199 UTF8_to_UTF16LE = UTF8_to_UTF16Nat;
200 UTF8_to_UTF32BE = UTF8_to_UTF32Swp;
201 UTF8_to_UTF32LE = UTF8_to_UTF32Nat;
202
203 UTF16BE_to_UTF8 = UTF16Swp_to_UTF8;
204 UTF16LE_to_UTF8 = UTF16Nat_to_UTF8;
205 UTF32BE_to_UTF8 = UTF32Swp_to_UTF8;
206 UTF32LE_to_UTF8 = UTF32Nat_to_UTF8;
207
208 UTF16BE_to_UTF32BE = UTF16Swp_to_UTF32Swp;
209 UTF16BE_to_UTF32LE = UTF16Swp_to_UTF32Nat;
210 UTF16LE_to_UTF32BE = UTF16Nat_to_UTF32Swp;
211 UTF16LE_to_UTF32LE = UTF16Nat_to_UTF32Nat;
212
213 UTF32BE_to_UTF16BE = UTF32Swp_to_UTF16Swp;
214 UTF32BE_to_UTF16LE = UTF32Swp_to_UTF16Nat;
215 UTF32LE_to_UTF16BE = UTF32Nat_to_UTF16Swp;
216 UTF32LE_to_UTF16LE = UTF32Nat_to_UTF16Nat;
217
218 }
219
220 } // InitializeUnicodeConversions
221
222 // =================================================================================================
223
224 #if XMP_MacBuild && __MWERKS__
225
226 #define UTF16InSwap(inPtr) UTF16Unit ( __lhbrx ( (void*)(inPtr), 0 ) )
227 #define UTF32InSwap(inPtr) UTF32Unit ( __lwbrx ( (void*)(inPtr), 0 ) )
228
229 #define UTF16OutSwap(outPtr,value) __sthbrx ( value, (void*)(outPtr), 0 )
230 #define UTF32OutSwap(outPtr,value) __stwbrx ( value, (void*)(outPtr), 0 )
231
232 #else
233
UTF16InSwap(const UTF16Unit * inPtr)234 static inline UTF16Unit UTF16InSwap ( const UTF16Unit * inPtr )
235 {
236 UTF16Unit inUnit = *inPtr;
237 return (inUnit << 8) | (inUnit >> 8);
238 }
239
UTF32InSwap(const UTF32Unit * inPtr)240 static inline UTF32Unit UTF32InSwap ( const UTF32Unit * inPtr )
241 {
242 UTF32Unit inUnit = *inPtr;
243 return (inUnit << 24) | ((inUnit << 8) & 0x00FF0000) | ((inUnit >> 8) & 0x0000FF00) | (inUnit >> 24);
244 }
245
UTF16OutSwap(UTF16Unit * outPtr,const UTF16Unit value)246 static inline void UTF16OutSwap ( UTF16Unit * outPtr, const UTF16Unit value )
247 {
248 UTF16Unit outUnit = (value << 8) | (value >> 8);
249 *outPtr = outUnit;
250 }
251
UTF32OutSwap(UTF32Unit * outPtr,const UTF32Unit value)252 static inline void UTF32OutSwap ( UTF32Unit * outPtr, const UTF32Unit value )
253 {
254 UTF32Unit outUnit = (value << 24) | ((value << 8) & 0x00FF0000) | ((value >> 8) & 0x0000FF00) | (value >> 24);
255 *outPtr = outUnit;
256 }
257
258 #endif
259
260 // =================================================================================================
261
SwapUTF16(const UTF16Unit * utf16In,UTF16Unit * utf16Out,const size_t utf16Len)262 void SwapUTF16 ( const UTF16Unit * utf16In, UTF16Unit * utf16Out, const size_t utf16Len )
263 {
264 for ( size_t i = 0; i < utf16Len; ++i ) utf16Out[i] = UTF16InSwap(utf16In+i);
265 }
266
SwapUTF32(const UTF32Unit * utf32In,UTF32Unit * utf32Out,const size_t utf32Len)267 void SwapUTF32 ( const UTF32Unit * utf32In, UTF32Unit * utf32Out, const size_t utf32Len ) {
268 for ( size_t i = 0; i < utf32Len; ++i ) utf32Out[i] = UTF32InSwap(utf32In+i);
269 }
270
271 // =================================================================================================
272
ToUTF16(const UTF8Unit * utf8In,size_t utf8Len,std::string * utf16Str,bool bigEndian)273 extern void ToUTF16 ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf16Str, bool bigEndian )
274 {
275 UTF8_to_UTF16_Proc Converter = UTF8_to_UTF16LE;
276 if ( bigEndian ) Converter = UTF8_to_UTF16BE;
277
278 enum { kBufferSize = 8*1024 };
279 UTF16Unit u16Buffer[kBufferSize]; // 16K bytes
280 size_t readCount, writeCount;
281
282 utf16Str->erase();
283 utf16Str->reserve ( 2*utf8Len ); // As good a guess as any.
284
285 while ( utf8Len > 0 ) {
286 Converter ( utf8In, utf8Len, u16Buffer, kBufferSize, &readCount, &writeCount );
287 if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
288 utf16Str->append ( (const char *)u16Buffer, writeCount*2 );
289 utf8In += readCount;
290 utf8Len -= readCount;
291 }
292
293 } // ToUTF16
294
295 // =================================================================================================
296
ToUTF16Native(const UTF8Unit * utf8In,size_t utf8Len,std::string * utf16Str)297 extern void ToUTF16Native ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf16Str )
298 {
299 enum { kBufferSize = 8*1024 };
300 UTF16Unit u16Buffer[kBufferSize]; // 16K bytes
301 size_t readCount, writeCount;
302
303 utf16Str->erase();
304 utf16Str->reserve ( 2*utf8Len ); // As good a guess as any.
305
306 while ( utf8Len > 0 ) {
307 UTF8_to_UTF16Nat ( utf8In, utf8Len, u16Buffer, kBufferSize, &readCount, &writeCount );
308 if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
309 utf16Str->append ( (const char *)u16Buffer, writeCount*2 );
310 utf8In += readCount;
311 utf8Len -= readCount;
312 }
313
314 } // ToUTF16Native
315
316 // =================================================================================================
317
ToUTF32(const UTF8Unit * utf8In,size_t utf8Len,std::string * utf32Str,bool bigEndian)318 extern void ToUTF32 ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf32Str, bool bigEndian )
319 {
320 UTF8_to_UTF32_Proc Converter = UTF8_to_UTF32LE;
321 if ( bigEndian ) Converter = UTF8_to_UTF32BE;
322
323 enum { kBufferSize = 4*1024 };
324 UTF32Unit u32Buffer[kBufferSize]; // 16K bytes
325 size_t readCount, writeCount;
326
327 utf32Str->erase();
328 utf32Str->reserve ( 4*utf8Len ); // As good a guess as any.
329
330 while ( utf8Len > 0 ) {
331 Converter ( utf8In, utf8Len, u32Buffer, kBufferSize, &readCount, &writeCount );
332 if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
333 utf32Str->append ( (const char *)u32Buffer, writeCount*4 );
334 utf8In += readCount;
335 utf8Len -= readCount;
336 }
337
338 } // ToUTF32
339
340 // =================================================================================================
341
ToUTF32Native(const UTF8Unit * utf8In,size_t utf8Len,std::string * utf32Str)342 extern void ToUTF32Native ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf32Str )
343 {
344 enum { kBufferSize = 4*1024 };
345 UTF32Unit u32Buffer[kBufferSize]; // 16K bytes
346 size_t readCount, writeCount;
347
348 utf32Str->erase();
349 utf32Str->reserve ( 4*utf8Len ); // As good a guess as any.
350
351 while ( utf8Len > 0 ) {
352 UTF8_to_UTF32Nat ( utf8In, utf8Len, u32Buffer, kBufferSize, &readCount, &writeCount );
353 if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
354 utf32Str->append ( (const char *)u32Buffer, writeCount*4 );
355 utf8In += readCount;
356 utf8Len -= readCount;
357 }
358
359 } // ToUTF32Native
360
361 // =================================================================================================
362
FromUTF16(const UTF16Unit * utf16In,size_t utf16Len,std::string * utf8Str,bool bigEndian)363 extern void FromUTF16 ( const UTF16Unit * utf16In, size_t utf16Len, std::string * utf8Str, bool bigEndian )
364 {
365 UTF16_to_UTF8_Proc Converter = UTF16LE_to_UTF8;
366 if ( bigEndian ) Converter = UTF16BE_to_UTF8;
367
368 enum { kBufferSize = 16*1024 };
369 UTF8Unit u8Buffer[kBufferSize];
370 size_t readCount, writeCount;
371
372 utf8Str->erase();
373 utf8Str->reserve ( 2*utf16Len ); // As good a guess as any.
374
375 while ( utf16Len > 0 ) {
376 Converter ( utf16In, utf16Len, u8Buffer, kBufferSize, &readCount, &writeCount );
377 if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
378 utf8Str->append ( (const char *)u8Buffer, writeCount );
379 utf16In += readCount;
380 utf16Len -= readCount;
381 }
382
383 } // FromUTF16
384
385 // =================================================================================================
386
FromUTF16Native(const UTF16Unit * utf16In,size_t utf16Len,std::string * utf8Str)387 extern void FromUTF16Native ( const UTF16Unit * utf16In, size_t utf16Len, std::string * utf8Str )
388 {
389 enum { kBufferSize = 16*1024 };
390 UTF8Unit u8Buffer[kBufferSize];
391 size_t readCount, writeCount;
392
393 utf8Str->erase();
394 utf8Str->reserve ( 2*utf16Len ); // As good a guess as any.
395
396 while ( utf16Len > 0 ) {
397 UTF16Nat_to_UTF8 ( utf16In, utf16Len, u8Buffer, kBufferSize, &readCount, &writeCount );
398 if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
399 utf8Str->append ( (const char *)u8Buffer, writeCount );
400 utf16In += readCount;
401 utf16Len -= readCount;
402 }
403
404 } // FromUTF16Native
405
406 // =================================================================================================
407
FromUTF32(const UTF32Unit * utf32In,size_t utf32Len,std::string * utf8Str,bool bigEndian)408 extern void FromUTF32 ( const UTF32Unit * utf32In, size_t utf32Len, std::string * utf8Str, bool bigEndian )
409 {
410 UTF32_to_UTF8_Proc Converter = UTF32LE_to_UTF8;
411 if ( bigEndian ) Converter = UTF32BE_to_UTF8;
412
413 enum { kBufferSize = 16*1024 };
414 UTF8Unit u8Buffer[kBufferSize];
415 size_t readCount, writeCount;
416
417 utf8Str->erase();
418 utf8Str->reserve ( 2*utf32Len ); // As good a guess as any.
419
420 while ( utf32Len > 0 ) {
421 Converter ( utf32In, utf32Len, u8Buffer, kBufferSize, &readCount, &writeCount );
422 if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
423 utf8Str->append ( (const char *)u8Buffer, writeCount );
424 utf32In += readCount;
425 utf32Len -= readCount;
426 }
427
428 } // FromUTF32
429
430 // =================================================================================================
431
FromUTF32Native(const UTF32Unit * utf32In,size_t utf32Len,std::string * utf8Str)432 extern void FromUTF32Native ( const UTF32Unit * utf32In, size_t utf32Len, std::string * utf8Str )
433 {
434 enum { kBufferSize = 16*1024 };
435 UTF8Unit u8Buffer[kBufferSize];
436 size_t readCount, writeCount;
437
438 utf8Str->erase();
439 utf8Str->reserve ( 2*utf32Len ); // As good a guess as any.
440
441 while ( utf32Len > 0 ) {
442 UTF32Nat_to_UTF8 ( utf32In, utf32Len, u8Buffer, kBufferSize, &readCount, &writeCount );
443 if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadXML );
444 utf8Str->append ( (const char *)u8Buffer, writeCount );
445 utf32In += readCount;
446 utf32Len -= readCount;
447 }
448
449 } // FromUTF32Native
450
451 // =================================================================================================
452
CodePoint_to_UTF8_Multi(const UTF32Unit cpIn,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf8Written)453 static void CodePoint_to_UTF8_Multi ( const UTF32Unit cpIn, UTF8Unit * utf8Out, const size_t utf8Len, size_t * utf8Written )
454 {
455 size_t unitCount = 0;
456
457 if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam );
458 if ( (0xD800 <= cpIn) && (cpIn <= 0xDFFF) ) UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam );
459
460 // Compute the number of bytes using 6 data bits each. Then see if the highest order bits will
461 // fit into the leading byte. Write the UTF-8 sequence if there is enough room.
462
463 UTF32Unit temp, mask;
464 size_t bytesNeeded = 0;
465 for ( temp = cpIn; temp != 0; temp = temp >> 6 ) ++bytesNeeded;
466
467 temp = cpIn >> ((bytesNeeded-1)*6); // The highest order data bits.
468 mask = (0x80 >> bytesNeeded) - 1; // Available data bits in the leading byte.
469 if ( temp > mask ) ++bytesNeeded;
470
471 if ( bytesNeeded > utf8Len ) goto Done; // Not enough room for the output.
472 unitCount = bytesNeeded;
473
474 temp = cpIn;
475 for ( --bytesNeeded; bytesNeeded > 0; --bytesNeeded ) {
476 utf8Out[bytesNeeded] = 0x80 | UTF8Unit ( temp & 0x3F );
477 temp = temp >> 6;
478 }
479
480 mask = ~((1 << (8-unitCount)) - 1);
481 utf8Out[0] = UTF8Unit ( mask | temp );
482
483 Done:
484 *utf8Written = unitCount;
485 return;
486
487 } // CodePoint_to_UTF8_Multi
488
489 // =================================================================================================
490
CodePoint_to_UTF8(const UTF32Unit cpIn,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf8Written)491 void CodePoint_to_UTF8 ( const UTF32Unit cpIn, UTF8Unit * utf8Out, const size_t utf8Len, size_t * utf8Written )
492 {
493 size_t unitCount = 0;
494
495 UC_Assert ( (utf8Out != 0) && (utf8Written != 0) );
496 if ( utf8Len == 0 ) goto Done;
497 if ( cpIn > 0x7F ) goto MultiByte; // ! Force linear execution path for ASCII.
498
499 if ( utf8Len == 0 ) goto Done;
500 unitCount = 1;
501 *utf8Out = UTF8Unit(cpIn);
502
503 Done:
504 *utf8Written = unitCount;
505 return;
506
507 MultiByte:
508 CodePoint_to_UTF8_Multi( cpIn, utf8Out, utf8Len, utf8Written );
509 return;
510
511 } // CodePoint_to_UTF8
512
513 // =================================================================================================
514
CodePoint_from_UTF8_Multi(const UTF8Unit * utf8In,const size_t utf8Len,UTF32Unit * cpOut,size_t * utf8Read)515 static void CodePoint_from_UTF8_Multi ( const UTF8Unit * utf8In, const size_t utf8Len, UTF32Unit * cpOut, size_t * utf8Read )
516 {
517 UTF8Unit inUnit = *utf8In;
518 size_t unitCount = 0;
519 UTF32Unit cp; // ! Avoid gcc complaints about declarations after goto's.
520 const UTF8Unit * utf8Pos;
521
522 // -------------------------------------------------------------------------------------
523 // We've got a multibyte UTF-8 character. The first byte has the number of bytes and the
524 // highest order data bits. The other bytes each add 6 more data bits.
525
526 #if 0 // This might be a more effcient way to count the bytes.
527 static XMP_Uns8 kByteCounts[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
528 size_t bytesNeeded = kByteCounts [ inUnit >> 4 ];
529 if ( (bytesNeeded < 2) || ((bytesNeeded == 4) && ((inUnit & 0x08) != 0)) ) {
530 UC_Throw ( "Invalid UTF-8 sequence length", kXMPErr_BadParam );
531 }
532 #endif
533
534 size_t bytesNeeded = 0; // Count the leading 1 bits in the first byte.
535 for ( UTF8Unit temp = inUnit; temp > 0x7F; temp = temp << 1 ) ++bytesNeeded;
536 // *** Consider CPU-specific assembly inline, e.g. cntlzw on PowerPC.
537
538 if ( (bytesNeeded < 2) || (bytesNeeded > 4) ) UC_Throw ( "Invalid UTF-8 sequence length", kXMPErr_BadParam );
539 if ( bytesNeeded > utf8Len ) goto Done; // Not enough input in this buffer.
540 unitCount = bytesNeeded;
541
542 cp = inUnit & ((1 << (7-unitCount)) - 1); // Isolate the initial data bits in the bottom of cp.
543
544 utf8Pos = utf8In + 1; // We've absorbed the first byte.
545 for ( --bytesNeeded; bytesNeeded > 0; --bytesNeeded, ++utf8Pos ) {
546 inUnit = *utf8Pos;
547 if ( (inUnit & UTF8Unit(0xC0)) != UTF8Unit(0x80) ) UC_Throw ( "Invalid UTF-8 data byte", kXMPErr_BadParam );
548 cp = (cp << 6) | (inUnit & 0x3F);
549 }
550
551 if ( cp >= 0xD800 ) { // Skip the next comparisons most of the time.
552 if ( (0xD800 <= cp) && (cp <= 0xDFFF) ) UC_Throw ( "Bad UTF-8 - surrogate code point", kXMPErr_BadParam );
553 if ( cp > 0x10FFFF ) UC_Throw ( "Bad UTF-8 - out of range", kXMPErr_BadParam );
554 }
555
556 *cpOut = cp; // ! Don't put after Done, don't write if no input.
557
558 Done:
559 *utf8Read = unitCount;
560 return;
561
562 } // CodePoint_from_UTF8_Multi
563
564 // =================================================================================================
565
CodePoint_from_UTF8(const UTF8Unit * utf8In,const size_t utf8Len,UTF32Unit * cpOut,size_t * utf8Read)566 void CodePoint_from_UTF8 ( const UTF8Unit * utf8In, const size_t utf8Len, UTF32Unit * cpOut, size_t * utf8Read )
567 {
568 UTF8Unit inUnit; // ! Don't read until we know there is input.
569 size_t unitCount = 0;
570
571 UC_Assert ( (utf8In != 0) && (cpOut != 0) && (utf8Read != 0) );
572 if ( utf8Len == 0 ) goto Done;
573 inUnit = *utf8In;
574 if ( inUnit >= 0x80 ) goto MultiByte; // ! Force linear execution path for ASCII.
575
576 unitCount = 1;
577 *cpOut = inUnit; // ! Don't put after Done, don't write if no input.
578
579 Done:
580 *utf8Read = unitCount;
581 return;
582
583 MultiByte:
584 CodePoint_from_UTF8_Multi ( utf8In, utf8Len, cpOut, utf8Read );
585 return;
586
587 } // CodePoint_from_UTF8
588
589 // =================================================================================================
590
CodePoint_to_UTF16Nat_Surrogate(const UTF32Unit cpIn,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf16Written)591 static void CodePoint_to_UTF16Nat_Surrogate ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
592 {
593 size_t unitCount = 0;
594 UTF32Unit temp; // ! Avoid gcc complaints about declarations after goto's.
595
596 if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam );
597 if ( utf16Len < 2 ) goto Done; // Not enough room for the output.
598
599 unitCount = 2;
600 temp = cpIn - 0x10000;
601 utf16Out[0] = 0xD800 | UTF16Unit ( temp >> 10 );
602 utf16Out[1] = 0xDC00 | UTF16Unit ( temp & 0x3FF );
603
604 Done:
605 *utf16Written = unitCount;
606 return;
607
608 } // CodePoint_to_UTF16Nat_Surrogate
609
610 // =================================================================================================
611
CodePoint_to_UTF16Nat(const UTF32Unit cpIn,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf16Written)612 static void CodePoint_to_UTF16Nat ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
613 {
614 size_t unitCount = 0;
615
616 UC_Assert ( (utf16Out != 0) && (utf16Written != 0) );
617 if ( utf16Len == 0 ) goto Done;
618 if ( cpIn >= 0xD800 ) goto CheckSurrogate; // ! Force linear execution path for the BMP.
619
620 InBMP:
621 unitCount = 1;
622 *utf16Out = UTF16Unit(cpIn);
623
624 Done:
625 *utf16Written = unitCount;
626 return;
627
628 CheckSurrogate:
629 if ( cpIn > 0xFFFF ) goto SurrogatePair;
630 if ( cpIn > 0xDFFF ) goto InBMP;
631 UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam );
632
633 SurrogatePair:
634 CodePoint_to_UTF16Nat_Surrogate ( cpIn, utf16Out, utf16Len, utf16Written );
635 return;
636
637 } // CodePoint_to_UTF16Nat
638
639 // =================================================================================================
640
CodePoint_from_UTF16Nat_Surrogate(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * cpOut,size_t * utf16Read)641 static void CodePoint_from_UTF16Nat_Surrogate ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
642 {
643 UTF16Unit hiUnit = *utf16In;
644 size_t unitCount = 0;
645 UTF16Unit loUnit; // ! Avoid gcc complaints about declarations after goto's.
646 UTF32Unit cp;
647
648 // ----------------------------------
649 // We've got a UTF-16 surrogate pair.
650
651 if ( hiUnit > 0xDBFF ) UC_Throw ( "Bad UTF-16 - leading low surrogate", kXMPErr_BadParam );
652 if ( utf16Len < 2 ) goto Done; // Not enough input in this buffer.
653
654 loUnit = *(utf16In+1);
655 if ( (loUnit < 0xDC00) || (0xDFFF < loUnit) ) UC_Throw ( "Bad UTF-16 - missing low surrogate", kXMPErr_BadParam );
656
657 unitCount = 2;
658 cp = (((hiUnit & 0x3FF) << 10) | (loUnit & 0x3FF)) + 0x10000;
659
660 *cpOut = cp; // ! Don't put after Done, don't write if no input.
661
662 Done:
663 *utf16Read = unitCount;
664 return;
665
666 } // CodePoint_from_UTF16Nat_Surrogate
667
668 // =================================================================================================
669
CodePoint_from_UTF16Nat(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * cpOut,size_t * utf16Read)670 static void CodePoint_from_UTF16Nat ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
671 {
672 UTF16Unit inUnit; // ! Don't read until we know there is input.
673 size_t unitCount = 0;
674
675 UC_Assert ( (utf16In != 0) && (cpOut != 0) && (utf16Read != 0) );
676 if ( utf16Len == 0 ) goto Done;
677 inUnit = *utf16In;
678 if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) goto SurrogatePair; // ! Force linear execution path for the BMP.
679
680 unitCount = 1;
681 *cpOut = inUnit; // ! Don't put after Done, don't write if no input.
682
683 Done:
684 *utf16Read = unitCount;
685 return;
686
687 SurrogatePair:
688 CodePoint_from_UTF16Nat_Surrogate ( utf16In, utf16Len, cpOut, utf16Read );
689 return;
690
691 } // CodePoint_from_UTF16Nat
692
693 // =================================================================================================
694
UTF8_to_UTF16Nat(const UTF8Unit * utf8In,const size_t utf8Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf8Read,size_t * utf16Written)695 static void UTF8_to_UTF16Nat ( const UTF8Unit * utf8In, const size_t utf8Len,
696 UTF16Unit * utf16Out, const size_t utf16Len,
697 size_t * utf8Read, size_t * utf16Written )
698 {
699 const UTF8Unit * utf8Pos = utf8In;
700 UTF16Unit * utf16Pos = utf16Out;
701
702 size_t utf8Left = utf8Len;
703 size_t utf16Left = utf16Len;
704
705 UC_Assert ( (utf8In != 0) && (utf16Out != 0) && (utf8Read != 0) && (utf16Written != 0) );
706
707 while ( (utf8Left > 0) && (utf16Left > 0) ) {
708
709 // Do a run of ASCII, it copies 1 input unit into 1 output unit.
710 size_t i, limit = utf8Left;
711 if ( limit > utf16Left ) limit = utf16Left;
712 for ( i = 0; i < limit; ++i ) {
713 UTF8Unit inUnit = *utf8Pos;
714 if ( inUnit > 0x7F ) break;
715 *utf16Pos = inUnit;
716 ++utf8Pos;
717 ++utf16Pos;
718 }
719 utf8Left -= i;
720 utf16Left -= i;
721
722 // Do a run of non-ASCII, it copies multiple input units into 1 or 2 output units.
723 while ( (utf8Left > 0) && (utf16Left > 0) ) {
724 UTF32Unit cp;
725 size_t len8, len16;
726 UTF8Unit inUnit = *utf8Pos;
727 if ( inUnit <= 0x7F ) break;
728 CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len8 );
729 if ( len8 == 0 ) goto Done; // The input buffer ends in the middle of a character.
730 if ( cp <= 0xFFFF ) {
731 *utf16Pos = UTF16Unit(cp);
732 len16 = 1;
733 } else {
734 CodePoint_to_UTF16Nat_Surrogate ( cp, utf16Pos, utf16Left, &len16 );
735 if ( len16 == 0 ) goto Done; // Not enough room in the output buffer.
736 }
737 utf8Left -= len8;
738 utf8Pos += len8;
739 utf16Left -= len16;
740 utf16Pos += len16;
741 }
742
743 }
744
745 Done: // Set the output lengths.
746 *utf8Read = utf8Len - utf8Left;
747 *utf16Written = utf16Len - utf16Left;
748
749 } // UTF8_to_UTF16Nat
750
751 // =================================================================================================
752
UTF8_to_UTF32Nat(const UTF8Unit * utf8In,const size_t utf8Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf8Read,size_t * utf32Written)753 static void UTF8_to_UTF32Nat ( const UTF8Unit * utf8In, const size_t utf8Len,
754 UTF32Unit * utf32Out, const size_t utf32Len,
755 size_t * utf8Read, size_t * utf32Written )
756 {
757 const UTF8Unit * utf8Pos = utf8In;
758 UTF32Unit * utf32Pos = utf32Out;
759
760 size_t utf8Left = utf8Len;
761 size_t utf32Left = utf32Len;
762
763 UC_Assert ( (utf8In != 0) && (utf32Out != 0) && (utf8Read != 0) && (utf32Written != 0) );
764
765 while ( (utf8Left > 0) && (utf32Left > 0) ) {
766
767 // Do a run of ASCII, it copies 1 input unit into 1 output unit.
768 size_t i, limit = utf8Left;
769 if ( limit > utf32Left ) limit = utf32Left;
770 for ( i = 0; i < limit; ++i ) {
771 UTF8Unit inUnit = *utf8Pos;
772 if ( inUnit > 0x7F ) break;
773 *utf32Pos = inUnit;
774 ++utf8Pos;
775 ++utf32Pos;
776 }
777 utf8Left -= i;
778 utf32Left -= i;
779
780 // Do a run of non-ASCII, it copies variable input into 1 output unit.
781 while ( (utf8Left > 0) && (utf32Left > 0) ) {
782 size_t len;
783 UTF8Unit inUnit = *utf8Pos;
784 if ( inUnit <= 0x7F ) break;
785 CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, utf32Pos, &len );
786 if ( len == 0 ) goto Done; // The input buffer ends in the middle of a character.
787 utf8Left -= len;
788 utf8Pos += len;
789 utf32Left -= 1;
790 utf32Pos += 1;
791 }
792
793 }
794
795 Done: // Set the output lengths.
796 *utf8Read = utf8Len - utf8Left;
797 *utf32Written = utf32Len - utf32Left;
798
799 } // UTF8_to_UTF32Nat
800
801 // =================================================================================================
802
UTF16Nat_to_UTF8(const UTF16Unit * utf16In,const size_t utf16Len,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf16Read,size_t * utf8Written)803 static void UTF16Nat_to_UTF8 ( const UTF16Unit * utf16In, const size_t utf16Len,
804 UTF8Unit * utf8Out, const size_t utf8Len,
805 size_t * utf16Read, size_t * utf8Written )
806 {
807 const UTF16Unit * utf16Pos = utf16In;
808 UTF8Unit * utf8Pos = utf8Out;
809
810 size_t utf16Left = utf16Len;
811 size_t utf8Left = utf8Len;
812
813 UC_Assert ( (utf16In != 0) && (utf8Out != 0) && (utf16Read != 0) && (utf8Written != 0) );
814
815 while ( (utf16Left > 0) && (utf8Left > 0) ) {
816
817 // Do a run of ASCII, it copies 1 input unit into 1 output unit.
818 size_t i, limit = utf16Left;
819 if ( limit > utf8Left ) limit = utf8Left;
820 for ( i = 0; i < limit; ++i ) {
821 UTF16Unit inUnit = *utf16Pos;
822 if ( inUnit > 0x7F ) break;
823 *utf8Pos = UTF8Unit(inUnit);
824 ++utf16Pos;
825 ++utf8Pos;
826 }
827 utf16Left -= i;
828 utf8Left -= i;
829
830 // Do a run of non-ASCII inside the BMP, it copies 1 input unit into multiple output units.
831 while ( (utf16Left > 0) && (utf8Left > 0) ) {
832 size_t len8;
833 UTF16Unit inUnit = *utf16Pos;
834 if ( inUnit <= 0x7F ) break;
835 if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
836 CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len8 );
837 if ( len8 == 0 ) goto Done; // Not enough room in the output buffer.
838 utf16Left -= 1;
839 utf16Pos += 1;
840 utf8Left -= len8;
841 utf8Pos += len8;
842 }
843
844 // Do a run of surrogate pairs, it copies 2 input units into multiple output units.
845 while ( (utf16Left > 0) && (utf8Left > 0) ) {
846 UTF32Unit cp;
847 size_t len16, len8;
848 UTF16Unit inUnit = *utf16Pos;
849 if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
850 CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, &cp, &len16 );
851 if ( len16 == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
852 UC_Assert ( len16 == 2 );
853 CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len8 );
854 if ( len8 == 0 ) goto Done; // Not enough room in the output buffer.
855 utf16Left -= len16;
856 utf16Pos += len16;
857 utf8Left -= len8;
858 utf8Pos += len8;
859 }
860
861 }
862
863 Done: // Set the output lengths.
864 *utf16Read = utf16Len - utf16Left;
865 *utf8Written = utf8Len - utf8Left;
866
867 } // UTF16Nat_to_UTF8
868
869 // =================================================================================================
870
UTF32Nat_to_UTF8(const UTF32Unit * utf32In,const size_t utf32Len,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf32Read,size_t * utf8Written)871 static void UTF32Nat_to_UTF8 ( const UTF32Unit * utf32In, const size_t utf32Len,
872 UTF8Unit * utf8Out, const size_t utf8Len,
873 size_t * utf32Read, size_t * utf8Written )
874 {
875 const UTF32Unit * utf32Pos = utf32In;
876 UTF8Unit * utf8Pos = utf8Out;
877
878 size_t utf32Left = utf32Len;
879 size_t utf8Left = utf8Len;
880
881 UC_Assert ( (utf32In != 0) && (utf8Out != 0) && (utf32Read != 0) && (utf8Written != 0) );
882
883 while ( (utf32Left > 0) && (utf8Left > 0) ) {
884
885 // Do a run of ASCII, it copies 1 input unit into 1 output unit.
886 size_t i, limit = utf32Left;
887 if ( limit > utf8Left ) limit = utf8Left;
888 for ( i = 0; i < limit; ++i ) {
889 UTF32Unit inUnit = *utf32Pos;
890 if ( inUnit > 0x7F ) break;
891 *utf8Pos = UTF8Unit(inUnit);
892 ++utf32Pos;
893 ++utf8Pos;
894 }
895 utf32Left -= i;
896 utf8Left -= i;
897
898 // Do a run of non-ASCII, it copies 1 input unit into multiple output units.
899 while ( (utf32Left > 0) && (utf8Left > 0) ) {
900 size_t len;
901 UTF32Unit inUnit = *utf32Pos;
902 if ( inUnit <= 0x7F ) break;
903 CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len );
904 if ( len == 0 ) goto Done; // Not enough room in the output buffer.
905 utf32Left -= 1;
906 utf32Pos += 1;
907 utf8Left -= len;
908 utf8Pos += len;
909 }
910
911 }
912
913 Done: // Set the output lengths.
914 *utf32Read = utf32Len - utf32Left;
915 *utf8Written = utf8Len - utf8Left;
916
917 } // UTF32Nat_to_UTF8
918
919 // =================================================================================================
920
UTF16Nat_to_UTF32Nat(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf16Read,size_t * utf32Written)921 static void UTF16Nat_to_UTF32Nat ( const UTF16Unit * utf16In, const size_t utf16Len,
922 UTF32Unit * utf32Out, const size_t utf32Len,
923 size_t * utf16Read, size_t * utf32Written )
924 {
925 const UTF16Unit * utf16Pos = utf16In;
926 UTF32Unit * utf32Pos = utf32Out;
927
928 size_t utf16Left = utf16Len;
929 size_t utf32Left = utf32Len;
930
931 UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
932
933 while ( (utf16Left > 0) && (utf32Left > 0) ) {
934
935 // Do a run of BMP, it copies 1 input unit into 1 output unit.
936 size_t i, limit = utf16Left;
937 if ( limit > utf32Left ) limit = utf32Left;
938 for ( i = 0; i < limit; ++i ) {
939 UTF16Unit inUnit = *utf16Pos;
940 if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
941 *utf32Pos = inUnit;
942 ++utf16Pos;
943 ++utf32Pos;
944 }
945 utf16Left -= i;
946 utf32Left -= i;
947
948 // Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
949 while ( (utf16Left > 0) && (utf32Left > 0) ) {
950 size_t len;
951 UTF16Unit inUnit = *utf16Pos;
952 if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
953 CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, utf32Pos, &len );
954 if ( len == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
955 UC_Assert ( len == 2 );
956 utf16Left -= len;
957 utf16Pos += len;
958 utf32Left -= 1;
959 utf32Pos += 1;
960 }
961
962 }
963
964 Done: // Set the output lengths.
965 *utf16Read = utf16Len - utf16Left;
966 *utf32Written = utf32Len - utf32Left;
967
968 } // UTF16Nat_to_UTF32Nat
969
970 // =================================================================================================
971
UTF32Nat_to_UTF16Nat(const UTF32Unit * utf32In,const size_t utf32Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf32Read,size_t * utf16Written)972 static void UTF32Nat_to_UTF16Nat ( const UTF32Unit * utf32In, const size_t utf32Len,
973 UTF16Unit * utf16Out, const size_t utf16Len,
974 size_t * utf32Read, size_t * utf16Written )
975 {
976 const UTF32Unit * utf32Pos = utf32In;
977 UTF16Unit * utf16Pos = utf16Out;
978
979 size_t utf32Left = utf32Len;
980 size_t utf16Left = utf16Len;
981
982 UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
983
984 while ( (utf32Left > 0) && (utf16Left > 0) ) {
985
986 // Do a run of BMP, it copies 1 input unit into 1 output unit.
987 size_t i, limit = utf32Left;
988 if ( limit > utf16Left ) limit = utf16Left;
989 for ( i = 0; i < limit; ++i ) {
990 UTF32Unit inUnit = *utf32Pos;
991 if ( inUnit > 0xFFFF ) break;
992 *utf16Pos = UTF16Unit(inUnit);
993 ++utf32Pos;
994 ++utf16Pos;
995 }
996 utf32Left -= i;
997 utf16Left -= i;
998
999 // Do a run of non-BMP, it copies 1 input unit into 2 output units.
1000 while ( (utf32Left > 0) && (utf16Left > 0) ) {
1001 size_t len;
1002 UTF32Unit inUnit = *utf32Pos;
1003 if ( inUnit <= 0xFFFF ) break;
1004 CodePoint_to_UTF16Nat_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1005 if ( len == 0 ) goto Done; // Not enough room in the output buffer.
1006 UC_Assert ( len == 2 );
1007 utf32Left -= 1;
1008 utf32Pos += 1;
1009 utf16Left -= 2;
1010 utf16Pos += 2;
1011 }
1012
1013 }
1014
1015 Done: // Set the output lengths.
1016 *utf32Read = utf32Len - utf32Left;
1017 *utf16Written = utf16Len - utf16Left;
1018
1019 } // UTF32Nat_to_UTF16Nat
1020
1021 // =================================================================================================
1022
CodePoint_to_UTF16Swp_Surrogate(const UTF32Unit cpIn,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf16Written)1023 static void CodePoint_to_UTF16Swp_Surrogate ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
1024 {
1025 size_t unitCount = 0;
1026 UTF32Unit temp; // ! Avoid gcc complaints about declarations after goto's.
1027
1028 if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam );
1029 if ( utf16Len < 2 ) goto Done; // Not enough room for the output.
1030
1031 unitCount = 2;
1032 temp = cpIn - 0x10000;
1033 UTF16OutSwap ( &utf16Out[0], (0xD800 | UTF16Unit ( temp >> 10 )) );
1034 UTF16OutSwap ( &utf16Out[1], (0xDC00 | UTF16Unit ( temp & 0x3FF)) );
1035
1036 Done:
1037 *utf16Written = unitCount;
1038 return;
1039
1040 } // CodePoint_to_UTF16Swp_Surrogate
1041
1042 // =================================================================================================
1043
CodePoint_to_UTF16Swp(const UTF32Unit cpIn,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf16Written)1044 static void CodePoint_to_UTF16Swp ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
1045 {
1046 size_t unitCount = 0;
1047
1048 UC_Assert ( (utf16Out != 0) && (utf16Written != 0) );
1049 if ( utf16Len == 0 ) goto Done;
1050 if ( cpIn >= 0xD800 ) goto CheckSurrogate; // ! Force linear execution path for the BMP.
1051
1052 InBMP:
1053 unitCount = 1;
1054 UTF16OutSwap ( utf16Out, UTF16Unit(cpIn) );
1055
1056 Done:
1057 *utf16Written = unitCount;
1058 return;
1059
1060 CheckSurrogate:
1061 if ( cpIn > 0xFFFF ) goto SurrogatePair;
1062 if ( cpIn > 0xDFFF ) goto InBMP;
1063 UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam );
1064
1065 SurrogatePair:
1066 CodePoint_to_UTF16Swp_Surrogate ( cpIn, utf16Out, utf16Len, utf16Written );
1067 return;
1068
1069 } // CodePoint_to_UTF16Swp
1070
1071 // =================================================================================================
1072
CodePoint_from_UTF16Swp_Surrogate(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * cpOut,size_t * utf16Read)1073 static void CodePoint_from_UTF16Swp_Surrogate ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
1074 {
1075 UTF16Unit hiUnit = UTF16InSwap(utf16In);
1076 size_t unitCount = 0;
1077 UTF16Unit loUnit; // ! Avoid gcc complaints about declarations after goto's.
1078 UTF32Unit cp;
1079
1080 // ----------------------------------
1081 // We've got a UTF-16 surrogate pair.
1082
1083 if ( hiUnit > 0xDBFF ) UC_Throw ( "Bad UTF-16 - leading low surrogate", kXMPErr_BadParam );
1084 if ( utf16Len < 2 ) goto Done; // Not enough input in this buffer.
1085
1086 loUnit = UTF16InSwap(utf16In+1);
1087 if ( (loUnit < 0xDC00) || (0xDFFF < loUnit) ) UC_Throw ( "Bad UTF-16 - missing low surrogate", kXMPErr_BadParam );
1088
1089 unitCount = 2;
1090 cp = (((hiUnit & 0x3FF) << 10) | (loUnit & 0x3FF)) + 0x10000;
1091
1092 *cpOut = cp; // ! Don't put after Done, don't write if no input.
1093
1094 Done:
1095 *utf16Read = unitCount;
1096 return;
1097
1098 } // CodePoint_from_UTF16Swp_Surrogate
1099
1100 // =================================================================================================
1101
CodePoint_from_UTF16Swp(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * cpOut,size_t * utf16Read)1102 static void CodePoint_from_UTF16Swp ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
1103 {
1104 UTF16Unit inUnit; // ! Don't read until we know there is input.
1105 size_t unitCount = 0;
1106
1107 UC_Assert ( (utf16In != 0) && (cpOut != 0) && (utf16Read != 0) );
1108 if ( utf16Len == 0 ) goto Done;
1109 inUnit = UTF16InSwap(utf16In);
1110 if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) goto SurrogatePair; // ! Force linear execution path for the BMP.
1111
1112 unitCount = 1;
1113 *cpOut = inUnit; // ! Don't put after Done, don't write if no input.
1114
1115 Done:
1116 *utf16Read = unitCount;
1117 return;
1118
1119 SurrogatePair:
1120 CodePoint_from_UTF16Swp_Surrogate ( utf16In, utf16Len, cpOut, utf16Read );
1121 return;
1122
1123 } // CodePoint_from_UTF16Swp
1124
1125 // =================================================================================================
1126
UTF8_to_UTF16Swp(const UTF8Unit * utf8In,const size_t utf8Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf8Read,size_t * utf16Written)1127 static void UTF8_to_UTF16Swp ( const UTF8Unit * utf8In, const size_t utf8Len,
1128 UTF16Unit * utf16Out, const size_t utf16Len,
1129 size_t * utf8Read, size_t * utf16Written )
1130 {
1131 const UTF8Unit * utf8Pos = utf8In;
1132 UTF16Unit * utf16Pos = utf16Out;
1133
1134 size_t utf8Left = utf8Len;
1135 size_t utf16Left = utf16Len;
1136
1137 UC_Assert ( (utf8In != 0) && (utf16Out != 0) && (utf8Read != 0) && (utf16Written != 0) );
1138
1139 while ( (utf8Left > 0) && (utf16Left > 0) ) {
1140
1141 // Do a run of ASCII, it copies 1 input unit into 1 output unit.
1142 size_t i, limit = utf8Left;
1143 if ( limit > utf16Left ) limit = utf16Left;
1144 for ( i = 0; i < limit; ++i ) {
1145 UTF8Unit inUnit = *utf8Pos;
1146 if ( inUnit > 0x7F ) break;
1147 *utf16Pos = UTF16Unit(inUnit) << 8; // Better than: UTF16OutSwap ( utf16Pos, inUnit );
1148 ++utf8Pos;
1149 ++utf16Pos;
1150 }
1151 utf8Left -= i;
1152 utf16Left -= i;
1153
1154 // Do a run of non-ASCII, it copies multiple input units into 1 or 2 output units.
1155 while ( (utf8Left > 0) && (utf16Left > 0) ) {
1156 UTF32Unit cp;
1157 size_t len8, len16;
1158 UTF8Unit inUnit = *utf8Pos;
1159 if ( inUnit <= 0x7F ) break;
1160 CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len8 );
1161 if ( len8 == 0 ) goto Done; // The input buffer ends in the middle of a character.
1162 if ( cp <= 0xFFFF ) {
1163 UTF16OutSwap ( utf16Pos, UTF16Unit(cp) );
1164 len16 = 1;
1165 } else {
1166 CodePoint_to_UTF16Swp_Surrogate ( cp, utf16Pos, utf16Left, &len16 );
1167 if ( len16 == 0 ) goto Done; // Not enough room in the output buffer.
1168 }
1169 utf8Left -= len8;
1170 utf8Pos += len8;
1171 utf16Left -= len16;
1172 utf16Pos += len16;
1173 }
1174
1175 }
1176
1177 Done: // Set the output lengths.
1178 *utf8Read = utf8Len - utf8Left;
1179 *utf16Written = utf16Len - utf16Left;
1180
1181 } // UTF8_to_UTF16Swp
1182
1183 // =================================================================================================
1184
UTF8_to_UTF32Swp(const UTF8Unit * utf8In,const size_t utf8Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf8Read,size_t * utf32Written)1185 static void UTF8_to_UTF32Swp ( const UTF8Unit * utf8In, const size_t utf8Len,
1186 UTF32Unit * utf32Out, const size_t utf32Len,
1187 size_t * utf8Read, size_t * utf32Written )
1188 {
1189 const UTF8Unit * utf8Pos = utf8In;
1190 UTF32Unit * utf32Pos = utf32Out;
1191
1192 size_t utf8Left = utf8Len;
1193 size_t utf32Left = utf32Len;
1194
1195 UC_Assert ( (utf8In != 0) && (utf32Out != 0) && (utf8Read != 0) && (utf32Written != 0) );
1196
1197 while ( (utf8Left > 0) && (utf32Left > 0) ) {
1198
1199 // Do a run of ASCII, it copies 1 input unit into 1 output unit.
1200 size_t i, limit = utf8Left;
1201 if ( limit > utf32Left ) limit = utf32Left;
1202 for ( i = 0; i < limit; ++i ) {
1203 UTF8Unit inUnit = *utf8Pos;
1204 if ( inUnit > 0x7F ) break;
1205 *utf32Pos = UTF32Unit(inUnit) << 24; // Better than: UTF32OutSwap ( utf32Pos, inUnit );
1206 ++utf8Pos;
1207 ++utf32Pos;
1208 }
1209 utf8Left -= i;
1210 utf32Left -= i;
1211
1212 // Do a run of non-ASCII, it copies variable input into 1 output unit.
1213 while ( (utf8Left > 0) && (utf32Left > 0) ) {
1214 size_t len;
1215 UTF32Unit cp;
1216 UTF8Unit inUnit = *utf8Pos;
1217 if ( inUnit <= 0x7F ) break;
1218 CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len );
1219 if ( len == 0 ) goto Done; // The input buffer ends in the middle of a character.
1220 UTF32OutSwap ( utf32Pos, cp );
1221 utf8Left -= len;
1222 utf8Pos += len;
1223 utf32Left -= 1;
1224 utf32Pos += 1;
1225 }
1226
1227 }
1228
1229 Done: // Set the output lengths.
1230 *utf8Read = utf8Len - utf8Left;
1231 *utf32Written = utf32Len - utf32Left;
1232
1233 } // UTF8_to_UTF32Swp
1234
1235 // =================================================================================================
1236
UTF16Swp_to_UTF8(const UTF16Unit * utf16In,const size_t utf16Len,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf16Read,size_t * utf8Written)1237 static void UTF16Swp_to_UTF8 ( const UTF16Unit * utf16In, const size_t utf16Len,
1238 UTF8Unit * utf8Out, const size_t utf8Len,
1239 size_t * utf16Read, size_t * utf8Written )
1240 {
1241 const UTF16Unit * utf16Pos = utf16In;
1242 UTF8Unit * utf8Pos = utf8Out;
1243
1244 size_t utf16Left = utf16Len;
1245 size_t utf8Left = utf8Len;
1246
1247 UC_Assert ( (utf16In != 0) && (utf8Out != 0) && (utf16Read != 0) && (utf8Written != 0) );
1248
1249 while ( (utf16Left > 0) && (utf8Left > 0) ) {
1250
1251 // Do a run of ASCII, it copies 1 input unit into 1 output unit.
1252 size_t i, limit = utf16Left;
1253 if ( limit > utf8Left ) limit = utf8Left;
1254 for ( i = 0; i < limit; ++i ) {
1255 UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1256 if ( inUnit > 0x7F ) break;
1257 *utf8Pos = UTF8Unit(inUnit);
1258 ++utf16Pos;
1259 ++utf8Pos;
1260 }
1261 utf16Left -= i;
1262 utf8Left -= i;
1263
1264 // Do a run of non-ASCII inside the BMP, it copies 1 input unit into multiple output units.
1265 while ( (utf16Left > 0) && (utf8Left > 0) ) {
1266 size_t len8;
1267 UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1268 if ( inUnit <= 0x7F ) break;
1269 if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1270 CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len8 );
1271 if ( len8 == 0 ) goto Done; // Not enough room in the output buffer.
1272 utf16Left -= 1;
1273 utf16Pos += 1;
1274 utf8Left -= len8;
1275 utf8Pos += len8;
1276 }
1277
1278 // Do a run of surrogate pairs, it copies 2 input units into multiple output units.
1279 while ( (utf16Left > 0) && (utf8Left > 0) ) {
1280 UTF32Unit cp;
1281 size_t len16, len8;
1282 UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1283 if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1284 CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, &cp, &len16 );
1285 if ( len16 == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
1286 UC_Assert ( len16 == 2 );
1287 CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len8 );
1288 if ( len8 == 0 ) goto Done; // Not enough room in the output buffer.
1289 utf16Left -= len16;
1290 utf16Pos += len16;
1291 utf8Left -= len8;
1292 utf8Pos += len8;
1293 }
1294
1295 }
1296
1297 Done: // Set the output lengths.
1298 *utf16Read = utf16Len - utf16Left;
1299 *utf8Written = utf8Len - utf8Left;
1300
1301 } // UTF16Swp_to_UTF8
1302
1303 // =================================================================================================
1304
UTF32Swp_to_UTF8(const UTF32Unit * utf32In,const size_t utf32Len,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf32Read,size_t * utf8Written)1305 static void UTF32Swp_to_UTF8 ( const UTF32Unit * utf32In, const size_t utf32Len,
1306 UTF8Unit * utf8Out, const size_t utf8Len,
1307 size_t * utf32Read, size_t * utf8Written )
1308 {
1309 const UTF32Unit * utf32Pos = utf32In;
1310 UTF8Unit * utf8Pos = utf8Out;
1311
1312 size_t utf32Left = utf32Len;
1313 size_t utf8Left = utf8Len;
1314
1315 UC_Assert ( (utf32In != 0) && (utf8Out != 0) && (utf32Read != 0) && (utf8Written != 0) );
1316
1317 while ( (utf32Left > 0) && (utf8Left > 0) ) {
1318
1319 // Do a run of ASCII, it copies 1 input unit into 1 output unit.
1320 size_t i, limit = utf32Left;
1321 if ( limit > utf8Left ) limit = utf8Left;
1322 for ( i = 0; i < limit; ++i ) {
1323 UTF32Unit cp = UTF32InSwap(utf32Pos);
1324 if ( cp > 0x7F ) break;
1325 *utf8Pos = UTF8Unit(cp);
1326 ++utf32Pos;
1327 ++utf8Pos;
1328 }
1329 utf32Left -= i;
1330 utf8Left -= i;
1331
1332 // Do a run of non-ASCII, it copies 1 input unit into multiple output units.
1333 while ( (utf32Left > 0) && (utf8Left > 0) ) {
1334 size_t len;
1335 UTF32Unit cp = UTF32InSwap(utf32Pos);
1336 if ( cp <= 0x7F ) break;
1337 CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len );
1338 if ( len == 0 ) goto Done; // Not enough room in the output buffer.
1339 utf32Left -= 1;
1340 utf32Pos += 1;
1341 utf8Left -= len;
1342 utf8Pos += len;
1343 }
1344
1345 }
1346
1347 Done: // Set the output lengths.
1348 *utf32Read = utf32Len - utf32Left;
1349 *utf8Written = utf8Len - utf8Left;
1350
1351 } // UTF32Swp_to_UTF8
1352
1353 // =================================================================================================
1354
UTF16Swp_to_UTF32Swp(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf16Read,size_t * utf32Written)1355 static void UTF16Swp_to_UTF32Swp ( const UTF16Unit * utf16In, const size_t utf16Len,
1356 UTF32Unit * utf32Out, const size_t utf32Len,
1357 size_t * utf16Read, size_t * utf32Written )
1358 {
1359 const UTF16Unit * utf16Pos = utf16In;
1360 UTF32Unit * utf32Pos = utf32Out;
1361
1362 size_t utf16Left = utf16Len;
1363 size_t utf32Left = utf32Len;
1364
1365 UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
1366
1367 while ( (utf16Left > 0) && (utf32Left > 0) ) {
1368
1369 // Do a run of BMP, it copies 1 input unit into 1 output unit.
1370 size_t i, limit = utf16Left;
1371 if ( limit > utf32Left ) limit = utf32Left;
1372 for ( i = 0; i < limit; ++i ) {
1373 UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1374 if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1375 *utf32Pos = UTF32Unit(*utf16Pos) << 16; // Better than: UTF32OutSwap ( utf32Pos, inUnit );
1376 ++utf16Pos;
1377 ++utf32Pos;
1378 }
1379 utf16Left -= i;
1380 utf32Left -= i;
1381
1382 // Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
1383 while ( (utf16Left > 0) && (utf32Left > 0) ) {
1384 size_t len;
1385 UTF32Unit cp;
1386 UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1387 if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1388 CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, &cp, &len );
1389 if ( len == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
1390 UTF32OutSwap ( utf32Pos, cp );
1391 UC_Assert ( len == 2 );
1392 utf16Left -= len;
1393 utf16Pos += len;
1394 utf32Left -= 1;
1395 utf32Pos += 1;
1396 }
1397
1398 }
1399
1400 Done: // Set the output lengths.
1401 *utf16Read = utf16Len - utf16Left;
1402 *utf32Written = utf32Len - utf32Left;
1403
1404 } // UTF16Swp_to_UTF32Swp
1405
1406 // =================================================================================================
1407
UTF32Swp_to_UTF16Swp(const UTF32Unit * utf32In,const size_t utf32Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf32Read,size_t * utf16Written)1408 static void UTF32Swp_to_UTF16Swp ( const UTF32Unit * utf32In, const size_t utf32Len,
1409 UTF16Unit * utf16Out, const size_t utf16Len,
1410 size_t * utf32Read, size_t * utf16Written )
1411 {
1412 const UTF32Unit * utf32Pos = utf32In;
1413 UTF16Unit * utf16Pos = utf16Out;
1414
1415 size_t utf32Left = utf32Len;
1416 size_t utf16Left = utf16Len;
1417
1418 const size_t k32to16Offset = swap32to16Offset; // ! Make sure compiler treats as an invariant.
1419
1420 UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
1421
1422 while ( (utf32Left > 0) && (utf16Left > 0) ) {
1423
1424 // Do a run of BMP, it copies 1 input unit into 1 output unit.
1425 size_t i, limit = utf32Left;
1426 if ( limit > utf16Left ) limit = utf16Left;
1427 for ( i = 0; i < limit; ++i ) {
1428 UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1429 if ( inUnit > 0xFFFF ) break;
1430 *utf16Pos = *(((UTF16Unit*)utf32Pos) + k32to16Offset); // Better than: UTF16OutSwap ( utf16Pos, UTF16Unit(inUnit) );
1431 ++utf32Pos;
1432 ++utf16Pos;
1433 }
1434 utf32Left -= i;
1435 utf16Left -= i;
1436
1437 // Do a run of non-BMP, it copies 1 input unit into 2 output units.
1438 while ( (utf32Left > 0) && (utf16Left > 0) ) {
1439 size_t len;
1440 UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1441 if ( inUnit <= 0xFFFF ) break;
1442 CodePoint_to_UTF16Swp_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1443 if ( len == 0 ) goto Done; // Not enough room in the output buffer.
1444 UC_Assert ( len == 2 );
1445 utf32Left -= 1;
1446 utf32Pos += 1;
1447 utf16Left -= 2;
1448 utf16Pos += 2;
1449 }
1450
1451 }
1452
1453 Done: // Set the output lengths.
1454 *utf32Read = utf32Len - utf32Left;
1455 *utf16Written = utf16Len - utf16Left;
1456
1457 } // UTF32Swp_to_UTF16Swp
1458
1459 // =================================================================================================
1460
UTF16Nat_to_UTF32Swp(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf16Read,size_t * utf32Written)1461 static void UTF16Nat_to_UTF32Swp ( const UTF16Unit * utf16In, const size_t utf16Len,
1462 UTF32Unit * utf32Out, const size_t utf32Len,
1463 size_t * utf16Read, size_t * utf32Written )
1464 {
1465 const UTF16Unit * utf16Pos = utf16In;
1466 UTF32Unit * utf32Pos = utf32Out;
1467
1468 size_t utf16Left = utf16Len;
1469 size_t utf32Left = utf32Len;
1470
1471 UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
1472
1473 while ( (utf16Left > 0) && (utf32Left > 0) ) {
1474
1475 // Do a run of BMP, it copies 1 input unit into 1 output unit.
1476 size_t i, limit = utf16Left;
1477 if ( limit > utf32Left ) limit = utf32Left;
1478 for ( i = 0; i < limit; ++i ) {
1479 UTF16Unit inUnit = *utf16Pos;
1480 if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1481 UTF32OutSwap ( utf32Pos, inUnit );
1482 ++utf16Pos;
1483 ++utf32Pos;
1484 }
1485 utf16Left -= i;
1486 utf32Left -= i;
1487
1488 // Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
1489 while ( (utf16Left > 0) && (utf32Left > 0) ) {
1490 size_t len;
1491 UTF32Unit cp;
1492 UTF16Unit inUnit = *utf16Pos;
1493 if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1494 CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, &cp, &len );
1495 if ( len == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
1496 UC_Assert ( len == 2 );
1497 UTF32OutSwap ( utf32Pos, cp );
1498 utf16Left -= len;
1499 utf16Pos += len;
1500 utf32Left -= 1;
1501 utf32Pos += 1;
1502 }
1503
1504 }
1505
1506 Done: // Set the output lengths.
1507 *utf16Read = utf16Len - utf16Left;
1508 *utf32Written = utf32Len - utf32Left;
1509
1510 } // UTF16Nat_to_UTF32Swp
1511
1512 // =================================================================================================
1513
UTF16Swp_to_UTF32Nat(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf16Read,size_t * utf32Written)1514 static void UTF16Swp_to_UTF32Nat ( const UTF16Unit * utf16In, const size_t utf16Len,
1515 UTF32Unit * utf32Out, const size_t utf32Len,
1516 size_t * utf16Read, size_t * utf32Written )
1517 {
1518 const UTF16Unit * utf16Pos = utf16In;
1519 UTF32Unit * utf32Pos = utf32Out;
1520
1521 size_t utf16Left = utf16Len;
1522 size_t utf32Left = utf32Len;
1523
1524 UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
1525
1526 while ( (utf16Left > 0) && (utf32Left > 0) ) {
1527
1528 // Do a run of BMP, it copies 1 input unit into 1 output unit.
1529 size_t i, limit = utf16Left;
1530 if ( limit > utf32Left ) limit = utf32Left;
1531 for ( i = 0; i < limit; ++i ) {
1532 UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1533 if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1534 *utf32Pos = inUnit;
1535 ++utf16Pos;
1536 ++utf32Pos;
1537 }
1538 utf16Left -= i;
1539 utf32Left -= i;
1540
1541 // Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
1542 while ( (utf16Left > 0) && (utf32Left > 0) ) {
1543 size_t len;
1544 UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1545 if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1546 CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, utf32Pos, &len );
1547 if ( len == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
1548 UC_Assert ( len == 2 );
1549 utf16Left -= len;
1550 utf16Pos += len;
1551 utf32Left -= 1;
1552 utf32Pos += 1;
1553 }
1554
1555 }
1556
1557 Done: // Set the output lengths.
1558 *utf16Read = utf16Len - utf16Left;
1559 *utf32Written = utf32Len - utf32Left;
1560
1561 } // UTF16Swp_to_UTF32Nat
1562
1563 // =================================================================================================
1564
UTF32Nat_to_UTF16Swp(const UTF32Unit * utf32In,const size_t utf32Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf32Read,size_t * utf16Written)1565 static void UTF32Nat_to_UTF16Swp ( const UTF32Unit * utf32In, const size_t utf32Len,
1566 UTF16Unit * utf16Out, const size_t utf16Len,
1567 size_t * utf32Read, size_t * utf16Written )
1568 {
1569 const UTF32Unit * utf32Pos = utf32In;
1570 UTF16Unit * utf16Pos = utf16Out;
1571
1572 size_t utf32Left = utf32Len;
1573 size_t utf16Left = utf16Len;
1574
1575 UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
1576
1577 while ( (utf32Left > 0) && (utf16Left > 0) ) {
1578
1579 // Do a run of BMP, it copies 1 input unit into 1 output unit.
1580 size_t i, limit = utf32Left;
1581 if ( limit > utf16Left ) limit = utf16Left;
1582 for ( i = 0; i < limit; ++i ) {
1583 UTF32Unit inUnit = *utf32Pos;
1584 if ( inUnit > 0xFFFF ) break;
1585 UTF16OutSwap ( utf16Pos, UTF16Unit(inUnit) );
1586 ++utf32Pos;
1587 ++utf16Pos;
1588 }
1589 utf32Left -= i;
1590 utf16Left -= i;
1591
1592 // Do a run of non-BMP, it copies 1 input unit into 2 output units.
1593 while ( (utf32Left > 0) && (utf16Left > 0) ) {
1594 size_t len;
1595 UTF32Unit inUnit = *utf32Pos;
1596 if ( inUnit <= 0xFFFF ) break;
1597 CodePoint_to_UTF16Swp_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1598 if ( len == 0 ) goto Done; // Not enough room in the output buffer.
1599 UC_Assert ( len == 2 );
1600 utf32Left -= 1;
1601 utf32Pos += 1;
1602 utf16Left -= 2;
1603 utf16Pos += 2;
1604 }
1605
1606 }
1607
1608 Done: // Set the output lengths.
1609 *utf32Read = utf32Len - utf32Left;
1610 *utf16Written = utf16Len - utf16Left;
1611
1612 } // UTF32Nat_to_UTF16Swp
1613
1614 // =================================================================================================
1615
UTF32Swp_to_UTF16Nat(const UTF32Unit * utf32In,const size_t utf32Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf32Read,size_t * utf16Written)1616 static void UTF32Swp_to_UTF16Nat ( const UTF32Unit * utf32In, const size_t utf32Len,
1617 UTF16Unit * utf16Out, const size_t utf16Len,
1618 size_t * utf32Read, size_t * utf16Written )
1619 {
1620 const UTF32Unit * utf32Pos = utf32In;
1621 UTF16Unit * utf16Pos = utf16Out;
1622
1623 size_t utf32Left = utf32Len;
1624 size_t utf16Left = utf16Len;
1625
1626 UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
1627
1628 while ( (utf32Left > 0) && (utf16Left > 0) ) {
1629
1630 // Do a run of BMP, it copies 1 input unit into 1 output unit.
1631 size_t i, limit = utf32Left;
1632 if ( limit > utf16Left ) limit = utf16Left;
1633 for ( i = 0; i < limit; ++i ) {
1634 UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1635 if ( inUnit > 0xFFFF ) break;
1636 *utf16Pos = UTF16Unit(inUnit);
1637 ++utf32Pos;
1638 ++utf16Pos;
1639 }
1640 utf32Left -= i;
1641 utf16Left -= i;
1642
1643 // Do a run of non-BMP, it copies 1 input unit into 2 output units.
1644 while ( (utf32Left > 0) && (utf16Left > 0) ) {
1645 size_t len;
1646 UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1647 if ( inUnit <= 0xFFFF ) break;
1648 CodePoint_to_UTF16Nat_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1649 if ( len == 0 ) goto Done; // Not enough room in the output buffer.
1650 UC_Assert ( len == 2 );
1651 utf32Left -= 1;
1652 utf32Pos += 1;
1653 utf16Left -= 2;
1654 utf16Pos += 2;
1655 }
1656
1657 }
1658
1659 Done: // Set the output lengths.
1660 *utf32Read = utf32Len - utf32Left;
1661 *utf16Written = utf16Len - utf16Left;
1662
1663 } // UTF32Swp_to_UTF16Nat
1664
1665 // =================================================================================================
1666