1 // =================================================================================================
2 // Copyright 2004 Adobe Systems Incorporated
3 // All Rights Reserved.
4 //
5 // NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms
6 // of the Adobe license agreement accompanying it.
7 // =================================================================================================
8
9 #include "public/include/XMP_Const.h"
10
11 #define UC_Assert(cond) /* Nothing for now, should be XMP_Assert. */
12 #define UC_Throw(msg,id) throw XMP_Error ( id, msg )
13
14 #include "source/UnicodeConversions.hpp"
15
16 #if SUNOS_SPARC || XMP_IOS_ARM
17 #include "string.h"
18 #endif
19
20 using namespace std;
21
22 // =================================================================================================
23
24 // *** Look into using asm inlines, e.g. count-leading bits for multi-byte UTF-8.
25
26 CodePoint_to_UTF16_Proc CodePoint_to_UTF16BE = 0;
27 CodePoint_to_UTF16_Proc CodePoint_to_UTF16LE = 0;
28
29 CodePoint_from_UTF16_Proc CodePoint_from_UTF16BE = 0;
30 CodePoint_from_UTF16_Proc CodePoint_from_UTF16LE = 0;
31
32 UTF8_to_UTF16_Proc UTF8_to_UTF16BE = 0;
33 UTF8_to_UTF16_Proc UTF8_to_UTF16LE = 0;
34 UTF8_to_UTF32_Proc UTF8_to_UTF32BE = 0;
35 UTF8_to_UTF32_Proc UTF8_to_UTF32LE = 0;
36
37 UTF16_to_UTF8_Proc UTF16BE_to_UTF8 = 0;
38 UTF16_to_UTF8_Proc UTF16LE_to_UTF8 = 0;
39 UTF32_to_UTF8_Proc UTF32BE_to_UTF8 = 0;
40 UTF32_to_UTF8_Proc UTF32LE_to_UTF8 = 0;
41
42 UTF8_to_UTF16_Proc UTF8_to_UTF16Native = 0;
43 UTF8_to_UTF32_Proc UTF8_to_UTF32Native = 0;
44 UTF16_to_UTF8_Proc UTF16Native_to_UTF8 = 0;
45 UTF32_to_UTF8_Proc UTF32Native_to_UTF8 = 0;
46
47 UTF16_to_UTF32_Proc UTF16BE_to_UTF32BE = 0;
48 UTF16_to_UTF32_Proc UTF16BE_to_UTF32LE = 0;
49 UTF16_to_UTF32_Proc UTF16LE_to_UTF32BE = 0;
50 UTF16_to_UTF32_Proc UTF16LE_to_UTF32LE = 0;
51
52 UTF32_to_UTF16_Proc UTF32BE_to_UTF16BE = 0;
53 UTF32_to_UTF16_Proc UTF32BE_to_UTF16LE = 0;
54 UTF32_to_UTF16_Proc UTF32LE_to_UTF16BE = 0;
55 UTF32_to_UTF16_Proc UTF32LE_to_UTF16LE = 0;
56
57 // -------------------------------------------------------------------------------------------------
58
59 static size_t swap32to16Offset = 0; // Offset to "convert" a swapped UTF32 pointer into a swapped UTF16 pointer.
60
61 // -------------------------------------------------------------------------------------------------
62
63 static void CodePoint_to_UTF16Nat ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written );
64 static void CodePoint_to_UTF16Swp ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written );
65
66 static void CodePoint_from_UTF16Nat ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read );
67 static void CodePoint_from_UTF16Swp ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read );
68
69 // -------------------------------------------------------------------------------------------------
70
71 static void UTF8_to_UTF16Nat ( const UTF8Unit * utf8In, const size_t utf8Len,
72 UTF16Unit * utf16Out, const size_t utf16Len,
73 size_t * utf8Read, size_t * utf16Written );
74
75 static void UTF8_to_UTF16Swp ( const UTF8Unit * utf8In, const size_t utf8Len,
76 UTF16Unit * utf16Out, const size_t utf16Len,
77 size_t * utf8Read, size_t * utf16Written );
78
79 static void UTF8_to_UTF32Nat ( const UTF8Unit * utf8In, const size_t utf8Len,
80 UTF32Unit * utf32Out, const size_t utf32Len,
81 size_t * utf8Read, size_t * utf32Written );
82
83 static void UTF8_to_UTF32Swp ( const UTF8Unit * utf8In, const size_t utf8Len,
84 UTF32Unit * utf32Out, const size_t utf32Len,
85 size_t * utf8Read, size_t * utf32Written );
86
87 // -------------------------------------------------------------------------------------------------
88
89 static void UTF16Nat_to_UTF8 ( const UTF16Unit * utf16In, const size_t utf16Len,
90 UTF8Unit * utf8Out, const size_t utf8Len,
91 size_t * utf16Read, size_t * utf8Written );
92
93 static void UTF16Swp_to_UTF8 ( const UTF16Unit * utf16In, const size_t utf16Len,
94 UTF8Unit * utf8Out, const size_t utf8Len,
95 size_t * utf16Read, size_t * utf8Written );
96
97 static void UTF32Nat_to_UTF8 ( const UTF32Unit * utf32In, const size_t utf32Len,
98 UTF8Unit * utf8Out, const size_t utf8Len,
99 size_t * utf32Read, size_t * utf8Written );
100
101 static void UTF32Swp_to_UTF8 ( const UTF32Unit * utf32In, const size_t utf32Len,
102 UTF8Unit * utf8Out, const size_t utf8Len,
103 size_t * utf32Read, size_t * utf8Written );
104
105 // -------------------------------------------------------------------------------------------------
106
107 static void UTF16Nat_to_UTF32Nat ( const UTF16Unit * utf16In, const size_t utf16Len,
108 UTF32Unit * utf32Out, const size_t utf32Len,
109 size_t * utf16Read, size_t * utf32Written );
110
111 static void UTF16Nat_to_UTF32Swp ( const UTF16Unit * utf16In, const size_t utf16Len,
112 UTF32Unit * utf32Out, const size_t utf32Len,
113 size_t * utf16Read, size_t * utf32Written );
114
115 static void UTF16Swp_to_UTF32Nat ( const UTF16Unit * utf16In, const size_t utf16Len,
116 UTF32Unit * utf32Out, const size_t utf32Len,
117 size_t * utf16Read, size_t * utf32Written );
118
119 static void UTF16Swp_to_UTF32Swp ( const UTF16Unit * utf16In, const size_t utf16Len,
120 UTF32Unit * utf32Out, const size_t utf32Len,
121 size_t * utf16Read, size_t * utf32Written );
122
123 // -------------------------------------------------------------------------------------------------
124
125 static void UTF32Nat_to_UTF16Nat ( const UTF32Unit * utf32In, const size_t utf32Len,
126 UTF16Unit * utf16Out, const size_t utf16Len,
127 size_t * utf32Read, size_t * utf16Written );
128
129 static void UTF32Nat_to_UTF16Swp ( const UTF32Unit * utf32In, const size_t utf32Len,
130 UTF16Unit * utf16Out, const size_t utf16Len,
131 size_t * utf32Read, size_t * utf16Written );
132
133 static void UTF32Swp_to_UTF16Nat ( const UTF32Unit * utf32In, const size_t utf32Len,
134 UTF16Unit * utf16Out, const size_t utf16Len,
135 size_t * utf32Read, size_t * utf16Written );
136
137 static void UTF32Swp_to_UTF16Swp ( const UTF32Unit * utf32In, const size_t utf32Len,
138 UTF16Unit * utf16Out, const size_t utf16Len,
139 size_t * utf32Read, size_t * utf16Written );
140
141 // =================================================================================================
142
InitializeUnicodeConversions()143 void InitializeUnicodeConversions()
144 {
145 UC_Assert ( (sizeof(UTF8Unit) == 1) && (sizeof(UTF16Unit) == 2) && (sizeof(UTF32Unit) == 4) );
146
147 UTF16Unit u16 = 0x00FF;
148 bool bigEndian = (*((UTF8Unit*)&u16) == 0);
149
150 UTF8_to_UTF16Native = UTF8_to_UTF16Nat;
151 UTF8_to_UTF32Native = UTF8_to_UTF32Nat;
152 UTF16Native_to_UTF8 = UTF16Nat_to_UTF8;
153 UTF32Native_to_UTF8 = UTF32Nat_to_UTF8;
154
155 if ( bigEndian ) {
156
157 swap32to16Offset = 0;
158
159 CodePoint_to_UTF16BE = CodePoint_to_UTF16Nat;
160 CodePoint_to_UTF16LE = CodePoint_to_UTF16Swp;
161
162 CodePoint_from_UTF16BE = CodePoint_from_UTF16Nat;
163 CodePoint_from_UTF16LE = CodePoint_from_UTF16Swp;
164
165 UTF8_to_UTF16BE = UTF8_to_UTF16Nat;
166 UTF8_to_UTF16LE = UTF8_to_UTF16Swp;
167 UTF8_to_UTF32BE = UTF8_to_UTF32Nat;
168 UTF8_to_UTF32LE = UTF8_to_UTF32Swp;
169
170 UTF16BE_to_UTF8 = UTF16Nat_to_UTF8;
171 UTF16LE_to_UTF8 = UTF16Swp_to_UTF8;
172 UTF32BE_to_UTF8 = UTF32Nat_to_UTF8;
173 UTF32LE_to_UTF8 = UTF32Swp_to_UTF8;
174
175 UTF16BE_to_UTF32BE = UTF16Nat_to_UTF32Nat;
176 UTF16BE_to_UTF32LE = UTF16Nat_to_UTF32Swp;
177 UTF16LE_to_UTF32BE = UTF16Swp_to_UTF32Nat;
178 UTF16LE_to_UTF32LE = UTF16Swp_to_UTF32Swp;
179
180 UTF32BE_to_UTF16BE = UTF32Nat_to_UTF16Nat;
181 UTF32BE_to_UTF16LE = UTF32Nat_to_UTF16Swp;
182 UTF32LE_to_UTF16BE = UTF32Swp_to_UTF16Nat;
183 UTF32LE_to_UTF16LE = UTF32Swp_to_UTF16Swp;
184
185 } else {
186
187 swap32to16Offset = 1; // ! Offset in UTF16 units!
188
189 CodePoint_to_UTF16BE = CodePoint_to_UTF16Swp;
190 CodePoint_to_UTF16LE = CodePoint_to_UTF16Nat;
191
192 CodePoint_from_UTF16BE = CodePoint_from_UTF16Swp;
193 CodePoint_from_UTF16LE = CodePoint_from_UTF16Nat;
194
195 UTF8_to_UTF16BE = UTF8_to_UTF16Swp;
196 UTF8_to_UTF16LE = UTF8_to_UTF16Nat;
197 UTF8_to_UTF32BE = UTF8_to_UTF32Swp;
198 UTF8_to_UTF32LE = UTF8_to_UTF32Nat;
199
200 UTF16BE_to_UTF8 = UTF16Swp_to_UTF8;
201 UTF16LE_to_UTF8 = UTF16Nat_to_UTF8;
202 UTF32BE_to_UTF8 = UTF32Swp_to_UTF8;
203 UTF32LE_to_UTF8 = UTF32Nat_to_UTF8;
204
205 UTF16BE_to_UTF32BE = UTF16Swp_to_UTF32Swp;
206 UTF16BE_to_UTF32LE = UTF16Swp_to_UTF32Nat;
207 UTF16LE_to_UTF32BE = UTF16Nat_to_UTF32Swp;
208 UTF16LE_to_UTF32LE = UTF16Nat_to_UTF32Nat;
209
210 UTF32BE_to_UTF16BE = UTF32Swp_to_UTF16Swp;
211 UTF32BE_to_UTF16LE = UTF32Swp_to_UTF16Nat;
212 UTF32LE_to_UTF16BE = UTF32Nat_to_UTF16Swp;
213 UTF32LE_to_UTF16LE = UTF32Nat_to_UTF16Nat;
214
215 }
216
217 } // InitializeUnicodeConversions
218
219 // =================================================================================================
220
221 #if SUNOS_SPARC || XMP_IOS_ARM
222 #define DefineAndGetValue(type,inPtr) type inUnit; memcpy ( &inUnit, inPtr, sizeof(type) );
223 #else
224 #define DefineAndGetValue(type,inPtr) type inUnit = *((type *)inPtr);
225 #endif
226
UTF16InSwap(const void * inPtr)227 static inline UTF16Unit UTF16InSwap ( const void * inPtr )
228 {
229 DefineAndGetValue ( UTF16Unit, inPtr );
230 return (inUnit << 8) | (inUnit >> 8);
231 }
UTF32InSwap(const void * inPtr)232 static inline UTF32Unit UTF32InSwap ( const void * inPtr )
233 {
234 DefineAndGetValue ( UTF32Unit, inPtr );
235 return (inUnit << 24) | ((inUnit << 8) & 0x00FF0000) | ((inUnit >> 8) & 0x0000FF00) | (inUnit >> 24);
236 }
237
UTF16OutSwap(UTF16Unit * outPtr,const UTF16Unit value)238 static inline void UTF16OutSwap ( UTF16Unit * outPtr, const UTF16Unit value )
239 {
240 UTF16Unit outUnit = (value << 8) | (value >> 8);
241 *outPtr = outUnit;
242 }
243
UTF32OutSwap(UTF32Unit * outPtr,const UTF32Unit value)244 static inline void UTF32OutSwap ( UTF32Unit * outPtr, const UTF32Unit value )
245 {
246 UTF32Unit outUnit = (value << 24) | ((value << 8) & 0x00FF0000) | ((value >> 8) & 0x0000FF00) | (value >> 24);
247 *outPtr = outUnit;
248 }
249
250 // =================================================================================================
251
SwapUTF16(const UTF16Unit * utf16In,UTF16Unit * utf16Out,const size_t utf16Len)252 void SwapUTF16 ( const UTF16Unit * utf16In, UTF16Unit * utf16Out, const size_t utf16Len )
253 {
254 for ( size_t i = 0; i < utf16Len; ++i ) utf16Out[i] = UTF16InSwap(utf16In+i);
255 }
256
SwapUTF32(const UTF32Unit * utf32In,UTF32Unit * utf32Out,const size_t utf32Len)257 void SwapUTF32 ( const UTF32Unit * utf32In, UTF32Unit * utf32Out, const size_t utf32Len ) {
258 for ( size_t i = 0; i < utf32Len; ++i ) utf32Out[i] = UTF32InSwap(utf32In+i);
259 }
260
261 // =================================================================================================
262
ToUTF16(const UTF8Unit * utf8In,size_t utf8Len,std::string * utf16Str,bool bigEndian)263 extern void ToUTF16 ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf16Str, bool bigEndian )
264 {
265 UTF8_to_UTF16_Proc Converter = UTF8_to_UTF16LE;
266 if ( bigEndian ) Converter = UTF8_to_UTF16BE;
267
268 enum { kBufferSize = 8*1024 };
269 UTF16Unit u16Buffer[kBufferSize]; // 16K bytes
270 size_t readCount, writeCount;
271
272 utf16Str->erase();
273 utf16Str->reserve ( 2*utf8Len ); // As good a guess as any.
274
275 while ( utf8Len > 0 ) {
276 Converter ( utf8In, utf8Len, u16Buffer, kBufferSize, &readCount, &writeCount );
277 if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadUnicode );
278 utf16Str->append ( (const char *)u16Buffer, writeCount*2 );
279 utf8In += readCount;
280 utf8Len -= readCount;
281 }
282
283 } // ToUTF16
284
285 // =================================================================================================
286
ToUTF16Native(const UTF8Unit * utf8In,size_t utf8Len,std::string * utf16Str)287 extern void ToUTF16Native ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf16Str )
288 {
289 enum { kBufferSize = 8*1024 };
290 UTF16Unit u16Buffer[kBufferSize]; // 16K bytes
291 size_t readCount, writeCount;
292
293 utf16Str->erase();
294 utf16Str->reserve ( 2*utf8Len ); // As good a guess as any.
295
296 while ( utf8Len > 0 ) {
297 UTF8_to_UTF16Nat ( utf8In, utf8Len, u16Buffer, kBufferSize, &readCount, &writeCount );
298 if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadUnicode );
299 utf16Str->append ( (const char *)u16Buffer, writeCount*2 );
300 utf8In += readCount;
301 utf8Len -= readCount;
302 }
303
304 } // ToUTF16Native
305
306 // =================================================================================================
307
ToUTF32(const UTF8Unit * utf8In,size_t utf8Len,std::string * utf32Str,bool bigEndian)308 extern void ToUTF32 ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf32Str, bool bigEndian )
309 {
310 UTF8_to_UTF32_Proc Converter = UTF8_to_UTF32LE;
311 if ( bigEndian ) Converter = UTF8_to_UTF32BE;
312
313 enum { kBufferSize = 4*1024 };
314 UTF32Unit u32Buffer[kBufferSize]; // 16K bytes
315 size_t readCount, writeCount;
316
317 utf32Str->erase();
318 utf32Str->reserve ( 4*utf8Len ); // As good a guess as any.
319
320 while ( utf8Len > 0 ) {
321 Converter ( utf8In, utf8Len, u32Buffer, kBufferSize, &readCount, &writeCount );
322 if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadUnicode );
323 utf32Str->append ( (const char *)u32Buffer, writeCount*4 );
324 utf8In += readCount;
325 utf8Len -= readCount;
326 }
327
328 } // ToUTF32
329
330 // =================================================================================================
331
ToUTF32Native(const UTF8Unit * utf8In,size_t utf8Len,std::string * utf32Str)332 extern void ToUTF32Native ( const UTF8Unit * utf8In, size_t utf8Len, std::string * utf32Str )
333 {
334 enum { kBufferSize = 4*1024 };
335 UTF32Unit u32Buffer[kBufferSize]; // 16K bytes
336 size_t readCount, writeCount;
337
338 utf32Str->erase();
339 utf32Str->reserve ( 4*utf8Len ); // As good a guess as any.
340
341 while ( utf8Len > 0 ) {
342 UTF8_to_UTF32Nat ( utf8In, utf8Len, u32Buffer, kBufferSize, &readCount, &writeCount );
343 if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadUnicode );
344 utf32Str->append ( (const char *)u32Buffer, writeCount*4 );
345 utf8In += readCount;
346 utf8Len -= readCount;
347 }
348
349 } // ToUTF32Native
350
351 // =================================================================================================
352
FromUTF16(const UTF16Unit * utf16In,size_t utf16Len,std::string * utf8Str,bool bigEndian)353 extern void FromUTF16 ( const UTF16Unit * utf16In, size_t utf16Len, std::string * utf8Str, bool bigEndian )
354 {
355 UTF16_to_UTF8_Proc Converter = UTF16LE_to_UTF8;
356 if ( bigEndian ) Converter = UTF16BE_to_UTF8;
357
358 enum { kBufferSize = 16*1024 };
359 UTF8Unit u8Buffer[kBufferSize];
360 size_t readCount, writeCount;
361
362 utf8Str->erase();
363 utf8Str->reserve ( 2*utf16Len ); // As good a guess as any.
364
365 while ( utf16Len > 0 ) {
366 Converter ( utf16In, utf16Len, u8Buffer, kBufferSize, &readCount, &writeCount );
367 if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadUnicode );
368 utf8Str->append ( (const char *)u8Buffer, writeCount );
369 utf16In += readCount;
370 utf16Len -= readCount;
371 }
372
373 } // FromUTF16
374
375 // =================================================================================================
376
FromUTF16Native(const UTF16Unit * utf16In,size_t utf16Len,std::string * utf8Str)377 extern void FromUTF16Native ( const UTF16Unit * utf16In, size_t utf16Len, std::string * utf8Str )
378 {
379 enum { kBufferSize = 16*1024 };
380 UTF8Unit u8Buffer[kBufferSize];
381 size_t readCount, writeCount;
382
383 utf8Str->erase();
384 utf8Str->reserve ( 2*utf16Len ); // As good a guess as any.
385
386 while ( utf16Len > 0 ) {
387 UTF16Nat_to_UTF8 ( utf16In, utf16Len, u8Buffer, kBufferSize, &readCount, &writeCount );
388 if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadUnicode );
389 utf8Str->append ( (const char *)u8Buffer, writeCount );
390 utf16In += readCount;
391 utf16Len -= readCount;
392 }
393
394 } // FromUTF16Native
395
396 // =================================================================================================
397
FromUTF32(const UTF32Unit * utf32In,size_t utf32Len,std::string * utf8Str,bool bigEndian)398 extern void FromUTF32 ( const UTF32Unit * utf32In, size_t utf32Len, std::string * utf8Str, bool bigEndian )
399 {
400 UTF32_to_UTF8_Proc Converter = UTF32LE_to_UTF8;
401 if ( bigEndian ) Converter = UTF32BE_to_UTF8;
402
403 enum { kBufferSize = 16*1024 };
404 UTF8Unit u8Buffer[kBufferSize];
405 size_t readCount, writeCount;
406
407 utf8Str->erase();
408 utf8Str->reserve ( 2*utf32Len ); // As good a guess as any.
409
410 while ( utf32Len > 0 ) {
411 Converter ( utf32In, utf32Len, u8Buffer, kBufferSize, &readCount, &writeCount );
412 if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadUnicode );
413 utf8Str->append ( (const char *)u8Buffer, writeCount );
414 utf32In += readCount;
415 utf32Len -= readCount;
416 }
417
418 } // FromUTF32
419
420 // =================================================================================================
421
FromUTF32Native(const UTF32Unit * utf32In,size_t utf32Len,std::string * utf8Str)422 extern void FromUTF32Native ( const UTF32Unit * utf32In, size_t utf32Len, std::string * utf8Str )
423 {
424 enum { kBufferSize = 16*1024 };
425 UTF8Unit u8Buffer[kBufferSize];
426 size_t readCount, writeCount;
427
428 utf8Str->erase();
429 utf8Str->reserve ( 2*utf32Len ); // As good a guess as any.
430
431 while ( utf32Len > 0 ) {
432 UTF32Nat_to_UTF8 ( utf32In, utf32Len, u8Buffer, kBufferSize, &readCount, &writeCount );
433 if ( writeCount == 0 ) UC_Throw ( "Incomplete Unicode at end of string", kXMPErr_BadUnicode );
434 utf8Str->append ( (const char *)u8Buffer, writeCount );
435 utf32In += readCount;
436 utf32Len -= readCount;
437 }
438
439 } // FromUTF32Native
440
441 // =================================================================================================
442
CodePoint_to_UTF8_Multi(const UTF32Unit cpIn,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf8Written)443 static void CodePoint_to_UTF8_Multi ( const UTF32Unit cpIn, UTF8Unit * utf8Out, const size_t utf8Len, size_t * utf8Written )
444 {
445 size_t unitCount = 0;
446
447 if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam );
448 if ( (0xD800 <= cpIn) && (cpIn <= 0xDFFF) ) UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam );
449
450 // Compute the number of bytes using 6 data bits each. Then see if the highest order bits will
451 // fit into the leading byte. Write the UTF-8 sequence if there is enough room.
452
453 UTF32Unit temp, mask;
454 size_t bytesNeeded = 0;
455 for ( temp = cpIn; temp != 0; temp = temp >> 6 ) ++bytesNeeded;
456
457 temp = cpIn >> ((bytesNeeded-1)*6); // The highest order data bits.
458 mask = (0x80 >> bytesNeeded) - 1; // Available data bits in the leading byte.
459 if ( temp > mask ) ++bytesNeeded;
460
461 if ( bytesNeeded > utf8Len ) goto Done; // Not enough room for the output.
462 unitCount = bytesNeeded;
463
464 temp = cpIn;
465 for ( --bytesNeeded; bytesNeeded > 0; --bytesNeeded ) {
466 utf8Out[bytesNeeded] = 0x80 | UTF8Unit ( temp & 0x3F );
467 temp = temp >> 6;
468 }
469
470 mask = ~((1 << (8-unitCount)) - 1);
471 utf8Out[0] = UTF8Unit ( mask | temp );
472
473 Done:
474 *utf8Written = unitCount;
475 return;
476
477 } // CodePoint_to_UTF8_Multi
478
479 // =================================================================================================
480
CodePoint_to_UTF8(const UTF32Unit cpIn,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf8Written)481 void CodePoint_to_UTF8 ( const UTF32Unit cpIn, UTF8Unit * utf8Out, const size_t utf8Len, size_t * utf8Written )
482 {
483 size_t unitCount = 0;
484
485 UC_Assert ( (utf8Out != 0) && (utf8Written != 0) );
486 if ( utf8Len == 0 ) goto Done;
487 if ( cpIn > 0x7F ) goto MultiByte; // ! Force linear execution path for ASCII.
488
489 unitCount = 1;
490 *utf8Out = UTF8Unit(cpIn);
491
492 Done:
493 *utf8Written = unitCount;
494 return;
495
496 MultiByte:
497 CodePoint_to_UTF8_Multi( cpIn, utf8Out, utf8Len, utf8Written );
498 return;
499
500 } // CodePoint_to_UTF8
501
502 // =================================================================================================
503
CodePoint_from_UTF8_Multi(const UTF8Unit * utf8In,const size_t utf8Len,UTF32Unit * cpOut,size_t * utf8Read)504 static void CodePoint_from_UTF8_Multi ( const UTF8Unit * utf8In, const size_t utf8Len, UTF32Unit * cpOut, size_t * utf8Read )
505 {
506 UTF8Unit inUnit = *utf8In;
507 size_t unitCount = 0;
508 UTF32Unit cp; // ! Avoid gcc complaints about declarations after goto's.
509 const UTF8Unit * utf8Pos;
510
511 // -------------------------------------------------------------------------------------
512 // We've got a multibyte UTF-8 character. The first byte has the number of bytes and the
513 // highest order data bits. The other bytes each add 6 more data bits.
514
515 #if 0 // This might be a more effcient way to count the bytes.
516 static XMP_Uns8 kByteCounts[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
517 size_t bytesNeeded = kByteCounts [ inUnit >> 4 ];
518 if ( (bytesNeeded < 2) || ((bytesNeeded == 4) && ((inUnit & 0x08) != 0)) ) {
519 UC_Throw ( "Invalid UTF-8 sequence length", kXMPErr_BadParam );
520 }
521 #endif
522
523 size_t bytesNeeded = 0; // Count the leading 1 bits in the first byte.
524 for ( UTF8Unit temp = inUnit; temp > 0x7F; temp = temp << 1 ) ++bytesNeeded;
525 // *** Consider CPU-specific assembly inline, e.g. cntlzw on PowerPC.
526
527 if ( (bytesNeeded < 2) || (bytesNeeded > 4) ) UC_Throw ( "Invalid UTF-8 sequence length", kXMPErr_BadParam );
528 if ( bytesNeeded > utf8Len ) goto Done; // Not enough input in this buffer.
529 unitCount = bytesNeeded;
530
531 cp = inUnit & ((1 << (7-unitCount)) - 1); // Isolate the initial data bits in the bottom of cp.
532
533 utf8Pos = utf8In + 1; // We've absorbed the first byte.
534 for ( --bytesNeeded; bytesNeeded > 0; --bytesNeeded, ++utf8Pos ) {
535 inUnit = *utf8Pos;
536 if ( (inUnit & UTF8Unit(0xC0)) != UTF8Unit(0x80) ) UC_Throw ( "Invalid UTF-8 data byte", kXMPErr_BadParam );
537 cp = (cp << 6) | (inUnit & 0x3F);
538 }
539
540 if ( cp >= 0xD800 ) { // Skip the next comparisons most of the time.
541 if ( (0xD800 <= cp) && (cp <= 0xDFFF) ) UC_Throw ( "Bad UTF-8 - surrogate code point", kXMPErr_BadParam );
542 if ( cp > 0x10FFFF ) UC_Throw ( "Bad UTF-8 - out of range", kXMPErr_BadParam );
543 }
544
545 *cpOut = cp; // ! Don't put after Done, don't write if no input.
546
547 Done:
548 *utf8Read = unitCount;
549 return;
550
551 } // CodePoint_from_UTF8_Multi
552
553 // =================================================================================================
554
CodePoint_from_UTF8(const UTF8Unit * utf8In,const size_t utf8Len,UTF32Unit * cpOut,size_t * utf8Read)555 void CodePoint_from_UTF8 ( const UTF8Unit * utf8In, const size_t utf8Len, UTF32Unit * cpOut, size_t * utf8Read )
556 {
557 UTF8Unit inUnit; // ! Don't read until we know there is input.
558 size_t unitCount = 0;
559
560 UC_Assert ( (utf8In != 0) && (cpOut != 0) && (utf8Read != 0) );
561 if ( utf8Len == 0 ) goto Done;
562 inUnit = *utf8In;
563 if ( inUnit >= 0x80 ) goto MultiByte; // ! Force linear execution path for ASCII.
564
565 unitCount = 1;
566 *cpOut = inUnit; // ! Don't put after Done, don't write if no input.
567
568 Done:
569 *utf8Read = unitCount;
570 return;
571
572 MultiByte:
573 CodePoint_from_UTF8_Multi ( utf8In, utf8Len, cpOut, utf8Read );
574 return;
575
576 } // CodePoint_from_UTF8
577
578 // =================================================================================================
579
CodePoint_to_UTF16Nat_Surrogate(const UTF32Unit cpIn,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf16Written)580 static void CodePoint_to_UTF16Nat_Surrogate ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
581 {
582 size_t unitCount = 0;
583 UTF32Unit temp; // ! Avoid gcc complaints about declarations after goto's.
584
585 if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam );
586 if ( utf16Len < 2 ) goto Done; // Not enough room for the output.
587
588 unitCount = 2;
589 temp = cpIn - 0x10000;
590 utf16Out[0] = 0xD800 | UTF16Unit ( temp >> 10 );
591 utf16Out[1] = 0xDC00 | UTF16Unit ( temp & 0x3FF );
592
593 Done:
594 *utf16Written = unitCount;
595 return;
596
597 } // CodePoint_to_UTF16Nat_Surrogate
598
599 // =================================================================================================
600
CodePoint_to_UTF16Nat(const UTF32Unit cpIn,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf16Written)601 static void CodePoint_to_UTF16Nat ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
602 {
603 size_t unitCount = 0;
604
605 UC_Assert ( (utf16Out != 0) && (utf16Written != 0) );
606 if ( utf16Len == 0 ) goto Done;
607 if ( cpIn >= 0xD800 ) goto CheckSurrogate; // ! Force linear execution path for the BMP.
608
609 InBMP:
610 unitCount = 1;
611 *utf16Out = UTF16Unit(cpIn);
612
613 Done:
614 *utf16Written = unitCount;
615 return;
616
617 CheckSurrogate:
618 if ( cpIn > 0xFFFF ) goto SurrogatePair;
619 if ( cpIn > 0xDFFF ) goto InBMP;
620 UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam );
621
622 SurrogatePair:
623 CodePoint_to_UTF16Nat_Surrogate ( cpIn, utf16Out, utf16Len, utf16Written );
624 return;
625
626 } // CodePoint_to_UTF16Nat
627
628 // =================================================================================================
629
CodePoint_from_UTF16Nat_Surrogate(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * cpOut,size_t * utf16Read)630 static void CodePoint_from_UTF16Nat_Surrogate ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
631 {
632 UTF16Unit hiUnit = *utf16In;
633 size_t unitCount = 0;
634 UTF16Unit loUnit; // ! Avoid gcc complaints about declarations after goto's.
635 UTF32Unit cp;
636
637 // ----------------------------------
638 // We've got a UTF-16 surrogate pair.
639
640 if ( hiUnit > 0xDBFF ) UC_Throw ( "Bad UTF-16 - leading low surrogate", kXMPErr_BadParam );
641 if ( utf16Len < 2 ) goto Done; // Not enough input in this buffer.
642
643 loUnit = *(utf16In+1);
644 if ( (loUnit < 0xDC00) || (0xDFFF < loUnit) ) UC_Throw ( "Bad UTF-16 - missing low surrogate", kXMPErr_BadParam );
645
646 unitCount = 2;
647 cp = (((hiUnit & 0x3FF) << 10) | (loUnit & 0x3FF)) + 0x10000;
648
649 *cpOut = cp; // ! Don't put after Done, don't write if no input.
650
651 Done:
652 *utf16Read = unitCount;
653 return;
654
655 } // CodePoint_from_UTF16Nat_Surrogate
656
657 // =================================================================================================
658
CodePoint_from_UTF16Nat(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * cpOut,size_t * utf16Read)659 static void CodePoint_from_UTF16Nat ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
660 {
661 UTF16Unit inUnit; // ! Don't read until we know there is input.
662 size_t unitCount = 0;
663
664 UC_Assert ( (utf16In != 0) && (cpOut != 0) && (utf16Read != 0) );
665 if ( utf16Len == 0 ) goto Done;
666 inUnit = *utf16In;
667 if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) goto SurrogatePair; // ! Force linear execution path for the BMP.
668
669 unitCount = 1;
670 *cpOut = inUnit; // ! Don't put after Done, don't write if no input.
671
672 Done:
673 *utf16Read = unitCount;
674 return;
675
676 SurrogatePair:
677 CodePoint_from_UTF16Nat_Surrogate ( utf16In, utf16Len, cpOut, utf16Read );
678 return;
679
680 } // CodePoint_from_UTF16Nat
681
682 // =================================================================================================
683
UTF8_to_UTF16Nat(const UTF8Unit * utf8In,const size_t utf8Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf8Read,size_t * utf16Written)684 static void UTF8_to_UTF16Nat ( const UTF8Unit * utf8In, const size_t utf8Len,
685 UTF16Unit * utf16Out, const size_t utf16Len,
686 size_t * utf8Read, size_t * utf16Written )
687 {
688 const UTF8Unit * utf8Pos = utf8In;
689 UTF16Unit * utf16Pos = utf16Out;
690
691 size_t utf8Left = utf8Len;
692 size_t utf16Left = utf16Len;
693
694 UC_Assert ( (utf8In != 0) && (utf16Out != 0) && (utf8Read != 0) && (utf16Written != 0) );
695
696 while ( (utf8Left > 0) && (utf16Left > 0) ) {
697
698 // Do a run of ASCII, it copies 1 input unit into 1 output unit.
699 size_t i, limit = utf8Left;
700 if ( limit > utf16Left ) limit = utf16Left;
701 for ( i = 0; i < limit; ++i ) {
702 UTF8Unit inUnit = *utf8Pos;
703 if ( inUnit > 0x7F ) break;
704 *utf16Pos = inUnit;
705 ++utf8Pos;
706 ++utf16Pos;
707 }
708 utf8Left -= i;
709 utf16Left -= i;
710
711 // Do a run of non-ASCII, it copies multiple input units into 1 or 2 output units.
712 while ( (utf8Left > 0) && (utf16Left > 0) ) {
713 UTF32Unit cp;
714 size_t len8, len16;
715 UTF8Unit inUnit = *utf8Pos;
716 if ( inUnit <= 0x7F ) break;
717 CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len8 );
718 if ( len8 == 0 ) goto Done; // The input buffer ends in the middle of a character.
719 if ( cp <= 0xFFFF ) {
720 *utf16Pos = UTF16Unit(cp);
721 len16 = 1;
722 } else {
723 CodePoint_to_UTF16Nat_Surrogate ( cp, utf16Pos, utf16Left, &len16 );
724 if ( len16 == 0 ) goto Done; // Not enough room in the output buffer.
725 }
726 utf8Left -= len8;
727 utf8Pos += len8;
728 utf16Left -= len16;
729 utf16Pos += len16;
730 }
731
732 }
733
734 Done: // Set the output lengths.
735 *utf8Read = utf8Len - utf8Left;
736 *utf16Written = utf16Len - utf16Left;
737
738 } // UTF8_to_UTF16Nat
739
740 // =================================================================================================
741
UTF8_to_UTF32Nat(const UTF8Unit * utf8In,const size_t utf8Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf8Read,size_t * utf32Written)742 static void UTF8_to_UTF32Nat ( const UTF8Unit * utf8In, const size_t utf8Len,
743 UTF32Unit * utf32Out, const size_t utf32Len,
744 size_t * utf8Read, size_t * utf32Written )
745 {
746 const UTF8Unit * utf8Pos = utf8In;
747 UTF32Unit * utf32Pos = utf32Out;
748
749 size_t utf8Left = utf8Len;
750 size_t utf32Left = utf32Len;
751
752 UC_Assert ( (utf8In != 0) && (utf32Out != 0) && (utf8Read != 0) && (utf32Written != 0) );
753
754 while ( (utf8Left > 0) && (utf32Left > 0) ) {
755
756 // Do a run of ASCII, it copies 1 input unit into 1 output unit.
757 size_t i, limit = utf8Left;
758 if ( limit > utf32Left ) limit = utf32Left;
759 for ( i = 0; i < limit; ++i ) {
760 UTF8Unit inUnit = *utf8Pos;
761 if ( inUnit > 0x7F ) break;
762 *utf32Pos = inUnit;
763 ++utf8Pos;
764 ++utf32Pos;
765 }
766 utf8Left -= i;
767 utf32Left -= i;
768
769 // Do a run of non-ASCII, it copies variable input into 1 output unit.
770 while ( (utf8Left > 0) && (utf32Left > 0) ) {
771 size_t len;
772 UTF8Unit inUnit = *utf8Pos;
773 if ( inUnit <= 0x7F ) break;
774 CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, utf32Pos, &len );
775 if ( len == 0 ) goto Done; // The input buffer ends in the middle of a character.
776 utf8Left -= len;
777 utf8Pos += len;
778 utf32Left -= 1;
779 utf32Pos += 1;
780 }
781
782 }
783
784 Done: // Set the output lengths.
785 *utf8Read = utf8Len - utf8Left;
786 *utf32Written = utf32Len - utf32Left;
787
788 } // UTF8_to_UTF32Nat
789
790 // =================================================================================================
791
UTF16Nat_to_UTF8(const UTF16Unit * utf16In,const size_t utf16Len,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf16Read,size_t * utf8Written)792 static void UTF16Nat_to_UTF8 ( const UTF16Unit * utf16In, const size_t utf16Len,
793 UTF8Unit * utf8Out, const size_t utf8Len,
794 size_t * utf16Read, size_t * utf8Written )
795 {
796 const UTF16Unit * utf16Pos = utf16In;
797 UTF8Unit * utf8Pos = utf8Out;
798
799 size_t utf16Left = utf16Len;
800 size_t utf8Left = utf8Len;
801
802 UC_Assert ( (utf16In != 0) && (utf8Out != 0) && (utf16Read != 0) && (utf8Written != 0) );
803
804 while ( (utf16Left > 0) && (utf8Left > 0) ) {
805
806 // Do a run of ASCII, it copies 1 input unit into 1 output unit.
807 size_t i, limit = utf16Left;
808 if ( limit > utf8Left ) limit = utf8Left;
809 for ( i = 0; i < limit; ++i ) {
810 UTF16Unit inUnit = *utf16Pos;
811 if ( inUnit > 0x7F ) break;
812 *utf8Pos = UTF8Unit(inUnit);
813 ++utf16Pos;
814 ++utf8Pos;
815 }
816 utf16Left -= i;
817 utf8Left -= i;
818
819 // Do a run of non-ASCII inside the BMP, it copies 1 input unit into multiple output units.
820 while ( (utf16Left > 0) && (utf8Left > 0) ) {
821 size_t len8;
822 UTF16Unit inUnit = *utf16Pos;
823 if ( inUnit <= 0x7F ) break;
824 if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
825 CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len8 );
826 if ( len8 == 0 ) goto Done; // Not enough room in the output buffer.
827 utf16Left -= 1;
828 utf16Pos += 1;
829 utf8Left -= len8;
830 utf8Pos += len8;
831 }
832
833 // Do a run of surrogate pairs, it copies 2 input units into multiple output units.
834 while ( (utf16Left > 0) && (utf8Left > 0) ) {
835 UTF32Unit cp;
836 size_t len16, len8;
837 UTF16Unit inUnit = *utf16Pos;
838 if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
839 CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, &cp, &len16 );
840 if ( len16 == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
841 UC_Assert ( len16 == 2 );
842 CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len8 );
843 if ( len8 == 0 ) goto Done; // Not enough room in the output buffer.
844 utf16Left -= len16;
845 utf16Pos += len16;
846 utf8Left -= len8;
847 utf8Pos += len8;
848 }
849
850 }
851
852 Done: // Set the output lengths.
853 *utf16Read = utf16Len - utf16Left;
854 *utf8Written = utf8Len - utf8Left;
855
856 } // UTF16Nat_to_UTF8
857
858 // =================================================================================================
859
UTF32Nat_to_UTF8(const UTF32Unit * utf32In,const size_t utf32Len,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf32Read,size_t * utf8Written)860 static void UTF32Nat_to_UTF8 ( const UTF32Unit * utf32In, const size_t utf32Len,
861 UTF8Unit * utf8Out, const size_t utf8Len,
862 size_t * utf32Read, size_t * utf8Written )
863 {
864 const UTF32Unit * utf32Pos = utf32In;
865 UTF8Unit * utf8Pos = utf8Out;
866
867 size_t utf32Left = utf32Len;
868 size_t utf8Left = utf8Len;
869
870 UC_Assert ( (utf32In != 0) && (utf8Out != 0) && (utf32Read != 0) && (utf8Written != 0) );
871
872 while ( (utf32Left > 0) && (utf8Left > 0) ) {
873
874 // Do a run of ASCII, it copies 1 input unit into 1 output unit.
875 size_t i, limit = utf32Left;
876 if ( limit > utf8Left ) limit = utf8Left;
877 for ( i = 0; i < limit; ++i ) {
878 UTF32Unit inUnit = *utf32Pos;
879 if ( inUnit > 0x7F ) break;
880 *utf8Pos = UTF8Unit(inUnit);
881 ++utf32Pos;
882 ++utf8Pos;
883 }
884 utf32Left -= i;
885 utf8Left -= i;
886
887 // Do a run of non-ASCII, it copies 1 input unit into multiple output units.
888 while ( (utf32Left > 0) && (utf8Left > 0) ) {
889 size_t len;
890 UTF32Unit inUnit = *utf32Pos;
891 if ( inUnit <= 0x7F ) break;
892 CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len );
893 if ( len == 0 ) goto Done; // Not enough room in the output buffer.
894 utf32Left -= 1;
895 utf32Pos += 1;
896 utf8Left -= len;
897 utf8Pos += len;
898 }
899
900 }
901
902 Done: // Set the output lengths.
903 *utf32Read = utf32Len - utf32Left;
904 *utf8Written = utf8Len - utf8Left;
905
906 } // UTF32Nat_to_UTF8
907
908 // =================================================================================================
909
UTF16Nat_to_UTF32Nat(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf16Read,size_t * utf32Written)910 static void UTF16Nat_to_UTF32Nat ( const UTF16Unit * utf16In, const size_t utf16Len,
911 UTF32Unit * utf32Out, const size_t utf32Len,
912 size_t * utf16Read, size_t * utf32Written )
913 {
914 const UTF16Unit * utf16Pos = utf16In;
915 UTF32Unit * utf32Pos = utf32Out;
916
917 size_t utf16Left = utf16Len;
918 size_t utf32Left = utf32Len;
919
920 UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
921
922 while ( (utf16Left > 0) && (utf32Left > 0) ) {
923
924 // Do a run of BMP, it copies 1 input unit into 1 output unit.
925 size_t i, limit = utf16Left;
926 if ( limit > utf32Left ) limit = utf32Left;
927 for ( i = 0; i < limit; ++i ) {
928 UTF16Unit inUnit = *utf16Pos;
929 if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
930 *utf32Pos = inUnit;
931 ++utf16Pos;
932 ++utf32Pos;
933 }
934 utf16Left -= i;
935 utf32Left -= i;
936
937 // Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
938 while ( (utf16Left > 0) && (utf32Left > 0) ) {
939 size_t len;
940 UTF16Unit inUnit = *utf16Pos;
941 if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
942 CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, utf32Pos, &len );
943 if ( len == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
944 UC_Assert ( len == 2 );
945 utf16Left -= len;
946 utf16Pos += len;
947 utf32Left -= 1;
948 utf32Pos += 1;
949 }
950
951 }
952
953 Done: // Set the output lengths.
954 *utf16Read = utf16Len - utf16Left;
955 *utf32Written = utf32Len - utf32Left;
956
957 } // UTF16Nat_to_UTF32Nat
958
959 // =================================================================================================
960
UTF32Nat_to_UTF16Nat(const UTF32Unit * utf32In,const size_t utf32Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf32Read,size_t * utf16Written)961 static void UTF32Nat_to_UTF16Nat ( const UTF32Unit * utf32In, const size_t utf32Len,
962 UTF16Unit * utf16Out, const size_t utf16Len,
963 size_t * utf32Read, size_t * utf16Written )
964 {
965 const UTF32Unit * utf32Pos = utf32In;
966 UTF16Unit * utf16Pos = utf16Out;
967
968 size_t utf32Left = utf32Len;
969 size_t utf16Left = utf16Len;
970
971 UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
972
973 while ( (utf32Left > 0) && (utf16Left > 0) ) {
974
975 // Do a run of BMP, it copies 1 input unit into 1 output unit.
976 size_t i, limit = utf32Left;
977 if ( limit > utf16Left ) limit = utf16Left;
978 for ( i = 0; i < limit; ++i ) {
979 UTF32Unit inUnit = *utf32Pos;
980 if ( inUnit > 0xFFFF ) break;
981 *utf16Pos = UTF16Unit(inUnit);
982 ++utf32Pos;
983 ++utf16Pos;
984 }
985 utf32Left -= i;
986 utf16Left -= i;
987
988 // Do a run of non-BMP, it copies 1 input unit into 2 output units.
989 while ( (utf32Left > 0) && (utf16Left > 0) ) {
990 size_t len;
991 UTF32Unit inUnit = *utf32Pos;
992 if ( inUnit <= 0xFFFF ) break;
993 CodePoint_to_UTF16Nat_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
994 if ( len == 0 ) goto Done; // Not enough room in the output buffer.
995 UC_Assert ( len == 2 );
996 utf32Left -= 1;
997 utf32Pos += 1;
998 utf16Left -= 2;
999 utf16Pos += 2;
1000 }
1001
1002 }
1003
1004 Done: // Set the output lengths.
1005 *utf32Read = utf32Len - utf32Left;
1006 *utf16Written = utf16Len - utf16Left;
1007
1008 } // UTF32Nat_to_UTF16Nat
1009
1010 // =================================================================================================
1011
CodePoint_to_UTF16Swp_Surrogate(const UTF32Unit cpIn,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf16Written)1012 static void CodePoint_to_UTF16Swp_Surrogate ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
1013 {
1014 size_t unitCount = 0;
1015 UTF32Unit temp; // ! Avoid gcc complaints about declarations after goto's.
1016
1017 if ( cpIn > 0x10FFFF ) UC_Throw ( "Bad UTF-32 - out of range", kXMPErr_BadParam );
1018 if ( utf16Len < 2 ) goto Done; // Not enough room for the output.
1019
1020 unitCount = 2;
1021 temp = cpIn - 0x10000;
1022 UTF16OutSwap ( &utf16Out[0], (0xD800 | UTF16Unit ( temp >> 10 )) );
1023 UTF16OutSwap ( &utf16Out[1], (0xDC00 | UTF16Unit ( temp & 0x3FF)) );
1024
1025 Done:
1026 *utf16Written = unitCount;
1027 return;
1028
1029 } // CodePoint_to_UTF16Swp_Surrogate
1030
1031 // =================================================================================================
1032
CodePoint_to_UTF16Swp(const UTF32Unit cpIn,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf16Written)1033 static void CodePoint_to_UTF16Swp ( const UTF32Unit cpIn, UTF16Unit * utf16Out, const size_t utf16Len, size_t * utf16Written )
1034 {
1035 size_t unitCount = 0;
1036
1037 UC_Assert ( (utf16Out != 0) && (utf16Written != 0) );
1038 if ( utf16Len == 0 ) goto Done;
1039 if ( cpIn >= 0xD800 ) goto CheckSurrogate; // ! Force linear execution path for the BMP.
1040
1041 InBMP:
1042 unitCount = 1;
1043 UTF16OutSwap ( utf16Out, UTF16Unit(cpIn) );
1044
1045 Done:
1046 *utf16Written = unitCount;
1047 return;
1048
1049 CheckSurrogate:
1050 if ( cpIn > 0xFFFF ) goto SurrogatePair;
1051 if ( cpIn > 0xDFFF ) goto InBMP;
1052 UC_Throw ( "Bad UTF-32 - surrogate code point", kXMPErr_BadParam );
1053
1054 SurrogatePair:
1055 CodePoint_to_UTF16Swp_Surrogate ( cpIn, utf16Out, utf16Len, utf16Written );
1056 return;
1057
1058 } // CodePoint_to_UTF16Swp
1059
1060 // =================================================================================================
1061
CodePoint_from_UTF16Swp_Surrogate(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * cpOut,size_t * utf16Read)1062 static void CodePoint_from_UTF16Swp_Surrogate ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
1063 {
1064 UTF16Unit hiUnit = UTF16InSwap(utf16In);
1065 size_t unitCount = 0;
1066 UTF16Unit loUnit; // ! Avoid gcc complaints about declarations after goto's.
1067 UTF32Unit cp;
1068
1069 // ----------------------------------
1070 // We've got a UTF-16 surrogate pair.
1071
1072 if ( hiUnit > 0xDBFF ) UC_Throw ( "Bad UTF-16 - leading low surrogate", kXMPErr_BadParam );
1073 if ( utf16Len < 2 ) goto Done; // Not enough input in this buffer.
1074
1075 loUnit = UTF16InSwap(utf16In+1);
1076 if ( (loUnit < 0xDC00) || (0xDFFF < loUnit) ) UC_Throw ( "Bad UTF-16 - missing low surrogate", kXMPErr_BadParam );
1077
1078 unitCount = 2;
1079 cp = (((hiUnit & 0x3FF) << 10) | (loUnit & 0x3FF)) + 0x10000;
1080
1081 *cpOut = cp; // ! Don't put after Done, don't write if no input.
1082
1083 Done:
1084 *utf16Read = unitCount;
1085 return;
1086
1087 } // CodePoint_from_UTF16Swp_Surrogate
1088
1089 // =================================================================================================
1090
CodePoint_from_UTF16Swp(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * cpOut,size_t * utf16Read)1091 static void CodePoint_from_UTF16Swp ( const UTF16Unit * utf16In, const size_t utf16Len, UTF32Unit * cpOut, size_t * utf16Read )
1092 {
1093 UTF16Unit inUnit; // ! Don't read until we know there is input.
1094 size_t unitCount = 0;
1095
1096 UC_Assert ( (utf16In != 0) && (cpOut != 0) && (utf16Read != 0) );
1097 if ( utf16Len == 0 ) goto Done;
1098 inUnit = UTF16InSwap(utf16In);
1099 if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) goto SurrogatePair; // ! Force linear execution path for the BMP.
1100
1101 unitCount = 1;
1102 *cpOut = inUnit; // ! Don't put after Done, don't write if no input.
1103
1104 Done:
1105 *utf16Read = unitCount;
1106 return;
1107
1108 SurrogatePair:
1109 CodePoint_from_UTF16Swp_Surrogate ( utf16In, utf16Len, cpOut, utf16Read );
1110 return;
1111
1112 } // CodePoint_from_UTF16Swp
1113
1114 // =================================================================================================
1115
UTF8_to_UTF16Swp(const UTF8Unit * utf8In,const size_t utf8Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf8Read,size_t * utf16Written)1116 static void UTF8_to_UTF16Swp ( const UTF8Unit * utf8In, const size_t utf8Len,
1117 UTF16Unit * utf16Out, const size_t utf16Len,
1118 size_t * utf8Read, size_t * utf16Written )
1119 {
1120 const UTF8Unit * utf8Pos = utf8In;
1121 UTF16Unit * utf16Pos = utf16Out;
1122
1123 size_t utf8Left = utf8Len;
1124 size_t utf16Left = utf16Len;
1125
1126 UC_Assert ( (utf8In != 0) && (utf16Out != 0) && (utf8Read != 0) && (utf16Written != 0) );
1127
1128 while ( (utf8Left > 0) && (utf16Left > 0) ) {
1129
1130 // Do a run of ASCII, it copies 1 input unit into 1 output unit.
1131 size_t i, limit = utf8Left;
1132 if ( limit > utf16Left ) limit = utf16Left;
1133 for ( i = 0; i < limit; ++i ) {
1134 UTF8Unit inUnit = *utf8Pos;
1135 if ( inUnit > 0x7F ) break;
1136 *utf16Pos = UTF16Unit(inUnit) << 8; // Better than: UTF16OutSwap ( utf16Pos, inUnit );
1137 ++utf8Pos;
1138 ++utf16Pos;
1139 }
1140 utf8Left -= i;
1141 utf16Left -= i;
1142
1143 // Do a run of non-ASCII, it copies multiple input units into 1 or 2 output units.
1144 while ( (utf8Left > 0) && (utf16Left > 0) ) {
1145 UTF32Unit cp;
1146 size_t len8, len16;
1147 UTF8Unit inUnit = *utf8Pos;
1148 if ( inUnit <= 0x7F ) break;
1149 CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len8 );
1150 if ( len8 == 0 ) goto Done; // The input buffer ends in the middle of a character.
1151 if ( cp <= 0xFFFF ) {
1152 UTF16OutSwap ( utf16Pos, UTF16Unit(cp) );
1153 len16 = 1;
1154 } else {
1155 CodePoint_to_UTF16Swp_Surrogate ( cp, utf16Pos, utf16Left, &len16 );
1156 if ( len16 == 0 ) goto Done; // Not enough room in the output buffer.
1157 }
1158 utf8Left -= len8;
1159 utf8Pos += len8;
1160 utf16Left -= len16;
1161 utf16Pos += len16;
1162 }
1163
1164 }
1165
1166 Done: // Set the output lengths.
1167 *utf8Read = utf8Len - utf8Left;
1168 *utf16Written = utf16Len - utf16Left;
1169
1170 } // UTF8_to_UTF16Swp
1171
1172 // =================================================================================================
1173
UTF8_to_UTF32Swp(const UTF8Unit * utf8In,const size_t utf8Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf8Read,size_t * utf32Written)1174 static void UTF8_to_UTF32Swp ( const UTF8Unit * utf8In, const size_t utf8Len,
1175 UTF32Unit * utf32Out, const size_t utf32Len,
1176 size_t * utf8Read, size_t * utf32Written )
1177 {
1178 const UTF8Unit * utf8Pos = utf8In;
1179 UTF32Unit * utf32Pos = utf32Out;
1180
1181 size_t utf8Left = utf8Len;
1182 size_t utf32Left = utf32Len;
1183
1184 UC_Assert ( (utf8In != 0) && (utf32Out != 0) && (utf8Read != 0) && (utf32Written != 0) );
1185
1186 while ( (utf8Left > 0) && (utf32Left > 0) ) {
1187
1188 // Do a run of ASCII, it copies 1 input unit into 1 output unit.
1189 size_t i, limit = utf8Left;
1190 if ( limit > utf32Left ) limit = utf32Left;
1191 for ( i = 0; i < limit; ++i ) {
1192 UTF8Unit inUnit = *utf8Pos;
1193 if ( inUnit > 0x7F ) break;
1194 *utf32Pos = UTF32Unit(inUnit) << 24; // Better than: UTF32OutSwap ( utf32Pos, inUnit );
1195 ++utf8Pos;
1196 ++utf32Pos;
1197 }
1198 utf8Left -= i;
1199 utf32Left -= i;
1200
1201 // Do a run of non-ASCII, it copies variable input into 1 output unit.
1202 while ( (utf8Left > 0) && (utf32Left > 0) ) {
1203 size_t len;
1204 UTF32Unit cp;
1205 UTF8Unit inUnit = *utf8Pos;
1206 if ( inUnit <= 0x7F ) break;
1207 CodePoint_from_UTF8_Multi ( utf8Pos, utf8Left, &cp, &len );
1208 if ( len == 0 ) goto Done; // The input buffer ends in the middle of a character.
1209 UTF32OutSwap ( utf32Pos, cp );
1210 utf8Left -= len;
1211 utf8Pos += len;
1212 utf32Left -= 1;
1213 utf32Pos += 1;
1214 }
1215
1216 }
1217
1218 Done: // Set the output lengths.
1219 *utf8Read = utf8Len - utf8Left;
1220 *utf32Written = utf32Len - utf32Left;
1221
1222 } // UTF8_to_UTF32Swp
1223
1224 // =================================================================================================
1225
UTF16Swp_to_UTF8(const UTF16Unit * utf16In,const size_t utf16Len,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf16Read,size_t * utf8Written)1226 static void UTF16Swp_to_UTF8 ( const UTF16Unit * utf16In, const size_t utf16Len,
1227 UTF8Unit * utf8Out, const size_t utf8Len,
1228 size_t * utf16Read, size_t * utf8Written )
1229 {
1230 const UTF16Unit * utf16Pos = utf16In;
1231 UTF8Unit * utf8Pos = utf8Out;
1232
1233 size_t utf16Left = utf16Len;
1234 size_t utf8Left = utf8Len;
1235
1236 UC_Assert ( (utf16In != 0) && (utf8Out != 0) && (utf16Read != 0) && (utf8Written != 0) );
1237
1238 while ( (utf16Left > 0) && (utf8Left > 0) ) {
1239
1240 // Do a run of ASCII, it copies 1 input unit into 1 output unit.
1241 size_t i, limit = utf16Left;
1242 if ( limit > utf8Left ) limit = utf8Left;
1243 for ( i = 0; i < limit; ++i ) {
1244 UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1245 if ( inUnit > 0x7F ) break;
1246 *utf8Pos = UTF8Unit(inUnit);
1247 ++utf16Pos;
1248 ++utf8Pos;
1249 }
1250 utf16Left -= i;
1251 utf8Left -= i;
1252
1253 // Do a run of non-ASCII inside the BMP, it copies 1 input unit into multiple output units.
1254 while ( (utf16Left > 0) && (utf8Left > 0) ) {
1255 size_t len8;
1256 UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1257 if ( inUnit <= 0x7F ) break;
1258 if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1259 CodePoint_to_UTF8_Multi ( inUnit, utf8Pos, utf8Left, &len8 );
1260 if ( len8 == 0 ) goto Done; // Not enough room in the output buffer.
1261 utf16Left -= 1;
1262 utf16Pos += 1;
1263 utf8Left -= len8;
1264 utf8Pos += len8;
1265 }
1266
1267 // Do a run of surrogate pairs, it copies 2 input units into multiple output units.
1268 while ( (utf16Left > 0) && (utf8Left > 0) ) {
1269 UTF32Unit cp;
1270 size_t len16, len8;
1271 UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1272 if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1273 CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, &cp, &len16 );
1274 if ( len16 == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
1275 UC_Assert ( len16 == 2 );
1276 CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len8 );
1277 if ( len8 == 0 ) goto Done; // Not enough room in the output buffer.
1278 utf16Left -= len16;
1279 utf16Pos += len16;
1280 utf8Left -= len8;
1281 utf8Pos += len8;
1282 }
1283
1284 }
1285
1286 Done: // Set the output lengths.
1287 *utf16Read = utf16Len - utf16Left;
1288 *utf8Written = utf8Len - utf8Left;
1289
1290 } // UTF16Swp_to_UTF8
1291
1292 // =================================================================================================
1293
UTF32Swp_to_UTF8(const UTF32Unit * utf32In,const size_t utf32Len,UTF8Unit * utf8Out,const size_t utf8Len,size_t * utf32Read,size_t * utf8Written)1294 static void UTF32Swp_to_UTF8 ( const UTF32Unit * utf32In, const size_t utf32Len,
1295 UTF8Unit * utf8Out, const size_t utf8Len,
1296 size_t * utf32Read, size_t * utf8Written )
1297 {
1298 const UTF32Unit * utf32Pos = utf32In;
1299 UTF8Unit * utf8Pos = utf8Out;
1300
1301 size_t utf32Left = utf32Len;
1302 size_t utf8Left = utf8Len;
1303
1304 UC_Assert ( (utf32In != 0) && (utf8Out != 0) && (utf32Read != 0) && (utf8Written != 0) );
1305
1306 while ( (utf32Left > 0) && (utf8Left > 0) ) {
1307
1308 // Do a run of ASCII, it copies 1 input unit into 1 output unit.
1309 size_t i, limit = utf32Left;
1310 if ( limit > utf8Left ) limit = utf8Left;
1311 for ( i = 0; i < limit; ++i ) {
1312 UTF32Unit cp = UTF32InSwap(utf32Pos);
1313 if ( cp > 0x7F ) break;
1314 *utf8Pos = UTF8Unit(cp);
1315 ++utf32Pos;
1316 ++utf8Pos;
1317 }
1318 utf32Left -= i;
1319 utf8Left -= i;
1320
1321 // Do a run of non-ASCII, it copies 1 input unit into multiple output units.
1322 while ( (utf32Left > 0) && (utf8Left > 0) ) {
1323 size_t len;
1324 UTF32Unit cp = UTF32InSwap(utf32Pos);
1325 if ( cp <= 0x7F ) break;
1326 CodePoint_to_UTF8_Multi ( cp, utf8Pos, utf8Left, &len );
1327 if ( len == 0 ) goto Done; // Not enough room in the output buffer.
1328 utf32Left -= 1;
1329 utf32Pos += 1;
1330 utf8Left -= len;
1331 utf8Pos += len;
1332 }
1333
1334 }
1335
1336 Done: // Set the output lengths.
1337 *utf32Read = utf32Len - utf32Left;
1338 *utf8Written = utf8Len - utf8Left;
1339
1340 } // UTF32Swp_to_UTF8
1341
1342 // =================================================================================================
1343
UTF16Swp_to_UTF32Swp(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf16Read,size_t * utf32Written)1344 static void UTF16Swp_to_UTF32Swp ( const UTF16Unit * utf16In, const size_t utf16Len,
1345 UTF32Unit * utf32Out, const size_t utf32Len,
1346 size_t * utf16Read, size_t * utf32Written )
1347 {
1348 const UTF16Unit * utf16Pos = utf16In;
1349 UTF32Unit * utf32Pos = utf32Out;
1350
1351 size_t utf16Left = utf16Len;
1352 size_t utf32Left = utf32Len;
1353
1354 UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
1355
1356 while ( (utf16Left > 0) && (utf32Left > 0) ) {
1357
1358 // Do a run of BMP, it copies 1 input unit into 1 output unit.
1359 size_t i, limit = utf16Left;
1360 if ( limit > utf32Left ) limit = utf32Left;
1361 for ( i = 0; i < limit; ++i ) {
1362 UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1363 if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1364 *utf32Pos = UTF32Unit(*utf16Pos) << 16; // Better than: UTF32OutSwap ( utf32Pos, inUnit );
1365 ++utf16Pos;
1366 ++utf32Pos;
1367 }
1368 utf16Left -= i;
1369 utf32Left -= i;
1370
1371 // Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
1372 while ( (utf16Left > 0) && (utf32Left > 0) ) {
1373 size_t len;
1374 UTF32Unit cp;
1375 UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1376 if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1377 CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, &cp, &len );
1378 if ( len == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
1379 UTF32OutSwap ( utf32Pos, cp );
1380 UC_Assert ( len == 2 );
1381 utf16Left -= len;
1382 utf16Pos += len;
1383 utf32Left -= 1;
1384 utf32Pos += 1;
1385 }
1386
1387 }
1388
1389 Done: // Set the output lengths.
1390 *utf16Read = utf16Len - utf16Left;
1391 *utf32Written = utf32Len - utf32Left;
1392
1393 } // UTF16Swp_to_UTF32Swp
1394
1395 // =================================================================================================
1396
UTF32Swp_to_UTF16Swp(const UTF32Unit * utf32In,const size_t utf32Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf32Read,size_t * utf16Written)1397 static void UTF32Swp_to_UTF16Swp ( const UTF32Unit * utf32In, const size_t utf32Len,
1398 UTF16Unit * utf16Out, const size_t utf16Len,
1399 size_t * utf32Read, size_t * utf16Written )
1400 {
1401 const UTF32Unit * utf32Pos = utf32In;
1402 UTF16Unit * utf16Pos = utf16Out;
1403
1404 size_t utf32Left = utf32Len;
1405 size_t utf16Left = utf16Len;
1406
1407 const size_t k32to16Offset = swap32to16Offset; // ! Make sure compiler treats as an invariant.
1408
1409 UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
1410
1411 while ( (utf32Left > 0) && (utf16Left > 0) ) {
1412
1413 // Do a run of BMP, it copies 1 input unit into 1 output unit.
1414 size_t i, limit = utf32Left;
1415 if ( limit > utf16Left ) limit = utf16Left;
1416 for ( i = 0; i < limit; ++i ) {
1417 UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1418 if ( inUnit > 0xFFFF ) break;
1419 *utf16Pos = *(((UTF16Unit*)utf32Pos) + k32to16Offset); // Better than: UTF16OutSwap ( utf16Pos, UTF16Unit(inUnit) );
1420 ++utf32Pos;
1421 ++utf16Pos;
1422 }
1423 utf32Left -= i;
1424 utf16Left -= i;
1425
1426 // Do a run of non-BMP, it copies 1 input unit into 2 output units.
1427 while ( (utf32Left > 0) && (utf16Left > 0) ) {
1428 size_t len;
1429 UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1430 if ( inUnit <= 0xFFFF ) break;
1431 CodePoint_to_UTF16Swp_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1432 if ( len == 0 ) goto Done; // Not enough room in the output buffer.
1433 UC_Assert ( len == 2 );
1434 utf32Left -= 1;
1435 utf32Pos += 1;
1436 utf16Left -= 2;
1437 utf16Pos += 2;
1438 }
1439
1440 }
1441
1442 Done: // Set the output lengths.
1443 *utf32Read = utf32Len - utf32Left;
1444 *utf16Written = utf16Len - utf16Left;
1445
1446 } // UTF32Swp_to_UTF16Swp
1447
1448 // =================================================================================================
1449
UTF16Nat_to_UTF32Swp(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf16Read,size_t * utf32Written)1450 static void UTF16Nat_to_UTF32Swp ( const UTF16Unit * utf16In, const size_t utf16Len,
1451 UTF32Unit * utf32Out, const size_t utf32Len,
1452 size_t * utf16Read, size_t * utf32Written )
1453 {
1454 const UTF16Unit * utf16Pos = utf16In;
1455 UTF32Unit * utf32Pos = utf32Out;
1456
1457 size_t utf16Left = utf16Len;
1458 size_t utf32Left = utf32Len;
1459
1460 UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
1461
1462 while ( (utf16Left > 0) && (utf32Left > 0) ) {
1463
1464 // Do a run of BMP, it copies 1 input unit into 1 output unit.
1465 size_t i, limit = utf16Left;
1466 if ( limit > utf32Left ) limit = utf32Left;
1467 for ( i = 0; i < limit; ++i ) {
1468 UTF16Unit inUnit = *utf16Pos;
1469 if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1470 UTF32OutSwap ( utf32Pos, inUnit );
1471 ++utf16Pos;
1472 ++utf32Pos;
1473 }
1474 utf16Left -= i;
1475 utf32Left -= i;
1476
1477 // Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
1478 while ( (utf16Left > 0) && (utf32Left > 0) ) {
1479 size_t len;
1480 UTF32Unit cp;
1481 UTF16Unit inUnit = *utf16Pos;
1482 if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1483 CodePoint_from_UTF16Nat_Surrogate ( utf16Pos, utf16Left, &cp, &len );
1484 if ( len == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
1485 UC_Assert ( len == 2 );
1486 UTF32OutSwap ( utf32Pos, cp );
1487 utf16Left -= len;
1488 utf16Pos += len;
1489 utf32Left -= 1;
1490 utf32Pos += 1;
1491 }
1492
1493 }
1494
1495 Done: // Set the output lengths.
1496 *utf16Read = utf16Len - utf16Left;
1497 *utf32Written = utf32Len - utf32Left;
1498
1499 } // UTF16Nat_to_UTF32Swp
1500
1501 // =================================================================================================
1502
UTF16Swp_to_UTF32Nat(const UTF16Unit * utf16In,const size_t utf16Len,UTF32Unit * utf32Out,const size_t utf32Len,size_t * utf16Read,size_t * utf32Written)1503 static void UTF16Swp_to_UTF32Nat ( const UTF16Unit * utf16In, const size_t utf16Len,
1504 UTF32Unit * utf32Out, const size_t utf32Len,
1505 size_t * utf16Read, size_t * utf32Written )
1506 {
1507 const UTF16Unit * utf16Pos = utf16In;
1508 UTF32Unit * utf32Pos = utf32Out;
1509
1510 size_t utf16Left = utf16Len;
1511 size_t utf32Left = utf32Len;
1512
1513 UC_Assert ( (utf16In != 0) && (utf32Out != 0) && (utf16Read != 0) && (utf32Written != 0) );
1514
1515 while ( (utf16Left > 0) && (utf32Left > 0) ) {
1516
1517 // Do a run of BMP, it copies 1 input unit into 1 output unit.
1518 size_t i, limit = utf16Left;
1519 if ( limit > utf32Left ) limit = utf32Left;
1520 for ( i = 0; i < limit; ++i ) {
1521 UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1522 if ( (0xD800 <= inUnit) && (inUnit <= 0xDFFF) ) break;
1523 *utf32Pos = inUnit;
1524 ++utf16Pos;
1525 ++utf32Pos;
1526 }
1527 utf16Left -= i;
1528 utf32Left -= i;
1529
1530 // Do a run of surrogate pairs, it copies 2 input units into 1 output unit.
1531 while ( (utf16Left > 0) && (utf32Left > 0) ) {
1532 size_t len;
1533 UTF16Unit inUnit = UTF16InSwap(utf16Pos);
1534 if ( (inUnit < 0xD800) || (0xDFFF < inUnit) ) break;
1535 CodePoint_from_UTF16Swp_Surrogate ( utf16Pos, utf16Left, utf32Pos, &len );
1536 if ( len == 0 ) goto Done; // The input buffer ends in the middle of a surrogate pair.
1537 UC_Assert ( len == 2 );
1538 utf16Left -= len;
1539 utf16Pos += len;
1540 utf32Left -= 1;
1541 utf32Pos += 1;
1542 }
1543
1544 }
1545
1546 Done: // Set the output lengths.
1547 *utf16Read = utf16Len - utf16Left;
1548 *utf32Written = utf32Len - utf32Left;
1549
1550 } // UTF16Swp_to_UTF32Nat
1551
1552 // =================================================================================================
1553
UTF32Nat_to_UTF16Swp(const UTF32Unit * utf32In,const size_t utf32Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf32Read,size_t * utf16Written)1554 static void UTF32Nat_to_UTF16Swp ( const UTF32Unit * utf32In, const size_t utf32Len,
1555 UTF16Unit * utf16Out, const size_t utf16Len,
1556 size_t * utf32Read, size_t * utf16Written )
1557 {
1558 const UTF32Unit * utf32Pos = utf32In;
1559 UTF16Unit * utf16Pos = utf16Out;
1560
1561 size_t utf32Left = utf32Len;
1562 size_t utf16Left = utf16Len;
1563
1564 UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
1565
1566 while ( (utf32Left > 0) && (utf16Left > 0) ) {
1567
1568 // Do a run of BMP, it copies 1 input unit into 1 output unit.
1569 size_t i, limit = utf32Left;
1570 if ( limit > utf16Left ) limit = utf16Left;
1571 for ( i = 0; i < limit; ++i ) {
1572 UTF32Unit inUnit = *utf32Pos;
1573 if ( inUnit > 0xFFFF ) break;
1574 UTF16OutSwap ( utf16Pos, UTF16Unit(inUnit) );
1575 ++utf32Pos;
1576 ++utf16Pos;
1577 }
1578 utf32Left -= i;
1579 utf16Left -= i;
1580
1581 // Do a run of non-BMP, it copies 1 input unit into 2 output units.
1582 while ( (utf32Left > 0) && (utf16Left > 0) ) {
1583 size_t len;
1584 UTF32Unit inUnit = *utf32Pos;
1585 if ( inUnit <= 0xFFFF ) break;
1586 CodePoint_to_UTF16Swp_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1587 if ( len == 0 ) goto Done; // Not enough room in the output buffer.
1588 UC_Assert ( len == 2 );
1589 utf32Left -= 1;
1590 utf32Pos += 1;
1591 utf16Left -= 2;
1592 utf16Pos += 2;
1593 }
1594
1595 }
1596
1597 Done: // Set the output lengths.
1598 *utf32Read = utf32Len - utf32Left;
1599 *utf16Written = utf16Len - utf16Left;
1600
1601 } // UTF32Nat_to_UTF16Swp
1602
1603 // =================================================================================================
1604
UTF32Swp_to_UTF16Nat(const UTF32Unit * utf32In,const size_t utf32Len,UTF16Unit * utf16Out,const size_t utf16Len,size_t * utf32Read,size_t * utf16Written)1605 static void UTF32Swp_to_UTF16Nat ( const UTF32Unit * utf32In, const size_t utf32Len,
1606 UTF16Unit * utf16Out, const size_t utf16Len,
1607 size_t * utf32Read, size_t * utf16Written )
1608 {
1609 const UTF32Unit * utf32Pos = utf32In;
1610 UTF16Unit * utf16Pos = utf16Out;
1611
1612 size_t utf32Left = utf32Len;
1613 size_t utf16Left = utf16Len;
1614
1615 UC_Assert ( (utf32In != 0) && (utf16Out != 0) && (utf32Read != 0) && (utf16Written != 0) );
1616
1617 while ( (utf32Left > 0) && (utf16Left > 0) ) {
1618
1619 // Do a run of BMP, it copies 1 input unit into 1 output unit.
1620 size_t i, limit = utf32Left;
1621 if ( limit > utf16Left ) limit = utf16Left;
1622 for ( i = 0; i < limit; ++i ) {
1623 UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1624 if ( inUnit > 0xFFFF ) break;
1625 *utf16Pos = UTF16Unit(inUnit);
1626 ++utf32Pos;
1627 ++utf16Pos;
1628 }
1629 utf32Left -= i;
1630 utf16Left -= i;
1631
1632 // Do a run of non-BMP, it copies 1 input unit into 2 output units.
1633 while ( (utf32Left > 0) && (utf16Left > 0) ) {
1634 size_t len;
1635 UTF32Unit inUnit = UTF32InSwap(utf32Pos);
1636 if ( inUnit <= 0xFFFF ) break;
1637 CodePoint_to_UTF16Nat_Surrogate ( inUnit, utf16Pos, utf16Left, &len );
1638 if ( len == 0 ) goto Done; // Not enough room in the output buffer.
1639 UC_Assert ( len == 2 );
1640 utf32Left -= 1;
1641 utf32Pos += 1;
1642 utf16Left -= 2;
1643 utf16Pos += 2;
1644 }
1645
1646 }
1647
1648 Done: // Set the output lengths.
1649 *utf32Read = utf32Len - utf32Left;
1650 *utf16Written = utf16Len - utf16Left;
1651
1652 } // UTF32Swp_to_UTF16Nat
1653
1654 // =================================================================================================
1655