1 /*---------------------------------------------------------------------------*
2 | PDFlib - A library for generating PDF on the fly |
3 +---------------------------------------------------------------------------+
4 | Copyright (c) 1997-2006 Thomas Merz and PDFlib GmbH. All rights reserved. |
5 +---------------------------------------------------------------------------+
6 | |
7 | This software is subject to the PDFlib license. It is NOT in the |
8 | public domain. Extended versions and commercial licenses are |
9 | available, please check http://www.pdflib.com. |
10 | |
11 *---------------------------------------------------------------------------*/
12
13 /* $Id: pc_unicode.c,v 1.179.2.32 2009/04/01 19:33:27 kurt Exp $
14 *
15 * PDFlib Unicode converting routines
16 *
17 */
18
19 #define PC_UNICODE_C
20
21 #include "pc_util.h"
22
23 #if defined(WIN32)
24 #define WIN32_LEAN_AND_MEAN
25 #include <windows.h>
26 #endif /* WIN32 */
27
28 /*
29 * The following source is based on Unicode's original source
30 * code ConvertUTF.c. It has been adapted to PDFlib programming
31 * conventions.
32 *
33 * The original file had the following notice:
34 *
35 * Copyright 2001 Unicode, Inc.
36 *
37 * Limitations on Rights to Redistribute This Code
38 *
39 * Author: Mark E. Davis, 1994.
40 * Rev History: Rick McGowan, fixes & updates May 2001.
41 *
42 *
43 * Functions for conversions between UTF32, UTF-16, and UTF-8.
44 * These funtions forming a complete set of conversions between
45 * the three formats. UTF-7 is not included here.
46 *
47 * Each of these routines takes pointers to input buffers and output
48 * buffers. The input buffers are const.
49 *
50 * Each routine converts the text between *sourceStart and sourceEnd,
51 * putting the result into the buffer between *targetStart and
52 * targetEnd. Note: the end pointers are *after* the last item: e.g.
53 * *(sourceEnd - 1) is the last item.
54 *
55 * The return result indicates whether the conversion was successful,
56 * and if not, whether the problem was in the source or target buffers.
57 * (Only the first encountered problem is indicated.)
58 *
59 * After the conversion, *sourceStart and *targetStart are both
60 * updated to point to the end of last text successfully converted in
61 * the respective buffers.
62 *
63 * Input parameters:
64 * sourceStart - pointer to a pointer to the source buffer.
65 * The contents of this are modified on return so that
66 * it points at the next thing to be converted.
67 * targetStart - similarly, pointer to pointer to the target buffer.
68 * sourceEnd, targetEnd - respectively pointers to the ends of the
69 * two buffers, for overflow checking only.
70 *
71 * These conversion functions take a pdc_convers_flags argument. When this
72 * flag is set to strict, both irregular sequences and isolated surrogates
73 * will cause an error. When the flag is set to lenient, both irregular
74 * sequences and isolated surrogates are converted.
75 *
76 * Whether the flag is strict or lenient, all illegal sequences will cause
77 * an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
78 * or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
79 * must check for illegal sequences.
80 *
81 * When the flag is set to lenient, characters over 0x10FFFF are converted
82 * to the replacement character; otherwise (when the flag is set to strict)
83 * they constitute an error.
84 *
85 * Output parameters:
86 * The value "sourceIllegal" is returned from some routines if the input
87 * sequence is malformed. When "sourceIllegal" is returned, the source
88 * value will point to the illegal value that caused the problem. E.g.,
89 * in UTF-8 when a sequence is malformed, it points to the start of the
90 * malformed sequence.
91 *
92 * Author: Mark E. Davis, 1994.
93 * Rev History: Rick McGowan, fixes & updates May 2001.
94 *
95 */
96
97 /*
98 * The following 4 definitions are compiler-specific.
99 * The C standard does not guarantee that wchar_t has at least
100 * 16 bits, so wchar_t is no less portable than unsigned short!
101 * All should be unsigned values to avoid sign extension during
102 * bit mask & shift operations.
103 */
104
105 /* Unicode original:
106 typedef unsigned long UTF32; at least 32 bits
107 typedef unsigned short UTF16; at least 16 bits
108 */
109
110 typedef unsigned int UTF32; /* 32 bits */
111 typedef unsigned short UTF16; /* 16 bits */
112 typedef unsigned char UTF8; /* typically 8 bits */
113
114 /* Some fundamental constants */
115 #define UNI_SUR_HIGH_START (UTF32)0xD800
116 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
117 #define UNI_SUR_LOW_START (UTF32)0xDC00
118 #define UNI_SUR_LOW_END (UTF32)0xDFFF
119 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
120 #define UNI_MAX_BMP (UTF32)0x0000FFFF
121 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
122 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
123
124 static const int halfShift = 10; /* used for shifting by 10 bits */
125
126 static const UTF32 halfBase = 0x0010000UL;
127 static const UTF32 halfMask = 0x3FFUL;
128
129
130 /* --------------------------------------------------------------------- */
131
132 static pdc_convers_result
pdc_convertUTF32toUTF16(UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF16 ** targetStart,const UTF16 * targetEnd,const pdc_convers_flags flags)133 pdc_convertUTF32toUTF16 (
134 UTF32** sourceStart, const UTF32* sourceEnd,
135 UTF16** targetStart, const UTF16* targetEnd,
136 const pdc_convers_flags flags) {
137 pdc_convers_result result = conversionOK;
138 UTF32* source = *sourceStart;
139 UTF16* target = *targetStart;
140 while (source < sourceEnd) {
141 UTF32 ch;
142 if (target >= targetEnd) {
143 result = targetExhausted; break;
144 }
145 ch = *source++;
146 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
147 if ((flags == strictConversion) &&
148 (ch >= UNI_SUR_HIGH_START &&
149 ch <= UNI_SUR_LOW_END)) {
150 --source; /* return to the illegal value itself */
151 result = sourceIllegal;
152 break;
153 } else {
154 *target++ = (UTF16) ch; /* normal case */
155 }
156 } else if (ch > UNI_MAX_UTF16) {
157 if (flags == strictConversion) {
158 result = sourceIllegal;
159 } else {
160 *target++ = UNI_REPLACEMENT_CHAR;
161 }
162 } else {
163 /* target is a character in range 0xFFFF - 0x10FFFF. */
164 if (target + 1 >= targetEnd) {
165 result = targetExhausted;
166 break;
167 }
168 ch -= halfBase;
169 *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
170 *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
171 }
172 }
173 *sourceStart = source;
174 *targetStart = target;
175 return result;
176 }
177
178 /* --------------------------------------------------------------------- */
179
180 static pdc_convers_result
pdc_convertUTF16toUTF32(UTF16 ** sourceStart,UTF16 * sourceEnd,UTF32 ** targetStart,const UTF32 * targetEnd,const pdc_convers_flags flags)181 pdc_convertUTF16toUTF32 (
182 UTF16** sourceStart, UTF16* sourceEnd,
183 UTF32** targetStart, const UTF32* targetEnd,
184 const pdc_convers_flags flags) {
185 pdc_convers_result result = conversionOK;
186 UTF16* source = *sourceStart;
187 UTF32* target = *targetStart;
188 UTF32 ch, ch2;
189 while (source < sourceEnd) {
190 ch = *source++;
191 if (ch >= UNI_SUR_HIGH_START &&
192 ch <= UNI_SUR_HIGH_END &&
193 source < sourceEnd) {
194 ch2 = *source;
195 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
196 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
197 + (ch2 - UNI_SUR_LOW_START) + halfBase;
198 ++source;
199 } else if (flags == strictConversion) {
200 /* it's an unpaired high surrogate */
201 --source; /* return to the illegal value itself */
202 result = sourceIllegal;
203 break;
204 }
205 } else if ((flags == strictConversion) &&
206 (ch >= UNI_SUR_LOW_START &&
207 ch <= UNI_SUR_LOW_END)) {
208 /* an unpaired low surrogate */
209 --source; /* return to the illegal value itself */
210 result = sourceIllegal;
211 break;
212 }
213 if (target >= targetEnd) {
214 result = targetExhausted;
215 break;
216 }
217 *target++ = ch;
218 }
219 *sourceStart = source;
220 *targetStart = target;
221 #ifdef CVTUTF_DEBUG
222 if (result == sourceIllegal) {
223 fprintf(stderr, "pdc_convertUTF16toUTF32 illegal seq 0x%04x,%04x\n",
224 ch, ch2);
225 fflush(stderr);
226 }
227 #endif
228 return result;
229 }
230
231 /* --------------------------------------------------------------------- */
232
233 /*
234 * Index into the table below with the first byte of a UTF-8 sequence to
235 * get the number of trailing bytes that are supposed to follow it.
236 */
237 static const char trailingBytesForUTF8[256] = {
238 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
239 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
240 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
241 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
242 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
243 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
244 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
245 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
246 };
247
248 #if 0
249 static const char
250 pdc_get_trailingBytesForUTF8(int i) {
251 return (trailingBytesForUTF8[i]);
252 }
253 #endif
254
255 /*
256 * Magic values subtracted from a buffer value during UTF8 conversion.
257 * This table contains as many values as there might be trailing bytes
258 * in a UTF-8 sequence.
259 */
260 static const UTF32 offsetsFromUTF8[6] = {
261 0x00000000UL, 0x00003080UL, 0x000E2080UL,
262 0x03C82080UL, 0xFA082080UL, 0x82082080UL
263 };
264
265 /*
266 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
267 * into the first byte, depending on how many bytes follow. There are
268 * as many entries in this table as there are UTF-8 sequence types.
269 * (I.e., one byte sequence, two byte... six byte sequence.)
270 */
271 static const UTF8 firstByteMark[7] = {
272 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
273 };
274
275 /* --------------------------------------------------------------------- */
276
277 /* The interface converts a whole buffer to avoid function-call overhead.
278 * Constants have been gathered. Loops & conditionals have been removed as
279 * much as possible for efficiency, in favor of drop-through switches.
280 * (See "Note A" at the bottom of the file for equivalent code.)
281 * If your compiler supports it, the "pdc_islegalUTF8" call can be turned
282 * into an inline function.
283 */
284
285 /* --------------------------------------------------------------------- */
286
287 static pdc_convers_result
pdc_convertUTF16toUTF8(UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,const UTF8 * targetEnd,const pdc_convers_flags flags)288 pdc_convertUTF16toUTF8 (
289 UTF16** sourceStart, const UTF16* sourceEnd,
290 UTF8** targetStart, const UTF8* targetEnd,
291 const pdc_convers_flags flags) {
292 pdc_convers_result result = conversionOK;
293 UTF16* source = *sourceStart;
294 UTF8* target = *targetStart;
295 while (source < sourceEnd) {
296 UTF32 ch;
297 unsigned short bytesToWrite = 0;
298 const UTF32 byteMask = 0xBF;
299 const UTF32 byteMark = 0x80;
300 ch = *source++;
301 /* If we have a surrogate pair, convert to UTF32 first. */
302 if (ch >= UNI_SUR_HIGH_START &&
303 ch <= UNI_SUR_HIGH_END &&
304 source < sourceEnd) {
305 UTF32 ch2 = *source;
306 if (ch2 >= UNI_SUR_LOW_START &&
307 ch2 <= UNI_SUR_LOW_END) {
308 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
309 + (ch2 - UNI_SUR_LOW_START) + halfBase;
310 ++source;
311 } else if (flags == strictConversion) {
312 /* it's an unpaired high surrogate */
313 --source; /* return to the illegal value itself */
314 result = sourceIllegal;
315 break;
316 }
317 } else if ((flags == strictConversion) &&
318 (ch >= UNI_SUR_LOW_START &&
319 ch <= UNI_SUR_LOW_END)) {
320 --source; /* return to the illegal value itself */
321 result = sourceIllegal;
322 break;
323 }
324 /* Figure out how many bytes the result will require */
325 if (ch < (UTF32)0x80) { bytesToWrite = 1;
326 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
327 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
328 } else if (ch < (UTF32)0x200000) { bytesToWrite = 4;
329 } else { bytesToWrite = 2;
330 ch = UNI_REPLACEMENT_CHAR;
331 }
332
333 target += bytesToWrite;
334 if (target > targetEnd) {
335 target -= bytesToWrite; result = targetExhausted; break;
336 }
337 switch (bytesToWrite) { /* note: everything falls through. */
338 case 4: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
339 case 3: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
340 case 2: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
341 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
342 }
343 target += bytesToWrite;
344 }
345 *sourceStart = source;
346 *targetStart = target;
347 return result;
348 }
349
350 /* --------------------------------------------------------------------- */
351
352 /*
353 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
354 * This must be called with the length pre-determined by the first byte.
355 * If not calling this from pdc_convertUTF8to*, then the length can be set by:
356 * length = trailingBytesForUTF8[*source]+1;
357 * and the sequence is illegal right away if there aren't that many bytes
358 * available.
359 * If presented with a length > 4, this returns pdc_false. The Unicode
360 * definition of UTF-8 goes up to 4-byte sequences.
361 */
362
363 static pdc_bool
pdc_islegalUTF8(UTF8 * source,int length)364 pdc_islegalUTF8(UTF8 *source, int length) {
365 UTF8 a;
366 UTF8 *srcptr = source+length;
367 switch (length) {
368 default: return pdc_false;
369 /* Everything else falls through when "pdc_true"... */
370 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return pdc_false;
371 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return pdc_false;
372 case 2: if ((a = (*--srcptr)) > 0xBF) return pdc_false;
373 switch (*source) {
374 /* no fall-through in this inner switch */
375 case 0xE0: if (a < 0xA0) return pdc_false; break;
376 case 0xF0: if (a < 0x90) return pdc_false; break;
377 case 0xF4: if (a > 0x8F) return pdc_false; break;
378 default: if (a < 0x80) return pdc_false;
379 }
380 case 1: if (*source >= 0x80 && *source < 0xC2) return pdc_false;
381 if (*source > 0xF4) return pdc_false;
382 }
383 return pdc_true;
384 }
385
386 /* --------------------------------------------------------------------- */
387
388 /*
389 * Exported function to return whether a UTF-8 sequence is legal or not.
390 * This is not used here; it's just exported.
391 */
392 #if 0
393 static pdc_bool pdc_islegalUTF8sequence(UTF8 *source, UTF8 *sourceEnd) {
394 int length = trailingBytesForUTF8[*source]+1;
395 if (source+length > sourceEnd) {
396 return pdc_false;
397 }
398 return pdc_islegalUTF8(source, length);
399 }
400 #endif
401
402 /* --------------------------------------------------------------------- */
403
404 static pdc_convers_result
pdc_convertUTF8toUTF16(UTF8 ** sourceStart,UTF8 * sourceEnd,UTF16 ** targetStart,const UTF16 * targetEnd,const pdc_convers_flags flags)405 pdc_convertUTF8toUTF16 (
406 UTF8** sourceStart, UTF8* sourceEnd,
407 UTF16** targetStart, const UTF16* targetEnd,
408 const pdc_convers_flags flags) {
409 pdc_convers_result result = conversionOK;
410 UTF8* source = *sourceStart;
411 UTF16* target = *targetStart;
412 while (source < sourceEnd) {
413 UTF32 ch = 0L;
414 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
415 if (source + extraBytesToRead >= sourceEnd) {
416 result = sourceExhausted;
417 break;
418 }
419 /* Do this check whether lenient or strict */
420 if (! pdc_islegalUTF8(source, extraBytesToRead+1)) {
421 result = sourceIllegal;
422 break;
423 }
424 /*
425 * The cases all fall through. See "Note A" below.
426 */
427 switch (extraBytesToRead) {
428 case 3: ch += *source++; ch <<= 6;
429 case 2: ch += *source++; ch <<= 6;
430 case 1: ch += *source++; ch <<= 6;
431 case 0: ch += *source++;
432 }
433 ch -= offsetsFromUTF8[extraBytesToRead];
434
435 if (target >= targetEnd) {
436 result = targetExhausted;
437 break;
438 }
439 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
440 if ((flags == strictConversion) &&
441 (ch >= UNI_SUR_HIGH_START &&
442 ch <= UNI_SUR_LOW_END)) {
443 --source; /* return to the illegal value itself */
444 result = sourceIllegal;
445 break;
446 } else {
447 *target++ = (UTF16) ch; /* normal case */
448 }
449 } else if (ch > UNI_MAX_UTF16) {
450 if (flags == strictConversion) {
451 result = sourceIllegal;
452 source -= extraBytesToRead; /* return to the start */
453 } else {
454 *target++ = UNI_REPLACEMENT_CHAR;
455 }
456 } else {
457 /* target is a character in range 0xFFFF - 0x10FFFF. */
458 if (target + 1 >= targetEnd) {
459 result = targetExhausted;
460 break;
461 }
462 ch -= halfBase;
463 *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
464 *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
465 }
466 }
467 *sourceStart = source;
468 *targetStart = target;
469 return result;
470 }
471
472 /* --------------------------------------------------------------------- */
473
474 static pdc_convers_result
pdc_convertUTF32toUTF8(UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF8 ** targetStart,const UTF8 * targetEnd,const pdc_convers_flags flags)475 pdc_convertUTF32toUTF8 (
476 UTF32** sourceStart, const UTF32* sourceEnd,
477 UTF8** targetStart, const UTF8* targetEnd,
478 const pdc_convers_flags flags) {
479 pdc_convers_result result = conversionOK;
480 UTF32* source = *sourceStart;
481 UTF8* target = *targetStart;
482 while (source < sourceEnd) {
483 UTF32 ch;
484 unsigned short bytesToWrite = 0;
485 const UTF32 byteMask = 0x000000BF;
486 const UTF32 byteMark = 0x00000080;
487 ch = *source++;
488 /* surrogates of any stripe are not legal UTF32 characters */
489 if (flags == strictConversion ) {
490 if ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)) {
491 --source; /* return to the illegal value itself */
492 result = sourceIllegal;
493 break;
494 }
495 }
496 /* Figure out how many bytes the result will require */
497 if (ch < (UTF32)0x80) { bytesToWrite = 1;
498 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
499 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
500 } else if (ch < (UTF32)0x200000) { bytesToWrite = 4;
501 } else { bytesToWrite = 2;
502 ch = UNI_REPLACEMENT_CHAR;
503 }
504
505 target += bytesToWrite;
506 if (target > targetEnd) {
507 target -= bytesToWrite; result = targetExhausted; break;
508 }
509 switch (bytesToWrite) { /* note: everything falls through. */
510 case 4: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
511 case 3: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
512 case 2: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
513 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
514 }
515 target += bytesToWrite;
516 }
517 *sourceStart = source;
518 *targetStart = target;
519 return result;
520 }
521
522 /* --------------------------------------------------------------------- */
523
524 static pdc_convers_result
pdc_convertUTF8toUTF32(UTF8 ** sourceStart,UTF8 * sourceEnd,UTF32 ** targetStart,const UTF32 * targetEnd,const pdc_convers_flags flags)525 pdc_convertUTF8toUTF32 (
526 UTF8** sourceStart, UTF8* sourceEnd,
527 UTF32** targetStart, const UTF32* targetEnd,
528 const pdc_convers_flags flags) {
529 pdc_convers_result result = conversionOK;
530 UTF8* source = *sourceStart;
531 UTF32* target = *targetStart;
532
533 (void) flags;
534
535 while (source < sourceEnd) {
536 UTF32 ch = 0;
537 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
538 if (source + extraBytesToRead >= sourceEnd) {
539 result = sourceExhausted; break;
540 }
541 /* Do this check whether lenient or strict */
542 if (! pdc_islegalUTF8(source, extraBytesToRead+1)) {
543 result = sourceIllegal;
544 break;
545 }
546 /*
547 * The cases all fall through. See "Note A" below.
548 */
549 switch (extraBytesToRead) {
550 case 3: ch += *source++; ch <<= 6;
551 case 2: ch += *source++; ch <<= 6;
552 case 1: ch += *source++; ch <<= 6;
553 case 0: ch += *source++;
554 }
555 ch -= offsetsFromUTF8[extraBytesToRead];
556
557 if (target >= targetEnd) {
558 result = targetExhausted;
559 break;
560 }
561 if (ch <= UNI_MAX_UTF32) {
562 *target++ = ch;
563 } else if (ch > UNI_MAX_UTF32) {
564 *target++ = UNI_REPLACEMENT_CHAR;
565 } else {
566 if (target + 1 >= targetEnd) {
567 result = targetExhausted;
568 break;
569 }
570 ch -= halfBase;
571 *target++ = (ch >> halfShift) + UNI_SUR_HIGH_START;
572 *target++ = (ch & halfMask) + UNI_SUR_LOW_START;
573 }
574 }
575 *sourceStart = source;
576 *targetStart = target;
577 return result;
578 }
579
580 /* ---------------------------------------------------------------------
581
582 Note A.
583 The fall-through switches in UTF-8 reading code save a
584 temp variable, some decrements & conditionals. The switches
585 are equivalent to the following loop:
586 {
587 int tmpBytesToRead = extraBytesToRead+1;
588 do {
589 ch += *source++;
590 --tmpBytesToRead;
591 if (tmpBytesToRead) ch <<= 6;
592 } while (tmpBytesToRead > 0);
593 }
594 In UTF-8 writing code, the switches on "bytesToWrite" are
595 similarly unrolled loops.
596
597 --------------------------------------------------------------------- */
598
599 const char *
pdc_get_textformat(int textformat)600 pdc_get_textformat(int textformat)
601 {
602 return pdc_get_keyword(textformat, pdc_textformat_keylist);
603 }
604
605 static const pdc_keyconn pdc_utfformat_keylist[] =
606 {
607 {"8", pdc_utf8},
608 {"16", pdc_utf16},
609 {"32", pdc_utf32},
610 {NULL, 0}
611 };
612
613
614 /*
615 * pdc_convert_string converts a arbitrary encoded string (maybe UTF) to
616 * another encoded string.
617 *
618 * The new converted string is allocated and terminated by the required
619 * number of zeros.
620 *
621 * The caller is responsible for freeing the resulting string buffer.
622 *
623 *
624 * LBP: low byte picking
625 *
626 * Input-Parameter:
627 *
628 * inutf: input string format (see pc_unicode.h):
629 *
630 * pdc_auto: If codepage != 0:
631 * see above.
632 * Otherwise:
633 * If a BOM is recognized:
634 * pdc_utf8 or pdc_utf16xx resp.
635 * Otherwise if input encoding <inev> is specified
636 * and flag PDC_CONV_FORCEUTF16 not set:
637 * pdc_bytes
638 * Otherwise:
639 * pdc_utf16
640 *
641 * pdc_auto2: If input encoding is not specified:
642 * pdc_utf16
643 * Otherwise after successfull LBP:
644 * pdc_auto
645 * Otherwise:
646 * pdc_utf16
647 *
648 * pdc_bytes: 8-bit string. Encoding is <inev> if specified.
649 *
650 * pdc_bytes2: After successfull LBP:
651 * pdc_bytes
652 * Otherwise:
653 * pdc_utf16
654 *
655 * pdc_utf8: UTF-8 formatted string.
656 *
657 * pdc_ebcdicutf8: EBCDIC-UTF-8 formatted string.
658 *
659 * pdc_utf16: If a UTF16 BOM is recognized:
660 * pdc_utf16be or pdc_utf16le
661 * Otherwise UTF-16 machine byte ordered string.
662 *
663 * pdc_utf16be UTF-16 big endian formatted string.
664 *
665 * pdc_utf16le UTF-16 little endian formatted string.
666 *
667 * codepage: OEM multi byte code-page number. If > 0 and
668 * <inutf> = pdc_auto, text will be converted to UTF-16.
669 *
670 * inev: Encoding vector for input pdc_bytes string.
671 *
672 * glyphtab: Mapping table for character reference names
673 *
674 * tabsize: Size of mapping table
675 *
676 * replchar: Treatment of non resolvable character references:
677 * >= 0: replacement character
678 * == text_error: error message
679 * == text_nocheck: will be ignored
680 * (see also pdc_charref2unicodelist())
681 *
682 * instring: Input string.
683 *
684 * inlen: Length of input string in byte.
685 *
686 * oututf: Target format for output string.
687 * pdc_auto, pdc_auto2 and pdc_bytes2 are not supported.
688 *
689 * outev: Encoding vector for output pdc_bytes string.
690 *
691 * flags: PDC_CONV_FORCEUTF16:
692 * In the case of <inutf> = pdc_auto[2] and <inev> != NULL
693 * <inutf> = pdc_utf16 will be forced.
694 *
695 * PDC_CONV_TRY7BYTES:
696 * UTF-8 output strings will have no BOM if each byte
697 * is smaller than x80.
698 * *oututf: pdc_byte.
699 *
700 * PDC_CONV_TRYBYTES:
701 * UTF-UTF-16xx output strings will be converted by LBP
702 * if each character is smaller than x0100.
703 * *oututf: pdc_byte.
704 *
705 * PDC_CONV_WITHBOM:
706 * UTF-8 or UTF-UTF-16xx output strings will be armed
707 * with an appropriate BOM.
708 *
709 * PDC_CONV_NOBOM:
710 * In UTF-8 or UTF-UTF-16xx output strings any BOM sequence
711 * will be removed. PDC_CONV_WITHBOM is dominant.
712 *
713 * PDC_CONV_AUTOBOM:
714 * BOM sequence will be set automatically if input string
715 * has a BOM.
716 *
717 * PDC_CONV_ANALYZE:
718 * Only analyzing BOMs of input string and dissolving auto
719 * textformats.
720 *
721 * PDC_CONV_TMPALLOC
722 * Temporary memory functions (pdc_malloc_tmp) are used
723 * rather than pdc_malloc etc.
724 *
725 * PDC_CONV_HTMLCHAR
726 * If input encoding vector is specified HTML character
727 * entities will be substituted.
728 *
729 * PDC_CONV_NEWALLOC
730 * Input string must be allocated at first to guarantee
731 * pointer alignment.
732 *
733 * PDC_CONV_INFLATE
734 * Invalid UTF-8 to UTF-16xx conversion will not cause
735 * an exception but rather an inflated byte string will
736 * be output.
737 *
738 * PDC_CONV_ESCSEQU
739 * Unicode sequences framed by escape character U+001B
740 * (found in PDF text strings) will be skipped.
741 *
742 * PDC_CONV_BSSEQU
743 * Code sequences beginning with backslash '\'
744 * will be substituted.
745 *
746 * PDC_CONV_ENCERROR
747 * If an 8-bit code cannot be converted to Unicode by <inev>
748 * or a Unicode cannot be converted to an 8-bit code by <outev>
749 * an error message will be created.
750 *
751 * PDC_CONV_KEEPLBCHAR
752 * In the case of PDC_CONV_ENCERROR relevant characters for
753 * line breaking do not lead to an error message.
754 *
755 * PDC_CONV_LOGGING
756 * Enables logging.
757 *
758 * verbose: Error messages are put out. Otherwise they are saved only.
759 *
760 * Output-Parameter:
761 *
762 * oututf: Reached format for output string.
763 *
764 * outstring: Pointer of allocated output string
765 *
766 * outlen: Length of output string.
767 *
768 */
769
770 #if defined(_MSC_VER) && defined(_MANAGED)
771 #pragma unmanaged
772 #endif
773 int
pdc_convert_string(pdc_core * pdc,pdc_text_format inutf,int codepage,pdc_encodingvector * inev,pdc_byte * instring,int inlen,pdc_text_format * oututf_p,pdc_encodingvector * outev,pdc_byte ** outstring,int * outlen,int flags,pdc_bool verbose)774 pdc_convert_string(pdc_core *pdc,
775 pdc_text_format inutf, int codepage,
776 pdc_encodingvector *inev,
777 pdc_byte *instring, int inlen,
778 pdc_text_format *oututf_p, pdc_encodingvector *outev,
779 pdc_byte **outstring, int *outlen, int flags,
780 pdc_bool verbose)
781 {
782 /* text_nocheck: see bug #1664 */
783 return pdc_convert_textstring(pdc, inutf, codepage, inev,
784 NULL, 0, text_nocheck, instring, inlen, oututf_p, outev,
785 outstring, outlen, flags, verbose);
786 }
787
788 int
pdc_convert_textstring(pdc_core * pdc,pdc_text_format inutf,int codepage,pdc_encodingvector * inev,const pdc_glyph_tab * glyphtab,int tabsize,int replchar,pdc_byte * instring,int inlen,pdc_text_format * oututf_p,pdc_encodingvector * outev,pdc_byte ** outstring,int * outlen,int flags,pdc_bool verbose)789 pdc_convert_textstring(pdc_core *pdc,
790 pdc_text_format inutf, int codepage,
791 pdc_encodingvector *inev,
792 const pdc_glyph_tab *glyphtab, int tabsize, int replchar,
793 pdc_byte *instring, int inlen,
794 pdc_text_format *oututf_p, pdc_encodingvector *outev,
795 pdc_byte **outstring, int *outlen, int flags,
796 pdc_bool verbose)
797 {
798 static const char *fn = "pdc_convert_textstring";
799 pdc_bool logg = flags & PDC_CONV_LOGGING;
800 const char *stemp1 = NULL, *stemp2 = NULL;
801 char sbuf[64];
802 pdc_text_format oututf = *oututf_p;
803 pdc_text_format oututf_s;
804 pdc_ushort *usinstr = (pdc_ushort *) instring;
805 pdc_ushort uv = 0;
806 pdc_byte *instr = NULL;
807 pdc_bool inalloc = pdc_false;
808 pdc_bool hasbom = pdc_false;
809 pdc_bool toswap = pdc_false;
810 int errcode = 0;
811 int i, j, n, len = 0;
812
813 (void) glyphtab;
814 (void) tabsize;
815 (void) replchar;
816
817 if (logg || pdc_logg_is_enabled(pdc, 5, trc_encoding))
818 {
819 pdc_logg(pdc, "\n");
820 if (!logg)
821 pdc_logg(pdc, "\t\ttext string of length %d will be converted...\n",
822 inlen);
823 logg = pdc_true;
824 }
825
826 if (logg)
827 {
828 pdc_logg(pdc, "\t\tinput textformat for string conversion: %s\n",
829 pdc_get_keyword(inutf, pdc_textformat_keylist));
830
831 if (inev != NULL)
832 pdc_logg(pdc, "\t\tinput encoding: %s\n", inev->apiname);
833
834 if (outev != NULL)
835 pdc_logg(pdc, "\t\toutput encoding: %s\n", outev->apiname);
836 }
837
838 /* prophylactic */
839 if (!inlen)
840 {
841 instring = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
842 pdc_calloc_tmp(pdc, 4, fn, NULL, NULL) :
843 pdc_calloc(pdc, 4, fn));
844
845 inalloc = pdc_true;
846 }
847 else if ((flags & PDC_CONV_NEWALLOC) ||
848 (flags & PDC_CONV_TMPALLOC) ||
849 (flags & PDC_CONV_BSSEQU))
850 {
851 instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
852 pdc_calloc_tmp(pdc, (size_t) (inlen + 2), fn, NULL, NULL) :
853 pdc_calloc(pdc, (size_t) (inlen + 2), fn));
854 memcpy(instr, instring, (size_t) inlen);
855
856 inalloc = pdc_true;
857 instring = instr;
858 instr = NULL;
859 usinstr = (pdc_ushort *) instring;
860 }
861
862 switch(inutf)
863 {
864 /* analyzing 2 byte textformat */
865 case pdc_auto2:
866 case pdc_bytes2:
867 if ((inutf == pdc_auto2 &&
868 (inev == NULL || (flags & PDC_CONV_FORCEUTF16))) ||
869 (flags & PDC_CONV_ANALYZE))
870 {
871 inutf = pdc_utf16;
872 }
873 else
874 {
875 if (logg)
876 pdc_logg(pdc, "\t\ttry to pick low bytes\n");
877
878 len = inlen / 2;
879 if (2 * len != inlen)
880 {
881 errcode = PDC_E_CONV_ILLUTF16;
882 goto PDC_CONV_ERROR;
883 }
884 for (i = 0; i < len; i++)
885 if (usinstr[i] > PDC_UNICODE_MAXLATIN1)
886 break;
887
888 /* low byte picking */
889 if (i == len)
890 {
891 instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
892 pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
893 pdc_calloc(pdc, (size_t) (len + 2), fn));
894 for (i = 0; i < len; i++)
895 instr[i] = (pdc_byte) usinstr[i];
896
897 if (inalloc)
898 {
899 if (flags & PDC_CONV_TMPALLOC)
900 pdc_free_tmp(pdc, instring);
901 else
902 pdc_free(pdc, instring);
903 }
904
905 inalloc = pdc_true;
906 instring = instr;
907 instr = NULL;
908 inlen = len;
909
910 if (inutf == pdc_bytes2)
911 inutf = pdc_bytes;
912 else
913 inutf = pdc_auto;
914 }
915 else
916 {
917 inutf = pdc_utf16;
918 }
919 }
920 break;
921
922 /* OEM multi byte text strings */
923 case pdc_auto:
924 case pdc_bytes:
925 if (codepage > 0)
926 {
927 #if defined(WIN32)
928 if (!(flags & PDC_CONV_ANALYZE) && inlen > 0)
929 {
930 if (logg)
931 pdc_logg(pdc,
932 "\t\tconverting according Windows codepage %d\n",
933 codepage);
934
935 len = MultiByteToWideChar((UINT) codepage, (DWORD) 0,
936 (LPCSTR) instring, inlen, NULL, 0);
937 if (len == 0)
938 {
939 DWORD lasterror = GetLastError();
940
941 stemp1 = pdc_errprintf(pdc, "cp%d", codepage);
942 if (lasterror == ERROR_INVALID_PARAMETER)
943 {
944 errcode = PDC_E_CONV_UNSUPP_MBTEXTFORM;
945 }
946 else
947 {
948 errcode = PDC_E_CONV_ILL_MBTEXTSTRING;
949 }
950 goto PDC_CONV_ERROR;
951 }
952
953 len *= 2;
954 instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
955 pdc_calloc_tmp(pdc, (size_t) (len + 2), fn,
956 NULL, NULL) :
957 pdc_calloc(pdc, (size_t) (len + 2), fn));
958 MultiByteToWideChar((UINT) codepage, (DWORD) 0, (LPCSTR)
959 instring, inlen,
960 (LPWSTR) instr, len);
961
962 if (inalloc)
963 {
964 if (flags & PDC_CONV_TMPALLOC)
965 pdc_free_tmp(pdc, instring);
966 else
967 pdc_free(pdc, instring);
968 }
969
970 inalloc = pdc_true;
971 instring = instr;
972 instr = NULL;
973 inlen = len;
974
975 inutf = pdc_utf16;
976 }
977 else
978 {
979 inutf = pdc_bytes;
980 }
981 #else /* WIN32 */
982 errcode = PDC_E_CONV_UNSUPP_MBTEXTFORM;
983 goto PDC_CONV_ERROR;
984 #endif /* !WIN32 */
985 }
986 break;
987
988 default:
989 break;
990 }
991
992 /* analyzing UTF-16 textformat */
993 if (inutf == pdc_utf16)
994 {
995 if (pdc_is_utf16be_unicode(instring))
996 inutf = pdc_utf16be;
997 else if (pdc_is_utf16le_unicode(instring))
998 inutf = pdc_utf16le;
999 }
1000
1001 /* analyzing auto textformat */
1002 else if (inutf == pdc_auto)
1003 {
1004 if (pdc_is_utf8_bytecode(instring))
1005 inutf = PDC_UTF8;
1006 else if (pdc_is_utf16be_unicode(instring))
1007 inutf = pdc_utf16be;
1008 else if (pdc_is_utf16le_unicode(instring))
1009 inutf = pdc_utf16le;
1010 else if (inev && !(flags & PDC_CONV_FORCEUTF16))
1011 inutf = pdc_bytes;
1012 else
1013 inutf = pdc_utf16;
1014 }
1015
1016 if (logg)
1017 pdc_logg(pdc, "\t\tdetermined textformat: %s\n",
1018 pdc_get_keyword(inutf, pdc_textformat_keylist));
1019
1020 /* only analyzing */
1021 if (flags & PDC_CONV_ANALYZE)
1022 goto PDC_CONV_EXIT;
1023
1024 /* conversion to UTF-16 by swapping */
1025 if ((inutf == pdc_utf16be || inutf == pdc_utf16le) &&
1026 (inutf != oututf || flags & PDC_CONV_TRYBYTES ||
1027 flags & PDC_CONV_HTMLCHAR))
1028 {
1029 if (inlen &&
1030 ((inutf == pdc_utf16be && !PDC_ISBIGENDIAN) ||
1031 (inutf == pdc_utf16le && PDC_ISBIGENDIAN)))
1032 {
1033 if (inalloc)
1034 pdc_swap_bytes2((char *) instring, inlen, NULL);
1035 else
1036 {
1037 instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1038 pdc_calloc_tmp(pdc, (size_t) (inlen + 2), fn, NULL, NULL) :
1039 pdc_calloc(pdc, (size_t) (inlen + 2), fn));
1040 pdc_swap_bytes2((char *) instring, inlen, (char *) instr);
1041
1042 inalloc = pdc_true;
1043 instring = instr;
1044 instr = NULL;
1045 }
1046 }
1047 inutf = pdc_utf16;
1048 }
1049
1050 /* conversion to UTF-32 by swapping */
1051 if (inlen && inutf == pdc_utf32)
1052 {
1053
1054 if ((pdc_is_utf32be_unicode(instring) && !PDC_ISBIGENDIAN) ||
1055 (pdc_is_utf32le_unicode(instring) && PDC_ISBIGENDIAN))
1056 {
1057 if (inalloc)
1058 pdc_swap_bytes4((char *) instring, inlen, NULL);
1059 else
1060 {
1061 instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1062 pdc_calloc_tmp(pdc, (size_t) (inlen + 4), fn, NULL, NULL) :
1063 pdc_calloc(pdc, (size_t) (inlen + 4), fn));
1064 pdc_swap_bytes4((char *) instring, inlen, (char *) instr);
1065
1066 inalloc = pdc_true;
1067 instring = instr;
1068 instr = NULL;
1069 }
1070 }
1071 }
1072
1073 /* illegal UTF-16 / UTF-32 */
1074 if (inutf >= pdc_utf16 && inlen % 2)
1075 {
1076 if (inutf == pdc_utf32 && inlen % 4)
1077 errcode = PDC_E_CONV_ILLUTF32;
1078 else
1079 errcode = PDC_E_CONV_ILLUTF16;
1080 goto PDC_CONV_ERROR;
1081 }
1082
1083
1084 /* conversion to UTF-16 by inflation or encoding vector */
1085 if (inutf == pdc_bytes &&
1086 (oututf != pdc_bytes || flags & PDC_CONV_HTMLCHAR || inev != outev))
1087 {
1088 if (logg)
1089 {
1090 if (flags & PDC_CONV_HTMLCHAR)
1091 pdc_logg(pdc, "\t\tbyte character entity substitution\n");
1092 }
1093
1094 len = 2 * inlen;
1095 instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1096 pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
1097 pdc_calloc(pdc, (size_t) (len + 2), fn));
1098 usinstr = (pdc_ushort *) instr;
1099
1100 j = 0;
1101 for (i = 0; i < inlen; i++)
1102 {
1103 uv = (pdc_ushort) instring[i];
1104 if (inev)
1105 {
1106 uv = inev->codes[uv];
1107 if (!uv && (flags & PDC_CONV_ENCERROR) &&
1108 (!(flags & PDC_CONV_KEEPLBCHAR) ||
1109 !pdc_is_linebreaking_relchar(uv)))
1110 {
1111 errcode = PDC_E_ENC_NOTDEF_CODE;
1112 stemp1 = pdc_errprintf(pdc, "x%02X", instring[i]);
1113 stemp2 = inev->apiname;
1114 goto PDC_CONV_ERROR;
1115 }
1116 }
1117
1118
1119 usinstr[j] = uv;
1120 j++;
1121 }
1122
1123 if (inalloc)
1124 {
1125 if (flags & PDC_CONV_TMPALLOC)
1126 pdc_free_tmp(pdc, instring);
1127 else
1128 pdc_free(pdc, instring);
1129 }
1130
1131 inalloc = pdc_true;
1132 instring = instr;
1133 instr = NULL;
1134 inlen = 2 * j;
1135 inutf = pdc_utf16;
1136 }
1137
1138
1139
1140 /* UTF conversion */
1141 oututf_s = oututf;
1142 if ((oututf_s == pdc_bytes && inutf == pdc_utf8) ||
1143 oututf_s == pdc_utf16be || oututf_s == pdc_utf16le)
1144 oututf_s = pdc_utf16;
1145 if (inutf != oututf_s && oututf_s != pdc_bytes)
1146 {
1147 len = 4 * (inlen + 1);
1148 instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1149 pdc_calloc_tmp(pdc, (size_t) len, fn, NULL, NULL) :
1150 pdc_calloc(pdc, (size_t) len, fn));
1151
1152 if (inlen)
1153 {
1154 pdc_convers_result result = conversionOK;
1155 pdc_byte *instringa, *instra, *instringe, *instre;
1156 UTF8 *isa8 = NULL, *ise8 = NULL;
1157 UTF16 *isa16, *ise16;
1158 UTF32 *isa32, *ise32;
1159
1160 if (logg)
1161 pdc_logg(pdc, "\t\tUTF conversion\n");
1162
1163 instringa = instring;
1164 instringe = instring + inlen;
1165 instra = instr;
1166 instre = instr + len;
1167
1168 if (inutf == pdc_utf8)
1169 {
1170 isa8 = (UTF8 *) instringa;
1171 ise8 = (UTF8 *) instringe;
1172 if (oututf_s == pdc_utf16)
1173 {
1174 isa16 = (UTF16 *) instra;
1175 ise16 = (UTF16 *) instre;
1176 result = pdc_convertUTF8toUTF16(&isa8, ise8,
1177 &isa16, ise16,
1178 strictConversion);
1179 instra = (pdc_byte *) isa16;
1180 instre = (pdc_byte *) ise16;
1181 }
1182 else
1183 {
1184 isa32 = (UTF32 *) instra;
1185 ise32 = (UTF32 *) instre;
1186 result = pdc_convertUTF8toUTF32(&isa8, ise8,
1187 &isa32, ise32,
1188 strictConversion);
1189 instra = (pdc_byte *) isa32;
1190 instre = (pdc_byte *) ise32;
1191 }
1192 }
1193 else if (inutf == pdc_utf16)
1194 {
1195 isa16 = (UTF16 *) instringa;
1196 ise16 = (UTF16 *) instringe;
1197 if (oututf_s == pdc_utf8)
1198 {
1199 isa8 = (UTF8 *) instra;
1200 ise8 = (UTF8 *) instre;
1201 result = pdc_convertUTF16toUTF8(&isa16, ise16, &isa8, ise8,
1202 strictConversion);
1203 instra = (pdc_byte *) isa8;
1204 instre = (pdc_byte *) ise8;
1205 }
1206 else
1207 {
1208 isa32 = (UTF32 *) instra;
1209 ise32 = (UTF32 *) instre;
1210 result = pdc_convertUTF16toUTF32(&isa16, ise16,
1211 &isa32, ise32,
1212 strictConversion);
1213 instra = (pdc_byte *) isa32;
1214 instre = (pdc_byte *) ise32;
1215 }
1216 }
1217 else if (inutf == pdc_utf32)
1218 {
1219 isa32 = (UTF32 *) instringa;
1220 ise32 = (UTF32 *) instringe;
1221 if (oututf_s == pdc_utf8)
1222 {
1223 isa8 = (UTF8 *) instra;
1224 ise8 = (UTF8 *) instre;
1225 result = pdc_convertUTF32toUTF8(&isa32, ise32,
1226 &isa8, ise8,
1227 strictConversion);
1228 instra = (pdc_byte *) isa8;
1229 instre = (pdc_byte *) ise8;
1230 }
1231 else
1232 {
1233 isa16 = (UTF16 *) instra;
1234 ise16 = (UTF16 *) instre;
1235 result = pdc_convertUTF32toUTF16(&isa32, ise32,
1236 &isa16, ise16,
1237 strictConversion);
1238 instra = (pdc_byte *) isa16;
1239 instre = (pdc_byte *) ise16;
1240 }
1241 }
1242
1243 switch (result)
1244 {
1245 case targetExhausted:
1246 errcode = PDC_E_CONV_MEMOVERFLOW;
1247 break;
1248
1249 case sourceExhausted:
1250 case sourceIllegal:
1251 if (inutf == pdc_utf8)
1252 {
1253 UTF8 *bp, *bpe;
1254 char *sb = sbuf;
1255
1256 bpe = MIN(ise8 - 1, isa8 + 3);
1257 for (bp = isa8; bp <= bpe; bp++)
1258 sb += sprintf(sb, "\\x%02X", *bp);
1259 if (*bp)
1260 sb += sprintf(sb, "...");
1261 sb += sprintf(sb, " (");
1262 for (bp = isa8; bp <= bpe; bp++)
1263 sb += sprintf(sb, "%c", *bp);
1264 if (*bp)
1265 sb += sprintf(sb, "...");
1266 sb += sprintf(sb, ")");
1267 stemp1 = sbuf;
1268
1269 stemp2 = pdc_errprintf(pdc, "%d", isa8 - (UTF8 *)instringa);
1270
1271 if (flags & PDC_CONV_INFLATE)
1272 {
1273 pdc_warning(pdc, PDC_E_CONV_ILLUTF8SEQU, stemp1, stemp2,
1274 0, 0);
1275
1276 pdc_inflate_ascii((char *) instring, inlen,
1277 (char *) instr, pdc_utf16);
1278 instra = instr + 2 * inlen;
1279 }
1280 else
1281 {
1282 errcode = PDC_E_CONV_ILLUTF8SEQU;
1283 }
1284 }
1285 else
1286 {
1287 stemp1 = pdc_get_keyword((int)inutf, pdc_utfformat_keylist);
1288 errcode = PDC_E_CONV_ILLUTF;
1289 }
1290 break;
1291
1292 default:
1293 break;
1294 }
1295
1296 if (errcode)
1297 {
1298 if (logg)
1299 pdc_logg(pdc, "\t\tUTF conversion error %d\n", result);
1300
1301 goto PDC_CONV_ERROR;
1302 }
1303
1304 inlen = instra - instr;
1305 }
1306
1307 if (inalloc)
1308 {
1309 if (flags & PDC_CONV_TMPALLOC)
1310 pdc_free_tmp(pdc, instring);
1311 else
1312 pdc_free(pdc, instring);
1313 }
1314
1315 len = (oututf == pdc_utf32) ? inlen + 4 : inlen + 2;
1316 if (inlen + 4 != len)
1317 instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1318 pdc_realloc_tmp(pdc, instr, (size_t) len, fn) :
1319 pdc_realloc(pdc, instr, (size_t) len, fn));
1320 instr[inlen] = 0;
1321 instr[inlen + 1] = 0;
1322 if (oututf == pdc_utf32)
1323 {
1324 instr[inlen + 2] = 0;
1325 instr[inlen + 3] = 0;
1326 }
1327
1328 inalloc = pdc_true;
1329 instring = instr;
1330 instr = NULL;
1331 inutf = oututf_s;
1332 }
1333
1334 if (inutf == pdc_bytes)
1335 {
1336 if (!inalloc)
1337 {
1338 instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1339 pdc_calloc_tmp(pdc, (size_t) (inlen + 2), fn, NULL, NULL) :
1340 pdc_calloc(pdc, (size_t) (inlen + 2), fn));
1341 memcpy(instr, instring, (size_t) inlen);
1342
1343 inalloc = pdc_true;
1344 instring = instr;
1345 instr = NULL;
1346 }
1347 }
1348
1349 /* trying to reduce UTF-16 string to bytes string */
1350 if (inutf == pdc_utf16 &&
1351 (oututf == pdc_bytes || flags & PDC_CONV_TRYBYTES))
1352 {
1353 if (logg)
1354 pdc_logg(pdc, "\t\ttry to reduce UTF-16 to bytes\n");
1355
1356 if (pdc_is_utf16be_unicode(instring) ||
1357 pdc_is_utf16le_unicode(instring))
1358 n = 1;
1359 else
1360 n = 0;
1361
1362 len = (inlen - n) / 2;
1363 instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1364 pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
1365 pdc_calloc(pdc, (size_t) (len + 2), fn));
1366 usinstr = (pdc_ushort *) instring;
1367
1368 for (i = 0; i < len; i++)
1369 {
1370 uv = usinstr[i + n];
1371 if (outev && uv)
1372 {
1373 j = pdc_get_encoding_bytecode(pdc, outev, uv);
1374 if (j < 0 && (flags & PDC_CONV_ENCERROR) && oututf == pdc_bytes)
1375 {
1376 errcode = PDC_E_ENC_NOTDEF_UNICODE;
1377 stemp1 = pdc_errprintf(pdc, "%04X", uv);
1378 stemp2 = outev->apiname;
1379 goto PDC_CONV_ERROR;
1380 }
1381 uv = (pdc_ushort) j;
1382 }
1383 if (uv > PDC_UNICODE_MAXLATIN1)
1384 break;
1385
1386 instr[i] = (pdc_byte) uv;
1387 }
1388
1389 if (i == len)
1390 {
1391 if (inalloc)
1392 {
1393 if (flags & PDC_CONV_TMPALLOC)
1394 pdc_free_tmp(pdc, instring);
1395 else
1396 pdc_free(pdc, instring);
1397 }
1398
1399 inalloc = pdc_true;
1400 instring = instr;
1401 instr = NULL;
1402 inlen = len;
1403 inutf = pdc_bytes;
1404 }
1405 else
1406 {
1407 if (flags & PDC_CONV_TMPALLOC)
1408 pdc_free_tmp(pdc, instr);
1409 else
1410 pdc_free(pdc, instr);
1411 instr = NULL;
1412 }
1413 }
1414
1415 /* UTF-8 format */
1416 if (inutf == pdc_utf8)
1417 {
1418 hasbom = pdc_is_utf8_unicode(instring);
1419
1420 if (flags & PDC_CONV_TRY7BYTES)
1421 {
1422 if (logg)
1423 pdc_logg(pdc, "\t\ttry to reduce UTF-8 to 7-bit\n");
1424
1425 for (i = hasbom ? 3 : 0; i < inlen; i++)
1426 if (instring[i] > PDC_UNICODE_MAXASCII)
1427 break;
1428 if (i == inlen)
1429 {
1430 flags &= ~PDC_CONV_WITHBOM;
1431 flags |= PDC_CONV_NOBOM;
1432 inutf = pdc_bytes;
1433 }
1434 }
1435 else if (hasbom && (flags & PDC_CONV_AUTOBOM))
1436 {
1437 flags &= ~PDC_CONV_NOBOM;
1438 flags |= PDC_CONV_WITHBOM;
1439 }
1440 else if ((flags & PDC_CONV_WITHBOM) && (flags & PDC_CONV_NOBOM))
1441 {
1442 flags &= ~PDC_CONV_NOBOM;
1443 }
1444
1445 if (!inalloc || flags & PDC_CONV_WITHBOM || flags & PDC_CONV_NOBOM)
1446 {
1447 i = (flags & PDC_CONV_WITHBOM && !hasbom) ? 3 : 0;
1448 j = (flags & PDC_CONV_NOBOM && hasbom) ? 3 : 0;
1449
1450 len = inlen + i - j;
1451 instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1452 pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
1453 pdc_calloc(pdc, (size_t) (len + 2), fn));
1454 memcpy(&instr[i], &instring[j], (size_t) (inlen - j));
1455 instr[len] = 0;
1456
1457 if (inalloc)
1458 {
1459 if (flags & PDC_CONV_TMPALLOC)
1460 pdc_free_tmp(pdc, instring);
1461 else
1462 pdc_free(pdc, instring);
1463 }
1464
1465 inalloc = pdc_true;
1466 instring = instr;
1467 instr = NULL;
1468 inlen = len;
1469
1470 hasbom = (flags & PDC_CONV_WITHBOM);
1471 }
1472
1473 if (hasbom)
1474 {
1475 instring[0] = PDF_BOM2;
1476 instring[1] = PDF_BOM3;
1477 instring[2] = PDF_BOM4;
1478 }
1479
1480 }
1481
1482 /* UTF-16 formats */
1483 if (inutf == pdc_utf16 || inutf == pdc_utf16be || inutf == pdc_utf16le)
1484 {
1485 hasbom = pdc_is_utf16be_unicode(instring) ||
1486 pdc_is_utf16le_unicode(instring);
1487
1488 if (hasbom && (flags & PDC_CONV_AUTOBOM))
1489 {
1490 flags &= ~PDC_CONV_NOBOM;
1491 flags |= PDC_CONV_WITHBOM;
1492 }
1493 else if ((flags & PDC_CONV_WITHBOM) && (flags & PDC_CONV_NOBOM))
1494 {
1495 flags &= ~PDC_CONV_NOBOM;
1496 }
1497
1498 if (!inalloc || oututf == pdc_utf16be || oututf == pdc_utf16le ||
1499 flags & PDC_CONV_WITHBOM || flags & PDC_CONV_NOBOM)
1500 {
1501 i = (flags & PDC_CONV_WITHBOM && !hasbom) ? 2 : 0;
1502 j = (flags & PDC_CONV_NOBOM && hasbom) ? 2 : 0;
1503
1504 len = inlen + i - j;
1505 instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1506 pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
1507 pdc_calloc(pdc, (size_t) (len + 2), fn));
1508 memcpy(&instr[i], &instring[j], (size_t) (inlen - j));
1509
1510 if (inalloc)
1511 {
1512 if (flags & PDC_CONV_TMPALLOC)
1513 pdc_free_tmp(pdc, instring);
1514 else
1515 pdc_free(pdc, instring);
1516 }
1517
1518 instring = instr;
1519 instr = NULL;
1520 inlen = len;
1521
1522 hasbom = (flags & PDC_CONV_WITHBOM);
1523 }
1524
1525 i = hasbom ? 2 : 0;
1526 if (inutf == pdc_utf16)
1527 {
1528 if (oututf == pdc_utf16be)
1529 {
1530 inutf = pdc_utf16be;
1531 toswap = !PDC_ISBIGENDIAN;
1532 }
1533 if (oututf == pdc_utf16le)
1534 {
1535 inutf = pdc_utf16le;
1536 toswap = PDC_ISBIGENDIAN;
1537 }
1538 if (toswap)
1539 pdc_swap_bytes2((char *) &instring[i], inlen - i, NULL);
1540 }
1541
1542 if (hasbom)
1543 {
1544 if (inutf == pdc_utf16be ||
1545 (inutf == pdc_utf16 && PDC_ISBIGENDIAN))
1546 {
1547 instring[0] = PDF_BOM0;
1548 instring[1] = PDF_BOM1;
1549 }
1550 if (inutf == pdc_utf16le ||
1551 (inutf == pdc_utf16 && !PDC_ISBIGENDIAN))
1552 {
1553 instring[0] = PDF_BOM1;
1554 instring[1] = PDF_BOM0;
1555 }
1556 }
1557 }
1558
1559 if (logg)
1560 pdc_logg(pdc, "\t\ttextformat of converted string: %s\n",
1561 pdc_get_keyword(inutf, pdc_textformat_keylist));
1562
1563 PDC_CONV_EXIT:
1564 *oututf_p = inutf;
1565 if (outlen)
1566 *outlen = inlen;
1567 *outstring = instring;
1568 return 0;
1569
1570 PDC_CONV_ERROR:
1571 if (outlen)
1572 *outlen = 0;
1573 *outstring = NULL;
1574
1575 if (errcode > 0)
1576 pdc_set_errmsg(pdc, errcode, stemp1, stemp2, 0, 0);
1577
1578 if (instr != NULL)
1579 {
1580 if (flags & PDC_CONV_TMPALLOC)
1581 pdc_free_tmp(pdc, instr);
1582 else
1583 pdc_free(pdc, instr);
1584 }
1585
1586 if (inalloc)
1587 {
1588 if (flags & PDC_CONV_TMPALLOC)
1589 pdc_free_tmp(pdc, instring);
1590 else
1591 pdc_free(pdc, instring);
1592 }
1593
1594 if (verbose)
1595 PDC_RETHROW(pdc);
1596
1597 return errcode;
1598 }
1599 #if defined(_MSC_VER) && defined(_MANAGED)
1600 #pragma managed
1601 #endif
1602
1603
1604 /*
1605 * pdc_convert_name_ext converts a string of name data type to UTF-8
1606 *
1607 * flags & PDC_CONV_EBCDIC: converts to EBCDIC-UTF-8
1608 *
1609 * len == 0: If the string has a [EBCDIC-]UTF-8 BOM or
1610 * flags & PDC_CONV_ISUTF8 is set the string will be duplicated.
1611 * Otherwise the string has encoding enc and codepage
1612 * codepage.
1613 * If enc == pdc_unicode the string is "UTF-16" encoded.
1614 * Otherwise: If enc < pdc_winansi the string is "host" encoded.
1615 *
1616 * len > 0: The string is a UTF-16 string of len bytes.
1617 *
1618 */
1619 char *
pdc_convert_name_ext(pdc_core * pdc,const char * name,int len,pdc_encoding enc,int codepage,int flags)1620 pdc_convert_name_ext(pdc_core *pdc, const char *name, int len,
1621 pdc_encoding enc, int codepage, int flags)
1622 {
1623 static const char fn[] = "pdc_convert_name_ext";
1624 pdc_encodingvector *ev = NULL;
1625 pdc_text_format nameformat = pdc_utf16;
1626 pdc_text_format outnameformat = pdc_utf8;
1627 pdc_byte *convname;
1628 char *outname = NULL;
1629 int outlen;
1630
1631 if (name == NULL)
1632 return NULL;
1633
1634 if (len == 0)
1635 {
1636 /* already [EBCDIC-]UTF-8 encoded */
1637 if ((flags & PDC_CONV_ISUTF8) || pdc_is_utf8_bytecode(name))
1638 {
1639 if (!(flags & PDC_CONV_WITHBOM))
1640 flags |= PDC_CONV_NOBOM;
1641
1642 if (!(flags & PDC_CONV_EBCDIC))
1643 flags |= PDC_CONV_ASCII;
1644
1645 /* On EBCDIC platforms EBCDIC-UTF-8 name strings are expected */
1646 outname = pdc_strdup_ext(pdc, name, (flags & ~PDC_CONV_EBCDIC), fn);
1647
1648 if (outname != NULL)
1649 return outname;
1650 }
1651
1652 /* see bug #1486 */
1653 if (enc == pdc_unicode)
1654 {
1655 /* UTF-16 encoded string */
1656 len = (int) pdc_wstrlen(name);
1657 }
1658 else
1659 {
1660 /* 8-bit encoded string */
1661 nameformat = pdc_bytes;
1662 if (enc < pdc_winansi)
1663 ev = pdc_get_encoding_vector(pdc,pdc_find_encoding(pdc,"host"));
1664 else
1665 ev = pdc_get_encoding_vector(pdc, enc);
1666
1667 len = (int) strlen(name);
1668 }
1669 }
1670
1671 if (flags & PDC_CONV_EBCDIC)
1672 outnameformat = PDC_UTF8;
1673
1674 flags |= PDC_CONV_TRY7BYTES;
1675 if (pdc->charref)
1676 flags |= PDC_CONV_HTMLCHAR;
1677 if (pdc->escapesequ)
1678 flags |= PDC_CONV_BSSEQU;
1679
1680 /* convert to UTF-8 */
1681 pdc_convert_string(pdc, nameformat, codepage, ev, (pdc_byte *) name, len,
1682 &outnameformat, NULL, &convname, &outlen, flags,
1683 pdc_true);
1684
1685 return (char *) convname;
1686 }
1687
1688 char *
pdc_convert_name(pdc_core * pdc,const char * name,int len,int flags)1689 pdc_convert_name(pdc_core *pdc, const char *name, int len, int flags)
1690 {
1691 return pdc_convert_name_ext(pdc, name, len, pdc_invalidenc, 0, flags);
1692 }
1693
1694 /* returned string is temporary allocated
1695 */
1696 char *
pdc_utf8_to_hostbytes(pdc_core * pdc,pdc_bool honorlang,char * name)1697 pdc_utf8_to_hostbytes(pdc_core *pdc, pdc_bool honorlang, char *name)
1698 {
1699 static const char fn[] = "pdc_utf8_to_hostbytes";
1700 pdc_encoding outenc = pdc_invalidenc;
1701 pdc_encodingvector *outev = NULL;
1702 pdc_text_format informat = PDC_UTF8;
1703 pdc_text_format outformat = pdc_utf16;
1704 pdc_byte *outname = NULL;
1705 int len = (int) strlen(name);
1706
1707 {
1708 (void) fn;
1709 (void) honorlang;
1710 outenc = pdc_find_encoding(pdc, "host");
1711 }
1712
1713 outev = pdc_get_encoding_vector(pdc, outenc);
1714
1715 pdc_convert_string(pdc, informat, 0, NULL, (pdc_byte *) name, len,
1716 &outformat, outev, &outname, &len,
1717 PDC_CONV_TRYBYTES | PDC_CONV_NOBOM | PDC_CONV_TMPALLOC,
1718 pdc_true);
1719 if (outformat == pdc_utf16)
1720 {
1721 pdc_free_tmp(pdc, outname);
1722 outname = NULL;
1723 }
1724
1725 return (char *) outname;
1726 }
1727
1728 /* returned string is temporary allocated
1729 */
1730 char *
pdc_hostbytes_to_utf8(pdc_core * pdc,pdc_bool honorlang,char * name)1731 pdc_hostbytes_to_utf8(pdc_core *pdc, pdc_bool honorlang, char *name)
1732 {
1733 static const char fn[] = "pdc_hostbytes_to_utf8";
1734 pdc_encoding inenc = pdc_invalidenc;
1735 pdc_encodingvector *inev = NULL;
1736 pdc_text_format informat = pdc_bytes;
1737 pdc_text_format outformat = PDC_UTF8;
1738 pdc_byte *outname = NULL;
1739 int len = (int) strlen(name);
1740
1741 {
1742 (void) fn;
1743 (void) honorlang;
1744 inenc = pdc_find_encoding(pdc, "host");
1745 }
1746
1747 inev = pdc_get_encoding_vector(pdc, inenc);
1748
1749 pdc_convert_string(pdc, informat, 0, inev, (pdc_byte *) name, len,
1750 &outformat, NULL, &outname, &len,
1751 PDC_CONV_NOBOM | PDC_CONV_TMPALLOC, pdc_true);
1752
1753 return (char *) outname;
1754 }
1755
1756 /* --------------------- basic UTF conversion functions --------------------- */
1757
1758 char *
pdc_utf16_to_utf8(pdc_core * pdc,const char * utf16string,int len,int flags,int * size)1759 pdc_utf16_to_utf8(pdc_core *pdc, const char *utf16string, int len, int flags,
1760 int *size)
1761 {
1762 pdc_text_format outtextformat = pdc_utf8;
1763 pdc_byte *utf8string = NULL;
1764 int outlen;
1765
1766 if (!utf16string)
1767 pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf16string", 0, 0, 0);
1768
1769 if (flags & PDC_CONV_EBCDIC)
1770 outtextformat = PDC_UTF8;
1771
1772 flags |= PDC_CONV_AUTOBOM;
1773 pdc_convert_string(pdc, pdc_utf16, 0, NULL,
1774 (pdc_byte *) utf16string, len,
1775 &outtextformat, NULL, &utf8string, &outlen,
1776 flags, pdc_true);
1777
1778 if (size) *size = outlen;
1779
1780 return (char *) utf8string;
1781 }
1782
1783 char *
pdc_utf8_to_utf16(pdc_core * pdc,const char * utf8string,const char * format,int flags,int * size)1784 pdc_utf8_to_utf16(pdc_core *pdc, const char *utf8string, const char *format,
1785 int flags, int *size)
1786 {
1787 pdc_text_format textformat = pdc_utf8;
1788 pdc_text_format outtextformat = pdc_utf16;
1789 pdc_byte *utf16string = NULL;
1790 int len;
1791
1792 if (!utf8string)
1793 pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf8string", 0, 0, 0);
1794 len = (int) strlen(utf8string);
1795
1796 if (format && *format)
1797 {
1798 int k = pdc_get_keycode_ci(format, pdc_textformat_keylist);
1799
1800 /* see bug #2175 */
1801 if (k == PDC_KEY_NOTFOUND)
1802 {
1803 char **sfl;
1804 const char *sf;
1805 int ns, i;
1806
1807 sf = NULL;
1808 ns = pdc_split_stringlist(pdc, format, NULL, 0, &sfl);
1809 for (i = 0; i < ns; i++)
1810 {
1811 if (!strcmp(sfl[i], "inflate"))
1812 flags |= PDC_CONV_INFLATE;
1813 else
1814 sf = sfl[i];
1815 }
1816 if (sf != NULL)
1817 k = pdc_get_keycode_ci(sf, pdc_textformat_keylist);
1818 else
1819 k = pdc_utf16;
1820
1821 pdc_cleanup_stringlist(pdc, sfl);
1822 }
1823
1824 if (k == PDC_KEY_NOTFOUND ||
1825 ((pdc_text_format) k != pdc_utf16 &&
1826 (pdc_text_format) k != pdc_utf16be &&
1827 (pdc_text_format) k != pdc_utf16le))
1828 pdc_error(pdc, PDC_E_ILLARG_STRING, "format", format, 0, 0);
1829
1830 outtextformat = (pdc_text_format) k;
1831 }
1832
1833 if (flags & PDC_CONV_EBCDIC)
1834 textformat = PDC_UTF8;
1835
1836 if (outtextformat == pdc_utf16)
1837 flags |= PDC_CONV_AUTOBOM;
1838 else
1839 flags |= PDC_CONV_WITHBOM;
1840 pdc_convert_string(pdc, textformat, 0, NULL,
1841 (pdc_byte *) utf8string, len,
1842 &outtextformat, NULL, &utf16string, size,
1843 flags, pdc_true);
1844
1845 return (char *) utf16string;
1846 }
1847
1848 char *
pdc_utf16_to_utf32(pdc_core * pdc,const char * utf16string,int len,int * size)1849 pdc_utf16_to_utf32(pdc_core *pdc, const char *utf16string, int len, int *size)
1850 {
1851 pdc_text_format outtextformat = pdc_utf32;
1852 pdc_byte *utf32string = NULL;
1853
1854 if (!utf16string)
1855 pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf16string", 0, 0, 0);
1856
1857 pdc_convert_string(pdc, pdc_utf16, 0, NULL,
1858 (pdc_byte *) utf16string, len,
1859 &outtextformat, NULL, &utf32string, size,
1860 0, pdc_true);
1861
1862 return (char *) utf32string;
1863 }
1864
1865 char *
pdc_utf32_to_utf8(pdc_core * pdc,const char * utf32string,int len,int flags,int * size)1866 pdc_utf32_to_utf8(pdc_core *pdc, const char *utf32string, int len, int flags,
1867 int *size)
1868 {
1869 pdc_text_format outtextformat = pdc_utf8;
1870 pdc_byte *utf8string = NULL;
1871 int outlen;
1872
1873 if (!utf32string)
1874 pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf32string", 0, 0, 0);
1875
1876 if (flags & PDC_CONV_EBCDIC)
1877 outtextformat = PDC_UTF8;
1878
1879 flags |= PDC_CONV_AUTOBOM;
1880 pdc_convert_string(pdc, pdc_utf32, 0, NULL,
1881 (pdc_byte *) utf32string, len,
1882 &outtextformat, NULL, &utf8string, &outlen,
1883 flags, pdc_true);
1884
1885 if (size) *size = outlen;
1886
1887 return (char *) utf8string;
1888 }
1889
1890 char *
pdc_utf32_to_utf16(pdc_core * pdc,const char * utf32string,int len,const char * format,int flags,int * size)1891 pdc_utf32_to_utf16(pdc_core *pdc, const char *utf32string, int len,
1892 const char *format, int flags, int *size)
1893 {
1894 pdc_text_format textformat = pdc_utf32;
1895 pdc_text_format outtextformat = pdc_utf16;
1896 pdc_byte *utf16string = NULL;
1897
1898 if (!utf32string)
1899 pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf32string", 0, 0, 0);
1900
1901 if (format && *format)
1902 {
1903 int k = pdc_get_keycode_ci(format, pdc_textformat_keylist);
1904 if (k == PDC_KEY_NOTFOUND ||
1905 ((pdc_text_format) k != pdc_utf16 &&
1906 (pdc_text_format) k != pdc_utf16be &&
1907 (pdc_text_format) k != pdc_utf16le))
1908 pdc_error(pdc, PDC_E_ILLARG_STRING, "format", format, 0, 0);
1909 outtextformat = (pdc_text_format) k;
1910 }
1911
1912 if (outtextformat == pdc_utf16)
1913 flags |= PDC_CONV_AUTOBOM;
1914 else
1915 flags |= PDC_CONV_WITHBOM;
1916 pdc_convert_string(pdc, textformat, 0, NULL,
1917 (pdc_byte *) utf32string, len,
1918 &outtextformat, NULL, &utf16string, size,
1919 flags, pdc_true);
1920
1921 return (char *) utf16string;
1922 }
1923
1924 int
pdc_char16_to_char32(pdc_core * pdc,const pdc_ushort * ustext,int * ic,int len,pdc_bool verbose)1925 pdc_char16_to_char32(pdc_core *pdc, const pdc_ushort *ustext, int *ic, int len,
1926 pdc_bool verbose)
1927 {
1928 pdc_ushort uvh = ustext[*ic];
1929
1930 if (uvh < PDC_UNICODE_MINHIGHSUR || uvh > PDC_UNICODE_MAXLOWSUR)
1931 {
1932 return (int) uvh;
1933 }
1934 else
1935 {
1936 UTF16 *isa16 = (UTF16 *) &ustext[*ic];
1937 pdc_ushort uvl = 0;
1938 int icn = *ic + 1;
1939
1940 if (icn < len)
1941 {
1942 uvl = ustext[icn];
1943 if (uvh <= PDC_UNICODE_MAXHIGHSUR)
1944 {
1945 if (uvl >= PDC_UNICODE_MINLOWSUR &&
1946 uvl <= PDC_UNICODE_MAXLOWSUR)
1947 {
1948 int usv;
1949 UTF16 *ise16 = isa16 + 2;
1950 UTF32 *isa32 = (UTF32 *) &usv;
1951 UTF32 *ise32 = isa32 + 1;
1952
1953 pdc_convers_result result = pdc_convertUTF16toUTF32(
1954 &isa16, ise16, &isa32, ise32, strictConversion);
1955 if (result == conversionOK)
1956 {
1957 *ic = icn;
1958 return usv;
1959 }
1960 }
1961 }
1962 }
1963
1964 pdc_set_errmsg(pdc, PDC_E_CONV_ILLUTF16SUR,
1965 pdc_errprintf(pdc, "%04X", uvh),
1966 pdc_errprintf(pdc, "%04X", uvl), 0, 0);
1967
1968 if (verbose)
1969 pdc_error(pdc, -1, 0, 0, 0, 0);
1970 }
1971
1972 return -1;
1973 }
1974
1975 int
pdc_char32_to_char16(pdc_core * pdc,int usv,pdc_ushort * uvlist,pdc_bool verbose)1976 pdc_char32_to_char16(pdc_core *pdc, int usv, pdc_ushort *uvlist,
1977 pdc_bool verbose)
1978 {
1979 if (usv < PDC_NUM_BMPVAL)
1980 {
1981 uvlist[0] = (pdc_ushort) usv;
1982 return 1;
1983 }
1984 else
1985 {
1986 UTF32 *isa32 = (UTF32 *) &usv;
1987 UTF32 *ise32 = isa32 + 1;
1988 UTF16 *isa16 = (UTF16 *) uvlist;
1989 UTF16 *ise16 = isa16 + 2;
1990
1991 pdc_convers_result result = pdc_convertUTF32toUTF16(
1992 &isa32, ise32, &isa16, ise16, strictConversion);
1993 if (result == conversionOK)
1994 {
1995 return 2;
1996 }
1997
1998 pdc_set_errmsg(pdc, PDC_E_CONV_ILLUTF32CHAR,
1999 pdc_errprintf(pdc, "%05X", usv), 0, 0, 0);
2000
2001 if (verbose)
2002 pdc_error(pdc, -1, 0, 0, 0, 0);
2003 }
2004
2005 return 0;
2006 }
2007