1 /*---------------------------------------------------------------------------*
2  |              PDFlib - A library for generating PDF on the fly             |
3  +---------------------------------------------------------------------------+
4  | Copyright (c) 1997-2006 Thomas Merz and PDFlib GmbH. All rights reserved. |
5  +---------------------------------------------------------------------------+
6  |                                                                           |
7  |    This software is subject to the PDFlib license. It is NOT in the       |
8  |    public domain. Extended versions and commercial licenses are           |
9  |    available, please check http://www.pdflib.com.                         |
10  |                                                                           |
11  *---------------------------------------------------------------------------*/
12 
13 /* $Id: pc_unicode.c,v 1.179.2.32 2009/04/01 19:33:27 kurt Exp $
14  *
15  * PDFlib Unicode converting routines
16  *
17  */
18 
19 #define PC_UNICODE_C
20 
21 #include "pc_util.h"
22 
23 #if defined(WIN32)
24 #define WIN32_LEAN_AND_MEAN
25 #include <windows.h>
26 #endif /* WIN32 */
27 
28 /*
29  *  The following source is based on Unicode's original source
30  *  code ConvertUTF.c. It has been adapted to PDFlib programming
31  *  conventions.
32  *
33  *  The original file had the following notice:
34  *
35  *      Copyright 2001 Unicode, Inc.
36  *
37  *      Limitations on Rights to Redistribute This Code
38  *
39  *      Author: Mark E. Davis, 1994.
40  *      Rev History: Rick McGowan, fixes & updates May 2001.
41  *
42  *
43  *  Functions for conversions between UTF32, UTF-16, and UTF-8.
44  *  These funtions forming a complete set of conversions between
45  *  the three formats. UTF-7 is not included here.
46  *
47  *  Each of these routines takes pointers to input buffers and output
48  *  buffers. The input buffers are const.
49  *
50  *  Each routine converts the text between *sourceStart and sourceEnd,
51  *  putting the result into the buffer between *targetStart and
52  *  targetEnd. Note: the end pointers are *after* the last item: e.g.
53  *  *(sourceEnd - 1) is the last item.
54  *
55  *  The return result indicates whether the conversion was successful,
56  *  and if not, whether the problem was in the source or target buffers.
57  *  (Only the first encountered problem is indicated.)
58  *
59  *  After the conversion, *sourceStart and *targetStart are both
60  *  updated to point to the end of last text successfully converted in
61  *  the respective buffers.
62  *
63  *  Input parameters:
64  *      sourceStart - pointer to a pointer to the source buffer.
65  *              The contents of this are modified on return so that
66  *              it points at the next thing to be converted.
67  *      targetStart - similarly, pointer to pointer to the target buffer.
68  *      sourceEnd, targetEnd - respectively pointers to the ends of the
69  *              two buffers, for overflow checking only.
70  *
71  *  These conversion functions take a pdc_convers_flags argument. When this
72  *  flag is set to strict, both irregular sequences and isolated surrogates
73  *  will cause an error.  When the flag is set to lenient, both irregular
74  *  sequences and isolated surrogates are converted.
75  *
76  *  Whether the flag is strict or lenient, all illegal sequences will cause
77  *  an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
78  *  or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
79  *  must check for illegal sequences.
80  *
81  *  When the flag is set to lenient, characters over 0x10FFFF are converted
82  *  to the replacement character; otherwise (when the flag is set to strict)
83  *  they constitute an error.
84  *
85  *  Output parameters:
86  *      The value "sourceIllegal" is returned from some routines if the input
87  *      sequence is malformed.  When "sourceIllegal" is returned, the source
88  *      value will point to the illegal value that caused the problem. E.g.,
89  *      in UTF-8 when a sequence is malformed, it points to the start of the
90  *      malformed sequence.
91  *
92  *  Author: Mark E. Davis, 1994.
93  *  Rev History: Rick McGowan, fixes & updates May 2001.
94  *
95  */
96 
97 /*
98  * The following 4 definitions are compiler-specific.
99  * The C standard does not guarantee that wchar_t has at least
100  * 16 bits, so wchar_t is no less portable than unsigned short!
101  * All should be unsigned values to avoid sign extension during
102  * bit mask & shift operations.
103  */
104 
105 /* Unicode original:
106 typedef unsigned long   UTF32;   at least 32 bits
107 typedef unsigned short  UTF16;   at least 16 bits
108 */
109 
110 typedef unsigned int    UTF32;  /* 32 bits */
111 typedef unsigned short  UTF16;  /* 16 bits */
112 typedef unsigned char   UTF8;   /* typically 8 bits */
113 
114 /* Some fundamental constants */
115 #define UNI_SUR_HIGH_START      (UTF32)0xD800
116 #define UNI_SUR_HIGH_END        (UTF32)0xDBFF
117 #define UNI_SUR_LOW_START       (UTF32)0xDC00
118 #define UNI_SUR_LOW_END         (UTF32)0xDFFF
119 #define UNI_REPLACEMENT_CHAR    (UTF32)0x0000FFFD
120 #define UNI_MAX_BMP             (UTF32)0x0000FFFF
121 #define UNI_MAX_UTF16           (UTF32)0x0010FFFF
122 #define UNI_MAX_UTF32           (UTF32)0x7FFFFFFF
123 
124 static const int halfShift      = 10; /* used for shifting by 10 bits */
125 
126 static const UTF32 halfBase     = 0x0010000UL;
127 static const UTF32 halfMask     = 0x3FFUL;
128 
129 
130 /* --------------------------------------------------------------------- */
131 
132 static pdc_convers_result
pdc_convertUTF32toUTF16(UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF16 ** targetStart,const UTF16 * targetEnd,const pdc_convers_flags flags)133 pdc_convertUTF32toUTF16 (
134                 UTF32** sourceStart, const UTF32* sourceEnd,
135                 UTF16** targetStart, const UTF16* targetEnd,
136                 const pdc_convers_flags flags) {
137     pdc_convers_result result = conversionOK;
138     UTF32* source = *sourceStart;
139     UTF16* target = *targetStart;
140     while (source < sourceEnd) {
141         UTF32 ch;
142         if (target >= targetEnd) {
143             result = targetExhausted; break;
144         }
145         ch = *source++;
146         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
147             if ((flags == strictConversion) &&
148                 (ch >= UNI_SUR_HIGH_START &&
149                  ch <= UNI_SUR_LOW_END)) {
150                 --source; /* return to the illegal value itself */
151                 result = sourceIllegal;
152                 break;
153             } else {
154                 *target++ = (UTF16) ch;     /* normal case */
155             }
156         } else if (ch > UNI_MAX_UTF16) {
157             if (flags == strictConversion) {
158                 result = sourceIllegal;
159             } else {
160                 *target++ = UNI_REPLACEMENT_CHAR;
161             }
162         } else {
163             /* target is a character in range 0xFFFF - 0x10FFFF. */
164             if (target + 1 >= targetEnd) {
165                 result = targetExhausted;
166                 break;
167             }
168             ch -= halfBase;
169             *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
170             *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
171         }
172     }
173     *sourceStart = source;
174     *targetStart = target;
175     return result;
176 }
177 
178 /* --------------------------------------------------------------------- */
179 
180 static pdc_convers_result
pdc_convertUTF16toUTF32(UTF16 ** sourceStart,UTF16 * sourceEnd,UTF32 ** targetStart,const UTF32 * targetEnd,const pdc_convers_flags flags)181 pdc_convertUTF16toUTF32 (
182                 UTF16** sourceStart, UTF16* sourceEnd,
183                 UTF32** targetStart, const UTF32* targetEnd,
184                 const pdc_convers_flags flags) {
185     pdc_convers_result result = conversionOK;
186     UTF16* source = *sourceStart;
187     UTF32* target = *targetStart;
188     UTF32 ch, ch2;
189     while (source < sourceEnd) {
190         ch = *source++;
191         if (ch >= UNI_SUR_HIGH_START &&
192             ch <= UNI_SUR_HIGH_END &&
193             source < sourceEnd) {
194             ch2 = *source;
195             if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
196                 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
197                       + (ch2 - UNI_SUR_LOW_START) + halfBase;
198                 ++source;
199             } else if (flags == strictConversion) {
200                 /* it's an unpaired high surrogate */
201                 --source; /* return to the illegal value itself */
202                 result = sourceIllegal;
203                 break;
204             }
205         } else if ((flags == strictConversion) &&
206                    (ch >= UNI_SUR_LOW_START &&
207                     ch <= UNI_SUR_LOW_END)) {
208             /* an unpaired low surrogate */
209             --source; /* return to the illegal value itself */
210             result = sourceIllegal;
211             break;
212         }
213         if (target >= targetEnd) {
214             result = targetExhausted;
215             break;
216         }
217         *target++ = ch;
218     }
219     *sourceStart = source;
220     *targetStart = target;
221 #ifdef CVTUTF_DEBUG
222 if (result == sourceIllegal) {
223     fprintf(stderr, "pdc_convertUTF16toUTF32 illegal seq 0x%04x,%04x\n",
224             ch, ch2);
225     fflush(stderr);
226 }
227 #endif
228     return result;
229 }
230 
231 /* --------------------------------------------------------------------- */
232 
233 /*
234  * Index into the table below with the first byte of a UTF-8 sequence to
235  * get the number of trailing bytes that are supposed to follow it.
236  */
237 static const char trailingBytesForUTF8[256] = {
238         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
239         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
240         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
241         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
242         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
243         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
244         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
245         2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
246 };
247 
248 #if 0
249 static const char
250 pdc_get_trailingBytesForUTF8(int i) {
251     return (trailingBytesForUTF8[i]);
252 }
253 #endif
254 
255 /*
256  * Magic values subtracted from a buffer value during UTF8 conversion.
257  * This table contains as many values as there might be trailing bytes
258  * in a UTF-8 sequence.
259  */
260 static const UTF32 offsetsFromUTF8[6] = {
261     0x00000000UL, 0x00003080UL, 0x000E2080UL,
262     0x03C82080UL, 0xFA082080UL, 0x82082080UL
263 };
264 
265 /*
266  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
267  * into the first byte, depending on how many bytes follow.  There are
268  * as many entries in this table as there are UTF-8 sequence types.
269  * (I.e., one byte sequence, two byte... six byte sequence.)
270  */
271 static const UTF8 firstByteMark[7] = {
272     0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
273 };
274 
275 /* --------------------------------------------------------------------- */
276 
277 /* The interface converts a whole buffer to avoid function-call overhead.
278  * Constants have been gathered. Loops & conditionals have been removed as
279  * much as possible for efficiency, in favor of drop-through switches.
280  * (See "Note A" at the bottom of the file for equivalent code.)
281  * If your compiler supports it, the "pdc_islegalUTF8" call can be turned
282  * into an inline function.
283  */
284 
285 /* --------------------------------------------------------------------- */
286 
287 static pdc_convers_result
pdc_convertUTF16toUTF8(UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,const UTF8 * targetEnd,const pdc_convers_flags flags)288 pdc_convertUTF16toUTF8 (
289                 UTF16** sourceStart, const UTF16* sourceEnd,
290                 UTF8** targetStart, const UTF8* targetEnd,
291                 const pdc_convers_flags flags) {
292     pdc_convers_result result = conversionOK;
293     UTF16* source = *sourceStart;
294     UTF8* target = *targetStart;
295     while (source < sourceEnd) {
296         UTF32 ch;
297         unsigned short bytesToWrite = 0;
298         const UTF32 byteMask = 0xBF;
299         const UTF32 byteMark = 0x80;
300         ch = *source++;
301         /* If we have a surrogate pair, convert to UTF32 first. */
302         if (ch >= UNI_SUR_HIGH_START &&
303             ch <= UNI_SUR_HIGH_END &&
304             source < sourceEnd) {
305             UTF32 ch2 = *source;
306             if (ch2 >= UNI_SUR_LOW_START &&
307                 ch2 <= UNI_SUR_LOW_END) {
308                 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
309                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
310                 ++source;
311             } else if (flags == strictConversion) {
312                 /* it's an unpaired high surrogate */
313                 --source; /* return to the illegal value itself */
314                 result = sourceIllegal;
315                 break;
316             }
317         } else if ((flags == strictConversion) &&
318                    (ch >= UNI_SUR_LOW_START &&
319                     ch <= UNI_SUR_LOW_END)) {
320             --source; /* return to the illegal value itself */
321             result = sourceIllegal;
322             break;
323         }
324         /* Figure out how many bytes the result will require */
325         if (ch < (UTF32)0x80) {                 bytesToWrite = 1;
326         } else if (ch < (UTF32)0x800) {         bytesToWrite = 2;
327         } else if (ch < (UTF32)0x10000) {       bytesToWrite = 3;
328         } else if (ch < (UTF32)0x200000) {      bytesToWrite = 4;
329         } else {                                bytesToWrite = 2;
330                                                 ch = UNI_REPLACEMENT_CHAR;
331         }
332 
333         target += bytesToWrite;
334         if (target > targetEnd) {
335             target -= bytesToWrite; result = targetExhausted; break;
336         }
337         switch (bytesToWrite) { /* note: everything falls through. */
338             case 4: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
339             case 3: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
340             case 2: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
341             case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
342         }
343         target += bytesToWrite;
344     }
345     *sourceStart = source;
346     *targetStart = target;
347     return result;
348 }
349 
350 /* --------------------------------------------------------------------- */
351 
352 /*
353  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
354  * This must be called with the length pre-determined by the first byte.
355  * If not calling this from pdc_convertUTF8to*, then the length can be set by:
356  *      length = trailingBytesForUTF8[*source]+1;
357  * and the sequence is illegal right away if there aren't that many bytes
358  * available.
359  * If presented with a length > 4, this returns pdc_false.  The Unicode
360  * definition of UTF-8 goes up to 4-byte sequences.
361  */
362 
363 static pdc_bool
pdc_islegalUTF8(UTF8 * source,int length)364 pdc_islegalUTF8(UTF8 *source, int length) {
365     UTF8 a;
366     UTF8 *srcptr = source+length;
367     switch (length) {
368     default: return pdc_false;
369         /* Everything else falls through when "pdc_true"... */
370     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return pdc_false;
371     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return pdc_false;
372     case 2: if ((a = (*--srcptr)) > 0xBF) return pdc_false;
373         switch (*source) {
374             /* no fall-through in this inner switch */
375             case 0xE0: if (a < 0xA0) return pdc_false; break;
376             case 0xF0: if (a < 0x90) return pdc_false; break;
377             case 0xF4: if (a > 0x8F) return pdc_false; break;
378             default:  if (a < 0x80) return pdc_false;
379         }
380     case 1: if (*source >= 0x80 && *source < 0xC2) return pdc_false;
381             if (*source > 0xF4) return pdc_false;
382     }
383     return pdc_true;
384 }
385 
386 /* --------------------------------------------------------------------- */
387 
388 /*
389  * Exported function to return whether a UTF-8 sequence is legal or not.
390  * This is not used here; it's just exported.
391  */
392 #if 0
393 static pdc_bool pdc_islegalUTF8sequence(UTF8 *source, UTF8 *sourceEnd) {
394     int length = trailingBytesForUTF8[*source]+1;
395     if (source+length > sourceEnd) {
396         return pdc_false;
397     }
398     return pdc_islegalUTF8(source, length);
399 }
400 #endif
401 
402 /* --------------------------------------------------------------------- */
403 
404 static pdc_convers_result
pdc_convertUTF8toUTF16(UTF8 ** sourceStart,UTF8 * sourceEnd,UTF16 ** targetStart,const UTF16 * targetEnd,const pdc_convers_flags flags)405 pdc_convertUTF8toUTF16 (
406                 UTF8** sourceStart, UTF8* sourceEnd,
407                 UTF16** targetStart, const UTF16* targetEnd,
408                 const pdc_convers_flags flags) {
409     pdc_convers_result result = conversionOK;
410     UTF8* source = *sourceStart;
411     UTF16* target = *targetStart;
412     while (source < sourceEnd) {
413         UTF32 ch = 0L;
414         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
415         if (source + extraBytesToRead >= sourceEnd) {
416             result = sourceExhausted;
417             break;
418         }
419         /* Do this check whether lenient or strict */
420         if (! pdc_islegalUTF8(source, extraBytesToRead+1)) {
421             result = sourceIllegal;
422             break;
423         }
424         /*
425          * The cases all fall through. See "Note A" below.
426          */
427         switch (extraBytesToRead) {
428             case 3: ch += *source++; ch <<= 6;
429             case 2: ch += *source++; ch <<= 6;
430             case 1: ch += *source++; ch <<= 6;
431             case 0: ch += *source++;
432         }
433         ch -= offsetsFromUTF8[extraBytesToRead];
434 
435         if (target >= targetEnd) {
436             result = targetExhausted;
437             break;
438         }
439         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
440             if ((flags == strictConversion) &&
441                 (ch >= UNI_SUR_HIGH_START &&
442                  ch <= UNI_SUR_LOW_END)) {
443                 --source; /* return to the illegal value itself */
444                 result = sourceIllegal;
445                 break;
446             } else {
447                 *target++ = (UTF16) ch;     /* normal case */
448             }
449         } else if (ch > UNI_MAX_UTF16) {
450             if (flags == strictConversion) {
451                     result = sourceIllegal;
452                     source -= extraBytesToRead; /* return to the start */
453             } else {
454                     *target++ = UNI_REPLACEMENT_CHAR;
455             }
456         } else {
457             /* target is a character in range 0xFFFF - 0x10FFFF. */
458             if (target + 1 >= targetEnd) {
459                     result = targetExhausted;
460                     break;
461             }
462             ch -= halfBase;
463             *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
464             *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
465         }
466     }
467     *sourceStart = source;
468     *targetStart = target;
469     return result;
470 }
471 
472 /* --------------------------------------------------------------------- */
473 
474 static pdc_convers_result
pdc_convertUTF32toUTF8(UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF8 ** targetStart,const UTF8 * targetEnd,const pdc_convers_flags flags)475 pdc_convertUTF32toUTF8 (
476                 UTF32** sourceStart, const UTF32* sourceEnd,
477                 UTF8** targetStart, const UTF8* targetEnd,
478                 const pdc_convers_flags flags) {
479     pdc_convers_result result = conversionOK;
480     UTF32* source = *sourceStart;
481     UTF8* target = *targetStart;
482     while (source < sourceEnd) {
483         UTF32 ch;
484         unsigned short bytesToWrite = 0;
485         const UTF32 byteMask = 0x000000BF;
486         const UTF32 byteMark = 0x00000080;
487         ch = *source++;
488         /* surrogates of any stripe are not legal UTF32 characters */
489         if (flags == strictConversion ) {
490             if ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)) {
491                 --source; /* return to the illegal value itself */
492                 result = sourceIllegal;
493                 break;
494             }
495         }
496         /* Figure out how many bytes the result will require */
497         if (ch < (UTF32)0x80) {                 bytesToWrite = 1;
498         } else if (ch < (UTF32)0x800) {         bytesToWrite = 2;
499         } else if (ch < (UTF32)0x10000) {       bytesToWrite = 3;
500         } else if (ch < (UTF32)0x200000) {      bytesToWrite = 4;
501         } else {                                bytesToWrite = 2;
502                                                 ch = UNI_REPLACEMENT_CHAR;
503         }
504 
505         target += bytesToWrite;
506         if (target > targetEnd) {
507             target -= bytesToWrite; result = targetExhausted; break;
508         }
509         switch (bytesToWrite) { /* note: everything falls through. */
510             case 4: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
511             case 3: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
512             case 2: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
513             case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
514         }
515         target += bytesToWrite;
516     }
517     *sourceStart = source;
518     *targetStart = target;
519     return result;
520 }
521 
522 /* --------------------------------------------------------------------- */
523 
524 static pdc_convers_result
pdc_convertUTF8toUTF32(UTF8 ** sourceStart,UTF8 * sourceEnd,UTF32 ** targetStart,const UTF32 * targetEnd,const pdc_convers_flags flags)525 pdc_convertUTF8toUTF32 (
526                 UTF8** sourceStart, UTF8* sourceEnd,
527                 UTF32** targetStart, const UTF32* targetEnd,
528                 const pdc_convers_flags flags) {
529     pdc_convers_result result = conversionOK;
530     UTF8* source = *sourceStart;
531     UTF32* target = *targetStart;
532 
533     (void) flags;
534 
535     while (source < sourceEnd) {
536         UTF32 ch = 0;
537         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
538         if (source + extraBytesToRead >= sourceEnd) {
539             result = sourceExhausted; break;
540         }
541         /* Do this check whether lenient or strict */
542         if (! pdc_islegalUTF8(source, extraBytesToRead+1)) {
543             result = sourceIllegal;
544             break;
545         }
546         /*
547          * The cases all fall through. See "Note A" below.
548          */
549         switch (extraBytesToRead) {
550             case 3: ch += *source++; ch <<= 6;
551             case 2: ch += *source++; ch <<= 6;
552             case 1: ch += *source++; ch <<= 6;
553             case 0: ch += *source++;
554         }
555         ch -= offsetsFromUTF8[extraBytesToRead];
556 
557         if (target >= targetEnd) {
558             result = targetExhausted;
559             break;
560         }
561         if (ch <= UNI_MAX_UTF32) {
562             *target++ = ch;
563         } else if (ch > UNI_MAX_UTF32) {
564             *target++ = UNI_REPLACEMENT_CHAR;
565         } else {
566             if (target + 1 >= targetEnd) {
567                 result = targetExhausted;
568                 break;
569             }
570             ch -= halfBase;
571             *target++ = (ch >> halfShift) + UNI_SUR_HIGH_START;
572             *target++ = (ch & halfMask) + UNI_SUR_LOW_START;
573         }
574     }
575     *sourceStart = source;
576     *targetStart = target;
577     return result;
578 }
579 
580 /* ---------------------------------------------------------------------
581 
582         Note A.
583         The fall-through switches in UTF-8 reading code save a
584         temp variable, some decrements & conditionals.  The switches
585         are equivalent to the following loop:
586                 {
587                         int tmpBytesToRead = extraBytesToRead+1;
588                         do {
589                                 ch += *source++;
590                                 --tmpBytesToRead;
591                                 if (tmpBytesToRead) ch <<= 6;
592                         } while (tmpBytesToRead > 0);
593                 }
594         In UTF-8 writing code, the switches on "bytesToWrite" are
595         similarly unrolled loops.
596 
597    --------------------------------------------------------------------- */
598 
599 const char *
pdc_get_textformat(int textformat)600 pdc_get_textformat(int textformat)
601 {
602     return pdc_get_keyword(textformat, pdc_textformat_keylist);
603 }
604 
605 static const pdc_keyconn pdc_utfformat_keylist[] =
606 {
607     {"8",     pdc_utf8},
608     {"16",    pdc_utf16},
609     {"32",    pdc_utf32},
610     {NULL, 0}
611 };
612 
613 
614 /*
615  *  pdc_convert_string converts a arbitrary encoded string (maybe UTF) to
616  *  another encoded string.
617  *
618  *  The new converted string is allocated and terminated by the required
619  *  number of zeros.
620  *
621  *  The caller is responsible for freeing the resulting string buffer.
622  *
623  *
624  *  LBP: low byte picking
625  *
626  *  Input-Parameter:
627  *
628  *  inutf:      input string format (see pc_unicode.h):
629  *
630  *              pdc_auto:     If codepage != 0:
631  *                                see above.
632  *                            Otherwise:
633  *                            If a BOM is recognized:
634  *                                pdc_utf8 or pdc_utf16xx resp.
635  *                            Otherwise if input encoding <inev> is specified
636  *                            and flag PDC_CONV_FORCEUTF16 not set:
637  *                                pdc_bytes
638  *                            Otherwise:
639  *                                pdc_utf16
640  *
641  *              pdc_auto2:    If input encoding is not specified:
642  *                                pdc_utf16
643  *                            Otherwise after successfull LBP:
644  *                                pdc_auto
645  *                            Otherwise:
646  *                                pdc_utf16
647  *
648  *              pdc_bytes:    8-bit string. Encoding is <inev> if specified.
649  *
650  *              pdc_bytes2:   After successfull LBP:
651  *                                pdc_bytes
652  *                            Otherwise:
653  *                                pdc_utf16
654  *
655  *              pdc_utf8:     UTF-8 formatted string.
656  *
657  *              pdc_ebcdicutf8: EBCDIC-UTF-8 formatted string.
658  *
659  *              pdc_utf16:    If a UTF16 BOM is recognized:
660  *                                pdc_utf16be or pdc_utf16le
661  *                            Otherwise UTF-16 machine byte ordered string.
662  *
663  *              pdc_utf16be   UTF-16 big endian formatted string.
664  *
665  *              pdc_utf16le   UTF-16 little endian formatted string.
666  *
667  *  codepage:   OEM multi byte code-page number. If > 0 and
668  *              <inutf> = pdc_auto, text will be converted to UTF-16.
669  *
670  *  inev:       Encoding vector for input pdc_bytes string.
671  *
672  *  glyphtab:   Mapping table for character reference names
673  *
674  *  tabsize:    Size of mapping table
675  *
676  *  replchar:   Treatment of non resolvable character references:
677  *              >= 0: replacement character
678  *              == text_error: error message
679  *              == text_nocheck: will be ignored
680  *              (see also pdc_charref2unicodelist())
681  *
682  *  instring:   Input string.
683  *
684  *  inlen:      Length of input string in byte.
685  *
686  *  oututf:     Target format for output string.
687  *              pdc_auto, pdc_auto2 and pdc_bytes2 are not supported.
688  *
689  *  outev:      Encoding vector for output pdc_bytes string.
690  *
691  *  flags:      PDC_CONV_FORCEUTF16:
692  *              In the case of <inutf> = pdc_auto[2] and <inev> != NULL
693  *              <inutf> = pdc_utf16 will be forced.
694  *
695  *              PDC_CONV_TRY7BYTES:
696  *              UTF-8 output strings will have no BOM if each byte
697  *              is smaller than x80.
698  *              *oututf: pdc_byte.
699  *
700  *              PDC_CONV_TRYBYTES:
701  *              UTF-UTF-16xx output strings will be converted by LBP
702  *              if each character is smaller than x0100.
703  *              *oututf: pdc_byte.
704  *
705  *              PDC_CONV_WITHBOM:
706  *              UTF-8 or UTF-UTF-16xx output strings will be armed
707  *              with an appropriate BOM.
708  *
709  *              PDC_CONV_NOBOM:
710  *              In UTF-8 or UTF-UTF-16xx output strings any BOM sequence
711  *              will be removed. PDC_CONV_WITHBOM is dominant.
712  *
713  *              PDC_CONV_AUTOBOM:
714  *              BOM sequence will be set automatically if input string
715  *              has a BOM.
716  *
717  *              PDC_CONV_ANALYZE:
718  *              Only analyzing BOMs of input string and dissolving auto
719  *              textformats.
720  *
721  *              PDC_CONV_TMPALLOC
722  *              Temporary memory functions (pdc_malloc_tmp) are used
723  *              rather than pdc_malloc etc.
724  *
725  *              PDC_CONV_HTMLCHAR
726  *              If input encoding vector is specified HTML character
727  *              entities will be substituted.
728  *
729  *              PDC_CONV_NEWALLOC
730  *              Input string must be allocated at first to guarantee
731  *              pointer alignment.
732  *
733  *              PDC_CONV_INFLATE
734  *              Invalid UTF-8 to UTF-16xx conversion will not cause
735  *              an exception but rather an inflated byte string will
736  *              be output.
737  *
738  *              PDC_CONV_ESCSEQU
739  *              Unicode sequences framed by escape character U+001B
740  *              (found in PDF text strings) will be skipped.
741  *
742  *              PDC_CONV_BSSEQU
743  *              Code sequences beginning with backslash '\'
744  *              will be substituted.
745  *
746  *              PDC_CONV_ENCERROR
747  *              If an 8-bit code cannot be converted to Unicode by <inev>
748  *              or a Unicode cannot be converted to an 8-bit code by <outev>
749  *              an error message will be created.
750  *
751  *              PDC_CONV_KEEPLBCHAR
752  *              In the case of PDC_CONV_ENCERROR relevant characters for
753  *              line breaking do not lead to an error message.
754  *
755  *              PDC_CONV_LOGGING
756  *              Enables logging.
757  *
758  *  verbose:    Error messages are put out. Otherwise they are saved only.
759  *
760  *  Output-Parameter:
761  *
762  *  oututf:     Reached format for output string.
763  *
764  *  outstring:  Pointer of allocated output string
765  *
766  *  outlen:     Length of output string.
767  *
768  */
769 
770 #if defined(_MSC_VER) && defined(_MANAGED)
771 #pragma unmanaged
772 #endif
773 int
pdc_convert_string(pdc_core * pdc,pdc_text_format inutf,int codepage,pdc_encodingvector * inev,pdc_byte * instring,int inlen,pdc_text_format * oututf_p,pdc_encodingvector * outev,pdc_byte ** outstring,int * outlen,int flags,pdc_bool verbose)774 pdc_convert_string(pdc_core *pdc,
775                    pdc_text_format inutf, int codepage,
776                    pdc_encodingvector *inev,
777                    pdc_byte *instring, int inlen,
778                    pdc_text_format *oututf_p, pdc_encodingvector *outev,
779                    pdc_byte **outstring, int *outlen, int flags,
780                    pdc_bool verbose)
781 {
782     /* text_nocheck: see bug #1664 */
783     return pdc_convert_textstring(pdc, inutf, codepage, inev,
784                    NULL, 0, text_nocheck, instring, inlen, oututf_p, outev,
785                    outstring, outlen, flags, verbose);
786 }
787 
788 int
pdc_convert_textstring(pdc_core * pdc,pdc_text_format inutf,int codepage,pdc_encodingvector * inev,const pdc_glyph_tab * glyphtab,int tabsize,int replchar,pdc_byte * instring,int inlen,pdc_text_format * oututf_p,pdc_encodingvector * outev,pdc_byte ** outstring,int * outlen,int flags,pdc_bool verbose)789 pdc_convert_textstring(pdc_core *pdc,
790                    pdc_text_format inutf, int codepage,
791                    pdc_encodingvector *inev,
792                    const pdc_glyph_tab *glyphtab, int tabsize, int replchar,
793                    pdc_byte *instring, int inlen,
794                    pdc_text_format *oututf_p, pdc_encodingvector *outev,
795                    pdc_byte **outstring, int *outlen, int flags,
796                    pdc_bool verbose)
797 {
798     static const char *fn = "pdc_convert_textstring";
799     pdc_bool logg = flags & PDC_CONV_LOGGING;
800     const char *stemp1 = NULL, *stemp2 = NULL;
801     char sbuf[64];
802     pdc_text_format oututf = *oututf_p;
803     pdc_text_format oututf_s;
804     pdc_ushort *usinstr = (pdc_ushort *) instring;
805     pdc_ushort uv = 0;
806     pdc_byte *instr = NULL;
807     pdc_bool inalloc = pdc_false;
808     pdc_bool hasbom = pdc_false;
809     pdc_bool toswap = pdc_false;
810     int errcode = 0;
811     int i, j, n, len = 0;
812 
813     (void) glyphtab;
814     (void) tabsize;
815     (void) replchar;
816 
817     if (logg || pdc_logg_is_enabled(pdc, 5, trc_encoding))
818     {
819         pdc_logg(pdc, "\n");
820         if (!logg)
821             pdc_logg(pdc, "\t\ttext string of length %d will be converted...\n",
822                      inlen);
823         logg = pdc_true;
824     }
825 
826     if (logg)
827     {
828         pdc_logg(pdc, "\t\tinput textformat for string conversion: %s\n",
829                  pdc_get_keyword(inutf, pdc_textformat_keylist));
830 
831         if (inev != NULL)
832             pdc_logg(pdc, "\t\tinput encoding: %s\n", inev->apiname);
833 
834         if (outev != NULL)
835             pdc_logg(pdc, "\t\toutput encoding: %s\n", outev->apiname);
836     }
837 
838     /* prophylactic */
839     if (!inlen)
840     {
841         instring = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
842              pdc_calloc_tmp(pdc, 4, fn, NULL, NULL) :
843              pdc_calloc(pdc, 4, fn));
844 
845         inalloc = pdc_true;
846     }
847     else if ((flags & PDC_CONV_NEWALLOC) ||
848              (flags & PDC_CONV_TMPALLOC) ||
849              (flags & PDC_CONV_BSSEQU))
850     {
851         instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
852              pdc_calloc_tmp(pdc, (size_t) (inlen + 2), fn, NULL, NULL) :
853              pdc_calloc(pdc, (size_t) (inlen + 2), fn));
854         memcpy(instr, instring, (size_t) inlen);
855 
856         inalloc = pdc_true;
857         instring = instr;
858         instr = NULL;
859         usinstr = (pdc_ushort *) instring;
860     }
861 
862     switch(inutf)
863     {
864         /* analyzing 2 byte textformat */
865         case pdc_auto2:
866         case pdc_bytes2:
867         if ((inutf == pdc_auto2 &&
868              (inev == NULL || (flags & PDC_CONV_FORCEUTF16))) ||
869             (flags & PDC_CONV_ANALYZE))
870         {
871             inutf = pdc_utf16;
872         }
873         else
874         {
875             if (logg)
876                 pdc_logg(pdc, "\t\ttry to pick low bytes\n");
877 
878             len = inlen / 2;
879             if (2 * len != inlen)
880             {
881                 errcode = PDC_E_CONV_ILLUTF16;
882                 goto PDC_CONV_ERROR;
883             }
884             for (i = 0; i < len; i++)
885                 if (usinstr[i] > PDC_UNICODE_MAXLATIN1)
886                     break;
887 
888             /* low byte picking */
889             if (i == len)
890             {
891                 instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
892                      pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
893                      pdc_calloc(pdc, (size_t) (len + 2), fn));
894                 for (i = 0; i < len; i++)
895                     instr[i] = (pdc_byte) usinstr[i];
896 
897                 if (inalloc)
898                 {
899                     if (flags & PDC_CONV_TMPALLOC)
900                         pdc_free_tmp(pdc, instring);
901                     else
902                         pdc_free(pdc, instring);
903                 }
904 
905                 inalloc = pdc_true;
906                 instring = instr;
907                 instr = NULL;
908                 inlen = len;
909 
910                 if (inutf == pdc_bytes2)
911                     inutf = pdc_bytes;
912                 else
913                     inutf = pdc_auto;
914             }
915             else
916             {
917                 inutf = pdc_utf16;
918             }
919         }
920         break;
921 
922         /* OEM multi byte text strings */
923         case pdc_auto:
924         case pdc_bytes:
925         if (codepage > 0)
926         {
927 #if defined(WIN32)
928             if (!(flags & PDC_CONV_ANALYZE) && inlen > 0)
929             {
930                 if (logg)
931                     pdc_logg(pdc,
932                         "\t\tconverting according Windows codepage %d\n",
933                         codepage);
934 
935                 len = MultiByteToWideChar((UINT) codepage, (DWORD) 0,
936                                           (LPCSTR) instring, inlen, NULL, 0);
937                 if (len == 0)
938                 {
939                     DWORD lasterror = GetLastError();
940 
941                     stemp1 = pdc_errprintf(pdc, "cp%d", codepage);
942                     if (lasterror == ERROR_INVALID_PARAMETER)
943                     {
944                         errcode = PDC_E_CONV_UNSUPP_MBTEXTFORM;
945                     }
946                     else
947                     {
948                         errcode = PDC_E_CONV_ILL_MBTEXTSTRING;
949                     }
950                     goto PDC_CONV_ERROR;
951                 }
952 
953                 len *= 2;
954                 instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
955                              pdc_calloc_tmp(pdc, (size_t) (len + 2), fn,
956                                             NULL, NULL) :
957                              pdc_calloc(pdc, (size_t) (len + 2), fn));
958                 MultiByteToWideChar((UINT) codepage, (DWORD) 0, (LPCSTR)
959                                     instring, inlen,
960                                     (LPWSTR) instr, len);
961 
962                 if (inalloc)
963                 {
964                     if (flags & PDC_CONV_TMPALLOC)
965                         pdc_free_tmp(pdc, instring);
966                     else
967                         pdc_free(pdc, instring);
968                 }
969 
970                 inalloc = pdc_true;
971                 instring = instr;
972                 instr = NULL;
973                 inlen = len;
974 
975                 inutf = pdc_utf16;
976             }
977             else
978             {
979                 inutf = pdc_bytes;
980             }
981 #else   /* WIN32 */
982             errcode = PDC_E_CONV_UNSUPP_MBTEXTFORM;
983             goto PDC_CONV_ERROR;
984 #endif  /* !WIN32 */
985         }
986         break;
987 
988         default:
989         break;
990     }
991 
992     /* analyzing UTF-16 textformat */
993     if (inutf == pdc_utf16)
994     {
995         if (pdc_is_utf16be_unicode(instring))
996             inutf = pdc_utf16be;
997         else if (pdc_is_utf16le_unicode(instring))
998             inutf = pdc_utf16le;
999     }
1000 
1001     /* analyzing auto textformat */
1002     else if (inutf == pdc_auto)
1003     {
1004         if (pdc_is_utf8_bytecode(instring))
1005             inutf = PDC_UTF8;
1006         else if (pdc_is_utf16be_unicode(instring))
1007             inutf = pdc_utf16be;
1008         else if (pdc_is_utf16le_unicode(instring))
1009             inutf = pdc_utf16le;
1010         else if (inev && !(flags & PDC_CONV_FORCEUTF16))
1011             inutf = pdc_bytes;
1012         else
1013             inutf = pdc_utf16;
1014     }
1015 
1016     if (logg)
1017         pdc_logg(pdc, "\t\tdetermined textformat: %s\n",
1018                  pdc_get_keyword(inutf, pdc_textformat_keylist));
1019 
1020     /* only analyzing */
1021     if (flags & PDC_CONV_ANALYZE)
1022         goto PDC_CONV_EXIT;
1023 
1024     /* conversion to UTF-16 by swapping */
1025     if ((inutf == pdc_utf16be  || inutf == pdc_utf16le) &&
1026         (inutf != oututf || flags & PDC_CONV_TRYBYTES ||
1027          flags & PDC_CONV_HTMLCHAR))
1028     {
1029         if (inlen &&
1030             ((inutf == pdc_utf16be && !PDC_ISBIGENDIAN) ||
1031              (inutf == pdc_utf16le &&  PDC_ISBIGENDIAN)))
1032         {
1033             if (inalloc)
1034                 pdc_swap_bytes2((char *) instring, inlen, NULL);
1035             else
1036             {
1037                 instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1038                      pdc_calloc_tmp(pdc, (size_t) (inlen + 2), fn, NULL, NULL) :
1039                      pdc_calloc(pdc, (size_t) (inlen + 2), fn));
1040                 pdc_swap_bytes2((char *) instring, inlen, (char *) instr);
1041 
1042                 inalloc = pdc_true;
1043                 instring = instr;
1044                 instr = NULL;
1045             }
1046         }
1047         inutf = pdc_utf16;
1048     }
1049 
1050     /* conversion to UTF-32 by swapping */
1051     if (inlen && inutf == pdc_utf32)
1052     {
1053 
1054         if ((pdc_is_utf32be_unicode(instring) && !PDC_ISBIGENDIAN) ||
1055             (pdc_is_utf32le_unicode(instring) &&  PDC_ISBIGENDIAN))
1056         {
1057             if (inalloc)
1058                 pdc_swap_bytes4((char *) instring, inlen, NULL);
1059             else
1060             {
1061                 instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1062                      pdc_calloc_tmp(pdc, (size_t) (inlen + 4), fn, NULL, NULL) :
1063                      pdc_calloc(pdc, (size_t) (inlen + 4), fn));
1064                 pdc_swap_bytes4((char *) instring, inlen, (char *) instr);
1065 
1066                 inalloc = pdc_true;
1067                 instring = instr;
1068                 instr = NULL;
1069             }
1070         }
1071     }
1072 
1073     /* illegal UTF-16 / UTF-32 */
1074     if (inutf >= pdc_utf16 && inlen % 2)
1075     {
1076         if (inutf == pdc_utf32 && inlen % 4)
1077             errcode = PDC_E_CONV_ILLUTF32;
1078         else
1079             errcode = PDC_E_CONV_ILLUTF16;
1080         goto PDC_CONV_ERROR;
1081     }
1082 
1083 
1084     /* conversion to UTF-16 by inflation or encoding vector */
1085     if (inutf == pdc_bytes &&
1086         (oututf != pdc_bytes || flags & PDC_CONV_HTMLCHAR || inev != outev))
1087     {
1088         if (logg)
1089         {
1090             if (flags & PDC_CONV_HTMLCHAR)
1091                 pdc_logg(pdc, "\t\tbyte character entity substitution\n");
1092         }
1093 
1094         len = 2 * inlen;
1095         instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1096              pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
1097              pdc_calloc(pdc, (size_t) (len + 2), fn));
1098         usinstr = (pdc_ushort *) instr;
1099 
1100         j = 0;
1101         for (i = 0; i < inlen; i++)
1102         {
1103             uv = (pdc_ushort) instring[i];
1104             if (inev)
1105             {
1106                 uv = inev->codes[uv];
1107                 if (!uv && (flags & PDC_CONV_ENCERROR) &&
1108                     (!(flags & PDC_CONV_KEEPLBCHAR) ||
1109                      !pdc_is_linebreaking_relchar(uv)))
1110                 {
1111                     errcode = PDC_E_ENC_NOTDEF_CODE;
1112                     stemp1 = pdc_errprintf(pdc, "x%02X", instring[i]);
1113                     stemp2 = inev->apiname;
1114                     goto PDC_CONV_ERROR;
1115                 }
1116             }
1117 
1118 
1119             usinstr[j] = uv;
1120             j++;
1121         }
1122 
1123         if (inalloc)
1124         {
1125             if (flags & PDC_CONV_TMPALLOC)
1126                 pdc_free_tmp(pdc, instring);
1127             else
1128                 pdc_free(pdc, instring);
1129         }
1130 
1131         inalloc = pdc_true;
1132         instring = instr;
1133         instr = NULL;
1134         inlen = 2 * j;
1135         inutf = pdc_utf16;
1136     }
1137 
1138 
1139 
1140     /* UTF conversion */
1141     oututf_s = oututf;
1142     if ((oututf_s == pdc_bytes && inutf == pdc_utf8) ||
1143          oututf_s == pdc_utf16be || oututf_s == pdc_utf16le)
1144         oututf_s = pdc_utf16;
1145     if (inutf != oututf_s && oututf_s != pdc_bytes)
1146     {
1147         len = 4 * (inlen + 1);
1148         instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1149              pdc_calloc_tmp(pdc, (size_t) len, fn, NULL, NULL) :
1150              pdc_calloc(pdc, (size_t) len, fn));
1151 
1152         if (inlen)
1153         {
1154             pdc_convers_result result = conversionOK;
1155             pdc_byte *instringa, *instra, *instringe, *instre;
1156             UTF8 *isa8 = NULL, *ise8 = NULL;
1157             UTF16 *isa16, *ise16;
1158             UTF32 *isa32, *ise32;
1159 
1160             if (logg)
1161                pdc_logg(pdc, "\t\tUTF conversion\n");
1162 
1163             instringa = instring;
1164             instringe = instring + inlen;
1165             instra = instr;
1166             instre = instr + len;
1167 
1168             if (inutf == pdc_utf8)
1169             {
1170                 isa8 = (UTF8 *) instringa;
1171                 ise8 = (UTF8 *) instringe;
1172                 if (oututf_s == pdc_utf16)
1173                 {
1174                     isa16 = (UTF16 *) instra;
1175                     ise16 = (UTF16 *) instre;
1176                     result = pdc_convertUTF8toUTF16(&isa8, ise8,
1177                                                     &isa16, ise16,
1178                                                     strictConversion);
1179                     instra = (pdc_byte *) isa16;
1180                     instre = (pdc_byte *) ise16;
1181                 }
1182                 else
1183                 {
1184                     isa32 = (UTF32 *) instra;
1185                     ise32 = (UTF32 *) instre;
1186                     result = pdc_convertUTF8toUTF32(&isa8, ise8,
1187                                                     &isa32, ise32,
1188                                                     strictConversion);
1189                     instra = (pdc_byte *) isa32;
1190                     instre = (pdc_byte *) ise32;
1191                 }
1192             }
1193             else if (inutf == pdc_utf16)
1194             {
1195                 isa16 = (UTF16 *) instringa;
1196                 ise16 = (UTF16 *) instringe;
1197                 if (oututf_s == pdc_utf8)
1198                 {
1199                     isa8 = (UTF8 *) instra;
1200                     ise8 = (UTF8 *) instre;
1201                     result = pdc_convertUTF16toUTF8(&isa16, ise16, &isa8, ise8,
1202                                                     strictConversion);
1203                     instra = (pdc_byte *) isa8;
1204                     instre = (pdc_byte *) ise8;
1205                 }
1206                 else
1207                 {
1208                     isa32 = (UTF32 *) instra;
1209                     ise32 = (UTF32 *) instre;
1210                     result = pdc_convertUTF16toUTF32(&isa16, ise16,
1211                                                      &isa32, ise32,
1212                                                      strictConversion);
1213                     instra = (pdc_byte *) isa32;
1214                     instre = (pdc_byte *) ise32;
1215                 }
1216             }
1217             else if (inutf == pdc_utf32)
1218             {
1219                 isa32 = (UTF32 *) instringa;
1220                 ise32 = (UTF32 *) instringe;
1221                 if (oututf_s == pdc_utf8)
1222                 {
1223                     isa8 = (UTF8 *) instra;
1224                     ise8 = (UTF8 *) instre;
1225                     result = pdc_convertUTF32toUTF8(&isa32, ise32,
1226                                                     &isa8, ise8,
1227                                                     strictConversion);
1228                     instra = (pdc_byte *) isa8;
1229                     instre = (pdc_byte *) ise8;
1230                 }
1231                 else
1232                 {
1233                     isa16 = (UTF16 *) instra;
1234                     ise16 = (UTF16 *) instre;
1235                     result = pdc_convertUTF32toUTF16(&isa32, ise32,
1236                                                      &isa16, ise16,
1237                                                      strictConversion);
1238                     instra = (pdc_byte *) isa16;
1239                     instre = (pdc_byte *) ise16;
1240                 }
1241             }
1242 
1243             switch (result)
1244             {
1245                 case targetExhausted:
1246                 errcode = PDC_E_CONV_MEMOVERFLOW;
1247                 break;
1248 
1249                 case sourceExhausted:
1250                 case sourceIllegal:
1251                 if (inutf == pdc_utf8)
1252                 {
1253                     UTF8 *bp, *bpe;
1254                     char *sb = sbuf;
1255 
1256                     bpe = MIN(ise8 - 1, isa8 + 3);
1257                     for (bp = isa8; bp <= bpe; bp++)
1258                         sb += sprintf(sb, "\\x%02X", *bp);
1259                     if (*bp)
1260                         sb += sprintf(sb, "...");
1261                     sb += sprintf(sb, " (");
1262                     for (bp = isa8; bp <= bpe; bp++)
1263                         sb += sprintf(sb, "%c", *bp);
1264                     if (*bp)
1265                         sb += sprintf(sb, "...");
1266                     sb += sprintf(sb, ")");
1267                     stemp1 = sbuf;
1268 
1269                     stemp2 = pdc_errprintf(pdc, "%d", isa8 - (UTF8 *)instringa);
1270 
1271                     if (flags & PDC_CONV_INFLATE)
1272                     {
1273                         pdc_warning(pdc, PDC_E_CONV_ILLUTF8SEQU, stemp1, stemp2,
1274                                     0, 0);
1275 
1276                         pdc_inflate_ascii((char *) instring, inlen,
1277                                           (char *) instr, pdc_utf16);
1278                         instra = instr + 2 * inlen;
1279                     }
1280                     else
1281                     {
1282                         errcode = PDC_E_CONV_ILLUTF8SEQU;
1283                     }
1284                 }
1285                 else
1286                 {
1287                     stemp1 = pdc_get_keyword((int)inutf, pdc_utfformat_keylist);
1288                     errcode = PDC_E_CONV_ILLUTF;
1289                 }
1290                 break;
1291 
1292                 default:
1293                 break;
1294             }
1295 
1296             if (errcode)
1297             {
1298                 if (logg)
1299                    pdc_logg(pdc, "\t\tUTF conversion error %d\n", result);
1300 
1301                 goto PDC_CONV_ERROR;
1302             }
1303 
1304             inlen = instra - instr;
1305         }
1306 
1307         if (inalloc)
1308         {
1309             if (flags & PDC_CONV_TMPALLOC)
1310                 pdc_free_tmp(pdc, instring);
1311             else
1312                 pdc_free(pdc, instring);
1313         }
1314 
1315         len = (oututf == pdc_utf32) ? inlen + 4 : inlen + 2;
1316         if (inlen + 4 != len)
1317             instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1318                  pdc_realloc_tmp(pdc, instr, (size_t) len, fn) :
1319                  pdc_realloc(pdc, instr, (size_t) len, fn));
1320         instr[inlen] = 0;
1321         instr[inlen + 1] = 0;
1322         if (oututf == pdc_utf32)
1323         {
1324             instr[inlen + 2] = 0;
1325             instr[inlen + 3] = 0;
1326         }
1327 
1328         inalloc = pdc_true;
1329         instring = instr;
1330         instr = NULL;
1331         inutf = oututf_s;
1332     }
1333 
1334     if (inutf == pdc_bytes)
1335     {
1336         if (!inalloc)
1337         {
1338             instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1339                  pdc_calloc_tmp(pdc, (size_t) (inlen + 2), fn, NULL, NULL) :
1340                  pdc_calloc(pdc, (size_t) (inlen + 2), fn));
1341             memcpy(instr, instring, (size_t) inlen);
1342 
1343             inalloc = pdc_true;
1344             instring = instr;
1345             instr = NULL;
1346         }
1347     }
1348 
1349     /* trying to reduce UTF-16 string to bytes string */
1350     if (inutf == pdc_utf16 &&
1351         (oututf == pdc_bytes || flags & PDC_CONV_TRYBYTES))
1352     {
1353         if (logg)
1354            pdc_logg(pdc, "\t\ttry to reduce UTF-16 to bytes\n");
1355 
1356         if (pdc_is_utf16be_unicode(instring) ||
1357             pdc_is_utf16le_unicode(instring))
1358             n = 1;
1359         else
1360             n = 0;
1361 
1362         len = (inlen - n) / 2;
1363         instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1364              pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
1365              pdc_calloc(pdc, (size_t) (len + 2), fn));
1366         usinstr = (pdc_ushort *) instring;
1367 
1368         for (i = 0; i < len; i++)
1369         {
1370             uv = usinstr[i + n];
1371             if (outev && uv)
1372             {
1373                 j = pdc_get_encoding_bytecode(pdc, outev, uv);
1374                 if (j < 0 && (flags & PDC_CONV_ENCERROR) && oututf == pdc_bytes)
1375                 {
1376                     errcode = PDC_E_ENC_NOTDEF_UNICODE;
1377                     stemp1 = pdc_errprintf(pdc, "%04X", uv);
1378                     stemp2 = outev->apiname;
1379                     goto PDC_CONV_ERROR;
1380                 }
1381                 uv = (pdc_ushort) j;
1382             }
1383             if (uv > PDC_UNICODE_MAXLATIN1)
1384                 break;
1385 
1386             instr[i] = (pdc_byte) uv;
1387         }
1388 
1389         if (i == len)
1390         {
1391             if (inalloc)
1392             {
1393                 if (flags & PDC_CONV_TMPALLOC)
1394                     pdc_free_tmp(pdc, instring);
1395                 else
1396                     pdc_free(pdc, instring);
1397             }
1398 
1399             inalloc = pdc_true;
1400             instring = instr;
1401             instr = NULL;
1402             inlen = len;
1403             inutf = pdc_bytes;
1404         }
1405         else
1406         {
1407             if (flags & PDC_CONV_TMPALLOC)
1408                 pdc_free_tmp(pdc, instr);
1409             else
1410                 pdc_free(pdc, instr);
1411             instr = NULL;
1412         }
1413     }
1414 
1415     /* UTF-8 format */
1416     if (inutf == pdc_utf8)
1417     {
1418         hasbom = pdc_is_utf8_unicode(instring);
1419 
1420         if (flags & PDC_CONV_TRY7BYTES)
1421         {
1422             if (logg)
1423                pdc_logg(pdc, "\t\ttry to reduce UTF-8 to 7-bit\n");
1424 
1425             for (i = hasbom ? 3 : 0; i < inlen; i++)
1426                 if (instring[i] > PDC_UNICODE_MAXASCII)
1427                     break;
1428             if (i == inlen)
1429             {
1430                 flags &= ~PDC_CONV_WITHBOM;
1431                 flags |= PDC_CONV_NOBOM;
1432                 inutf = pdc_bytes;
1433             }
1434         }
1435         else if (hasbom && (flags & PDC_CONV_AUTOBOM))
1436         {
1437             flags &= ~PDC_CONV_NOBOM;
1438             flags |= PDC_CONV_WITHBOM;
1439         }
1440         else if ((flags & PDC_CONV_WITHBOM) && (flags & PDC_CONV_NOBOM))
1441         {
1442             flags &= ~PDC_CONV_NOBOM;
1443         }
1444 
1445         if (!inalloc || flags & PDC_CONV_WITHBOM || flags & PDC_CONV_NOBOM)
1446         {
1447             i = (flags & PDC_CONV_WITHBOM && !hasbom) ? 3 : 0;
1448             j = (flags & PDC_CONV_NOBOM && hasbom) ? 3 : 0;
1449 
1450             len = inlen + i - j;
1451             instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1452                  pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
1453                  pdc_calloc(pdc, (size_t) (len + 2), fn));
1454             memcpy(&instr[i], &instring[j], (size_t) (inlen - j));
1455             instr[len] = 0;
1456 
1457             if (inalloc)
1458             {
1459                 if (flags & PDC_CONV_TMPALLOC)
1460                     pdc_free_tmp(pdc, instring);
1461                 else
1462                     pdc_free(pdc, instring);
1463             }
1464 
1465             inalloc = pdc_true;
1466             instring = instr;
1467             instr = NULL;
1468             inlen = len;
1469 
1470             hasbom = (flags & PDC_CONV_WITHBOM);
1471         }
1472 
1473         if (hasbom)
1474         {
1475             instring[0] = PDF_BOM2;
1476             instring[1] = PDF_BOM3;
1477             instring[2] = PDF_BOM4;
1478         }
1479 
1480     }
1481 
1482     /* UTF-16 formats */
1483     if (inutf == pdc_utf16 || inutf == pdc_utf16be || inutf == pdc_utf16le)
1484     {
1485         hasbom = pdc_is_utf16be_unicode(instring) ||
1486                  pdc_is_utf16le_unicode(instring);
1487 
1488         if (hasbom && (flags & PDC_CONV_AUTOBOM))
1489         {
1490             flags &= ~PDC_CONV_NOBOM;
1491             flags |= PDC_CONV_WITHBOM;
1492         }
1493         else if ((flags & PDC_CONV_WITHBOM) && (flags & PDC_CONV_NOBOM))
1494         {
1495             flags &= ~PDC_CONV_NOBOM;
1496         }
1497 
1498         if (!inalloc || oututf == pdc_utf16be || oututf == pdc_utf16le ||
1499             flags & PDC_CONV_WITHBOM || flags & PDC_CONV_NOBOM)
1500         {
1501             i = (flags & PDC_CONV_WITHBOM && !hasbom) ? 2 : 0;
1502             j = (flags & PDC_CONV_NOBOM && hasbom) ? 2 : 0;
1503 
1504             len = inlen + i - j;
1505             instr = (pdc_byte *) ((flags & PDC_CONV_TMPALLOC) ?
1506                  pdc_calloc_tmp(pdc, (size_t) (len + 2), fn, NULL, NULL) :
1507                  pdc_calloc(pdc, (size_t) (len + 2), fn));
1508             memcpy(&instr[i], &instring[j], (size_t) (inlen - j));
1509 
1510             if (inalloc)
1511             {
1512                 if (flags & PDC_CONV_TMPALLOC)
1513                     pdc_free_tmp(pdc, instring);
1514                 else
1515                     pdc_free(pdc, instring);
1516             }
1517 
1518             instring = instr;
1519             instr = NULL;
1520             inlen = len;
1521 
1522             hasbom = (flags & PDC_CONV_WITHBOM);
1523         }
1524 
1525         i = hasbom ? 2 : 0;
1526         if (inutf == pdc_utf16)
1527         {
1528             if (oututf == pdc_utf16be)
1529             {
1530                 inutf = pdc_utf16be;
1531                 toswap = !PDC_ISBIGENDIAN;
1532             }
1533             if (oututf == pdc_utf16le)
1534             {
1535                 inutf = pdc_utf16le;
1536                 toswap = PDC_ISBIGENDIAN;
1537             }
1538             if (toswap)
1539                 pdc_swap_bytes2((char *) &instring[i], inlen - i, NULL);
1540         }
1541 
1542         if (hasbom)
1543         {
1544             if (inutf == pdc_utf16be ||
1545                 (inutf == pdc_utf16 && PDC_ISBIGENDIAN))
1546             {
1547                 instring[0] = PDF_BOM0;
1548                 instring[1] = PDF_BOM1;
1549             }
1550             if (inutf == pdc_utf16le ||
1551                 (inutf == pdc_utf16 && !PDC_ISBIGENDIAN))
1552             {
1553                 instring[0] = PDF_BOM1;
1554                 instring[1] = PDF_BOM0;
1555             }
1556         }
1557     }
1558 
1559     if (logg)
1560         pdc_logg(pdc, "\t\ttextformat of converted string: %s\n",
1561                  pdc_get_keyword(inutf, pdc_textformat_keylist));
1562 
1563     PDC_CONV_EXIT:
1564     *oututf_p = inutf;
1565     if (outlen)
1566         *outlen = inlen;
1567     *outstring = instring;
1568     return 0;
1569 
1570     PDC_CONV_ERROR:
1571     if (outlen)
1572         *outlen = 0;
1573     *outstring = NULL;
1574 
1575     if (errcode > 0)
1576         pdc_set_errmsg(pdc, errcode, stemp1, stemp2, 0, 0);
1577 
1578     if (instr != NULL)
1579     {
1580         if (flags & PDC_CONV_TMPALLOC)
1581             pdc_free_tmp(pdc, instr);
1582         else
1583             pdc_free(pdc, instr);
1584     }
1585 
1586     if (inalloc)
1587     {
1588         if (flags & PDC_CONV_TMPALLOC)
1589             pdc_free_tmp(pdc, instring);
1590         else
1591             pdc_free(pdc, instring);
1592     }
1593 
1594     if (verbose)
1595         PDC_RETHROW(pdc);
1596 
1597     return errcode;
1598 }
1599 #if defined(_MSC_VER) && defined(_MANAGED)
1600 #pragma managed
1601 #endif
1602 
1603 
1604 /*
1605  *  pdc_convert_name_ext converts a string of name data type to UTF-8
1606  *
1607  *  flags & PDC_CONV_EBCDIC: converts to EBCDIC-UTF-8
1608  *
1609  *  len == 0: If the string has a [EBCDIC-]UTF-8 BOM or
1610  *            flags & PDC_CONV_ISUTF8 is set the string will be duplicated.
1611  *            Otherwise the string has encoding enc and codepage
1612  *            codepage.
1613  *            If enc == pdc_unicode the string is "UTF-16" encoded.
1614  *            Otherwise: If enc < pdc_winansi the string is "host" encoded.
1615  *
1616  *  len  > 0: The string is a UTF-16 string of len bytes.
1617  *
1618  */
1619 char *
pdc_convert_name_ext(pdc_core * pdc,const char * name,int len,pdc_encoding enc,int codepage,int flags)1620 pdc_convert_name_ext(pdc_core *pdc, const char *name, int len,
1621                      pdc_encoding enc, int codepage, int flags)
1622 {
1623     static const char fn[] = "pdc_convert_name_ext";
1624     pdc_encodingvector *ev = NULL;
1625     pdc_text_format nameformat = pdc_utf16;
1626     pdc_text_format outnameformat = pdc_utf8;
1627     pdc_byte *convname;
1628     char *outname = NULL;
1629     int outlen;
1630 
1631     if (name == NULL)
1632         return NULL;
1633 
1634     if (len == 0)
1635     {
1636         /* already [EBCDIC-]UTF-8 encoded */
1637         if ((flags & PDC_CONV_ISUTF8) || pdc_is_utf8_bytecode(name))
1638         {
1639             if (!(flags & PDC_CONV_WITHBOM))
1640                 flags |= PDC_CONV_NOBOM;
1641 
1642             if (!(flags & PDC_CONV_EBCDIC))
1643                 flags |= PDC_CONV_ASCII;
1644 
1645             /* On EBCDIC platforms EBCDIC-UTF-8 name strings are expected */
1646             outname = pdc_strdup_ext(pdc, name, (flags & ~PDC_CONV_EBCDIC), fn);
1647 
1648             if (outname != NULL)
1649                 return outname;
1650         }
1651 
1652         /* see bug #1486 */
1653         if (enc == pdc_unicode)
1654         {
1655             /* UTF-16 encoded string */
1656             len = (int) pdc_wstrlen(name);
1657         }
1658         else
1659         {
1660             /* 8-bit encoded string */
1661             nameformat = pdc_bytes;
1662             if (enc < pdc_winansi)
1663                 ev = pdc_get_encoding_vector(pdc,pdc_find_encoding(pdc,"host"));
1664             else
1665                 ev = pdc_get_encoding_vector(pdc, enc);
1666 
1667             len = (int) strlen(name);
1668         }
1669     }
1670 
1671     if (flags & PDC_CONV_EBCDIC)
1672         outnameformat = PDC_UTF8;
1673 
1674     flags |= PDC_CONV_TRY7BYTES;
1675     if (pdc->charref)
1676         flags |= PDC_CONV_HTMLCHAR;
1677     if (pdc->escapesequ)
1678         flags |= PDC_CONV_BSSEQU;
1679 
1680     /* convert to UTF-8 */
1681     pdc_convert_string(pdc, nameformat, codepage, ev, (pdc_byte *) name, len,
1682                 &outnameformat, NULL, &convname, &outlen, flags,
1683                 pdc_true);
1684 
1685     return (char *) convname;
1686 }
1687 
1688 char *
pdc_convert_name(pdc_core * pdc,const char * name,int len,int flags)1689 pdc_convert_name(pdc_core *pdc, const char *name, int len, int flags)
1690 {
1691     return pdc_convert_name_ext(pdc, name, len, pdc_invalidenc, 0, flags);
1692 }
1693 
1694 /* returned string is temporary allocated
1695 */
1696 char *
pdc_utf8_to_hostbytes(pdc_core * pdc,pdc_bool honorlang,char * name)1697 pdc_utf8_to_hostbytes(pdc_core *pdc, pdc_bool honorlang, char *name)
1698 {
1699     static const char fn[] = "pdc_utf8_to_hostbytes";
1700     pdc_encoding outenc = pdc_invalidenc;
1701     pdc_encodingvector *outev = NULL;
1702     pdc_text_format informat = PDC_UTF8;
1703     pdc_text_format outformat = pdc_utf16;
1704     pdc_byte *outname = NULL;
1705     int len = (int) strlen(name);
1706 
1707     {
1708         (void) fn;
1709         (void) honorlang;
1710         outenc = pdc_find_encoding(pdc, "host");
1711     }
1712 
1713     outev = pdc_get_encoding_vector(pdc, outenc);
1714 
1715     pdc_convert_string(pdc, informat, 0, NULL, (pdc_byte *) name, len,
1716                 &outformat, outev, &outname, &len,
1717                 PDC_CONV_TRYBYTES | PDC_CONV_NOBOM | PDC_CONV_TMPALLOC,
1718                 pdc_true);
1719     if (outformat == pdc_utf16)
1720     {
1721         pdc_free_tmp(pdc, outname);
1722         outname = NULL;
1723     }
1724 
1725     return (char *) outname;
1726 }
1727 
1728 /* returned string is temporary allocated
1729 */
1730 char *
pdc_hostbytes_to_utf8(pdc_core * pdc,pdc_bool honorlang,char * name)1731 pdc_hostbytes_to_utf8(pdc_core *pdc, pdc_bool honorlang, char *name)
1732 {
1733     static const char fn[] = "pdc_hostbytes_to_utf8";
1734     pdc_encoding inenc = pdc_invalidenc;
1735     pdc_encodingvector *inev = NULL;
1736     pdc_text_format informat = pdc_bytes;
1737     pdc_text_format outformat = PDC_UTF8;
1738     pdc_byte *outname = NULL;
1739     int len = (int) strlen(name);
1740 
1741     {
1742         (void) fn;
1743         (void) honorlang;
1744         inenc = pdc_find_encoding(pdc, "host");
1745     }
1746 
1747     inev = pdc_get_encoding_vector(pdc, inenc);
1748 
1749     pdc_convert_string(pdc, informat, 0, inev, (pdc_byte *) name, len,
1750                 &outformat, NULL, &outname, &len,
1751                 PDC_CONV_NOBOM | PDC_CONV_TMPALLOC, pdc_true);
1752 
1753     return (char *) outname;
1754 }
1755 
1756 /* --------------------- basic UTF conversion functions --------------------- */
1757 
1758 char *
pdc_utf16_to_utf8(pdc_core * pdc,const char * utf16string,int len,int flags,int * size)1759 pdc_utf16_to_utf8(pdc_core *pdc, const char *utf16string, int len, int flags,
1760                   int *size)
1761 {
1762     pdc_text_format outtextformat = pdc_utf8;
1763     pdc_byte *utf8string = NULL;
1764     int outlen;
1765 
1766     if (!utf16string)
1767         pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf16string", 0, 0, 0);
1768 
1769     if (flags & PDC_CONV_EBCDIC)
1770         outtextformat = PDC_UTF8;
1771 
1772     flags |= PDC_CONV_AUTOBOM;
1773     pdc_convert_string(pdc, pdc_utf16, 0, NULL,
1774                        (pdc_byte *) utf16string, len,
1775                        &outtextformat, NULL, &utf8string, &outlen,
1776                        flags, pdc_true);
1777 
1778     if (size) *size = outlen;
1779 
1780     return (char *) utf8string;
1781 }
1782 
1783 char *
pdc_utf8_to_utf16(pdc_core * pdc,const char * utf8string,const char * format,int flags,int * size)1784 pdc_utf8_to_utf16(pdc_core *pdc, const char *utf8string, const char *format,
1785                   int flags, int *size)
1786 {
1787     pdc_text_format textformat = pdc_utf8;
1788     pdc_text_format outtextformat = pdc_utf16;
1789     pdc_byte *utf16string = NULL;
1790     int len;
1791 
1792     if (!utf8string)
1793         pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf8string", 0, 0, 0);
1794     len = (int) strlen(utf8string);
1795 
1796     if (format && *format)
1797     {
1798         int k = pdc_get_keycode_ci(format, pdc_textformat_keylist);
1799 
1800         /* see bug #2175 */
1801         if (k == PDC_KEY_NOTFOUND)
1802         {
1803             char **sfl;
1804             const char *sf;
1805             int ns, i;
1806 
1807             sf = NULL;
1808             ns = pdc_split_stringlist(pdc, format, NULL, 0, &sfl);
1809             for (i = 0; i < ns; i++)
1810             {
1811                 if (!strcmp(sfl[i], "inflate"))
1812                     flags |= PDC_CONV_INFLATE;
1813                 else
1814                     sf = sfl[i];
1815             }
1816             if (sf != NULL)
1817                 k = pdc_get_keycode_ci(sf, pdc_textformat_keylist);
1818             else
1819                 k = pdc_utf16;
1820 
1821             pdc_cleanup_stringlist(pdc, sfl);
1822         }
1823 
1824         if (k == PDC_KEY_NOTFOUND ||
1825             ((pdc_text_format) k != pdc_utf16 &&
1826              (pdc_text_format) k != pdc_utf16be &&
1827              (pdc_text_format) k != pdc_utf16le))
1828             pdc_error(pdc, PDC_E_ILLARG_STRING, "format", format, 0, 0);
1829 
1830         outtextformat = (pdc_text_format) k;
1831     }
1832 
1833     if (flags & PDC_CONV_EBCDIC)
1834         textformat = PDC_UTF8;
1835 
1836     if (outtextformat == pdc_utf16)
1837         flags |= PDC_CONV_AUTOBOM;
1838     else
1839         flags |= PDC_CONV_WITHBOM;
1840     pdc_convert_string(pdc, textformat, 0, NULL,
1841                       (pdc_byte *) utf8string, len,
1842                       &outtextformat, NULL, &utf16string, size,
1843                       flags, pdc_true);
1844 
1845     return (char *) utf16string;
1846 }
1847 
1848 char *
pdc_utf16_to_utf32(pdc_core * pdc,const char * utf16string,int len,int * size)1849 pdc_utf16_to_utf32(pdc_core *pdc, const char *utf16string, int len, int *size)
1850 {
1851     pdc_text_format outtextformat = pdc_utf32;
1852     pdc_byte *utf32string = NULL;
1853 
1854     if (!utf16string)
1855         pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf16string", 0, 0, 0);
1856 
1857     pdc_convert_string(pdc, pdc_utf16, 0, NULL,
1858                        (pdc_byte *) utf16string, len,
1859                        &outtextformat, NULL, &utf32string, size,
1860                        0, pdc_true);
1861 
1862     return (char *) utf32string;
1863 }
1864 
1865 char *
pdc_utf32_to_utf8(pdc_core * pdc,const char * utf32string,int len,int flags,int * size)1866 pdc_utf32_to_utf8(pdc_core *pdc, const char *utf32string, int len, int flags,
1867                   int *size)
1868 {
1869     pdc_text_format outtextformat = pdc_utf8;
1870     pdc_byte *utf8string = NULL;
1871     int outlen;
1872 
1873     if (!utf32string)
1874         pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf32string", 0, 0, 0);
1875 
1876     if (flags & PDC_CONV_EBCDIC)
1877         outtextformat = PDC_UTF8;
1878 
1879     flags |= PDC_CONV_AUTOBOM;
1880     pdc_convert_string(pdc, pdc_utf32, 0, NULL,
1881                        (pdc_byte *) utf32string, len,
1882                        &outtextformat, NULL, &utf8string, &outlen,
1883                        flags, pdc_true);
1884 
1885     if (size) *size = outlen;
1886 
1887     return (char *) utf8string;
1888 }
1889 
1890 char *
pdc_utf32_to_utf16(pdc_core * pdc,const char * utf32string,int len,const char * format,int flags,int * size)1891 pdc_utf32_to_utf16(pdc_core *pdc, const char *utf32string, int len,
1892                    const char *format, int flags, int *size)
1893 {
1894     pdc_text_format textformat = pdc_utf32;
1895     pdc_text_format outtextformat = pdc_utf16;
1896     pdc_byte *utf16string = NULL;
1897 
1898     if (!utf32string)
1899         pdc_error(pdc, PDC_E_ILLARG_EMPTY, "utf32string", 0, 0, 0);
1900 
1901     if (format && *format)
1902     {
1903         int k = pdc_get_keycode_ci(format, pdc_textformat_keylist);
1904         if (k == PDC_KEY_NOTFOUND ||
1905             ((pdc_text_format) k != pdc_utf16 &&
1906              (pdc_text_format) k != pdc_utf16be &&
1907              (pdc_text_format) k != pdc_utf16le))
1908             pdc_error(pdc, PDC_E_ILLARG_STRING, "format", format, 0, 0);
1909         outtextformat = (pdc_text_format) k;
1910     }
1911 
1912     if (outtextformat == pdc_utf16)
1913         flags |= PDC_CONV_AUTOBOM;
1914     else
1915         flags |= PDC_CONV_WITHBOM;
1916     pdc_convert_string(pdc, textformat, 0, NULL,
1917                       (pdc_byte *) utf32string, len,
1918                       &outtextformat, NULL, &utf16string, size,
1919                       flags, pdc_true);
1920 
1921     return (char *) utf16string;
1922 }
1923 
1924 int
pdc_char16_to_char32(pdc_core * pdc,const pdc_ushort * ustext,int * ic,int len,pdc_bool verbose)1925 pdc_char16_to_char32(pdc_core *pdc, const pdc_ushort *ustext, int *ic, int len,
1926                      pdc_bool verbose)
1927 {
1928     pdc_ushort uvh = ustext[*ic];
1929 
1930     if (uvh < PDC_UNICODE_MINHIGHSUR || uvh > PDC_UNICODE_MAXLOWSUR)
1931     {
1932         return (int) uvh;
1933     }
1934     else
1935     {
1936         UTF16 *isa16 = (UTF16 *) &ustext[*ic];
1937         pdc_ushort uvl = 0;
1938         int icn = *ic + 1;
1939 
1940         if (icn < len)
1941         {
1942             uvl = ustext[icn];
1943             if (uvh <= PDC_UNICODE_MAXHIGHSUR)
1944             {
1945                 if (uvl >= PDC_UNICODE_MINLOWSUR &&
1946                     uvl <= PDC_UNICODE_MAXLOWSUR)
1947                 {
1948                     int usv;
1949                     UTF16 *ise16 = isa16 + 2;
1950                     UTF32 *isa32 = (UTF32 *) &usv;
1951                     UTF32 *ise32 = isa32 + 1;
1952 
1953                     pdc_convers_result result = pdc_convertUTF16toUTF32(
1954                                 &isa16, ise16, &isa32, ise32, strictConversion);
1955                     if (result == conversionOK)
1956                     {
1957                         *ic = icn;
1958                         return usv;
1959                     }
1960                 }
1961             }
1962         }
1963 
1964         pdc_set_errmsg(pdc, PDC_E_CONV_ILLUTF16SUR,
1965                        pdc_errprintf(pdc, "%04X", uvh),
1966                        pdc_errprintf(pdc, "%04X", uvl), 0, 0);
1967 
1968         if (verbose)
1969             pdc_error(pdc, -1, 0, 0, 0, 0);
1970     }
1971 
1972     return -1;
1973 }
1974 
1975 int
pdc_char32_to_char16(pdc_core * pdc,int usv,pdc_ushort * uvlist,pdc_bool verbose)1976 pdc_char32_to_char16(pdc_core *pdc, int usv, pdc_ushort *uvlist,
1977                      pdc_bool verbose)
1978 {
1979     if (usv < PDC_NUM_BMPVAL)
1980     {
1981         uvlist[0] = (pdc_ushort) usv;
1982         return 1;
1983     }
1984     else
1985     {
1986         UTF32 *isa32 = (UTF32 *) &usv;
1987         UTF32 *ise32 = isa32 + 1;
1988         UTF16 *isa16 = (UTF16 *) uvlist;
1989         UTF16 *ise16 = isa16 + 2;
1990 
1991         pdc_convers_result result = pdc_convertUTF32toUTF16(
1992                     &isa32, ise32, &isa16, ise16, strictConversion);
1993         if (result == conversionOK)
1994         {
1995             return 2;
1996         }
1997 
1998         pdc_set_errmsg(pdc, PDC_E_CONV_ILLUTF32CHAR,
1999                        pdc_errprintf(pdc, "%05X", usv), 0, 0, 0);
2000 
2001         if (verbose)
2002             pdc_error(pdc, -1, 0, 0, 0, 0);
2003     }
2004 
2005     return 0;
2006 }
2007