1 /**
2   @file uemf_utf.c
3 
4   @brief Functions for manipulating UTF and various types of text.
5 
6 
7   Compile with "U_VALGRIND" defined defined to enable code which lets valgrind check each record for
8   uninitialized data.
9 
10   Compile with "SOL8" defined for Solaris 8 or 9 (Sparc).
11 */
12 
13 /*
14 File:      uemf_utf.c
15 Version:   0.0.5
16 Date:      29-JAN-2014
17 Author:    David Mathog, Biology Division, Caltech
18 email:     mathog@caltech.edu
19 Copyright: 2014 David Mathog and California Institute of Technology (Caltech)
20 */
21 
22 #ifdef __cplusplus
23 extern "C" {
24 #endif
25 
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29 #include <iconv.h>
30 #include <wchar.h>
31 #include <errno.h>
32 #include <string.h>
33 #include <limits.h> // for INT_MAX, INT_MIN
34 #include <math.h>   // for U_ROUND()
35 #include "uemf_utf.h"
36 
37 //! \cond
38 /* Prototypes for functions used here and defined in uemf_endian.c, but which are not supposed
39 to be used in end user code. */
40 
41 void U_swap2(void *ul, unsigned int count);
42 //! \endcond
43 
44 /* ******************************************************************************************** */
45 
46 /** \cond */
47 /* iconv() has a funny cast on some older systems, on most recent ones
48    it is just char **.  This tries to work around the issue.  If you build this
49    on another funky system this code may need to be modified, or define ICONV_CAST
50    on the compile line(but it may be tricky).
51 */
52 #if _LIBICONV_VERSION == 0x0109
53 # define ICONV_CAST (const char **)
54 #endif  // _LIBICONV_VERSION 0x0109
55 #ifdef SOL8
56 # define ICONV_CAST (const char **)
57 #endif  //SOL8
58 #if !defined(ICONV_CAST)
59 # define ICONV_CAST (char **)
60 #endif  //ICONV_CAST
61 /** \endcond */
62 
63 /* **********************************************************************************************
64 These functions are used for development and debugging and should be be includied in production code.
65 *********************************************************************************************** */
66 
67 /**
68     \brief Dump a UTF8  string.  Not for use in production code.
69     \param src string to examine
70 */
wchar8show(const char * src)71 void wchar8show(
72       const char *src
73    ){
74    if(!src){
75       printf("char show <NULL>\n");
76    }
77    else {
78       printf("char show\n");
79       size_t srclen = 0;
80       while(*src){ printf("%d %d %x\n",(int) srclen,*src,*src); srclen++; src++; }
81    }
82 }
83 
84 /**
85     \brief Dump a UTF16  string.  Not for use in production code.
86     \param src string to examine
87 */
wchar16show(const uint16_t * src)88 void wchar16show(
89       const uint16_t *src
90    ){
91    if(!src){
92       printf("uint16_t show <NULL>\n");
93    }
94    else {
95       printf("uint16_t show\n");
96       size_t srclen = 0;
97       while(*src){ printf("%d %d %x\n",(int) srclen,*src,*src); srclen++; src++; }
98    }
99 }
100 
101 /**
102     \brief Dump a UTF32 string.  Not for use in production code.
103 */
wchar32show(const uint32_t * src)104 void wchar32show(
105       const uint32_t *src
106    ){
107    if(!src){
108       printf("uint32_t show <NULL>\n");
109    }
110    else {
111       printf("uint32_t show\n");
112       size_t srclen = 0;
113       while(*src){ printf("%d %d %x\n",(int) srclen,*src,*src); srclen++; src++; }
114    }
115 }
116 
117 /**
118     \brief Dump a wchar_t string.  Not for use in production code.
119     \param src string to examine
120 */
wchartshow(const wchar_t * src)121 void wchartshow(
122       const wchar_t *src
123    ){
124    uint32_t val;
125    if(!src){
126       printf("wchar_t show <NULL>\n");
127    }
128    else {
129       printf("wchar_t show\n");
130       size_t srclen = 0;
131       if(!src)return;
132       while(*src){
133          val = *src;  // because *src is wchar_t is not strictly an integer type, can cause warnings on next line
134          printf("%d %d %x\n",(int) srclen,val,val);
135          srclen++;
136          src++;
137       }
138    }
139 }
140 
141 /* **********************************************************************************************
142 These functions are used for character type conversions, Image conversions, and other
143 utility operations
144 *********************************************************************************************** */
145 
146 /**
147     \brief Find the number of (storage) characters in a 16 bit character string, not including terminator.
148     \param src string to examine
149 */
wchar16len(const uint16_t * src)150 size_t wchar16len(
151       const uint16_t *src
152    ){
153    size_t srclen = 0;
154    if(src){
155       while(*src){ srclen++; src++; }
156    }
157    return(srclen);
158 }
159 
160 /**
161     \brief Find the number of (storage) characters in a 32 bit character  string, not including terminator.
162     \param src string to examine
163 */
wchar32len(const uint32_t * src)164 size_t wchar32len(
165       const uint32_t *src
166    ){
167    size_t srclen = 0;
168    if(src){
169       while(*src){ srclen++; src++; }
170    }
171    return(srclen);
172 }
173 
174 /**
175     \brief Strncpy for wchar16 (UTF16).
176     \param dst destination (already allocated)
177     \param src source
178     \param nchars number of characters to copy
179 */
wchar16strncpy(uint16_t * dst,const uint16_t * src,size_t nchars)180 void   wchar16strncpy(
181       uint16_t       *dst,
182       const uint16_t *src,
183       size_t          nchars
184    ){
185    if(src){
186       for(;nchars;nchars--,dst++,src++){
187         *dst = *src;
188         if(!*src)break;
189       }
190    }
191 }
192 
193 /**
194     \brief Fill the output string with N characters, if the input string is shorter than N, pad with nulls.
195     \param dst destination (already allocated)
196     \param src source
197     \param nchars number of characters to copy
198 
199 */
wchar16strncpypad(uint16_t * dst,const uint16_t * src,size_t nchars)200 void   wchar16strncpypad(
201       uint16_t       *dst,
202       const uint16_t *src,
203       size_t          nchars
204    ){
205    if(src){
206       for(;*src && nchars;nchars--,dst++,src++){ *dst = *src; }
207       for(;nchars;nchars--,dst++){               *dst = 0;    }  // Pad the remainder
208    }
209 }
210 
211 /*  For the following converstion functions, remember that iconv() modifies ALL of its parameters,
212     so save a pointer to the destination buffer!!!!
213     It isn't clear that terminators are being
214     copied properly, so be sure allocated space is a bit larger and cleared.
215 */
216 
217 /**
218     \brief Convert a UTF32LE string to a UTF16LE string.
219     \returns pointer to new string or NULL if it fails
220     \param src wchar_t string to convert
221     \param max number of characters to convert, if 0, until terminator
222     \param len number of characters in new string, NOT including terminator
223 */
U_Utf32leToUtf16le(const uint32_t * src,size_t max,size_t * len)224 uint16_t *U_Utf32leToUtf16le(
225       const uint32_t *src,
226       size_t          max,
227       size_t         *len
228    ){
229    char *dst,*dst2;
230    char *src2 = (char *) src;
231    size_t srclen,dstlen,status;
232 
233    if(!src)return(NULL);
234    if(max){ srclen = 4*max; }
235    else {   srclen = 4 + 4*wchar32len(src); } //include terminator, length in BYTES
236 
237    dstlen = 2 + srclen;                     // this will always work, but may waste space
238    dst2  = dst = calloc(dstlen,1);          // so there will be at least one terminator
239    if(dst){
240       iconv_t conv = iconv_open("UTF-16LE", "UTF-32LE");
241       if ( conv == (iconv_t) -1){
242          free(dst2);
243          dst2=NULL;
244       }
245       else {
246          status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
247          iconv_close(conv);
248          if(status == (size_t) -1){
249             free(dst2);
250             dst2 = NULL;
251          }
252          else if(len){
253             *len=wchar16len((uint16_t *)dst2);
254          }
255       }
256    }
257    return((uint16_t *)dst2);
258 }
259 
260 /**
261     \brief  Convert a UTF16LE string to a UTF32LE string.
262     \return pointer to new string or NULL if it fails
263     \param src UTF16LE string to convert
264     \param max number of characters to convert, if 0, until terminator
265     \param len number of characters in new string, NOT including terminator
266 */
U_Utf16leToUtf32le(const uint16_t * src,size_t max,size_t * len)267 uint32_t *U_Utf16leToUtf32le(
268       const uint16_t *src,
269       size_t          max,
270       size_t         *len
271    ){
272    char *dst,*dst2;
273    char *src2 = (char *) src;
274    size_t srclen,dstlen,status;
275 
276    if(!src)return(NULL);
277    if(max){ srclen = 2*max; }
278    else {   srclen = 2*wchar16len(src)+2; } // include terminator, length in BYTES
279    dstlen = 2*(2 + srclen);                 // This should always work
280    dst2 = dst = calloc(dstlen,1);
281    if(dst){
282       iconv_t conv = iconv_open("UTF-32LE",   "UTF-16LE");
283       if ( conv == (iconv_t) -1){
284          free(dst2);
285          dst2=NULL;
286       }
287       else {
288          status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
289          iconv_close(conv);
290          if(status == (size_t) -1){
291             free(dst2);
292             dst2 = NULL;
293          }
294          else if(len){
295             *len=wchar32len((uint32_t *)dst2);
296          }
297       }
298    }
299    return((uint32_t *) dst2);
300 }
301 
302 /**
303     \brief  Convert a Latin1 string to a UTF32LE string.
304     \return pointer to new string or NULL if it fails
305     \param src Latin1 string to convert
306     \param max number of characters to convert, if 0, until terminator
307     \param len number of characters in new string, NOT including terminator
308 
309 
310     U_EMR_EXTTEXTOUTA records are "8 bit ASCII".  In theory that is ASCII in an 8
311     bit character, but numerous applications store Latin1 in them, and some
312     _may_ store UTF-8 in them.  Since very vew Latin1 strings are valid UTF-8 strings,
313     call U_Utf8ToUtf32le first, and if it fails, then call this function.
314 */
U_Latin1ToUtf32le(const char * src,size_t max,size_t * len)315 uint32_t *U_Latin1ToUtf32le(
316       const char *src,
317       size_t      max,
318       size_t     *len
319    ){
320    char *dst,*dst2;
321    char *src2 = (char *) src;
322    size_t srclen,dstlen,status;
323 
324    if(!src)return(NULL);
325    if(max){ srclen = max; }
326    else {   srclen = strlen(src)+1; }       // include terminator, length in BYTES
327    dstlen = sizeof(uint32_t)*(1 + srclen);  // This should always work but might waste some space
328    dst2 = dst = calloc(dstlen,1);
329    if(dst){
330       iconv_t conv = iconv_open("UTF-32LE",   "LATIN1");
331       if ( conv == (iconv_t) -1){
332          free(dst2);
333          dst2=NULL;
334       }
335       else {
336          status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
337          iconv_close(conv);
338          if(status == (size_t) -1){
339             free(dst2);
340             dst2 = NULL;
341          }
342          else if(len){
343             *len=wchar32len((uint32_t *)dst2);
344          }
345       }
346    }
347    return((uint32_t *) dst2);
348 }
349 
350 /**
351     \brief  Convert a UTF8 string to a UTF32LE string.
352     \return pointer to new string or NULL if it fails
353     \param src UTF8 string to convert
354     \param max number of characters to convert, if 0, until terminator
355     \param len number of characters in new string, NOT including terminator
356 */
U_Utf8ToUtf32le(const char * src,size_t max,size_t * len)357 uint32_t *U_Utf8ToUtf32le(
358       const char *src,
359       size_t      max,
360       size_t     *len
361    ){
362    char *dst,*dst2;
363    char *src2 = (char *) src;
364    size_t srclen,dstlen,status;
365 
366    if(!src)return(NULL);
367    if(max){ srclen = max; }
368    else {   srclen = strlen(src)+1; }       // include terminator, length in BYTES
369    dstlen = sizeof(uint32_t)*(1 + srclen);  // This should always work but might waste some space
370    dst2 = dst = calloc(dstlen,1);
371    if(dst){
372       iconv_t conv = iconv_open("UTF-32LE",   "UTF-8");
373       if ( conv == (iconv_t) -1){
374          free(dst2);
375          dst2=NULL;
376       }
377       else {
378          status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
379          iconv_close(conv);
380          if(status == (size_t) -1){
381             free(dst2);
382             dst2 = NULL;
383          }
384          else if(len){
385             *len=wchar32len((uint32_t *)dst2);
386          }
387       }
388    }
389    return((uint32_t *) dst2);
390 }
391 
392 /**
393     \brief  Convert a UTF32LE string to a UTF8 string.
394     \return pointer to new string or NULL if it fails
395     \param src wchar_t string to convert
396     \param max number of characters to convert, if 0, until terminator
397     \param len number of characters in new string, NOT including terminator
398 */
U_Utf32leToUtf8(const uint32_t * src,size_t max,size_t * len)399 char *U_Utf32leToUtf8(
400       const uint32_t *src,
401       size_t          max,
402       size_t         *len
403    ){
404    char *dst,*dst2;
405    char *src2 = (char *) src;
406    size_t srclen,dstlen,status;
407 
408    if(!src)return(NULL);
409    if(max){ srclen = 4*max; }
410    else {   srclen = 4*(1 + wchar32len(src)); } //include terminator, length in BYTES
411    dstlen = 1 + srclen;                         // This should always work but might waste some space
412    dst2 = dst = calloc(dstlen,1);
413    if(dst){
414       iconv_t conv = iconv_open("UTF-8",   "UTF-32LE");
415       if ( conv == (iconv_t) -1){
416          free(dst2);
417          dst2=NULL;
418       }
419       else {
420          status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
421          iconv_close(conv);
422          if(status == (size_t) -1){
423             free(dst2);
424             dst2 = NULL;
425          }
426          else if(len){
427             *len=strlen(dst2);
428          }
429       }
430    }
431    return(dst2);
432 }
433 
434 /**
435    \brief Convert a UTF-8 string to a UTF16-LE string.
436    \return pointer to new string or NULL if it fails
437    \param src UTF8 string to convert
438    \param max number of characters to convert, if 0, until terminator
439    \param len number of characters in new string, NOT including terminator
440 */
U_Utf8ToUtf16le(const char * src,size_t max,size_t * len)441 uint16_t *U_Utf8ToUtf16le(
442       const char   *src,
443       size_t        max,
444       size_t       *len
445    ){
446    char *dst,*dst2;
447    char *src2 = (char *) src;
448    size_t srclen,dstlen,status;
449 
450    if(!src)return(NULL);
451    if(max){ srclen = max; }
452    else {   srclen = strlen(src)+1; }       // include terminator, length in BYTES
453    dstlen = 2 * (1 + srclen);               // this will always work, but may waste space
454    dst2 = dst =calloc(dstlen,1);            // so there will always be a terminator
455    if(dst){
456       iconv_t conv = iconv_open("UTF-16LE", "UTF-8");
457       if ( conv == (iconv_t) -1){
458          free(dst2);
459          dst2=NULL;
460       }
461       else {
462          status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
463          iconv_close(conv);
464          if(status == (size_t) -1){
465             free(dst2);
466             dst2 = NULL;
467          }
468          else if(len){
469             *len=wchar16len((uint16_t *)dst2);
470          }
471       }
472    }
473    return((uint16_t *)dst2);
474 }
475 
476 /**
477     \brief Convert a UTF16LE string to a UTF8 string.
478     \return pointer to new UTF8 string or NULL if it fails
479     \param src UTF16LE string to convert
480     \param max number of characters to convert, if 0, until terminator
481     \param len number of characters in new string, NOT including terminator
482 */
U_Utf16leToUtf8(const uint16_t * src,size_t max,size_t * len)483 char *U_Utf16leToUtf8(
484       const uint16_t *src,
485       size_t          max,
486       size_t         *len
487    ){
488    char *dst, *dst2;
489    char *src2 = (char *) src;
490    size_t srclen,dstlen,status;
491 
492    if(!src)return(NULL);
493    if(max){ srclen = 2*max; }
494    else {   srclen = 2*(1 +wchar16len(src)); } //include terminator, length in BYTES
495    dstlen = 1 + 2*srclen;                      // this will always work, but may waste space
496                                                // worst case is all glyphs (==max) need 4 UTF-8 encoded bytes + terminator.
497    dst2 = dst = (char *) calloc(dstlen,1);
498    if(dst){
499       iconv_t conv = iconv_open("UTF-8", "UTF-16LE");
500       if ( conv == (iconv_t) -1){
501          free(dst2);
502          dst2=NULL;
503       }
504       else {
505          status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
506          iconv_close(conv);
507          if(status == (size_t) -1){
508             free(dst2);
509             dst2 = NULL;
510          }
511          else if(len){
512             *len=strlen(dst2);
513             dst = dst2;
514             dst2 = U_strdup(dst); // make a string of exactly the right size
515             free(dst);            // free the one which was probably too big
516          }
517       }
518    }
519    return(dst2);
520 }
521 
522 /**
523     \brief Convert a UTF16LE string to a LATIN1 string.
524     \return pointer to new UTF8 string or NULL if it fails
525     \param src UTF16LE string to convert
526     \param max number of characters to convert, if 0, until terminator
527     \param len number of characters in new string, NOT including terminator
528 */
U_Utf16leToLatin1(const uint16_t * src,size_t max,size_t * len)529 char *U_Utf16leToLatin1(
530       const uint16_t *src,
531       size_t          max,
532       size_t         *len
533    ){
534    char *dst, *dst2;
535    char *src2 = (char *) src;
536    size_t srclen,dstlen,status;
537 
538    if(!src)return(NULL);
539    if(max){ srclen = 2*max; }
540    else {   srclen = 2*(1 +wchar16len(src)); } //include terminator, length in BYTES
541    dstlen = 1 + srclen;                        // this will always work as latin1 is always 1 byte/character
542    dst2 = dst = (char *) calloc(dstlen,1);
543    if(dst){
544       iconv_t conv = iconv_open("LATIN1//TRANSLIT",   "UTF-16LE");
545       if ( conv == (iconv_t) -1){
546          free(dst2);
547          dst2=NULL;
548       }
549       else {
550          status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
551          iconv_close(conv);
552          if(status == (size_t) -1){
553             free(dst2);
554             dst2 = NULL;
555          }
556          else if(len){
557             *len=strlen(dst2);
558             dst = dst2;
559             dst2 = U_strdup(dst); // make a string of exactly the right size
560             free(dst);            // free the one which was probably too big
561          }
562       }
563    }
564    return(dst2);
565 }
566 /**
567     \brief Put a single 16 bit character into UTF-16LE form.
568 
569     Used in conjunction with U_Utf16leEdit(), because the character
570     representation would otherwise be dependent on machine Endianness.
571 
572     \return UTF16LE representation of the character.
573     \param src 16 bit character
574 
575 */
U_Utf16le(const uint16_t src)576 uint16_t U_Utf16le(const uint16_t src){
577     uint16_t dst=src;
578 #if U_BYTE_SWAP
579     U_swap2(&dst,1);
580 #endif
581     return(dst);
582 }
583 
584 /**
585     \brief  Convert a UTF8 string to a Latin1 string.
586     \return pointer to new string or NULL if it fails
587     \param src Latin1 string to convert
588     \param max number of characters to convert, if 0, until terminator
589     \param len number of characters in new string, NOT including terminator
590 
591 
592     WMF uses latin1, others UTF-8, only some utf-8 can be converted to latin1.
593 
594 */
U_Utf8ToLatin1(const char * src,size_t max,size_t * len)595 char *U_Utf8ToLatin1(
596       const char *src,
597       size_t      max,
598       size_t     *len
599    ){
600    char *dst,*dst2;
601    char *src2 = (char *) src;
602    size_t srclen,dstlen,status;
603    if(max){ srclen = max; }
604    else {   srclen = strlen(src)+1; }       // include terminator, length in BYTES
605    dstlen = (1 + srclen);                   // This should always work but might waste some space
606    dst2 = dst = calloc(dstlen,1);
607    if(dst){
608       iconv_t conv = iconv_open("LATIN1//TRANSLIT",   "UTF-8"); // translate what can be, fill in with something close for the rest
609       if ( conv == (iconv_t) -1){
610          free(dst2);
611          dst2=NULL;
612       }
613       else {
614          status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
615          iconv_close(conv);
616          if(status == (size_t) -1){
617             free(dst2);
618             dst2 = NULL;
619          }
620          else if(len){
621             *len=strlen(dst2);
622          }
623       }
624    }
625    return((char *) dst2);
626 }
627 
628 /**
629     \brief  Convert a Latin1 string to a UTF8 string.
630     \return pointer to new string or NULL if it fails
631     \param src Latin1 string to convert
632     \param max number of characters to convert, if 0, until terminator
633     \param len number of characters in new string, NOT including terminator
634 
635 
636     WMF uses latin1, others UTF-8, all Latin1 should be able to convert to utf-8.
637 
638 */
U_Latin1ToUtf8(const char * src,size_t max,size_t * len)639 char *U_Latin1ToUtf8(
640       const char *src,
641       size_t      max,
642       size_t     *len
643    ){
644    char *dst,*dst2;
645    char *src2 = (char *) src;
646    size_t srclen,dstlen,status;
647    if(max){ srclen = max; }
648    else {   srclen = strlen(src)+1; }       // include terminator, will waste some space
649    dstlen = (1 + 2*srclen);                 // This should always work because all latin1 convert to 1 or 2 byte UTF8, it might waste some space
650    dst2 = dst = calloc(dstlen,1);
651    if(dst){
652       iconv_t conv = iconv_open("UTF-8", "LATIN1"); // everything should translate
653       if ( conv == (iconv_t) -1){
654          free(dst2);
655          dst2=NULL;
656       }
657       else {
658          status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
659          iconv_close(conv);
660          if(status == (size_t) -1){
661             free(dst2);
662             dst2 = NULL;
663          }
664          else if(len){
665             *len=strlen(dst2);
666          }
667       }
668    }
669    return((char *) dst2);
670 }
671 
672 /**
673     \brief Single character replacement in a UTF-16LE string.
674 
675     Used solely for the Description field which contains
676     embedded nulls, which makes it difficult to manipulate.  Use some other character and then swap it.
677 
678     \return number of substitutions, or -1 if src is not defined
679     \param src UTF16LE string to edit
680     \param find character to replace
681     \param replace replacestitute character
682 
683 */
U_Utf16leEdit(uint16_t * src,uint16_t find,uint16_t replace)684 int U_Utf16leEdit(
685       uint16_t *src,
686       uint16_t  find,
687       uint16_t  replace
688    ){
689    int count=0;
690    if(!src)return(-1);
691    while(*src){
692      if(*src == find){ *src = replace; count++; }
693      src++;
694    }
695    return(count);
696 }
697 
698 /**
699     \brief strdup for when strict C99 compliance is enforced
700     \returns duplicate string or NULL on error
701     \param s string to duplicate
702 */
U_strdup(const char * s)703 char *U_strdup(const char *s){
704    char   *news=NULL;
705    size_t  slen;
706    if(s){
707       slen = strlen(s) + 1; //include the terminator!
708       news = malloc(slen);
709       if(news){
710          memcpy(news,s,slen);
711       }
712    }
713    return(news);
714 
715 }
716 
717 
718 #ifdef __cplusplus
719 }
720 #endif
721