1 /**
2 @file uemf_utf.c
3
4 @brief Functions for manipulating UTF and various types of text.
5
6
7 Compile with "U_VALGRIND" defined defined to enable code which lets valgrind check each record for
8 uninitialized data.
9
10 Compile with "SOL8" defined for Solaris 8 or 9 (Sparc).
11 */
12
13 /*
14 File: uemf_utf.c
15 Version: 0.0.5
16 Date: 29-JAN-2014
17 Author: David Mathog, Biology Division, Caltech
18 email: mathog@caltech.edu
19 Copyright: 2014 David Mathog and California Institute of Technology (Caltech)
20 */
21
22 #ifdef __cplusplus
23 extern "C" {
24 #endif
25
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29 #include <iconv.h>
30 #include <wchar.h>
31 #include <errno.h>
32 #include <string.h>
33 #include <limits.h> // for INT_MAX, INT_MIN
34 #include <math.h> // for U_ROUND()
35 #include "uemf_utf.h"
36
37 //! \cond
38 /* Prototypes for functions used here and defined in uemf_endian.c, but which are not supposed
39 to be used in end user code. */
40
41 void U_swap2(void *ul, unsigned int count);
42 //! \endcond
43
44 /* ******************************************************************************************** */
45
46 /** \cond */
47 /* iconv() has a funny cast on some older systems, on most recent ones
48 it is just char **. This tries to work around the issue. If you build this
49 on another funky system this code may need to be modified, or define ICONV_CAST
50 on the compile line(but it may be tricky).
51 */
52 #if _LIBICONV_VERSION == 0x0109
53 # define ICONV_CAST (const char **)
54 #endif // _LIBICONV_VERSION 0x0109
55 #ifdef SOL8
56 # define ICONV_CAST (const char **)
57 #endif //SOL8
58 #if !defined(ICONV_CAST)
59 # define ICONV_CAST (char **)
60 #endif //ICONV_CAST
61 /** \endcond */
62
63 /* **********************************************************************************************
64 These functions are used for development and debugging and should be be includied in production code.
65 *********************************************************************************************** */
66
67 /**
68 \brief Dump a UTF8 string. Not for use in production code.
69 \param src string to examine
70 */
wchar8show(const char * src)71 void wchar8show(
72 const char *src
73 ){
74 if(!src){
75 printf("char show <NULL>\n");
76 }
77 else {
78 printf("char show\n");
79 size_t srclen = 0;
80 while(*src){ printf("%d %d %x\n",(int) srclen,*src,*src); srclen++; src++; }
81 }
82 }
83
84 /**
85 \brief Dump a UTF16 string. Not for use in production code.
86 \param src string to examine
87 */
wchar16show(const uint16_t * src)88 void wchar16show(
89 const uint16_t *src
90 ){
91 if(!src){
92 printf("uint16_t show <NULL>\n");
93 }
94 else {
95 printf("uint16_t show\n");
96 size_t srclen = 0;
97 while(*src){ printf("%d %d %x\n",(int) srclen,*src,*src); srclen++; src++; }
98 }
99 }
100
101 /**
102 \brief Dump a UTF32 string. Not for use in production code.
103 */
wchar32show(const uint32_t * src)104 void wchar32show(
105 const uint32_t *src
106 ){
107 if(!src){
108 printf("uint32_t show <NULL>\n");
109 }
110 else {
111 printf("uint32_t show\n");
112 size_t srclen = 0;
113 while(*src){ printf("%d %d %x\n",(int) srclen,*src,*src); srclen++; src++; }
114 }
115 }
116
117 /**
118 \brief Dump a wchar_t string. Not for use in production code.
119 \param src string to examine
120 */
wchartshow(const wchar_t * src)121 void wchartshow(
122 const wchar_t *src
123 ){
124 uint32_t val;
125 if(!src){
126 printf("wchar_t show <NULL>\n");
127 }
128 else {
129 printf("wchar_t show\n");
130 size_t srclen = 0;
131 if(!src)return;
132 while(*src){
133 val = *src; // because *src is wchar_t is not strictly an integer type, can cause warnings on next line
134 printf("%d %d %x\n",(int) srclen,val,val);
135 srclen++;
136 src++;
137 }
138 }
139 }
140
141 /* **********************************************************************************************
142 These functions are used for character type conversions, Image conversions, and other
143 utility operations
144 *********************************************************************************************** */
145
146 /**
147 \brief Find the number of (storage) characters in a 16 bit character string, not including terminator.
148 \param src string to examine
149 */
wchar16len(const uint16_t * src)150 size_t wchar16len(
151 const uint16_t *src
152 ){
153 size_t srclen = 0;
154 if(src){
155 while(*src){ srclen++; src++; }
156 }
157 return(srclen);
158 }
159
160 /**
161 \brief Find the number of (storage) characters in a 32 bit character string, not including terminator.
162 \param src string to examine
163 */
wchar32len(const uint32_t * src)164 size_t wchar32len(
165 const uint32_t *src
166 ){
167 size_t srclen = 0;
168 if(src){
169 while(*src){ srclen++; src++; }
170 }
171 return(srclen);
172 }
173
174 /**
175 \brief Strncpy for wchar16 (UTF16).
176 \param dst destination (already allocated)
177 \param src source
178 \param nchars number of characters to copy
179 */
wchar16strncpy(uint16_t * dst,const uint16_t * src,size_t nchars)180 void wchar16strncpy(
181 uint16_t *dst,
182 const uint16_t *src,
183 size_t nchars
184 ){
185 if(src){
186 for(;nchars;nchars--,dst++,src++){
187 *dst = *src;
188 if(!*src)break;
189 }
190 }
191 }
192
193 /**
194 \brief Fill the output string with N characters, if the input string is shorter than N, pad with nulls.
195 \param dst destination (already allocated)
196 \param src source
197 \param nchars number of characters to copy
198
199 */
wchar16strncpypad(uint16_t * dst,const uint16_t * src,size_t nchars)200 void wchar16strncpypad(
201 uint16_t *dst,
202 const uint16_t *src,
203 size_t nchars
204 ){
205 if(src){
206 for(;*src && nchars;nchars--,dst++,src++){ *dst = *src; }
207 for(;nchars;nchars--,dst++){ *dst = 0; } // Pad the remainder
208 }
209 }
210
211 /* For the following converstion functions, remember that iconv() modifies ALL of its parameters,
212 so save a pointer to the destination buffer!!!!
213 It isn't clear that terminators are being
214 copied properly, so be sure allocated space is a bit larger and cleared.
215 */
216
217 /**
218 \brief Convert a UTF32LE string to a UTF16LE string.
219 \returns pointer to new string or NULL if it fails
220 \param src wchar_t string to convert
221 \param max number of characters to convert, if 0, until terminator
222 \param len number of characters in new string, NOT including terminator
223 */
U_Utf32leToUtf16le(const uint32_t * src,size_t max,size_t * len)224 uint16_t *U_Utf32leToUtf16le(
225 const uint32_t *src,
226 size_t max,
227 size_t *len
228 ){
229 char *dst,*dst2;
230 char *src2 = (char *) src;
231 size_t srclen,dstlen,status;
232
233 if(!src)return(NULL);
234 if(max){ srclen = 4*max; }
235 else { srclen = 4 + 4*wchar32len(src); } //include terminator, length in BYTES
236
237 dstlen = 2 + srclen; // this will always work, but may waste space
238 dst2 = dst = calloc(dstlen,1); // so there will be at least one terminator
239 if(dst){
240 iconv_t conv = iconv_open("UTF-16LE", "UTF-32LE");
241 if ( conv == (iconv_t) -1){
242 free(dst2);
243 dst2=NULL;
244 }
245 else {
246 status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
247 iconv_close(conv);
248 if(status == (size_t) -1){
249 free(dst2);
250 dst2 = NULL;
251 }
252 else if(len){
253 *len=wchar16len((uint16_t *)dst2);
254 }
255 }
256 }
257 return((uint16_t *)dst2);
258 }
259
260 /**
261 \brief Convert a UTF16LE string to a UTF32LE string.
262 \return pointer to new string or NULL if it fails
263 \param src UTF16LE string to convert
264 \param max number of characters to convert, if 0, until terminator
265 \param len number of characters in new string, NOT including terminator
266 */
U_Utf16leToUtf32le(const uint16_t * src,size_t max,size_t * len)267 uint32_t *U_Utf16leToUtf32le(
268 const uint16_t *src,
269 size_t max,
270 size_t *len
271 ){
272 char *dst,*dst2;
273 char *src2 = (char *) src;
274 size_t srclen,dstlen,status;
275
276 if(!src)return(NULL);
277 if(max){ srclen = 2*max; }
278 else { srclen = 2*wchar16len(src)+2; } // include terminator, length in BYTES
279 dstlen = 2*(2 + srclen); // This should always work
280 dst2 = dst = calloc(dstlen,1);
281 if(dst){
282 iconv_t conv = iconv_open("UTF-32LE", "UTF-16LE");
283 if ( conv == (iconv_t) -1){
284 free(dst2);
285 dst2=NULL;
286 }
287 else {
288 status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
289 iconv_close(conv);
290 if(status == (size_t) -1){
291 free(dst2);
292 dst2 = NULL;
293 }
294 else if(len){
295 *len=wchar32len((uint32_t *)dst2);
296 }
297 }
298 }
299 return((uint32_t *) dst2);
300 }
301
302 /**
303 \brief Convert a Latin1 string to a UTF32LE string.
304 \return pointer to new string or NULL if it fails
305 \param src Latin1 string to convert
306 \param max number of characters to convert, if 0, until terminator
307 \param len number of characters in new string, NOT including terminator
308
309
310 U_EMR_EXTTEXTOUTA records are "8 bit ASCII". In theory that is ASCII in an 8
311 bit character, but numerous applications store Latin1 in them, and some
312 _may_ store UTF-8 in them. Since very vew Latin1 strings are valid UTF-8 strings,
313 call U_Utf8ToUtf32le first, and if it fails, then call this function.
314 */
U_Latin1ToUtf32le(const char * src,size_t max,size_t * len)315 uint32_t *U_Latin1ToUtf32le(
316 const char *src,
317 size_t max,
318 size_t *len
319 ){
320 char *dst,*dst2;
321 char *src2 = (char *) src;
322 size_t srclen,dstlen,status;
323
324 if(!src)return(NULL);
325 if(max){ srclen = max; }
326 else { srclen = strlen(src)+1; } // include terminator, length in BYTES
327 dstlen = sizeof(uint32_t)*(1 + srclen); // This should always work but might waste some space
328 dst2 = dst = calloc(dstlen,1);
329 if(dst){
330 iconv_t conv = iconv_open("UTF-32LE", "LATIN1");
331 if ( conv == (iconv_t) -1){
332 free(dst2);
333 dst2=NULL;
334 }
335 else {
336 status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
337 iconv_close(conv);
338 if(status == (size_t) -1){
339 free(dst2);
340 dst2 = NULL;
341 }
342 else if(len){
343 *len=wchar32len((uint32_t *)dst2);
344 }
345 }
346 }
347 return((uint32_t *) dst2);
348 }
349
350 /**
351 \brief Convert a UTF8 string to a UTF32LE string.
352 \return pointer to new string or NULL if it fails
353 \param src UTF8 string to convert
354 \param max number of characters to convert, if 0, until terminator
355 \param len number of characters in new string, NOT including terminator
356 */
U_Utf8ToUtf32le(const char * src,size_t max,size_t * len)357 uint32_t *U_Utf8ToUtf32le(
358 const char *src,
359 size_t max,
360 size_t *len
361 ){
362 char *dst,*dst2;
363 char *src2 = (char *) src;
364 size_t srclen,dstlen,status;
365
366 if(!src)return(NULL);
367 if(max){ srclen = max; }
368 else { srclen = strlen(src)+1; } // include terminator, length in BYTES
369 dstlen = sizeof(uint32_t)*(1 + srclen); // This should always work but might waste some space
370 dst2 = dst = calloc(dstlen,1);
371 if(dst){
372 iconv_t conv = iconv_open("UTF-32LE", "UTF-8");
373 if ( conv == (iconv_t) -1){
374 free(dst2);
375 dst2=NULL;
376 }
377 else {
378 status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
379 iconv_close(conv);
380 if(status == (size_t) -1){
381 free(dst2);
382 dst2 = NULL;
383 }
384 else if(len){
385 *len=wchar32len((uint32_t *)dst2);
386 }
387 }
388 }
389 return((uint32_t *) dst2);
390 }
391
392 /**
393 \brief Convert a UTF32LE string to a UTF8 string.
394 \return pointer to new string or NULL if it fails
395 \param src wchar_t string to convert
396 \param max number of characters to convert, if 0, until terminator
397 \param len number of characters in new string, NOT including terminator
398 */
U_Utf32leToUtf8(const uint32_t * src,size_t max,size_t * len)399 char *U_Utf32leToUtf8(
400 const uint32_t *src,
401 size_t max,
402 size_t *len
403 ){
404 char *dst,*dst2;
405 char *src2 = (char *) src;
406 size_t srclen,dstlen,status;
407
408 if(!src)return(NULL);
409 if(max){ srclen = 4*max; }
410 else { srclen = 4*(1 + wchar32len(src)); } //include terminator, length in BYTES
411 dstlen = 1 + srclen; // This should always work but might waste some space
412 dst2 = dst = calloc(dstlen,1);
413 if(dst){
414 iconv_t conv = iconv_open("UTF-8", "UTF-32LE");
415 if ( conv == (iconv_t) -1){
416 free(dst2);
417 dst2=NULL;
418 }
419 else {
420 status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
421 iconv_close(conv);
422 if(status == (size_t) -1){
423 free(dst2);
424 dst2 = NULL;
425 }
426 else if(len){
427 *len=strlen(dst2);
428 }
429 }
430 }
431 return(dst2);
432 }
433
434 /**
435 \brief Convert a UTF-8 string to a UTF16-LE string.
436 \return pointer to new string or NULL if it fails
437 \param src UTF8 string to convert
438 \param max number of characters to convert, if 0, until terminator
439 \param len number of characters in new string, NOT including terminator
440 */
U_Utf8ToUtf16le(const char * src,size_t max,size_t * len)441 uint16_t *U_Utf8ToUtf16le(
442 const char *src,
443 size_t max,
444 size_t *len
445 ){
446 char *dst,*dst2;
447 char *src2 = (char *) src;
448 size_t srclen,dstlen,status;
449
450 if(!src)return(NULL);
451 if(max){ srclen = max; }
452 else { srclen = strlen(src)+1; } // include terminator, length in BYTES
453 dstlen = 2 * (1 + srclen); // this will always work, but may waste space
454 dst2 = dst =calloc(dstlen,1); // so there will always be a terminator
455 if(dst){
456 iconv_t conv = iconv_open("UTF-16LE", "UTF-8");
457 if ( conv == (iconv_t) -1){
458 free(dst2);
459 dst2=NULL;
460 }
461 else {
462 status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
463 iconv_close(conv);
464 if(status == (size_t) -1){
465 free(dst2);
466 dst2 = NULL;
467 }
468 else if(len){
469 *len=wchar16len((uint16_t *)dst2);
470 }
471 }
472 }
473 return((uint16_t *)dst2);
474 }
475
476 /**
477 \brief Convert a UTF16LE string to a UTF8 string.
478 \return pointer to new UTF8 string or NULL if it fails
479 \param src UTF16LE string to convert
480 \param max number of characters to convert, if 0, until terminator
481 \param len number of characters in new string, NOT including terminator
482 */
U_Utf16leToUtf8(const uint16_t * src,size_t max,size_t * len)483 char *U_Utf16leToUtf8(
484 const uint16_t *src,
485 size_t max,
486 size_t *len
487 ){
488 char *dst, *dst2;
489 char *src2 = (char *) src;
490 size_t srclen,dstlen,status;
491
492 if(!src)return(NULL);
493 if(max){ srclen = 2*max; }
494 else { srclen = 2*(1 +wchar16len(src)); } //include terminator, length in BYTES
495 dstlen = 1 + 2*srclen; // this will always work, but may waste space
496 // worst case is all glyphs (==max) need 4 UTF-8 encoded bytes + terminator.
497 dst2 = dst = (char *) calloc(dstlen,1);
498 if(dst){
499 iconv_t conv = iconv_open("UTF-8", "UTF-16LE");
500 if ( conv == (iconv_t) -1){
501 free(dst2);
502 dst2=NULL;
503 }
504 else {
505 status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
506 iconv_close(conv);
507 if(status == (size_t) -1){
508 free(dst2);
509 dst2 = NULL;
510 }
511 else if(len){
512 *len=strlen(dst2);
513 dst = dst2;
514 dst2 = U_strdup(dst); // make a string of exactly the right size
515 free(dst); // free the one which was probably too big
516 }
517 }
518 }
519 return(dst2);
520 }
521
522 /**
523 \brief Convert a UTF16LE string to a LATIN1 string.
524 \return pointer to new UTF8 string or NULL if it fails
525 \param src UTF16LE string to convert
526 \param max number of characters to convert, if 0, until terminator
527 \param len number of characters in new string, NOT including terminator
528 */
U_Utf16leToLatin1(const uint16_t * src,size_t max,size_t * len)529 char *U_Utf16leToLatin1(
530 const uint16_t *src,
531 size_t max,
532 size_t *len
533 ){
534 char *dst, *dst2;
535 char *src2 = (char *) src;
536 size_t srclen,dstlen,status;
537
538 if(!src)return(NULL);
539 if(max){ srclen = 2*max; }
540 else { srclen = 2*(1 +wchar16len(src)); } //include terminator, length in BYTES
541 dstlen = 1 + srclen; // this will always work as latin1 is always 1 byte/character
542 dst2 = dst = (char *) calloc(dstlen,1);
543 if(dst){
544 iconv_t conv = iconv_open("LATIN1//TRANSLIT", "UTF-16LE");
545 if ( conv == (iconv_t) -1){
546 free(dst2);
547 dst2=NULL;
548 }
549 else {
550 status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
551 iconv_close(conv);
552 if(status == (size_t) -1){
553 free(dst2);
554 dst2 = NULL;
555 }
556 else if(len){
557 *len=strlen(dst2);
558 dst = dst2;
559 dst2 = U_strdup(dst); // make a string of exactly the right size
560 free(dst); // free the one which was probably too big
561 }
562 }
563 }
564 return(dst2);
565 }
566 /**
567 \brief Put a single 16 bit character into UTF-16LE form.
568
569 Used in conjunction with U_Utf16leEdit(), because the character
570 representation would otherwise be dependent on machine Endianness.
571
572 \return UTF16LE representation of the character.
573 \param src 16 bit character
574
575 */
U_Utf16le(const uint16_t src)576 uint16_t U_Utf16le(const uint16_t src){
577 uint16_t dst=src;
578 #if U_BYTE_SWAP
579 U_swap2(&dst,1);
580 #endif
581 return(dst);
582 }
583
584 /**
585 \brief Convert a UTF8 string to a Latin1 string.
586 \return pointer to new string or NULL if it fails
587 \param src Latin1 string to convert
588 \param max number of characters to convert, if 0, until terminator
589 \param len number of characters in new string, NOT including terminator
590
591
592 WMF uses latin1, others UTF-8, only some utf-8 can be converted to latin1.
593
594 */
U_Utf8ToLatin1(const char * src,size_t max,size_t * len)595 char *U_Utf8ToLatin1(
596 const char *src,
597 size_t max,
598 size_t *len
599 ){
600 char *dst,*dst2;
601 char *src2 = (char *) src;
602 size_t srclen,dstlen,status;
603 if(max){ srclen = max; }
604 else { srclen = strlen(src)+1; } // include terminator, length in BYTES
605 dstlen = (1 + srclen); // This should always work but might waste some space
606 dst2 = dst = calloc(dstlen,1);
607 if(dst){
608 iconv_t conv = iconv_open("LATIN1//TRANSLIT", "UTF-8"); // translate what can be, fill in with something close for the rest
609 if ( conv == (iconv_t) -1){
610 free(dst2);
611 dst2=NULL;
612 }
613 else {
614 status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
615 iconv_close(conv);
616 if(status == (size_t) -1){
617 free(dst2);
618 dst2 = NULL;
619 }
620 else if(len){
621 *len=strlen(dst2);
622 }
623 }
624 }
625 return((char *) dst2);
626 }
627
628 /**
629 \brief Convert a Latin1 string to a UTF8 string.
630 \return pointer to new string or NULL if it fails
631 \param src Latin1 string to convert
632 \param max number of characters to convert, if 0, until terminator
633 \param len number of characters in new string, NOT including terminator
634
635
636 WMF uses latin1, others UTF-8, all Latin1 should be able to convert to utf-8.
637
638 */
U_Latin1ToUtf8(const char * src,size_t max,size_t * len)639 char *U_Latin1ToUtf8(
640 const char *src,
641 size_t max,
642 size_t *len
643 ){
644 char *dst,*dst2;
645 char *src2 = (char *) src;
646 size_t srclen,dstlen,status;
647 if(max){ srclen = max; }
648 else { srclen = strlen(src)+1; } // include terminator, will waste some space
649 dstlen = (1 + 2*srclen); // This should always work because all latin1 convert to 1 or 2 byte UTF8, it might waste some space
650 dst2 = dst = calloc(dstlen,1);
651 if(dst){
652 iconv_t conv = iconv_open("UTF-8", "LATIN1"); // everything should translate
653 if ( conv == (iconv_t) -1){
654 free(dst2);
655 dst2=NULL;
656 }
657 else {
658 status = iconv(conv, ICONV_CAST &src2, &srclen, &dst, &dstlen);
659 iconv_close(conv);
660 if(status == (size_t) -1){
661 free(dst2);
662 dst2 = NULL;
663 }
664 else if(len){
665 *len=strlen(dst2);
666 }
667 }
668 }
669 return((char *) dst2);
670 }
671
672 /**
673 \brief Single character replacement in a UTF-16LE string.
674
675 Used solely for the Description field which contains
676 embedded nulls, which makes it difficult to manipulate. Use some other character and then swap it.
677
678 \return number of substitutions, or -1 if src is not defined
679 \param src UTF16LE string to edit
680 \param find character to replace
681 \param replace replacestitute character
682
683 */
U_Utf16leEdit(uint16_t * src,uint16_t find,uint16_t replace)684 int U_Utf16leEdit(
685 uint16_t *src,
686 uint16_t find,
687 uint16_t replace
688 ){
689 int count=0;
690 if(!src)return(-1);
691 while(*src){
692 if(*src == find){ *src = replace; count++; }
693 src++;
694 }
695 return(count);
696 }
697
698 /**
699 \brief strdup for when strict C99 compliance is enforced
700 \returns duplicate string or NULL on error
701 \param s string to duplicate
702 */
U_strdup(const char * s)703 char *U_strdup(const char *s){
704 char *news=NULL;
705 size_t slen;
706 if(s){
707 slen = strlen(s) + 1; //include the terminator!
708 news = malloc(slen);
709 if(news){
710 memcpy(news,s,slen);
711 }
712 }
713 return(news);
714
715 }
716
717
718 #ifdef __cplusplus
719 }
720 #endif
721