1 /* mchar.c
2 * Codeset and wide character processing
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 *
18 * REFS:
19 * http://mail.nl.linux.org/linux-utf8/2001-04/msg00083.html
20 * http://www.cl.cam.ac.uk/~mgk25/unicode.html
21 * http://mail.nl.linux.org/linux-utf8/2001-06/msg00020.html
22 * http://mail.nl.linux.org/linux-utf8/2001-04/msg00254.html
23 */
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <stdarg.h>
28 #include <stddef.h>
29 #include "srconfig.h"
30 #if defined HAVE_UNISTD_H
31 #include <unistd.h>
32 #endif
33 #include <ctype.h>
34 #if defined HAVE_LOCALE_CHARSET
35 #include <localcharset.h>
36 #elif defined HAVE_LANGINFO_CODESET
37 #include <langinfo.h>
38 #endif
39 #include <locale.h>
40 #include <time.h>
41 #include <errno.h>
42 #include "debug.h"
43 #include "srtypes.h"
44 #include "mchar.h"
45
46 #if WIN32
47 #define ICONV_WCHAR "UCS-2-INTERNAL"
48 #define vsnprintf _vsnprintf
49 // #define vswprintf _vsnwprintf
50 #else
51 #define ICONV_WCHAR "WCHAR_T"
52 /* This prototype is missing in some systems */
53 // int vswprintf (wchar_t * ws, size_t n, const wchar_t * format, va_list arg);
54 #endif
55
56
57 /*****************************************************************************
58 * Public functions
59 *****************************************************************************/
60 char *left_str(char *str, int len);
61 char *subnstr_until(const char *str, char *until, char *newstr, int maxlen);
62 char *format_byte_size(char *str, long size);
63 void trim(char *str);
64
65
66 /*****************************************************************************
67 * These functions are NOT mchar related
68 *****************************************************************************/
69 char*
subnstr_until(const char * str,char * until,char * newstr,int maxlen)70 subnstr_until(const char *str, char *until, char *newstr, int maxlen)
71 {
72 const char *p = str;
73 int len = 0;
74
75 for (len = 0; strncmp(p, until, strlen(until)) != 0 && len < maxlen-1; p++)
76 {
77 newstr[len] = *p;
78 len++;
79 }
80 newstr[len] = '\0';
81
82 return newstr;
83 }
84
left_str(char * str,int len)85 char *left_str(char *str, int len)
86 {
87 int slen = strlen(str);
88
89 if (slen <= len)
90 return str;
91
92 str[len] = '\0';
93 return str;
94 }
95
format_byte_size(char * str,long size)96 char *format_byte_size(char *str, long size)
97 {
98 const long ONE_K = 1024;
99 const long ONE_M = ONE_K*ONE_K;
100
101 if (size < ONE_K)
102 sprintf(str, "%ldb", size);
103 else if (size < ONE_M)
104 sprintf(str, "%ldkb", size/ONE_K);
105 else
106 sprintf(str, "%.2fM", (float)size/(ONE_M));
107
108 return str;
109 }
110
trim(char * str)111 void trim(char *str)
112 {
113 char *start = str;
114 char *end;
115 char *original_end;
116 char *test;
117
118 /* skip over initial whitespace */
119 while (*start && isspace(*start)) {
120 ++start;
121 }
122
123 /* locate end of string */
124 end = start;
125 while (*end) {
126 ++end;
127 }
128 original_end = end;
129
130 /* backtrack over final whitespace */
131 while (end > start) {
132 test = end-1;
133 if (isspace(*test)) {
134 end = test;
135 } else {
136 break;
137 }
138 }
139
140 /* move non-whitespace text if initial whitespace was found above. */
141 /* move is unnecessary if resulting string is empty */
142 if (start > str && start != end) {
143 memmove(str, start, end-start);
144 }
145
146 /* null-terminate resulting string. */
147 /* this is necessary in all cases except when the string was not modified */
148 if (start > str || end < original_end) {
149 str[end-start] = '\0';
150 }
151 }
152
153 /* This is a little different from standard strncpy, because:
154 1) behavior is known when dst & src overlap
155 2) only copy n-1 characters max
156 3) then add the null char
157 */
158 void
sr_strncpy(char * dst,char * src,int n)159 sr_strncpy (char* dst, char* src, int n)
160 {
161 int i = 0;
162 for (i = 0; i < n-1; i++) {
163 if (!(dst[i] = src[i])) {
164 return;
165 }
166 }
167 dst[i] = 0;
168 }
169
170 /*****************************************************************************
171 * Lstring functions
172 *****************************************************************************/
173 void
lstring_initialize(Lstring * lstring)174 lstring_initialize (Lstring* lstring)
175 {
176 lstring->num_bytes = 0;
177 lstring->data = 0;
178 }
179
180
181 /*****************************************************************************
182 * These functions ARE mchar related
183 *****************************************************************************/
184 void
gmem_concat(gchar ** base_mem,gsize base_bytes,char * concat_mem,gsize concat_bytes)185 gmem_concat
186 (
187 gchar** base_mem, /* In/Out: The memory to be concatenated to */
188 gsize base_bytes, /* Input: Size of base memory (in bytes) */
189 char* concat_mem, /* Input: The memory to concatenate */
190 gsize concat_bytes /* Input: Size of concat_mem (in bytes) */
191 )
192 {
193 *base_mem = g_realloc (*base_mem, base_bytes + concat_bytes);
194 memcpy (*base_mem + base_bytes, concat_mem, concat_bytes);
195 }
196
197 /* Convert a string, replacing unconvertable characters.
198 Returns a gchar* string, which must be freed by caller using g_free. */
199 void
convert_string_with_replacement(gchar ** output_string,gsize * output_bytes,char * input_string,gsize input_bytes,char * from_codeset,char * to_codeset,char * repl)200 convert_string_with_replacement
201 (
202 gchar** output_string, /* Output: The converted string */
203 gsize* output_bytes, /* Output: Size of output string (in bytes) */
204 char* input_string, /* Input: String to convert */
205 gsize input_bytes, /* Input: Length of input string (in bytes) */
206 char* from_codeset, /* Input: Codeset of input string */
207 char* to_codeset, /* Input: Codeset of output string */
208 char* repl /* Input: Replacement character (zero terminated,
209 in utf-8) */
210 )
211 {
212 GIConv giconv;
213 gsize cur = 0; /* Current byte to convert */
214 GError *error = 0;
215 gsize bytes_to_convert = input_bytes;
216 int need_repl = 1;
217 gsize br, bw;
218 gchar* repl_string;
219 gsize repl_bytes;
220
221 *output_string = 0;
222 *output_bytes = 0;
223
224 giconv = g_iconv_open (to_codeset, from_codeset);
225 if (giconv == (GIConv) -1) {
226 /* Not sure why this would happen */
227 debug_printf ("g_iconv_open returned zero\n");
228 return;
229 }
230
231 /* Convert replacement character from UTF-8 to to_codeset */
232 repl_string = g_convert (repl, strlen (repl),
233 to_codeset, "UTF-8",
234 &br, &bw, &error);
235 if (repl_string && br == strlen (repl)) {
236 repl_bytes = bw;
237 } else {
238 repl_bytes = 0;
239 if (repl_string) {
240 g_free (repl_string);
241 }
242 }
243
244 while (cur < input_bytes) {
245 gchar* os;
246 int drop_byte = 0; /* Should we drop a byte? */
247
248 br = 0;
249 os = g_convert_with_iconv (&input_string[cur], bytes_to_convert,
250 giconv, &br, &bw, &error);
251 debug_printf ("cur=%d, btc=%d, br=%d, bw=%d (b1=0x%02x)\n",
252 cur, bytes_to_convert, br, bw, input_string[cur]);
253
254 /* If the conversion was unsuccessful, usually it means that
255 either the input byte doesn't belong to the from_codeset,
256 or the code point doesn't belong to the to_codeset.
257 */
258 if (error) {
259 /* There could be a partial output -- the specs aren't clear
260 about this. Better free it just in case. */
261 if (os) {
262 g_free (os);
263 }
264 switch (error->code) {
265 case G_CONVERT_ERROR_ILLEGAL_SEQUENCE:
266 case G_CONVERT_ERROR_FAILED:
267 case G_CONVERT_ERROR_PARTIAL_INPUT:
268 debug_printf ("g_convert_with_iconv returned error: %d (%s)\n",
269 error->code, error->message);
270 break;
271 case G_CONVERT_ERROR_NO_CONVERSION:
272 default:
273 /* This shouldn't happen, as GNU inconv guarantees
274 conversion to/from UTF-8. */
275 debug_printf ("g_convert_with_iconv returned error: %d (%s)\n",
276 error->code, error->message);
277 g_error_free (error);
278 g_free (repl_string);
279 return;
280 }
281 /* How many bytes to try next time? */
282 switch (bytes_to_convert) {
283 case 4:
284 case 3:
285 case 2:
286 bytes_to_convert --;
287 break;
288 case 1:
289 /* Crapped out. Drop current byte, and add "?" to string. */
290 drop_byte = 1;
291 break;
292 default:
293 /* Best guess based on br value returned from iconv */
294 if (br < bytes_to_convert && br > 0) {
295 bytes_to_convert = br;
296 } else {
297 bytes_to_convert = 4;
298 }
299 }
300 g_error_free (error);
301 error = 0;
302 } else {
303 if (br == 0) {
304 /* glib 2.16.5 (and probably other versions) doesn't properly
305 return G_CONVERT_ERROR_PARTIAL_INPUT with partially
306 translated characters. We'll detect this condition
307 when there is no error, but br is 0. */
308 drop_byte = 1;
309 } else {
310 /* A successful conversion. */
311 debug_printf ("Successful conversion: %d bytes read\n", br);
312 gmem_concat (output_string, *output_bytes, os, bw);
313 g_free (os);
314 cur += br;
315 bytes_to_convert = input_bytes - cur;
316 *output_bytes += bw;
317 need_repl = 1;
318 }
319 }
320
321 /* drop_byte will be true if a conversion failure happened.
322 Drop current byte from input, and append replacement
323 character into output. But only append a single replacement
324 character for each group of dropped bytes. */
325 if (drop_byte) {
326 cur ++;
327 bytes_to_convert = input_bytes - cur;
328 if (need_repl) {
329 gmem_concat (output_string, *output_bytes, repl, repl_bytes);
330 *output_bytes += repl_bytes;
331 need_repl = 0;
332 }
333 }
334 }
335 g_free (repl_string);
336 g_iconv_close (giconv);
337 debug_printf ("convert_string_with_replacement |%s| -> |%s| (%s -> %s)\n",
338 input_string, *output_string, from_codeset, to_codeset);
339 return;
340 }
341
342 /* Assumes src is valid utf8 */
343 int
utf8cpy(gchar * dst,gchar * src,int dst_len)344 utf8cpy (gchar* dst, gchar* src, int dst_len)
345 {
346 gchar *s = src;
347 gchar *d = dst;
348 gunichar c;
349 gint dlen = 0;
350 gint clen;
351
352 while (dst_len > 6) {
353 c = g_utf8_get_char(s);
354 if (!c) break;
355 clen = g_unichar_to_utf8 (c, d);
356 d += clen;
357 dlen += clen;
358 dst_len -= clen;
359 s = g_utf8_next_char (s);
360 }
361 *d = 0;
362 return dlen;
363 }
364
365 void
lstring_from_lstring(Lstring * lstring_out,Lstring * lstring_in,char * from_codeset,char * to_codeset)366 lstring_from_lstring (Lstring* lstring_out,/* Output: Output string */
367 Lstring* lstring_in, /* Input: Input string */
368 char* from_codeset, /* Input: Codeset of input string */
369 char* to_codeset) /* Input: Codeset of output string */
370 {
371 convert_string_with_replacement (&lstring_out->data,
372 &lstring_out->num_bytes,
373 lstring_in->data,
374 lstring_out->num_bytes,
375 from_codeset,
376 to_codeset,
377 "?");
378 }
379
380 void
lstring_from_gstring(Lstring * lstring_out,gchar * gstring_in,char * to_codeset)381 lstring_from_gstring (Lstring* lstring_out,/* Output: Output string */
382 gchar* gstring_in, /* Input: Input string */
383 char* to_codeset) /* Input: Codeset of output string */
384 {
385 convert_string_with_replacement (&lstring_out->data,
386 &lstring_out->num_bytes,
387 gstring_in,
388 strlen (gstring_in) + 1,
389 "UTF-8",
390 to_codeset,
391 "?");
392 }
393
394 /* Input value mlen is measured in mchar, not bytes.
395 Return value is the number of mchar occupied by the converted string,
396 not including the null character.
397
398 For GLIB UTF8, it makes more sense to return the dynamically allocated
399 string, and pass in the codeset string itself rather than
400 rmi & codeset type.
401
402 GLIB UTF8 returns number of bytes.
403 */
404 int
gstring_from_string(RIP_MANAGER_INFO * rmi,mchar * m,int mlen,char * c,int codeset_type)405 gstring_from_string (RIP_MANAGER_INFO* rmi, mchar* m, int mlen,
406 char* c, int codeset_type)
407 {
408 CODESET_OPTIONS* mchar_cs = &rmi->mchar_cs;
409 if (mlen < 0) return 0;
410 *m = 0;
411 if (!c) return 0;
412
413 {
414 gchar* gstring;
415 gsize gstring_len;
416 char* src_codeset;
417 int rc;
418
419 switch (codeset_type) {
420 case CODESET_UTF8:
421 src_codeset = "UTF-8";
422 break;
423 case CODESET_LOCALE:
424 src_codeset = mchar_cs->codeset_locale;
425 break;
426 case CODESET_FILESYS:
427 src_codeset = mchar_cs->codeset_filesys;
428 break;
429 case CODESET_ID3:
430 src_codeset = mchar_cs->codeset_id3;
431 break;
432 case CODESET_METADATA:
433 src_codeset = mchar_cs->codeset_metadata;
434 break;
435 case CODESET_RELAY:
436 src_codeset = mchar_cs->codeset_relay;
437 break;
438 default:
439 printf ("Program error. Bad codeset m->c (%d)\n", codeset_type);
440 exit (-1);
441 }
442 /* GCS FIX: This is not correct, as strlen(c) won't work
443 for UTF-16. */
444 convert_string_with_replacement (&gstring, &gstring_len,
445 c, strlen(c) + 1,
446 src_codeset, "UTF-8",
447 "?");
448 if (!gstring) {
449 debug_printf ("Error converting gstring_from_string\n");
450 return 0;
451 }
452 rc = utf8cpy (m, gstring, mlen);
453 g_free (gstring);
454 return rc;
455 }
456 }
457
458 /* Return value is the number of char occupied by the converted string,
459 not including the null character. */
460 int
string_from_gstring(RIP_MANAGER_INFO * rmi,char * c,int clen,mchar * m,int codeset_type)461 string_from_gstring (RIP_MANAGER_INFO* rmi, char* c, int clen, mchar* m, int codeset_type)
462 {
463 CODESET_OPTIONS* mchar_cs = &rmi->mchar_cs;
464 if (clen <= 0) return 0;
465 *c = 0;
466 if (!m) return 0;
467 {
468 gchar* cstring;
469 gsize cstring_len;
470 char* tgt_codeset;
471
472 switch (codeset_type) {
473 case CODESET_UTF8:
474 tgt_codeset = "UTF-8";
475 break;
476 case CODESET_LOCALE:
477 tgt_codeset = mchar_cs->codeset_locale;
478 break;
479 case CODESET_FILESYS:
480 tgt_codeset = mchar_cs->codeset_filesys;
481 break;
482 case CODESET_ID3:
483 tgt_codeset = mchar_cs->codeset_id3;
484 break;
485 case CODESET_METADATA:
486 tgt_codeset = mchar_cs->codeset_metadata;
487 break;
488 case CODESET_RELAY:
489 tgt_codeset = mchar_cs->codeset_relay;
490 break;
491 default:
492 printf ("Program error. Bad codeset m->c (%d)\n", codeset_type);
493 exit (-1);
494 }
495 /* This is the new method */
496 convert_string_with_replacement (&cstring, &cstring_len,
497 m, strlen(m) + 1,
498 "UTF-8",
499 tgt_codeset,
500 "?");
501 if (!cstring) {
502 debug_printf ("Error converting string_from_gstring\n");
503 return 0;
504 }
505 /* GCS FIX: truncation can chop multibyte string */
506 /* This will be fixed by using dynamic memory here... */
507 if (cstring_len >= clen) {
508 cstring_len = clen - 1;
509 }
510 memcpy (c, cstring, cstring_len);
511 /* GCS FIX: If converting to UTF-16, need to add 00 to end */
512 c[cstring_len] = 0;
513 g_free (cstring);
514 return cstring_len;
515 }
516 }
517
518 const char*
default_codeset(void)519 default_codeset (void)
520 {
521 const char* fromcode = 0;
522
523 #if defined HAVE_LOCALE_CHARSET
524 debug_printf ("Using locale_charset() to get system codeset.\n");
525 fromcode = locale_charset ();
526 #elif defined HAVE_LANGINFO_CODESET
527 debug_printf ("Using nl_langinfo() to get system codeset.\n");
528 fromcode = nl_langinfo (CODESET);
529 #else
530 debug_printf ("No way to get system codeset.\n");
531 #endif
532 if (!fromcode || !fromcode[0]) {
533 debug_printf ("No default codeset, using ISO-8859-1.\n");
534 fromcode = "ISO-8859-1";
535 } else {
536 debug_printf ("Found default codeset %s\n", fromcode);
537 }
538
539 #if defined (WIN32)
540 {
541 /* This is just for debugging */
542 LCID lcid;
543 lcid = GetSystemDefaultLCID ();
544 debug_printf ("SystemDefaultLCID: %04x\n", lcid);
545 lcid = GetUserDefaultLCID ();
546 debug_printf ("UserDefaultLCID: %04x\n", lcid);
547 }
548 #endif
549
550 // #if defined HAVE_ICONV
551 // debug_printf ("Have iconv.\n");
552 // #else
553 // debug_printf ("No iconv.\n");
554 // #endif
555
556 return fromcode;
557 }
558
559 void
sr_set_locale(void)560 sr_set_locale (void)
561 {
562 setlocale (LC_ALL, "");
563 setlocale (LC_CTYPE, "");
564 debug_printf ("LOCALE is %s\n",setlocale(LC_ALL,NULL));
565 }
566
567 void
set_codesets_default(CODESET_OPTIONS * cs_opt)568 set_codesets_default (CODESET_OPTIONS* cs_opt)
569 {
570 const char* fromcode = 0;
571
572 /* Set default codesets */
573 fromcode = default_codeset ();
574 if (fromcode) {
575 strncpy (cs_opt->codeset_locale, fromcode, MAX_CODESET_STRING);
576 strncpy (cs_opt->codeset_filesys, fromcode, MAX_CODESET_STRING);
577 strncpy (cs_opt->codeset_metadata, fromcode, MAX_CODESET_STRING);
578 strncpy (cs_opt->codeset_relay, fromcode, MAX_CODESET_STRING);
579 }
580
581 /* Always use UTF-16 for id3 */
582 strncpy (cs_opt->codeset_id3, "UTF-16", MAX_CODESET_STRING);
583
584 /* Don't use UTF-8 for metadata */
585 if (!strcmp (fromcode, "UTF-8")) {
586 strncpy (cs_opt->codeset_metadata, "ISO-8859-1", MAX_CODESET_STRING);
587 strncpy (cs_opt->codeset_relay, "ISO-8859-1", MAX_CODESET_STRING);
588 }
589 }
590
591 void
register_codesets(RIP_MANAGER_INFO * rmi,CODESET_OPTIONS * cs_opt)592 register_codesets (RIP_MANAGER_INFO* rmi, CODESET_OPTIONS* cs_opt)
593 {
594 CODESET_OPTIONS* mchar_cs = &rmi->mchar_cs;
595
596 /* For ID3, force UCS-2, UCS-2LE, UCS-2BE, UTF-16LE, and UTF-16BE
597 to be UTF-16. This way, we get the BOM like we need.
598 This might change if we upgrade to id3v2.4, which allows
599 UTF-8 and UTF-16 without BOM. */
600 if (!strncmp (cs_opt->codeset_id3, "UCS-2", strlen("UCS-2")) ||
601 !strncmp (cs_opt->codeset_id3, "UTF-16", strlen("UTF-16"))) {
602 strcpy (cs_opt->codeset_id3, "UTF-16");
603 }
604
605 strcpy (mchar_cs->codeset_locale, cs_opt->codeset_locale);
606 strcpy (mchar_cs->codeset_filesys, cs_opt->codeset_filesys);
607 strcpy (mchar_cs->codeset_id3, cs_opt->codeset_id3);
608 strcpy (mchar_cs->codeset_metadata, cs_opt->codeset_metadata);
609 strcpy (mchar_cs->codeset_relay, cs_opt->codeset_relay);
610 debug_printf ("Locale codeset: %s\n", mchar_cs->codeset_locale);
611 debug_printf ("Filesys codeset: %s\n", mchar_cs->codeset_filesys);
612 debug_printf ("ID3 codeset: %s\n", mchar_cs->codeset_id3);
613 debug_printf ("Metadata codeset: %s\n", mchar_cs->codeset_metadata);
614 debug_printf ("Relay codeset: %s\n", mchar_cs->codeset_relay);
615 }
616
617 /* This is used to set the codeset byte for id3v2 frames */
618 int
is_id3_unicode(RIP_MANAGER_INFO * rmi)619 is_id3_unicode (RIP_MANAGER_INFO* rmi)
620 {
621 CODESET_OPTIONS* mchar_cs = &rmi->mchar_cs;
622 if (!strcmp ("UTF-16", mchar_cs->codeset_id3)) {
623 return 1;
624 }
625 return 0;
626 }
627
628 void
mstrncpy(mchar * dst,mchar * src,int n)629 mstrncpy (mchar* dst, mchar* src, int n)
630 {
631 int i = 0;
632 for (i = 0; i < n-1; i++) {
633 if (!(dst[i] = src[i])) {
634 return;
635 }
636 }
637 dst[i] = 0;
638 }
639
640 mchar*
mstrdup(mchar * src)641 mstrdup (mchar* src)
642 {
643 return strdup (src);
644 }
645
646 mchar*
mstrcpy(mchar * dest,const mchar * src)647 mstrcpy (mchar* dest, const mchar* src)
648 {
649 return strcpy (dest, src);
650 }
651
652 size_t
mstrlen(mchar * s)653 mstrlen (mchar* s)
654 {
655 return strlen (s);
656 }
657
658 /* GCS FIX: gcc can give a warning about vswprintf. This may require
659 setting gcc -std=c99, or gcc -lang-c99 */
660 int
msnprintf(mchar * dest,size_t n,const mchar * fmt,...)661 msnprintf (mchar* dest, size_t n, const mchar* fmt, ...)
662 {
663 int rc;
664 va_list ap;
665 va_start (ap, fmt);
666 rc = vsnprintf (dest, n, fmt, ap);
667 va_end (ap);
668 return rc;
669 }
670
671 mchar*
mstrchr(const mchar * ws,mchar wc)672 mstrchr (const mchar* ws, mchar wc)
673 {
674 return g_utf8_strchr (ws, -1, g_utf8_get_char(&wc));
675 }
676
677 mchar*
mstrrchr(const mchar * ws,mchar wc)678 mstrrchr (const mchar* ws, mchar wc)
679 {
680 return g_utf8_strrchr (ws, -1, g_utf8_get_char(&wc));
681 }
682
683 mchar*
mstrncat(mchar * ws1,const mchar * ws2,size_t n)684 mstrncat (mchar* ws1, const mchar* ws2, size_t n)
685 {
686 return strncat (ws1, ws2, n);
687 }
688
689 int
mstrcmp(const mchar * ws1,const mchar * ws2)690 mstrcmp (const mchar* ws1, const mchar* ws2)
691 {
692 return strcmp (ws1, ws2);
693 }
694
695 long int
mtol(const mchar * string)696 mtol (const mchar* string)
697 {
698 return strtol (string, 0, 0);
699 }
700