1 /**
2 * @file
3 * Conversion between different character encodings
4 *
5 * @authors
6 * Copyright (C) 1999-2002,2007 Thomas Roessler <roessler@does-not-exist.org>
7 *
8 * @copyright
9 * This program is free software: you can redistribute it and/or modify it under
10 * the terms of the GNU General Public License as published by the Free Software
11 * Foundation, either version 2 of the License, or (at your option) any later
12 * version.
13 *
14 * This program is distributed in the hope that it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU General Public License along with
20 * this program. If not, see <http://www.gnu.org/licenses/>.
21 */
22
23 /**
24 * @page mutt_charset Conversion between different character encodings
25 *
26 * Conversion between different character encodings
27 */
28
29 #include "config.h"
30 #include <ctype.h>
31 #include <errno.h>
32 #include <iconv.h>
33 #include <langinfo.h>
34 #include <limits.h>
35 #include <stdbool.h>
36 #include <stdio.h>
37 #include <string.h>
38 #include "config/lib.h"
39 #include "core/lib.h"
40 #include "charset.h"
41 #include "buffer.h"
42 #include "memory.h"
43 #include "queue.h"
44 #include "regex3.h"
45 #include "string2.h"
46 #ifdef ENABLE_NLS
47 #include <libintl.h>
48 #endif
49
50 #ifndef EILSEQ
51 #define EILSEQ EINVAL
52 #endif
53
54 /**
55 * ReplacementChar - When a Unicode character can't be displayed, use this instead
56 */
57 wchar_t ReplacementChar = '?';
58
59 /**
60 * CharsetIsUtf8 - Is the user's current character set utf-8?
61 */
62 bool CharsetIsUtf8 = false;
63
64 /**
65 * struct Lookup - Regex to String lookup table
66 *
67 * This is used by 'charset-hook' and 'iconv-hook'.
68 */
69 struct Lookup
70 {
71 enum LookupType type; ///< Lookup type
72 struct Regex regex; ///< Regular expression
73 char *replacement; ///< Alternative charset to use
74 TAILQ_ENTRY(Lookup) entries; ///< Linked list
75 };
76 TAILQ_HEAD(LookupList, Lookup);
77
78 static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups);
79
80 /**
81 * struct MimeNames - MIME name lookup entry
82 */
83 struct MimeNames
84 {
85 const char *key;
86 const char *pref;
87 };
88
89 /**
90 * PreferredMimeNames - Lookup table of preferred charsets
91 *
92 * The following list has been created manually from the data under:
93 * http://www.isi.edu/in-notes/iana/assignments/character-sets
94 * Last update: 2000-09-07
95 *
96 * @note It includes only the subset of character sets for which a preferred
97 * MIME name is given.
98 */
99 const struct MimeNames PreferredMimeNames[] = {
100 // clang-format off
101 { "ansi_x3.4-1968", "us-ascii" },
102 { "iso-ir-6", "us-ascii" },
103 { "iso_646.irv:1991", "us-ascii" },
104 { "ascii", "us-ascii" },
105 { "iso646-us", "us-ascii" },
106 { "us", "us-ascii" },
107 { "ibm367", "us-ascii" },
108 { "cp367", "us-ascii" },
109 { "csASCII", "us-ascii" },
110
111 { "csISO2022KR", "iso-2022-kr" },
112 { "csEUCKR", "euc-kr" },
113 { "csISO2022JP", "iso-2022-jp" },
114 { "csISO2022JP2", "iso-2022-jp-2" },
115
116 { "ISO_8859-1:1987", "iso-8859-1" },
117 { "iso-ir-100", "iso-8859-1" },
118 { "iso_8859-1", "iso-8859-1" },
119 { "latin1", "iso-8859-1" },
120 { "l1", "iso-8859-1" },
121 { "IBM819", "iso-8859-1" },
122 { "CP819", "iso-8859-1" },
123 { "csISOLatin1", "iso-8859-1" },
124
125 { "ISO_8859-2:1987", "iso-8859-2" },
126 { "iso-ir-101", "iso-8859-2" },
127 { "iso_8859-2", "iso-8859-2" },
128 { "latin2", "iso-8859-2" },
129 { "l2", "iso-8859-2" },
130 { "csISOLatin2", "iso-8859-2" },
131
132 { "ISO_8859-3:1988", "iso-8859-3" },
133 { "iso-ir-109", "iso-8859-3" },
134 { "ISO_8859-3", "iso-8859-3" },
135 { "latin3", "iso-8859-3" },
136 { "l3", "iso-8859-3" },
137 { "csISOLatin3", "iso-8859-3" },
138
139 { "ISO_8859-4:1988", "iso-8859-4" },
140 { "iso-ir-110", "iso-8859-4" },
141 { "ISO_8859-4", "iso-8859-4" },
142 { "latin4", "iso-8859-4" },
143 { "l4", "iso-8859-4" },
144 { "csISOLatin4", "iso-8859-4" },
145
146 { "ISO_8859-6:1987", "iso-8859-6" },
147 { "iso-ir-127", "iso-8859-6" },
148 { "iso_8859-6", "iso-8859-6" },
149 { "ECMA-114", "iso-8859-6" },
150 { "ASMO-708", "iso-8859-6" },
151 { "arabic", "iso-8859-6" },
152 { "csISOLatinArabic", "iso-8859-6" },
153
154 { "ISO_8859-7:1987", "iso-8859-7" },
155 { "iso-ir-126", "iso-8859-7" },
156 { "ISO_8859-7", "iso-8859-7" },
157 { "ELOT_928", "iso-8859-7" },
158 { "ECMA-118", "iso-8859-7" },
159 { "greek", "iso-8859-7" },
160 { "greek8", "iso-8859-7" },
161 { "csISOLatinGreek", "iso-8859-7" },
162
163 { "ISO_8859-8:1988", "iso-8859-8" },
164 { "iso-ir-138", "iso-8859-8" },
165 { "ISO_8859-8", "iso-8859-8" },
166 { "hebrew", "iso-8859-8" },
167 { "csISOLatinHebrew", "iso-8859-8" },
168
169 { "ISO_8859-5:1988", "iso-8859-5" },
170 { "iso-ir-144", "iso-8859-5" },
171 { "ISO_8859-5", "iso-8859-5" },
172 { "cyrillic", "iso-8859-5" },
173 { "csISOLatinCyrillic", "iso-8859-5" },
174
175 { "ISO_8859-9:1989", "iso-8859-9" },
176 { "iso-ir-148", "iso-8859-9" },
177 { "ISO_8859-9", "iso-8859-9" },
178 { "latin5", "iso-8859-9" }, /* this is not a bug */
179 { "l5", "iso-8859-9" },
180 { "csISOLatin5", "iso-8859-9" },
181
182 { "ISO_8859-10:1992", "iso-8859-10" },
183 { "iso-ir-157", "iso-8859-10" },
184 { "latin6", "iso-8859-10" }, /* this is not a bug */
185 { "l6", "iso-8859-10" },
186 { "csISOLatin6", "iso-8859-10" },
187
188 { "csKOI8r", "koi8-r" },
189
190 { "MS_Kanji", "Shift_JIS" }, /* Note the underscore! */
191 { "csShiftJis", "Shift_JIS" },
192
193 { "Extended_UNIX_Code_Packed_Format_for_Japanese",
194 "euc-jp" },
195 { "csEUCPkdFmtJapanese", "euc-jp" },
196
197 { "csGB2312", "gb2312" },
198 { "csbig5", "big5" },
199
200 /* End of official brain damage.
201 * What follows has been taken from glibc's localedata files. */
202
203 { "iso_8859-13", "iso-8859-13" },
204 { "iso-ir-179", "iso-8859-13" },
205 { "latin7", "iso-8859-13" }, /* this is not a bug */
206 { "l7", "iso-8859-13" },
207
208 { "iso_8859-14", "iso-8859-14" },
209 { "latin8", "iso-8859-14" }, /* this is not a bug */
210 { "l8", "iso-8859-14" },
211
212 { "iso_8859-15", "iso-8859-15" },
213 { "latin9", "iso-8859-15" }, /* this is not a bug */
214
215 /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
216 { "latin0", "iso-8859-15" }, /* this is not a bug */
217
218 { "iso_8859-16", "iso-8859-16" },
219 { "latin10", "iso-8859-16" }, /* this is not a bug */
220
221 { "646", "us-ascii" },
222
223 /* http://www.sun.com/software/white-papers/wp-unicode/ */
224
225 { "eucJP", "euc-jp" },
226 { "PCK", "Shift_JIS" },
227 { "ko_KR-euc", "euc-kr" },
228 { "zh_TW-big5", "big5" },
229
230 /* seems to be common on some systems */
231
232 { "sjis", "Shift_JIS" },
233 { "euc-jp-ms", "eucJP-ms" },
234
235 /* If you happen to encounter system-specific brain-damage with respect to
236 * character set naming, please add it above this comment, and submit a patch
237 * to <neomutt-devel@neomutt.org> */
238
239 { NULL, NULL },
240 // clang-format on
241 };
242
243 /**
244 * lookup_new - Create a new Lookup
245 * @retval ptr New Lookup
246 */
lookup_new(void)247 static struct Lookup *lookup_new(void)
248 {
249 return mutt_mem_calloc(1, sizeof(struct Lookup));
250 }
251
252 /**
253 * lookup_free - Free a Lookup
254 * @param ptr Lookup to free
255 */
lookup_free(struct Lookup ** ptr)256 static void lookup_free(struct Lookup **ptr)
257 {
258 if (!ptr || !*ptr)
259 return;
260
261 struct Lookup *l = *ptr;
262 FREE(&l->replacement);
263 FREE(&l->regex.pattern);
264 if (l->regex.regex)
265 regfree(l->regex.regex);
266 FREE(&l->regex.regex);
267 FREE(&l->regex);
268
269 FREE(ptr);
270 }
271
272 /**
273 * lookup_charset - Look for a preferred character set name
274 * @param type Type, e.g. #MUTT_LOOKUP_CHARSET
275 * @param cs Character set
276 * @retval ptr Charset string
277 *
278 * If the character set matches one of the regexes,
279 * then return the replacement name.
280 */
lookup_charset(enum LookupType type,const char * cs)281 static const char *lookup_charset(enum LookupType type, const char *cs)
282 {
283 if (!cs)
284 return NULL;
285
286 struct Lookup *l = NULL;
287
288 TAILQ_FOREACH(l, &Lookups, entries)
289 {
290 if (l->type != type)
291 continue;
292 if (mutt_regex_match(&l->regex, cs))
293 return l->replacement;
294 }
295 return NULL;
296 }
297
298 /**
299 * mutt_ch_convert_nonmime_string - Try to convert a string using a list of character sets
300 * @param[in,out] ps String to be converted
301 * @retval 0 Success
302 * @retval -1 Error
303 *
304 * Work through `$assumed_charset` looking for a character set conversion that
305 * works. Failing that, try mutt_ch_get_default_charset().
306 */
mutt_ch_convert_nonmime_string(char ** ps)307 int mutt_ch_convert_nonmime_string(char **ps)
308 {
309 if (!ps)
310 return -1;
311
312 char *u = *ps;
313 const size_t ulen = mutt_str_len(u);
314 if (ulen == 0)
315 return 0;
316
317 const char *c1 = NULL;
318
319 const char *const c_assumed_charset =
320 cs_subset_string(NeoMutt->sub, "assumed_charset");
321 const char *const c_charset = cs_subset_string(NeoMutt->sub, "charset");
322 for (const char *c = c_assumed_charset; c; c = c1 ? c1 + 1 : 0)
323 {
324 c1 = strchr(c, ':');
325 size_t n = c1 ? c1 - c : mutt_str_len(c);
326 if (n == 0)
327 return 0;
328 char *fromcode = mutt_mem_malloc(n + 1);
329 mutt_str_copy(fromcode, c, n + 1);
330 char *s = mutt_strn_dup(u, ulen);
331 int m = mutt_ch_convert_string(&s, fromcode, c_charset, MUTT_ICONV_NO_FLAGS);
332 FREE(&fromcode);
333 if (m == 0)
334 {
335 FREE(ps);
336 *ps = s;
337 return 0;
338 }
339 FREE(&s);
340 }
341 mutt_ch_convert_string(ps, (const char *) mutt_ch_get_default_charset(),
342 c_charset, MUTT_ICONV_HOOK_FROM);
343 return -1;
344 }
345
346 /**
347 * mutt_ch_canonical_charset - Canonicalise the charset of a string
348 * @param buf Buffer for canonical character set name
349 * @param buflen Length of buffer
350 * @param name Name to be canonicalised
351 *
352 * This first ties off any charset extension such as "//TRANSLIT",
353 * canonicalizes the charset and re-adds the extension
354 */
mutt_ch_canonical_charset(char * buf,size_t buflen,const char * name)355 void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
356 {
357 if (!buf || !name)
358 return;
359
360 char in[1024], scratch[1024 + 10];
361
362 mutt_str_copy(in, name, sizeof(in));
363 char *ext = strchr(in, '/');
364 if (ext)
365 *ext++ = '\0';
366
367 if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
368 {
369 mutt_str_copy(buf, "utf-8", buflen);
370 goto out;
371 }
372
373 /* catch some common iso-8859-something misspellings */
374 size_t plen;
375 if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
376 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
377 else if ((plen = mutt_istr_startswith(in, "8859-")))
378 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
379 else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
380 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
381 else if ((plen = mutt_istr_startswith(in, "iso8859-")))
382 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
383 else
384 mutt_str_copy(scratch, in, sizeof(scratch));
385
386 for (size_t i = 0; PreferredMimeNames[i].key; i++)
387 {
388 if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
389 {
390 mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
391 goto out;
392 }
393 }
394
395 mutt_str_copy(buf, scratch, buflen);
396
397 /* for cosmetics' sake, transform to lowercase. */
398 for (char *p = buf; *p; p++)
399 *p = tolower(*p);
400
401 out:
402 if (ext && *ext)
403 {
404 mutt_str_cat(buf, buflen, "/");
405 mutt_str_cat(buf, buflen, ext);
406 }
407 }
408
409 /**
410 * mutt_ch_chscmp - Are the names of two character sets equivalent?
411 * @param cs1 First character set
412 * @param cs2 Second character set
413 * @retval true Names are equivalent
414 * @retval false Names differ
415 *
416 * Charsets may have extensions that mutt_ch_canonical_charset() leaves intact;
417 * we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2'
418 * does _not_ have any extension) we simply check if the shorter string is a
419 * prefix for the longer.
420 */
mutt_ch_chscmp(const char * cs1,const char * cs2)421 bool mutt_ch_chscmp(const char *cs1, const char *cs2)
422 {
423 if (!cs1 || !cs2)
424 return false;
425
426 char buf[256];
427
428 mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
429
430 int len1 = mutt_str_len(buf);
431 int len2 = mutt_str_len(cs2);
432
433 return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
434 ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
435 }
436
437 /**
438 * mutt_ch_get_default_charset - Get the default character set
439 * @retval ptr Name of the default character set
440 *
441 * @warning This returns a pointer to a static buffer. Do not free it.
442 */
mutt_ch_get_default_charset(void)443 char *mutt_ch_get_default_charset(void)
444 {
445 static char fcharset[128];
446 const char *const c_assumed_charset =
447 cs_subset_string(NeoMutt->sub, "assumed_charset");
448 const char *c = c_assumed_charset;
449 const char *c1 = NULL;
450
451 if (c)
452 {
453 c1 = strchr(c, ':');
454
455 size_t copysize;
456 if (c1)
457 copysize = MIN((c1 - c + 1), sizeof(fcharset));
458 else
459 copysize = sizeof(fcharset);
460 mutt_str_copy(fcharset, c, copysize);
461 return fcharset;
462 }
463 return strcpy(fcharset, "us-ascii");
464 }
465
466 /**
467 * mutt_ch_get_langinfo_charset - Get the user's choice of character set
468 * @retval ptr Charset string
469 *
470 * Get the canonical character set used by the user's locale.
471 * The caller must free the returned string.
472 */
mutt_ch_get_langinfo_charset(void)473 char *mutt_ch_get_langinfo_charset(void)
474 {
475 char buf[1024] = { 0 };
476
477 mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
478
479 if (buf[0] != '\0')
480 return mutt_str_dup(buf);
481
482 return mutt_str_dup("iso-8859-1");
483 }
484
485 /**
486 * mutt_ch_lookup_add - Add a new character set lookup
487 * @param type Type of character set, e.g. #MUTT_LOOKUP_CHARSET
488 * @param pat Pattern to match
489 * @param replace Replacement string
490 * @param err Buffer for error message
491 * @retval true Lookup added to list
492 * @retval false Regex string was invalid
493 *
494 * Add a regex for a character set and a replacement name.
495 */
mutt_ch_lookup_add(enum LookupType type,const char * pat,const char * replace,struct Buffer * err)496 bool mutt_ch_lookup_add(enum LookupType type, const char *pat,
497 const char *replace, struct Buffer *err)
498 {
499 if (!pat || !replace)
500 return false;
501
502 regex_t *rx = mutt_mem_malloc(sizeof(regex_t));
503 int rc = REG_COMP(rx, pat, REG_ICASE);
504 if (rc != 0)
505 {
506 regerror(rc, rx, err->data, err->dsize);
507 FREE(&rx);
508 return false;
509 }
510
511 struct Lookup *l = lookup_new();
512 l->type = type;
513 l->replacement = mutt_str_dup(replace);
514 l->regex.pattern = mutt_str_dup(pat);
515 l->regex.regex = rx;
516 l->regex.pat_not = false;
517
518 TAILQ_INSERT_TAIL(&Lookups, l, entries);
519
520 return true;
521 }
522
523 /**
524 * mutt_ch_lookup_remove - Remove all the character set lookups
525 *
526 * Empty the list of replacement character set names.
527 */
mutt_ch_lookup_remove(void)528 void mutt_ch_lookup_remove(void)
529 {
530 struct Lookup *l = NULL;
531 struct Lookup *tmp = NULL;
532
533 TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
534 {
535 TAILQ_REMOVE(&Lookups, l, entries);
536 lookup_free(&l);
537 }
538 }
539
540 /**
541 * mutt_ch_charset_lookup - Look for a replacement character set
542 * @param chs Character set to lookup
543 * @retval ptr Replacement character set (if a 'charset-hook' matches)
544 * @retval NULL No matching hook
545 *
546 * Look through all the 'charset-hook's.
547 * If one matches return the replacement character set.
548 */
mutt_ch_charset_lookup(const char * chs)549 const char *mutt_ch_charset_lookup(const char *chs)
550 {
551 return lookup_charset(MUTT_LOOKUP_CHARSET, chs);
552 }
553
554 /**
555 * mutt_ch_iconv_open - Set up iconv for conversions
556 * @param tocode Current character set
557 * @param fromcode Target character set
558 * @param flags Flags, e.g. #MUTT_ICONV_HOOK_FROM
559 * @retval ptr iconv handle for the conversion
560 *
561 * Like iconv_open, but canonicalises the charsets, applies charset-hooks,
562 * recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips
563 * charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers
564 * should use flags=0 when fromcode can safely be considered true, either some
565 * constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be
566 * used only when fromcode is unsure, taken from a possibly wrong incoming MIME
567 * label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions
568 * in some setups.
569 *
570 * @note By design charset-hooks should never be, and are never, applied
571 * to tocode.
572 *
573 * @note The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks,
574 * not at all on iconv-hooks.
575 */
mutt_ch_iconv_open(const char * tocode,const char * fromcode,uint8_t flags)576 iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
577 {
578 char tocode1[128];
579 char fromcode1[128];
580 const char *tocode2 = NULL, *fromcode2 = NULL;
581 const char *tmp = NULL;
582
583 iconv_t cd;
584
585 /* transform to MIME preferred charset names */
586 mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
587 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
588
589 /* maybe apply charset-hooks and recanonicalise fromcode,
590 * but only when caller asked us to sanitize a potentially wrong
591 * charset name incoming from the wild exterior. */
592 if (flags & MUTT_ICONV_HOOK_FROM)
593 {
594 tmp = mutt_ch_charset_lookup(fromcode1);
595 if (tmp)
596 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
597 }
598
599 /* always apply iconv-hooks to suit system's iconv tastes */
600 tocode2 = mutt_ch_iconv_lookup(tocode1);
601 tocode2 = tocode2 ? tocode2 : tocode1;
602 fromcode2 = mutt_ch_iconv_lookup(fromcode1);
603 fromcode2 = fromcode2 ? fromcode2 : fromcode1;
604
605 /* call system iconv with names it appreciates */
606 cd = iconv_open(tocode2, fromcode2);
607 if (cd != (iconv_t) -1)
608 return cd;
609
610 return (iconv_t) -1;
611 }
612
613 /**
614 * mutt_ch_iconv - Change the encoding of a string
615 * @param[in] cd Iconv conversion descriptor
616 * @param[in,out] inbuf Buffer to convert
617 * @param[in,out] inbytesleft Length of buffer to convert
618 * @param[in,out] outbuf Buffer for the result
619 * @param[in,out] outbytesleft Length of result buffer
620 * @param[in] inrepls Input replacement characters
621 * @param[in] outrepl Output replacement characters
622 * @param[out] iconverrno Errno if iconv() fails, 0 if it succeeds
623 * @retval num Characters converted
624 *
625 * Like iconv, but keeps going even when the input is invalid
626 * If you're supplying inrepls, the source charset should be stateless;
627 * if you're supplying an outrepl, the target charset should be.
628 */
mutt_ch_iconv(iconv_t cd,const char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,const char ** inrepls,const char * outrepl,int * iconverrno)629 size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,
630 char **outbuf, size_t *outbytesleft, const char **inrepls,
631 const char *outrepl, int *iconverrno)
632 {
633 size_t rc = 0;
634 const char *ib = *inbuf;
635 size_t ibl = *inbytesleft;
636 char *ob = *outbuf;
637 size_t obl = *outbytesleft;
638
639 while (true)
640 {
641 errno = 0;
642 const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
643 if (ret1 != (size_t) -1)
644 rc += ret1;
645 if (iconverrno)
646 *iconverrno = errno;
647
648 if (ibl && obl && (errno == EILSEQ))
649 {
650 if (inrepls)
651 {
652 /* Try replacing the input */
653 const char **t = NULL;
654 for (t = inrepls; *t; t++)
655 {
656 const char *ib1 = *t;
657 size_t ibl1 = strlen(*t);
658 char *ob1 = ob;
659 size_t obl1 = obl;
660 iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
661 if (ibl1 == 0)
662 {
663 ib++;
664 ibl--;
665 ob = ob1;
666 obl = obl1;
667 rc++;
668 break;
669 }
670 }
671 if (*t)
672 continue;
673 }
674 /* Replace the output */
675 if (!outrepl)
676 outrepl = "?";
677 iconv(cd, NULL, NULL, &ob, &obl);
678 if (obl)
679 {
680 int n = strlen(outrepl);
681 if (n > obl)
682 {
683 outrepl = "?";
684 n = 1;
685 }
686 memcpy(ob, outrepl, n);
687 ib++;
688 ibl--;
689 ob += n;
690 obl -= n;
691 rc++;
692 iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
693 continue;
694 }
695 }
696 *inbuf = ib;
697 *inbytesleft = ibl;
698 *outbuf = ob;
699 *outbytesleft = obl;
700 return rc;
701 }
702 }
703
704 /**
705 * mutt_ch_iconv_lookup - Look for a replacement character set
706 * @param chs Character set to lookup
707 * @retval ptr Replacement character set (if a 'iconv-hook' matches)
708 * @retval NULL No matching hook
709 *
710 * Look through all the 'iconv-hook's.
711 * If one matches return the replacement character set.
712 */
mutt_ch_iconv_lookup(const char * chs)713 const char *mutt_ch_iconv_lookup(const char *chs)
714 {
715 return lookup_charset(MUTT_LOOKUP_ICONV, chs);
716 }
717
718 /**
719 * mutt_ch_check - Check whether a string can be converted between encodings
720 * @param[in] s String to check
721 * @param[in] slen Length of the string to check
722 * @param[in] from Current character set
723 * @param[in] to Target character set
724 * @retval 0 Success
725 * @retval -1 Error in iconv_open()
726 * @retval >0 Errno as set by iconv()
727 */
mutt_ch_check(const char * s,size_t slen,const char * from,const char * to)728 int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
729 {
730 if (!s || !from || !to)
731 return -1;
732
733 int rc = 0;
734 iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
735 if (cd == (iconv_t) -1)
736 return -1;
737
738 size_t outlen = MB_LEN_MAX * slen;
739 char *out = mutt_mem_malloc(outlen + 1);
740 char *saved_out = out;
741
742 const size_t convlen =
743 iconv(cd, (ICONV_CONST char **) &s, &slen, &out, (size_t *) &outlen);
744 if (convlen == -1)
745 rc = errno;
746
747 FREE(&saved_out);
748 iconv_close(cd);
749 return rc;
750 }
751
752 /**
753 * mutt_ch_convert_string - Convert a string between encodings
754 * @param[in,out] ps String to convert
755 * @param[in] from Current character set
756 * @param[in] to Target character set
757 * @param[in] flags Flags, e.g. #MUTT_ICONV_HOOK_FROM
758 * @retval 0 Success
759 * @retval -1 Invalid arguments or failure to open an iconv channel
760 * @retval errno Failure in iconv conversion
761 *
762 * Parameter flags is given as-is to mutt_ch_iconv_open().
763 * See there for its meaning and usage policy.
764 */
mutt_ch_convert_string(char ** ps,const char * from,const char * to,uint8_t flags)765 int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
766 {
767 if (!ps)
768 return -1;
769
770 char *s = *ps;
771
772 if (!s || (*s == '\0'))
773 return 0;
774
775 if (!to || !from)
776 return -1;
777
778 const char *repls[] = { "\357\277\275", "?", 0 };
779 int rc = 0;
780
781 iconv_t cd = mutt_ch_iconv_open(to, from, flags);
782 if (cd == (iconv_t) -1)
783 return -1;
784
785 size_t len;
786 const char *ib = NULL;
787 char *buf = NULL, *ob = NULL;
788 size_t ibl, obl;
789 const char **inrepls = NULL;
790 const char *outrepl = NULL;
791
792 if (mutt_ch_is_utf8(to))
793 outrepl = "\357\277\275";
794 else if (mutt_ch_is_utf8(from))
795 inrepls = repls;
796 else
797 outrepl = "?";
798
799 len = strlen(s);
800 ib = s;
801 ibl = len + 1;
802 obl = MB_LEN_MAX * ibl;
803 buf = mutt_mem_malloc(obl + 1);
804 ob = buf;
805
806 mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
807 iconv_close(cd);
808
809 *ob = '\0';
810
811 FREE(ps);
812 *ps = buf;
813
814 mutt_str_adjust(ps);
815 return rc;
816 }
817
818 /**
819 * mutt_ch_check_charset - Does iconv understand a character set?
820 * @param cs Character set to check
821 * @param strict Check strictly by using iconv
822 * @retval true Character set is valid
823 *
824 * If `strict` is false, then finding a matching character set in
825 * #PreferredMimeNames will be enough.
826 * If `strict` is true, or the charset is not in #PreferredMimeNames, then
827 * iconv() with be run.
828 */
mutt_ch_check_charset(const char * cs,bool strict)829 bool mutt_ch_check_charset(const char *cs, bool strict)
830 {
831 if (!cs)
832 return false;
833
834 if (mutt_ch_is_utf8(cs))
835 return true;
836
837 if (!strict)
838 {
839 for (int i = 0; PreferredMimeNames[i].key; i++)
840 {
841 if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
842 mutt_istr_equal(PreferredMimeNames[i].pref, cs))
843 {
844 return true;
845 }
846 }
847 }
848
849 iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
850 if (cd != (iconv_t) (-1))
851 {
852 iconv_close(cd);
853 return true;
854 }
855
856 return false;
857 }
858
859 /**
860 * mutt_ch_fgetconv_open - Prepare a file for charset conversion
861 * @param fp FILE ptr to prepare
862 * @param from Current character set
863 * @param to Destination character set
864 * @param flags Flags, e.g. #MUTT_ICONV_HOOK_FROM
865 * @retval ptr fgetconv handle
866 *
867 * Parameter flags is given as-is to mutt_ch_iconv_open().
868 */
mutt_ch_fgetconv_open(FILE * fp,const char * from,const char * to,uint8_t flags)869 struct FgetConv *mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, uint8_t flags)
870 {
871 struct FgetConv *fc = NULL;
872 iconv_t cd = (iconv_t) -1;
873
874 if (from && to)
875 cd = mutt_ch_iconv_open(to, from, flags);
876
877 if (cd != (iconv_t) -1)
878 {
879 static const char *repls[] = { "\357\277\275", "?", 0 };
880
881 fc = mutt_mem_malloc(sizeof(struct FgetConv));
882 fc->p = fc->bufo;
883 fc->ob = fc->bufo;
884 fc->ib = fc->bufi;
885 fc->ibl = 0;
886 fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
887 }
888 else
889 fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
890 fc->fp = fp;
891 fc->cd = cd;
892 return fc;
893 }
894
895 /**
896 * mutt_ch_fgetconv_close - Close an fgetconv handle
897 * @param[out] fc fgetconv handle
898 */
mutt_ch_fgetconv_close(struct FgetConv ** fc)899 void mutt_ch_fgetconv_close(struct FgetConv **fc)
900 {
901 if (!fc || !*fc)
902 return;
903
904 if ((*fc)->cd != (iconv_t) -1)
905 iconv_close((*fc)->cd);
906 FREE(fc);
907 }
908
909 /**
910 * mutt_ch_fgetconv - Convert a file's character set
911 * @param fc FgetConv handle
912 * @retval num Next character in the converted file
913 * @retval EOF Error
914 *
915 * A file is read into a buffer and its character set is converted.
916 * Each call to this function will return one converted character.
917 * The buffer is refilled automatically when empty.
918 */
mutt_ch_fgetconv(struct FgetConv * fc)919 int mutt_ch_fgetconv(struct FgetConv *fc)
920 {
921 if (!fc)
922 return EOF;
923 if (fc->cd == (iconv_t) -1)
924 return fgetc(fc->fp);
925 if (!fc->p)
926 return EOF;
927 if (fc->p < fc->ob)
928 return (unsigned char) *(fc->p)++;
929
930 /* Try to convert some more */
931 fc->p = fc->bufo;
932 fc->ob = fc->bufo;
933 if (fc->ibl)
934 {
935 size_t obl = sizeof(fc->bufo);
936 iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
937 if (fc->p < fc->ob)
938 return (unsigned char) *(fc->p)++;
939 }
940
941 /* If we trusted iconv a bit more, we would at this point
942 * ask why it had stopped converting ... */
943
944 /* Try to read some more */
945 if ((fc->ibl == sizeof(fc->bufi)) ||
946 (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
947 {
948 fc->p = 0;
949 return EOF;
950 }
951 if (fc->ibl)
952 memcpy(fc->bufi, fc->ib, fc->ibl);
953 fc->ib = fc->bufi;
954 fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
955
956 /* Try harder this time to convert some */
957 if (fc->ibl)
958 {
959 size_t obl = sizeof(fc->bufo);
960 mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
961 fc->inrepls, 0, NULL);
962 if (fc->p < fc->ob)
963 return (unsigned char) *(fc->p)++;
964 }
965
966 /* Either the file has finished or one of the buffers is too small */
967 fc->p = 0;
968 return EOF;
969 }
970
971 /**
972 * mutt_ch_fgetconvs - Convert a file's charset into a string buffer
973 * @param buf Buffer for result
974 * @param buflen Length of buffer
975 * @param fc FgetConv handle
976 * @retval ptr Success, result buffer
977 * @retval NULL Error
978 *
979 * Read a file into a buffer, converting the character set as it goes.
980 */
mutt_ch_fgetconvs(char * buf,size_t buflen,struct FgetConv * fc)981 char *mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
982 {
983 if (!buf)
984 return NULL;
985
986 size_t r;
987 for (r = 0; (r + 1) < buflen;)
988 {
989 const int c = mutt_ch_fgetconv(fc);
990 if (c == EOF)
991 break;
992 buf[r++] = (char) c;
993 if (c == '\n')
994 break;
995 }
996 buf[r] = '\0';
997
998 if (r > 0)
999 return buf;
1000
1001 return NULL;
1002 }
1003
1004 /**
1005 * mutt_ch_set_charset - Update the records for a new character set
1006 * @param charset New character set
1007 *
1008 * Check if this character set is utf-8 and pick a suitable replacement
1009 * character for unprintable characters.
1010 *
1011 * @note This calls `bind_textdomain_codeset()` which will affect future
1012 * message translations.
1013 */
mutt_ch_set_charset(const char * charset)1014 void mutt_ch_set_charset(const char *charset)
1015 {
1016 char buf[256];
1017
1018 mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1019
1020 if (mutt_ch_is_utf8(buf))
1021 {
1022 CharsetIsUtf8 = true;
1023 ReplacementChar = 0xfffd; /* replacement character */
1024 }
1025 else
1026 {
1027 CharsetIsUtf8 = false;
1028 ReplacementChar = '?';
1029 }
1030
1031 #if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1032 bind_textdomain_codeset(PACKAGE, buf);
1033 #endif
1034 }
1035
1036 /**
1037 * mutt_ch_choose - Figure the best charset to encode a string
1038 * @param[in] fromcode Original charset of the string
1039 * @param[in] charsets Colon-separated list of potential charsets to use
1040 * @param[in] u String to encode
1041 * @param[in] ulen Length of the string to encode
1042 * @param[out] d If not NULL, point it to the converted string
1043 * @param[out] dlen If not NULL, point it to the length of the d string
1044 * @retval ptr Best performing charset
1045 * @retval NULL None could be found
1046 */
mutt_ch_choose(const char * fromcode,const char * charsets,const char * u,size_t ulen,char ** d,size_t * dlen)1047 char *mutt_ch_choose(const char *fromcode, const char *charsets, const char *u,
1048 size_t ulen, char **d, size_t *dlen)
1049 {
1050 if (!fromcode)
1051 return NULL;
1052
1053 char *e = NULL, *tocode = NULL;
1054 size_t elen = 0, bestn = 0;
1055 const char *q = NULL;
1056
1057 for (const char *p = charsets; p; p = q ? q + 1 : 0)
1058 {
1059 q = strchr(p, ':');
1060
1061 size_t n = q ? q - p : strlen(p);
1062 if (n == 0)
1063 continue;
1064
1065 char *t = mutt_mem_malloc(n + 1);
1066 memcpy(t, p, n);
1067 t[n] = '\0';
1068
1069 char *s = mutt_strn_dup(u, ulen);
1070 const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1071 mutt_ch_check(s, ulen, fromcode, t);
1072 if (rc)
1073 {
1074 FREE(&t);
1075 FREE(&s);
1076 continue;
1077 }
1078 size_t slen = mutt_str_len(s);
1079
1080 if (!tocode || (n < bestn))
1081 {
1082 bestn = n;
1083 FREE(&tocode);
1084 tocode = t;
1085 if (d)
1086 {
1087 FREE(&e);
1088 e = s;
1089 }
1090 else
1091 FREE(&s);
1092 elen = slen;
1093 }
1094 else
1095 {
1096 FREE(&t);
1097 FREE(&s);
1098 }
1099 }
1100 if (tocode)
1101 {
1102 if (d)
1103 *d = e;
1104 if (dlen)
1105 *dlen = elen;
1106
1107 char canonical_buf[1024];
1108 mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1109 mutt_str_replace(&tocode, canonical_buf);
1110 }
1111 return tocode;
1112 }
1113