1 /**
2  * @file
3  * Conversion between different character encodings
4  *
5  * @authors
6  * Copyright (C) 1999-2002,2007 Thomas Roessler <roessler@does-not-exist.org>
7  *
8  * @copyright
9  * This program is free software: you can redistribute it and/or modify it under
10  * the terms of the GNU General Public License as published by the Free Software
11  * Foundation, either version 2 of the License, or (at your option) any later
12  * version.
13  *
14  * This program is distributed in the hope that it will be useful, but WITHOUT
15  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16  * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
17  * details.
18  *
19  * You should have received a copy of the GNU General Public License along with
20  * this program.  If not, see <http://www.gnu.org/licenses/>.
21  */
22 
23 /**
24  * @page mutt_charset Conversion between different character encodings
25  *
26  * Conversion between different character encodings
27  */
28 
29 #include "config.h"
30 #include <ctype.h>
31 #include <errno.h>
32 #include <iconv.h>
33 #include <langinfo.h>
34 #include <limits.h>
35 #include <stdbool.h>
36 #include <stdio.h>
37 #include <string.h>
38 #include "config/lib.h"
39 #include "core/lib.h"
40 #include "charset.h"
41 #include "buffer.h"
42 #include "memory.h"
43 #include "queue.h"
44 #include "regex3.h"
45 #include "string2.h"
46 #ifdef ENABLE_NLS
47 #include <libintl.h>
48 #endif
49 
50 #ifndef EILSEQ
51 #define EILSEQ EINVAL
52 #endif
53 
54 /**
55  * ReplacementChar - When a Unicode character can't be displayed, use this instead
56  */
57 wchar_t ReplacementChar = '?';
58 
59 /**
60  * CharsetIsUtf8 - Is the user's current character set utf-8?
61  */
62 bool CharsetIsUtf8 = false;
63 
64 /**
65  * struct Lookup - Regex to String lookup table
66  *
67  * This is used by 'charset-hook' and 'iconv-hook'.
68  */
69 struct Lookup
70 {
71   enum LookupType type;        ///< Lookup type
72   struct Regex regex;          ///< Regular expression
73   char *replacement;           ///< Alternative charset to use
74   TAILQ_ENTRY(Lookup) entries; ///< Linked list
75 };
76 TAILQ_HEAD(LookupList, Lookup);
77 
78 static struct LookupList Lookups = TAILQ_HEAD_INITIALIZER(Lookups);
79 
80 /**
81  * struct MimeNames - MIME name lookup entry
82  */
83 struct MimeNames
84 {
85   const char *key;
86   const char *pref;
87 };
88 
89 /**
90  * PreferredMimeNames - Lookup table of preferred charsets
91  *
92  * The following list has been created manually from the data under:
93  * http://www.isi.edu/in-notes/iana/assignments/character-sets
94  * Last update: 2000-09-07
95  *
96  * @note It includes only the subset of character sets for which a preferred
97  * MIME name is given.
98  */
99 const struct MimeNames PreferredMimeNames[] = {
100   // clang-format off
101   { "ansi_x3.4-1968",        "us-ascii"      },
102   { "iso-ir-6",              "us-ascii"      },
103   { "iso_646.irv:1991",      "us-ascii"      },
104   { "ascii",                 "us-ascii"      },
105   { "iso646-us",             "us-ascii"      },
106   { "us",                    "us-ascii"      },
107   { "ibm367",                "us-ascii"      },
108   { "cp367",                 "us-ascii"      },
109   { "csASCII",               "us-ascii"      },
110 
111   { "csISO2022KR",           "iso-2022-kr"   },
112   { "csEUCKR",               "euc-kr"        },
113   { "csISO2022JP",           "iso-2022-jp"   },
114   { "csISO2022JP2",          "iso-2022-jp-2" },
115 
116   { "ISO_8859-1:1987",       "iso-8859-1"    },
117   { "iso-ir-100",            "iso-8859-1"    },
118   { "iso_8859-1",            "iso-8859-1"    },
119   { "latin1",                "iso-8859-1"    },
120   { "l1",                    "iso-8859-1"    },
121   { "IBM819",                "iso-8859-1"    },
122   { "CP819",                 "iso-8859-1"    },
123   { "csISOLatin1",           "iso-8859-1"    },
124 
125   { "ISO_8859-2:1987",       "iso-8859-2"    },
126   { "iso-ir-101",            "iso-8859-2"    },
127   { "iso_8859-2",            "iso-8859-2"    },
128   { "latin2",                "iso-8859-2"    },
129   { "l2",                    "iso-8859-2"    },
130   { "csISOLatin2",           "iso-8859-2"    },
131 
132   { "ISO_8859-3:1988",       "iso-8859-3"    },
133   { "iso-ir-109",            "iso-8859-3"    },
134   { "ISO_8859-3",            "iso-8859-3"    },
135   { "latin3",                "iso-8859-3"    },
136   { "l3",                    "iso-8859-3"    },
137   { "csISOLatin3",           "iso-8859-3"    },
138 
139   { "ISO_8859-4:1988",       "iso-8859-4"    },
140   { "iso-ir-110",            "iso-8859-4"    },
141   { "ISO_8859-4",            "iso-8859-4"    },
142   { "latin4",                "iso-8859-4"    },
143   { "l4",                    "iso-8859-4"    },
144   { "csISOLatin4",           "iso-8859-4"    },
145 
146   { "ISO_8859-6:1987",       "iso-8859-6"    },
147   { "iso-ir-127",            "iso-8859-6"    },
148   { "iso_8859-6",            "iso-8859-6"    },
149   { "ECMA-114",              "iso-8859-6"    },
150   { "ASMO-708",              "iso-8859-6"    },
151   { "arabic",                "iso-8859-6"    },
152   { "csISOLatinArabic",      "iso-8859-6"    },
153 
154   { "ISO_8859-7:1987",       "iso-8859-7"    },
155   { "iso-ir-126",            "iso-8859-7"    },
156   { "ISO_8859-7",            "iso-8859-7"    },
157   { "ELOT_928",              "iso-8859-7"    },
158   { "ECMA-118",              "iso-8859-7"    },
159   { "greek",                 "iso-8859-7"    },
160   { "greek8",                "iso-8859-7"    },
161   { "csISOLatinGreek",       "iso-8859-7"    },
162 
163   { "ISO_8859-8:1988",       "iso-8859-8"    },
164   { "iso-ir-138",            "iso-8859-8"    },
165   { "ISO_8859-8",            "iso-8859-8"    },
166   { "hebrew",                "iso-8859-8"    },
167   { "csISOLatinHebrew",      "iso-8859-8"    },
168 
169   { "ISO_8859-5:1988",       "iso-8859-5"    },
170   { "iso-ir-144",            "iso-8859-5"    },
171   { "ISO_8859-5",            "iso-8859-5"    },
172   { "cyrillic",              "iso-8859-5"    },
173   { "csISOLatinCyrillic",    "iso-8859-5"    },
174 
175   { "ISO_8859-9:1989",       "iso-8859-9"    },
176   { "iso-ir-148",            "iso-8859-9"    },
177   { "ISO_8859-9",            "iso-8859-9"    },
178   { "latin5",                "iso-8859-9"    },  /* this is not a bug */
179   { "l5",                    "iso-8859-9"    },
180   { "csISOLatin5",           "iso-8859-9"    },
181 
182   { "ISO_8859-10:1992",      "iso-8859-10"   },
183   { "iso-ir-157",            "iso-8859-10"   },
184   { "latin6",                "iso-8859-10"   },  /* this is not a bug */
185   { "l6",                    "iso-8859-10"   },
186   { "csISOLatin6",           "iso-8859-10"   },
187 
188   { "csKOI8r",               "koi8-r"        },
189 
190   { "MS_Kanji",              "Shift_JIS"     },  /* Note the underscore! */
191   { "csShiftJis",            "Shift_JIS"     },
192 
193   { "Extended_UNIX_Code_Packed_Format_for_Japanese",
194                              "euc-jp"        },
195   { "csEUCPkdFmtJapanese",   "euc-jp"        },
196 
197   { "csGB2312",              "gb2312"        },
198   { "csbig5",                "big5"          },
199 
200   /* End of official brain damage.
201    * What follows has been taken from glibc's localedata files.  */
202 
203   { "iso_8859-13",           "iso-8859-13"   },
204   { "iso-ir-179",            "iso-8859-13"   },
205   { "latin7",                "iso-8859-13"   },  /* this is not a bug */
206   { "l7",                    "iso-8859-13"   },
207 
208   { "iso_8859-14",           "iso-8859-14"   },
209   { "latin8",                "iso-8859-14"   },  /* this is not a bug */
210   { "l8",                    "iso-8859-14"   },
211 
212   { "iso_8859-15",           "iso-8859-15"   },
213   { "latin9",                "iso-8859-15"   },  /* this is not a bug */
214 
215   /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
216   { "latin0",                "iso-8859-15"   },  /* this is not a bug */
217 
218   { "iso_8859-16",           "iso-8859-16"   },
219   { "latin10",               "iso-8859-16"   },  /* this is not a bug */
220 
221   { "646",                   "us-ascii"      },
222 
223   /* http://www.sun.com/software/white-papers/wp-unicode/ */
224 
225   { "eucJP",                 "euc-jp"        },
226   { "PCK",                   "Shift_JIS"     },
227   { "ko_KR-euc",             "euc-kr"        },
228   { "zh_TW-big5",            "big5"          },
229 
230   /* seems to be common on some systems */
231 
232   { "sjis",                  "Shift_JIS"     },
233   { "euc-jp-ms",             "eucJP-ms"      },
234 
235   /* If you happen to encounter system-specific brain-damage with respect to
236    * character set naming, please add it above this comment, and submit a patch
237    * to <neomutt-devel@neomutt.org> */
238 
239   { NULL, NULL },
240   // clang-format on
241 };
242 
243 /**
244  * lookup_new - Create a new Lookup
245  * @retval ptr New Lookup
246  */
lookup_new(void)247 static struct Lookup *lookup_new(void)
248 {
249   return mutt_mem_calloc(1, sizeof(struct Lookup));
250 }
251 
252 /**
253  * lookup_free - Free a Lookup
254  * @param ptr Lookup to free
255  */
lookup_free(struct Lookup ** ptr)256 static void lookup_free(struct Lookup **ptr)
257 {
258   if (!ptr || !*ptr)
259     return;
260 
261   struct Lookup *l = *ptr;
262   FREE(&l->replacement);
263   FREE(&l->regex.pattern);
264   if (l->regex.regex)
265     regfree(l->regex.regex);
266   FREE(&l->regex.regex);
267   FREE(&l->regex);
268 
269   FREE(ptr);
270 }
271 
272 /**
273  * lookup_charset - Look for a preferred character set name
274  * @param type Type, e.g. #MUTT_LOOKUP_CHARSET
275  * @param cs   Character set
276  * @retval ptr Charset string
277  *
278  * If the character set matches one of the regexes,
279  * then return the replacement name.
280  */
lookup_charset(enum LookupType type,const char * cs)281 static const char *lookup_charset(enum LookupType type, const char *cs)
282 {
283   if (!cs)
284     return NULL;
285 
286   struct Lookup *l = NULL;
287 
288   TAILQ_FOREACH(l, &Lookups, entries)
289   {
290     if (l->type != type)
291       continue;
292     if (mutt_regex_match(&l->regex, cs))
293       return l->replacement;
294   }
295   return NULL;
296 }
297 
298 /**
299  * mutt_ch_convert_nonmime_string - Try to convert a string using a list of character sets
300  * @param[in,out] ps String to be converted
301  * @retval 0  Success
302  * @retval -1 Error
303  *
304  * Work through `$assumed_charset` looking for a character set conversion that
305  * works.  Failing that, try mutt_ch_get_default_charset().
306  */
mutt_ch_convert_nonmime_string(char ** ps)307 int mutt_ch_convert_nonmime_string(char **ps)
308 {
309   if (!ps)
310     return -1;
311 
312   char *u = *ps;
313   const size_t ulen = mutt_str_len(u);
314   if (ulen == 0)
315     return 0;
316 
317   const char *c1 = NULL;
318 
319   const char *const c_assumed_charset =
320       cs_subset_string(NeoMutt->sub, "assumed_charset");
321   const char *const c_charset = cs_subset_string(NeoMutt->sub, "charset");
322   for (const char *c = c_assumed_charset; c; c = c1 ? c1 + 1 : 0)
323   {
324     c1 = strchr(c, ':');
325     size_t n = c1 ? c1 - c : mutt_str_len(c);
326     if (n == 0)
327       return 0;
328     char *fromcode = mutt_mem_malloc(n + 1);
329     mutt_str_copy(fromcode, c, n + 1);
330     char *s = mutt_strn_dup(u, ulen);
331     int m = mutt_ch_convert_string(&s, fromcode, c_charset, MUTT_ICONV_NO_FLAGS);
332     FREE(&fromcode);
333     if (m == 0)
334     {
335       FREE(ps);
336       *ps = s;
337       return 0;
338     }
339     FREE(&s);
340   }
341   mutt_ch_convert_string(ps, (const char *) mutt_ch_get_default_charset(),
342                          c_charset, MUTT_ICONV_HOOK_FROM);
343   return -1;
344 }
345 
346 /**
347  * mutt_ch_canonical_charset - Canonicalise the charset of a string
348  * @param buf Buffer for canonical character set name
349  * @param buflen Length of buffer
350  * @param name Name to be canonicalised
351  *
352  * This first ties off any charset extension such as "//TRANSLIT",
353  * canonicalizes the charset and re-adds the extension
354  */
mutt_ch_canonical_charset(char * buf,size_t buflen,const char * name)355 void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
356 {
357   if (!buf || !name)
358     return;
359 
360   char in[1024], scratch[1024 + 10];
361 
362   mutt_str_copy(in, name, sizeof(in));
363   char *ext = strchr(in, '/');
364   if (ext)
365     *ext++ = '\0';
366 
367   if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
368   {
369     mutt_str_copy(buf, "utf-8", buflen);
370     goto out;
371   }
372 
373   /* catch some common iso-8859-something misspellings */
374   size_t plen;
375   if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
376     snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
377   else if ((plen = mutt_istr_startswith(in, "8859-")))
378     snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
379   else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
380     snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
381   else if ((plen = mutt_istr_startswith(in, "iso8859-")))
382     snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
383   else
384     mutt_str_copy(scratch, in, sizeof(scratch));
385 
386   for (size_t i = 0; PreferredMimeNames[i].key; i++)
387   {
388     if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
389     {
390       mutt_str_copy(buf, PreferredMimeNames[i].pref, buflen);
391       goto out;
392     }
393   }
394 
395   mutt_str_copy(buf, scratch, buflen);
396 
397   /* for cosmetics' sake, transform to lowercase. */
398   for (char *p = buf; *p; p++)
399     *p = tolower(*p);
400 
401 out:
402   if (ext && *ext)
403   {
404     mutt_str_cat(buf, buflen, "/");
405     mutt_str_cat(buf, buflen, ext);
406   }
407 }
408 
409 /**
410  * mutt_ch_chscmp - Are the names of two character sets equivalent?
411  * @param cs1 First character set
412  * @param cs2 Second character set
413  * @retval true  Names are equivalent
414  * @retval false Names differ
415  *
416  * Charsets may have extensions that mutt_ch_canonical_charset() leaves intact;
417  * we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2'
418  * does _not_ have any extension) we simply check if the shorter string is a
419  * prefix for the longer.
420  */
mutt_ch_chscmp(const char * cs1,const char * cs2)421 bool mutt_ch_chscmp(const char *cs1, const char *cs2)
422 {
423   if (!cs1 || !cs2)
424     return false;
425 
426   char buf[256];
427 
428   mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
429 
430   int len1 = mutt_str_len(buf);
431   int len2 = mutt_str_len(cs2);
432 
433   return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
434                           ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
435 }
436 
437 /**
438  * mutt_ch_get_default_charset - Get the default character set
439  * @retval ptr Name of the default character set
440  *
441  * @warning This returns a pointer to a static buffer.  Do not free it.
442  */
mutt_ch_get_default_charset(void)443 char *mutt_ch_get_default_charset(void)
444 {
445   static char fcharset[128];
446   const char *const c_assumed_charset =
447       cs_subset_string(NeoMutt->sub, "assumed_charset");
448   const char *c = c_assumed_charset;
449   const char *c1 = NULL;
450 
451   if (c)
452   {
453     c1 = strchr(c, ':');
454 
455     size_t copysize;
456     if (c1)
457       copysize = MIN((c1 - c + 1), sizeof(fcharset));
458     else
459       copysize = sizeof(fcharset);
460     mutt_str_copy(fcharset, c, copysize);
461     return fcharset;
462   }
463   return strcpy(fcharset, "us-ascii");
464 }
465 
466 /**
467  * mutt_ch_get_langinfo_charset - Get the user's choice of character set
468  * @retval ptr Charset string
469  *
470  * Get the canonical character set used by the user's locale.
471  * The caller must free the returned string.
472  */
mutt_ch_get_langinfo_charset(void)473 char *mutt_ch_get_langinfo_charset(void)
474 {
475   char buf[1024] = { 0 };
476 
477   mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
478 
479   if (buf[0] != '\0')
480     return mutt_str_dup(buf);
481 
482   return mutt_str_dup("iso-8859-1");
483 }
484 
485 /**
486  * mutt_ch_lookup_add - Add a new character set lookup
487  * @param type    Type of character set, e.g. #MUTT_LOOKUP_CHARSET
488  * @param pat     Pattern to match
489  * @param replace Replacement string
490  * @param err     Buffer for error message
491  * @retval true  Lookup added to list
492  * @retval false Regex string was invalid
493  *
494  * Add a regex for a character set and a replacement name.
495  */
mutt_ch_lookup_add(enum LookupType type,const char * pat,const char * replace,struct Buffer * err)496 bool mutt_ch_lookup_add(enum LookupType type, const char *pat,
497                         const char *replace, struct Buffer *err)
498 {
499   if (!pat || !replace)
500     return false;
501 
502   regex_t *rx = mutt_mem_malloc(sizeof(regex_t));
503   int rc = REG_COMP(rx, pat, REG_ICASE);
504   if (rc != 0)
505   {
506     regerror(rc, rx, err->data, err->dsize);
507     FREE(&rx);
508     return false;
509   }
510 
511   struct Lookup *l = lookup_new();
512   l->type = type;
513   l->replacement = mutt_str_dup(replace);
514   l->regex.pattern = mutt_str_dup(pat);
515   l->regex.regex = rx;
516   l->regex.pat_not = false;
517 
518   TAILQ_INSERT_TAIL(&Lookups, l, entries);
519 
520   return true;
521 }
522 
523 /**
524  * mutt_ch_lookup_remove - Remove all the character set lookups
525  *
526  * Empty the list of replacement character set names.
527  */
mutt_ch_lookup_remove(void)528 void mutt_ch_lookup_remove(void)
529 {
530   struct Lookup *l = NULL;
531   struct Lookup *tmp = NULL;
532 
533   TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
534   {
535     TAILQ_REMOVE(&Lookups, l, entries);
536     lookup_free(&l);
537   }
538 }
539 
540 /**
541  * mutt_ch_charset_lookup - Look for a replacement character set
542  * @param chs Character set to lookup
543  * @retval ptr  Replacement character set (if a 'charset-hook' matches)
544  * @retval NULL No matching hook
545  *
546  * Look through all the 'charset-hook's.
547  * If one matches return the replacement character set.
548  */
mutt_ch_charset_lookup(const char * chs)549 const char *mutt_ch_charset_lookup(const char *chs)
550 {
551   return lookup_charset(MUTT_LOOKUP_CHARSET, chs);
552 }
553 
554 /**
555  * mutt_ch_iconv_open - Set up iconv for conversions
556  * @param tocode   Current character set
557  * @param fromcode Target character set
558  * @param flags    Flags, e.g. #MUTT_ICONV_HOOK_FROM
559  * @retval ptr iconv handle for the conversion
560  *
561  * Like iconv_open, but canonicalises the charsets, applies charset-hooks,
562  * recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips
563  * charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers
564  * should use flags=0 when fromcode can safely be considered true, either some
565  * constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be
566  * used only when fromcode is unsure, taken from a possibly wrong incoming MIME
567  * label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions
568  * in some setups.
569  *
570  * @note By design charset-hooks should never be, and are never, applied
571  * to tocode.
572  *
573  * @note The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks,
574  * not at all on iconv-hooks.
575  */
mutt_ch_iconv_open(const char * tocode,const char * fromcode,uint8_t flags)576 iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
577 {
578   char tocode1[128];
579   char fromcode1[128];
580   const char *tocode2 = NULL, *fromcode2 = NULL;
581   const char *tmp = NULL;
582 
583   iconv_t cd;
584 
585   /* transform to MIME preferred charset names */
586   mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
587   mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
588 
589   /* maybe apply charset-hooks and recanonicalise fromcode,
590    * but only when caller asked us to sanitize a potentially wrong
591    * charset name incoming from the wild exterior. */
592   if (flags & MUTT_ICONV_HOOK_FROM)
593   {
594     tmp = mutt_ch_charset_lookup(fromcode1);
595     if (tmp)
596       mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
597   }
598 
599   /* always apply iconv-hooks to suit system's iconv tastes */
600   tocode2 = mutt_ch_iconv_lookup(tocode1);
601   tocode2 = tocode2 ? tocode2 : tocode1;
602   fromcode2 = mutt_ch_iconv_lookup(fromcode1);
603   fromcode2 = fromcode2 ? fromcode2 : fromcode1;
604 
605   /* call system iconv with names it appreciates */
606   cd = iconv_open(tocode2, fromcode2);
607   if (cd != (iconv_t) -1)
608     return cd;
609 
610   return (iconv_t) -1;
611 }
612 
613 /**
614  * mutt_ch_iconv - Change the encoding of a string
615  * @param[in]     cd           Iconv conversion descriptor
616  * @param[in,out] inbuf        Buffer to convert
617  * @param[in,out] inbytesleft  Length of buffer to convert
618  * @param[in,out] outbuf       Buffer for the result
619  * @param[in,out] outbytesleft Length of result buffer
620  * @param[in]     inrepls      Input replacement characters
621  * @param[in]     outrepl      Output replacement characters
622  * @param[out]    iconverrno   Errno if iconv() fails, 0 if it succeeds
623  * @retval num Characters converted
624  *
625  * Like iconv, but keeps going even when the input is invalid
626  * If you're supplying inrepls, the source charset should be stateless;
627  * if you're supplying an outrepl, the target charset should be.
628  */
mutt_ch_iconv(iconv_t cd,const char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,const char ** inrepls,const char * outrepl,int * iconverrno)629 size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft,
630                      char **outbuf, size_t *outbytesleft, const char **inrepls,
631                      const char *outrepl, int *iconverrno)
632 {
633   size_t rc = 0;
634   const char *ib = *inbuf;
635   size_t ibl = *inbytesleft;
636   char *ob = *outbuf;
637   size_t obl = *outbytesleft;
638 
639   while (true)
640   {
641     errno = 0;
642     const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
643     if (ret1 != (size_t) -1)
644       rc += ret1;
645     if (iconverrno)
646       *iconverrno = errno;
647 
648     if (ibl && obl && (errno == EILSEQ))
649     {
650       if (inrepls)
651       {
652         /* Try replacing the input */
653         const char **t = NULL;
654         for (t = inrepls; *t; t++)
655         {
656           const char *ib1 = *t;
657           size_t ibl1 = strlen(*t);
658           char *ob1 = ob;
659           size_t obl1 = obl;
660           iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
661           if (ibl1 == 0)
662           {
663             ib++;
664             ibl--;
665             ob = ob1;
666             obl = obl1;
667             rc++;
668             break;
669           }
670         }
671         if (*t)
672           continue;
673       }
674       /* Replace the output */
675       if (!outrepl)
676         outrepl = "?";
677       iconv(cd, NULL, NULL, &ob, &obl);
678       if (obl)
679       {
680         int n = strlen(outrepl);
681         if (n > obl)
682         {
683           outrepl = "?";
684           n = 1;
685         }
686         memcpy(ob, outrepl, n);
687         ib++;
688         ibl--;
689         ob += n;
690         obl -= n;
691         rc++;
692         iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
693         continue;
694       }
695     }
696     *inbuf = ib;
697     *inbytesleft = ibl;
698     *outbuf = ob;
699     *outbytesleft = obl;
700     return rc;
701   }
702 }
703 
704 /**
705  * mutt_ch_iconv_lookup - Look for a replacement character set
706  * @param chs Character set to lookup
707  * @retval ptr  Replacement character set (if a 'iconv-hook' matches)
708  * @retval NULL No matching hook
709  *
710  * Look through all the 'iconv-hook's.
711  * If one matches return the replacement character set.
712  */
mutt_ch_iconv_lookup(const char * chs)713 const char *mutt_ch_iconv_lookup(const char *chs)
714 {
715   return lookup_charset(MUTT_LOOKUP_ICONV, chs);
716 }
717 
718 /**
719  * mutt_ch_check - Check whether a string can be converted between encodings
720  * @param[in] s     String to check
721  * @param[in] slen  Length of the string to check
722  * @param[in] from  Current character set
723  * @param[in] to    Target character set
724  * @retval 0  Success
725  * @retval -1 Error in iconv_open()
726  * @retval >0 Errno as set by iconv()
727  */
mutt_ch_check(const char * s,size_t slen,const char * from,const char * to)728 int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
729 {
730   if (!s || !from || !to)
731     return -1;
732 
733   int rc = 0;
734   iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
735   if (cd == (iconv_t) -1)
736     return -1;
737 
738   size_t outlen = MB_LEN_MAX * slen;
739   char *out = mutt_mem_malloc(outlen + 1);
740   char *saved_out = out;
741 
742   const size_t convlen =
743       iconv(cd, (ICONV_CONST char **) &s, &slen, &out, (size_t *) &outlen);
744   if (convlen == -1)
745     rc = errno;
746 
747   FREE(&saved_out);
748   iconv_close(cd);
749   return rc;
750 }
751 
752 /**
753  * mutt_ch_convert_string - Convert a string between encodings
754  * @param[in,out] ps    String to convert
755  * @param[in]     from  Current character set
756  * @param[in]     to    Target character set
757  * @param[in]     flags Flags, e.g. #MUTT_ICONV_HOOK_FROM
758  * @retval 0      Success
759  * @retval -1     Invalid arguments or failure to open an iconv channel
760  * @retval errno  Failure in iconv conversion
761  *
762  * Parameter flags is given as-is to mutt_ch_iconv_open().
763  * See there for its meaning and usage policy.
764  */
mutt_ch_convert_string(char ** ps,const char * from,const char * to,uint8_t flags)765 int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
766 {
767   if (!ps)
768     return -1;
769 
770   char *s = *ps;
771 
772   if (!s || (*s == '\0'))
773     return 0;
774 
775   if (!to || !from)
776     return -1;
777 
778   const char *repls[] = { "\357\277\275", "?", 0 };
779   int rc = 0;
780 
781   iconv_t cd = mutt_ch_iconv_open(to, from, flags);
782   if (cd == (iconv_t) -1)
783     return -1;
784 
785   size_t len;
786   const char *ib = NULL;
787   char *buf = NULL, *ob = NULL;
788   size_t ibl, obl;
789   const char **inrepls = NULL;
790   const char *outrepl = NULL;
791 
792   if (mutt_ch_is_utf8(to))
793     outrepl = "\357\277\275";
794   else if (mutt_ch_is_utf8(from))
795     inrepls = repls;
796   else
797     outrepl = "?";
798 
799   len = strlen(s);
800   ib = s;
801   ibl = len + 1;
802   obl = MB_LEN_MAX * ibl;
803   buf = mutt_mem_malloc(obl + 1);
804   ob = buf;
805 
806   mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
807   iconv_close(cd);
808 
809   *ob = '\0';
810 
811   FREE(ps);
812   *ps = buf;
813 
814   mutt_str_adjust(ps);
815   return rc;
816 }
817 
818 /**
819  * mutt_ch_check_charset - Does iconv understand a character set?
820  * @param cs     Character set to check
821  * @param strict Check strictly by using iconv
822  * @retval true Character set is valid
823  *
824  * If `strict` is false, then finding a matching character set in
825  * #PreferredMimeNames will be enough.
826  * If `strict` is true, or the charset is not in #PreferredMimeNames, then
827  * iconv() with be run.
828  */
mutt_ch_check_charset(const char * cs,bool strict)829 bool mutt_ch_check_charset(const char *cs, bool strict)
830 {
831   if (!cs)
832     return false;
833 
834   if (mutt_ch_is_utf8(cs))
835     return true;
836 
837   if (!strict)
838   {
839     for (int i = 0; PreferredMimeNames[i].key; i++)
840     {
841       if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
842           mutt_istr_equal(PreferredMimeNames[i].pref, cs))
843       {
844         return true;
845       }
846     }
847   }
848 
849   iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
850   if (cd != (iconv_t) (-1))
851   {
852     iconv_close(cd);
853     return true;
854   }
855 
856   return false;
857 }
858 
859 /**
860  * mutt_ch_fgetconv_open - Prepare a file for charset conversion
861  * @param fp    FILE ptr to prepare
862  * @param from  Current character set
863  * @param to    Destination character set
864  * @param flags Flags, e.g. #MUTT_ICONV_HOOK_FROM
865  * @retval ptr fgetconv handle
866  *
867  * Parameter flags is given as-is to mutt_ch_iconv_open().
868  */
mutt_ch_fgetconv_open(FILE * fp,const char * from,const char * to,uint8_t flags)869 struct FgetConv *mutt_ch_fgetconv_open(FILE *fp, const char *from, const char *to, uint8_t flags)
870 {
871   struct FgetConv *fc = NULL;
872   iconv_t cd = (iconv_t) -1;
873 
874   if (from && to)
875     cd = mutt_ch_iconv_open(to, from, flags);
876 
877   if (cd != (iconv_t) -1)
878   {
879     static const char *repls[] = { "\357\277\275", "?", 0 };
880 
881     fc = mutt_mem_malloc(sizeof(struct FgetConv));
882     fc->p = fc->bufo;
883     fc->ob = fc->bufo;
884     fc->ib = fc->bufi;
885     fc->ibl = 0;
886     fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
887   }
888   else
889     fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
890   fc->fp = fp;
891   fc->cd = cd;
892   return fc;
893 }
894 
895 /**
896  * mutt_ch_fgetconv_close - Close an fgetconv handle
897  * @param[out] fc fgetconv handle
898  */
mutt_ch_fgetconv_close(struct FgetConv ** fc)899 void mutt_ch_fgetconv_close(struct FgetConv **fc)
900 {
901   if (!fc || !*fc)
902     return;
903 
904   if ((*fc)->cd != (iconv_t) -1)
905     iconv_close((*fc)->cd);
906   FREE(fc);
907 }
908 
909 /**
910  * mutt_ch_fgetconv - Convert a file's character set
911  * @param fc FgetConv handle
912  * @retval num Next character in the converted file
913  * @retval EOF Error
914  *
915  * A file is read into a buffer and its character set is converted.
916  * Each call to this function will return one converted character.
917  * The buffer is refilled automatically when empty.
918  */
mutt_ch_fgetconv(struct FgetConv * fc)919 int mutt_ch_fgetconv(struct FgetConv *fc)
920 {
921   if (!fc)
922     return EOF;
923   if (fc->cd == (iconv_t) -1)
924     return fgetc(fc->fp);
925   if (!fc->p)
926     return EOF;
927   if (fc->p < fc->ob)
928     return (unsigned char) *(fc->p)++;
929 
930   /* Try to convert some more */
931   fc->p = fc->bufo;
932   fc->ob = fc->bufo;
933   if (fc->ibl)
934   {
935     size_t obl = sizeof(fc->bufo);
936     iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
937     if (fc->p < fc->ob)
938       return (unsigned char) *(fc->p)++;
939   }
940 
941   /* If we trusted iconv a bit more, we would at this point
942    * ask why it had stopped converting ... */
943 
944   /* Try to read some more */
945   if ((fc->ibl == sizeof(fc->bufi)) ||
946       (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
947   {
948     fc->p = 0;
949     return EOF;
950   }
951   if (fc->ibl)
952     memcpy(fc->bufi, fc->ib, fc->ibl);
953   fc->ib = fc->bufi;
954   fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
955 
956   /* Try harder this time to convert some */
957   if (fc->ibl)
958   {
959     size_t obl = sizeof(fc->bufo);
960     mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
961                   fc->inrepls, 0, NULL);
962     if (fc->p < fc->ob)
963       return (unsigned char) *(fc->p)++;
964   }
965 
966   /* Either the file has finished or one of the buffers is too small */
967   fc->p = 0;
968   return EOF;
969 }
970 
971 /**
972  * mutt_ch_fgetconvs - Convert a file's charset into a string buffer
973  * @param buf    Buffer for result
974  * @param buflen Length of buffer
975  * @param fc     FgetConv handle
976  * @retval ptr  Success, result buffer
977  * @retval NULL Error
978  *
979  * Read a file into a buffer, converting the character set as it goes.
980  */
mutt_ch_fgetconvs(char * buf,size_t buflen,struct FgetConv * fc)981 char *mutt_ch_fgetconvs(char *buf, size_t buflen, struct FgetConv *fc)
982 {
983   if (!buf)
984     return NULL;
985 
986   size_t r;
987   for (r = 0; (r + 1) < buflen;)
988   {
989     const int c = mutt_ch_fgetconv(fc);
990     if (c == EOF)
991       break;
992     buf[r++] = (char) c;
993     if (c == '\n')
994       break;
995   }
996   buf[r] = '\0';
997 
998   if (r > 0)
999     return buf;
1000 
1001   return NULL;
1002 }
1003 
1004 /**
1005  * mutt_ch_set_charset - Update the records for a new character set
1006  * @param charset New character set
1007  *
1008  * Check if this character set is utf-8 and pick a suitable replacement
1009  * character for unprintable characters.
1010  *
1011  * @note This calls `bind_textdomain_codeset()` which will affect future
1012  * message translations.
1013  */
mutt_ch_set_charset(const char * charset)1014 void mutt_ch_set_charset(const char *charset)
1015 {
1016   char buf[256];
1017 
1018   mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1019 
1020   if (mutt_ch_is_utf8(buf))
1021   {
1022     CharsetIsUtf8 = true;
1023     ReplacementChar = 0xfffd; /* replacement character */
1024   }
1025   else
1026   {
1027     CharsetIsUtf8 = false;
1028     ReplacementChar = '?';
1029   }
1030 
1031 #if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1032   bind_textdomain_codeset(PACKAGE, buf);
1033 #endif
1034 }
1035 
1036 /**
1037  * mutt_ch_choose - Figure the best charset to encode a string
1038  * @param[in] fromcode Original charset of the string
1039  * @param[in] charsets Colon-separated list of potential charsets to use
1040  * @param[in] u        String to encode
1041  * @param[in] ulen     Length of the string to encode
1042  * @param[out] d       If not NULL, point it to the converted string
1043  * @param[out] dlen    If not NULL, point it to the length of the d string
1044  * @retval ptr  Best performing charset
1045  * @retval NULL None could be found
1046  */
mutt_ch_choose(const char * fromcode,const char * charsets,const char * u,size_t ulen,char ** d,size_t * dlen)1047 char *mutt_ch_choose(const char *fromcode, const char *charsets, const char *u,
1048                      size_t ulen, char **d, size_t *dlen)
1049 {
1050   if (!fromcode)
1051     return NULL;
1052 
1053   char *e = NULL, *tocode = NULL;
1054   size_t elen = 0, bestn = 0;
1055   const char *q = NULL;
1056 
1057   for (const char *p = charsets; p; p = q ? q + 1 : 0)
1058   {
1059     q = strchr(p, ':');
1060 
1061     size_t n = q ? q - p : strlen(p);
1062     if (n == 0)
1063       continue;
1064 
1065     char *t = mutt_mem_malloc(n + 1);
1066     memcpy(t, p, n);
1067     t[n] = '\0';
1068 
1069     char *s = mutt_strn_dup(u, ulen);
1070     const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1071                        mutt_ch_check(s, ulen, fromcode, t);
1072     if (rc)
1073     {
1074       FREE(&t);
1075       FREE(&s);
1076       continue;
1077     }
1078     size_t slen = mutt_str_len(s);
1079 
1080     if (!tocode || (n < bestn))
1081     {
1082       bestn = n;
1083       FREE(&tocode);
1084       tocode = t;
1085       if (d)
1086       {
1087         FREE(&e);
1088         e = s;
1089       }
1090       else
1091         FREE(&s);
1092       elen = slen;
1093     }
1094     else
1095     {
1096       FREE(&t);
1097       FREE(&s);
1098     }
1099   }
1100   if (tocode)
1101   {
1102     if (d)
1103       *d = e;
1104     if (dlen)
1105       *dlen = elen;
1106 
1107     char canonical_buf[1024];
1108     mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1109     mutt_str_replace(&tocode, canonical_buf);
1110   }
1111   return tocode;
1112 }
1113