1 /*
2  * Copyright (C) 1999-2002,2007 Thomas Roessler <roessler@does-not-exist.org>
3  *
4  *     This program is free software; you can redistribute it
5  *     and/or modify it under the terms of the GNU General Public
6  *     License as published by the Free Software Foundation; either
7  *     version 2 of the License, or (at your option) any later
8  *     version.
9  *
10  *     This program is distributed in the hope that it will be
11  *     useful, but WITHOUT ANY WARRANTY; without even the implied
12  *     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
13  *     PURPOSE.  See the GNU General Public License for more
14  *     details.
15  *
16  *     You should have received a copy of the GNU General Public
17  *     License along with this program; if not, write to the Free
18  *     Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19  *     Boston, MA  02110-1301, USA.
20  */
21 
22 #if HAVE_CONFIG_H
23 # include "config.h"
24 #endif
25 
26 #include <string.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 
30 #include <ctype.h>
31 
32 #include <sys/types.h>
33 #include <dirent.h>
34 #include <unistd.h>
35 #include <errno.h>
36 
37 #include "mutt.h"
38 #include "charset.h"
39 
40 #ifndef EILSEQ
41 # define EILSEQ EINVAL
42 #endif
43 
44 /*
45  * The following list has been created manually from the data under:
46  * http://www.isi.edu/in-notes/iana/assignments/character-sets
47  * Last update: 2000-09-07
48  *
49  * Note that it includes only the subset of character sets for which
50  * a preferred MIME name is given.
51  */
52 
53 static struct
54 {
55   char *key;
56   char *pref;
57 }
58 PreferredMIMENames[] =
59 {
60   { "ansi_x3.4-1968", 	"us-ascii"     	},
61   { "iso-ir-6",		"us-ascii" 	},
62   { "iso_646.irv:1991",	"us-ascii" 	},
63   { "ascii",		"us-ascii" 	},
64   { "iso646-us",	"us-ascii" 	},
65   { "us",		"us-ascii" 	},
66   { "ibm367",		"us-ascii" 	},
67   { "cp367",		"us-ascii" 	},
68   { "csASCII",		"us-ascii" 	},
69 
70   { "csISO2022KR",	"iso-2022-kr" 	},
71   { "csEUCKR",		"euc-kr"      	},
72   { "csISO2022JP",	"iso-2022-jp"	},
73   { "csISO2022JP2",	"iso-2022-jp-2" },
74 
75   { "ISO_8859-1:1987",	"iso-8859-1"	},
76   { "iso-ir-100",	"iso-8859-1"	},
77   { "iso_8859-1",	"iso-8859-1"	},
78   { "latin1",		"iso-8859-1"	},
79   { "l1",		"iso-8859-1"	},
80   { "IBM819",		"iso-8859-1"	},
81   { "CP819",		"iso-8859-1"	},
82   { "csISOLatin1",	"iso-8859-1"	},
83 
84   { "ISO_8859-2:1987",	"iso-8859-2"	},
85   { "iso-ir-101",	"iso-8859-2"	},
86   { "iso_8859-2",	"iso-8859-2"	},
87   { "latin2",		"iso-8859-2"	},
88   { "l2",		"iso-8859-2"	},
89   { "csISOLatin2",	"iso-8859-2"	},
90 
91   { "ISO_8859-3:1988",	"iso-8859-3"	},
92   { "iso-ir-109",	"iso-8859-3"	},
93   { "ISO_8859-3",	"iso-8859-3"	},
94   { "latin3",		"iso-8859-3"	},
95   { "l3",		"iso-8859-3"	},
96   { "csISOLatin3",	"iso-8859-3"	},
97 
98   { "ISO_8859-4:1988",	"iso-8859-4"	},
99   { "iso-ir-110",	"iso-8859-4"	},
100   { "ISO_8859-4",	"iso-8859-4"	},
101   { "latin4",		"iso-8859-4"	},
102   { "l4",		"iso-8859-4"	},
103   { "csISOLatin4",	"iso-8859-4"	},
104 
105   { "ISO_8859-6:1987",	"iso-8859-6"	},
106   { "iso-ir-127",	"iso-8859-6"	},
107   { "iso_8859-6",	"iso-8859-6"	},
108   { "ECMA-114",		"iso-8859-6"	},
109   { "ASMO-708",		"iso-8859-6"	},
110   { "arabic",		"iso-8859-6"	},
111   { "csISOLatinArabic",	"iso-8859-6"	},
112 
113   { "ISO_8859-7:1987",	"iso-8859-7"	},
114   { "iso-ir-126",	"iso-8859-7"	},
115   { "ISO_8859-7",	"iso-8859-7"	},
116   { "ELOT_928",		"iso-8859-7"	},
117   { "ECMA-118",		"iso-8859-7"	},
118   { "greek",		"iso-8859-7"	},
119   { "greek8",		"iso-8859-7"	},
120   { "csISOLatinGreek",	"iso-8859-7"	},
121 
122   { "ISO_8859-8:1988",	"iso-8859-8"	},
123   { "iso-ir-138",	"iso-8859-8"	},
124   { "ISO_8859-8",	"iso-8859-8"	},
125   { "hebrew",		"iso-8859-8"	},
126   { "csISOLatinHebrew",	"iso-8859-8"	},
127 
128   { "ISO_8859-5:1988",	"iso-8859-5"	},
129   { "iso-ir-144",	"iso-8859-5"	},
130   { "ISO_8859-5",	"iso-8859-5"	},
131   { "cyrillic",		"iso-8859-5"	},
132   { "csISOLatinCyrillic", "iso-8859-5"	},
133 
134   { "ISO_8859-9:1989",	"iso-8859-9"	},
135   { "iso-ir-148",	"iso-8859-9"	},
136   { "ISO_8859-9",	"iso-8859-9"	},
137   { "latin5",		"iso-8859-9"	}, /* this is not a bug */
138   { "l5",		"iso-8859-9"	},
139   { "csISOLatin5",	"iso-8859-9"	},
140 
141   { "ISO_8859-10:1992",	"iso-8859-10"	},
142   { "iso-ir-157",	"iso-8859-10"	},
143   { "latin6",		"iso-8859-10"	}, /* this is not a bug */
144   { "l6",		"iso-8859-10"	},
145   { "csISOLatin6",	"iso-8859-10"	},
146 
147   { "csKOI8r",		"koi8-r"	},
148 
149   { "MS_Kanji",		"Shift_JIS"	}, /* Note the underscore! */
150   { "csShiftJis",	"Shift_JIS"	},
151 
152   { "Extended_UNIX_Code_Packed_Format_for_Japanese",
153       			"euc-jp"	},
154   { "csEUCPkdFmtJapanese",
155       			"euc-jp"	},
156 
157   { "csGB2312",		"gb2312"	},
158   { "csbig5",		"big5"		},
159 
160   /*
161    * End of official brain damage.  What follows has been taken
162    * from glibc's localedata files.
163    */
164 
165   { "iso_8859-13",	"iso-8859-13"	},
166   { "iso-ir-179",	"iso-8859-13"	},
167   { "latin7",		"iso-8859-13"	}, /* this is not a bug */
168   { "l7",		"iso-8859-13"	},
169 
170   { "iso_8859-14",	"iso-8859-14"	},
171   { "latin8",		"iso-8859-14"	}, /* this is not a bug */
172   { "l8",		"iso-8859-14"	},
173 
174   { "iso_8859-15",	"iso-8859-15"	},
175   { "latin9",		"iso-8859-15"	}, /* this is not a bug */
176 
177   /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
178   { "latin0",           "iso-8859-15"   }, /* this is not a bug */
179 
180   { "iso_8859-16",      "iso-8859-16"   },
181   { "latin10",          "iso-8859-16"   }, /* this is not a bug */
182 
183   /*
184    * David Champion <dgc@uchicago.edu> has observed this with
185    * nl_langinfo under SunOS 5.8.
186    */
187 
188   { "646",		"us-ascii"	},
189 
190   /*
191    * http://www.sun.com/software/white-papers/wp-unicode/
192    */
193 
194   { "eucJP",		"euc-jp"	},
195   { "PCK",		"Shift_JIS"	},
196   { "ko_KR-euc",	"euc-kr"	},
197   { "zh_TW-big5",	"big5"		},
198 
199   /* seems to be common on some systems */
200 
201   { "sjis",		"Shift_JIS"	},
202   { "euc-jp-ms",	"eucJP-ms"	},
203 
204 
205   /*
206    * If you happen to encounter system-specific brain-damage with
207    * respect to character set naming, please add it above this
208    * comment, and submit a patch to <mutt-dev@mutt.org>.
209    */
210 
211   /* End of aliases.  Please keep this line last. */
212 
213   { NULL, 		NULL		}
214 };
215 
216 #ifdef HAVE_LANGINFO_CODESET
217 # include <langinfo.h>
218 
219 
mutt_set_langinfo_charset(void)220 void mutt_set_langinfo_charset (void)
221 {
222   char buff[LONG_STRING];
223   char buff2[LONG_STRING];
224 
225   strfcpy (buff, nl_langinfo (CODESET), sizeof (buff));
226   mutt_canonical_charset (buff2, sizeof (buff2), buff);
227 
228   /* finally, set $charset */
229   if (!(Charset = safe_strdup (buff2)))
230     Charset = safe_strdup ("iso-8859-1");
231 }
232 
233 #else
234 
mutt_set_langinfo_charset(void)235 void mutt_set_langinfo_charset (void)
236 {
237   Charset = safe_strdup ("iso-8859-1");
238 }
239 
240 #endif
241 
242 /* this first ties off any charset extension such as //TRANSLIT,
243    canonicalizes the charset and re-adds the extension */
mutt_canonical_charset(char * dest,size_t dlen,const char * name)244 void mutt_canonical_charset (char *dest, size_t dlen, const char *name)
245 {
246   size_t i;
247   char *p, *ext;
248   char in[LONG_STRING], scratch[LONG_STRING];
249 
250   strfcpy (in, name, sizeof (in));
251   if ((ext = strchr (in, '/')))
252     *ext++ = 0;
253 
254   if (!ascii_strcasecmp (in, "utf-8") || !ascii_strcasecmp (in, "utf8"))
255   {
256     strfcpy (dest, "utf-8", dlen);
257     goto out;
258   }
259 
260   /* catch some common iso-8859-something misspellings */
261   if (!ascii_strncasecmp (in, "8859", 4) && in[4] != '-')
262     snprintf (scratch, sizeof (scratch), "iso-8859-%s", in +4);
263   else if (!ascii_strncasecmp (in, "8859-", 5))
264     snprintf (scratch, sizeof (scratch), "iso-8859-%s", in + 5);
265   else if (!ascii_strncasecmp (in, "iso8859", 7) && in[7] != '-')
266     snprintf (scratch, sizeof (scratch), "iso_8859-%s", in + 7);
267   else if (!ascii_strncasecmp (in, "iso8859-", 8))
268     snprintf (scratch, sizeof (scratch), "iso_8859-%s", in + 8);
269   else
270     strfcpy (scratch, in, sizeof (scratch));
271 
272   for (i = 0; PreferredMIMENames[i].key; i++)
273     if (!ascii_strcasecmp (scratch, PreferredMIMENames[i].key) ||
274 	!mutt_strcasecmp (scratch, PreferredMIMENames[i].key))
275     {
276       strfcpy (dest, PreferredMIMENames[i].pref, dlen);
277       goto out;
278     }
279 
280   strfcpy (dest, scratch, dlen);
281 
282   /* for cosmetics' sake, transform to lowercase. */
283   for (p = dest; *p; p++)
284     *p = ascii_tolower (*p);
285 
286 out:
287   if (ext && *ext)
288   {
289     safe_strcat (dest, dlen, "/");
290     safe_strcat (dest, dlen, ext);
291   }
292 }
293 
mutt_chscmp(const char * s,const char * chs)294 int mutt_chscmp (const char *s, const char *chs)
295 {
296   char buffer[STRING];
297   int a, b;
298 
299   if (!s) return 0;
300 
301   /* charsets may have extensions mutt_canonical_charset()
302      leaves intact; we expect `chs' to originate from mutt
303      code, not user input (i.e. `chs' does _not_ have any
304      extension)
305      we simply check if the shorter string is a prefix for
306      the longer */
307   mutt_canonical_charset (buffer, sizeof (buffer), s);
308   a = mutt_strlen (buffer);
309   b = mutt_strlen (chs);
310   return !ascii_strncasecmp (a > b ? buffer : chs,
311 			     a > b ? chs : buffer, MIN(a,b));
312 }
313 
mutt_get_default_charset()314 char *mutt_get_default_charset ()
315 {
316   static char fcharset[SHORT_STRING];
317   const char *c = AssumedCharset;
318   const char *c1;
319 
320   if (c && *c) {
321     c1 = strchr (c, ':');
322     strfcpy (fcharset, c, c1 ? (c1 - c + 1) : sizeof (fcharset));
323     return fcharset;
324   }
325   return strcpy (fcharset, "us-ascii"); /* __STRCPY_CHECKED__ */
326 }
327 
328 #ifndef HAVE_ICONV
329 
iconv_open(const char * tocode,const char * fromcode)330 iconv_t iconv_open (const char *tocode, const char *fromcode)
331 {
332   return (iconv_t)(-1);
333 }
334 
iconv(iconv_t cd,ICONV_CONST char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)335 size_t iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t *inbytesleft,
336 	      char **outbuf, size_t *outbytesleft)
337 {
338   return 0;
339 }
340 
iconv_close(iconv_t cd)341 int iconv_close (iconv_t cd)
342 {
343   return 0;
344 }
345 
346 #endif /* !HAVE_ICONV */
347 
348 
349 /*
350  * Like iconv_open, but canonicalises the charsets, applies
351  * charset-hooks, recanonicalises, and finally applies iconv-hooks.
352  * Parameter flags=0 skips charset-hooks, while M_ICONV_HOOK_FROM
353  * applies them to fromcode. Callers should use flags=0 when fromcode
354  * can safely be considered true, either some constant, or some value
355  * provided by the user; M_ICONV_HOOK_FROM should be used only when
356  * fromcode is unsure, taken from a possibly wrong incoming MIME label,
357  * or such. Misusing M_ICONV_HOOK_FROM leads to unwanted interactions
358  * in some setups. Note: By design charset-hooks should never be, and
359  * are never, applied to tocode. Highlight note: The top-well-named
360  * M_ICONV_HOOK_FROM acts on charset-hooks, not at all on iconv-hooks.
361  */
362 
mutt_iconv_open(const char * tocode,const char * fromcode,int flags)363 iconv_t mutt_iconv_open (const char *tocode, const char *fromcode, int flags)
364 {
365   char tocode1[SHORT_STRING];
366   char fromcode1[SHORT_STRING];
367   char *tocode2, *fromcode2;
368   char *tmp;
369 
370   iconv_t cd;
371 
372   /* transform to MIME preferred charset names */
373   mutt_canonical_charset (tocode1, sizeof (tocode1), tocode);
374   mutt_canonical_charset (fromcode1, sizeof (fromcode1), fromcode);
375 
376   /* maybe apply charset-hooks and recanonicalise fromcode,
377    * but only when caller asked us to sanitize a potentialy wrong
378    * charset name incoming from the wild exterior. */
379   if ((flags & M_ICONV_HOOK_FROM) && (tmp = mutt_charset_hook (fromcode1)))
380     mutt_canonical_charset (fromcode1, sizeof (fromcode1), tmp);
381 
382   /* always apply iconv-hooks to suit system's iconv tastes */
383   tocode2 = mutt_iconv_hook (tocode1);
384   tocode2 = (tocode2) ? tocode2 : tocode1;
385   fromcode2 = mutt_iconv_hook (fromcode1);
386   fromcode2 = (fromcode2) ? fromcode2 : fromcode1;
387 
388   /* call system iconv with names it appreciates */
389   if ((cd = iconv_open (tocode2, fromcode2)) != (iconv_t) -1)
390     return cd;
391 
392   return (iconv_t) -1;
393 }
394 
395 
396 /*
397  * Like iconv, but keeps going even when the input is invalid
398  * If you're supplying inrepls, the source charset should be stateless;
399  * if you're supplying an outrepl, the target charset should be.
400  */
401 
mutt_iconv(iconv_t cd,ICONV_CONST char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,ICONV_CONST char ** inrepls,const char * outrepl)402 size_t mutt_iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t *inbytesleft,
403 		   char **outbuf, size_t *outbytesleft,
404 		   ICONV_CONST char **inrepls, const char *outrepl)
405 {
406   size_t ret = 0, ret1;
407   ICONV_CONST char *ib = *inbuf;
408   size_t ibl = *inbytesleft;
409   char *ob = *outbuf;
410   size_t obl = *outbytesleft;
411 
412   for (;;)
413   {
414     ret1 = iconv (cd, &ib, &ibl, &ob, &obl);
415     if (ret1 != (size_t)-1)
416       ret += ret1;
417     if (ibl && obl && errno == EILSEQ)
418     {
419       if (inrepls)
420       {
421 	/* Try replacing the input */
422 	ICONV_CONST char **t;
423 	for (t = inrepls; *t; t++)
424 	{
425 	  ICONV_CONST char *ib1 = *t;
426 	  size_t ibl1 = strlen (*t);
427 	  char *ob1 = ob;
428 	  size_t obl1 = obl;
429 	  iconv (cd, &ib1, &ibl1, &ob1, &obl1);
430 	  if (!ibl1)
431 	  {
432 	    ++ib, --ibl;
433 	    ob = ob1, obl = obl1;
434 	    ++ret;
435 	    break;
436 	  }
437 	}
438 	if (*t)
439 	  continue;
440       }
441       /* Replace the output */
442       if (!outrepl)
443 	outrepl = "?";
444       iconv (cd, 0, 0, &ob, &obl);
445       if (obl)
446       {
447 	int n = strlen (outrepl);
448 	if (n > obl)
449 	{
450 	  outrepl = "?";
451 	  n = 1;
452 	}
453 	memcpy (ob, outrepl, n);
454 	++ib, --ibl;
455 	ob += n, obl -= n;
456 	++ret;
457 	iconv (cd, 0, 0, 0, 0); /* for good measure */
458 	continue;
459       }
460     }
461     *inbuf = ib, *inbytesleft = ibl;
462     *outbuf = ob, *outbytesleft = obl;
463     return ret;
464   }
465 }
466 
467 
468 /*
469  * Convert a string
470  * Used in rfc2047.c, rfc2231.c, crypt-gpgme.c, mutt_idna.c, and more.
471  * Parameter flags is given as-is to mutt_iconv_open(). See there
472  * for its meaning and usage policy.
473  */
474 
mutt_convert_string(char ** ps,const char * from,const char * to,int flags)475 int mutt_convert_string (char **ps, const char *from, const char *to, int flags)
476 {
477   iconv_t cd;
478   ICONV_CONST char *repls[] = { "\357\277\275", "?", 0 };
479   char *s = *ps;
480 
481   if (!s || !*s)
482     return 0;
483 
484   if (option (OPTSANITIZEJACHARS) && !ascii_strncasecmp (from, "iso-2022-jp", 11))
485     mutt_sanitize_ja_chars (s, mutt_strlen(s), 0);
486 
487   if (to && from && (cd = mutt_iconv_open (to, from, flags)) != (iconv_t)-1)
488   {
489     int len;
490     ICONV_CONST char *ib;
491     char *buf, *ob;
492     size_t ibl, obl;
493     ICONV_CONST char **inrepls = 0;
494     char *outrepl = 0;
495 
496     if (mutt_is_utf8 (to))
497       outrepl = "\357\277\275";
498     else if (mutt_is_utf8 (from))
499       inrepls = repls;
500     else
501       outrepl = "?";
502 
503     len = strlen (s);
504     ib = s, ibl = len + 1;
505     obl = MB_LEN_MAX * ibl;
506     ob = buf = safe_malloc (obl + 1);
507 
508     mutt_iconv (cd, &ib, &ibl, &ob, &obl, inrepls, outrepl);
509     iconv_close (cd);
510 
511     *ob = '\0';
512 
513     FREE (ps);		/* __FREE_CHECKED__ */
514     *ps = buf;
515 
516     mutt_str_adjust (ps);
517     return 0;
518   }
519   else
520     return -1;
521 }
522 
523 
524 /*
525  * FGETCONV stuff for converting a file while reading it
526  * Used in sendlib.c for converting from mutt's Charset
527  */
528 
529 struct fgetconv_s
530 {
531   FILE *file;
532   iconv_t cd;
533   char bufi[512];
534   char bufo[512];
535   char *p;
536   char *ob;
537   char *ib;
538   size_t ibl;
539   ICONV_CONST char **inrepls;
540 };
541 
542 struct fgetconv_not
543 {
544   FILE *file;
545   iconv_t cd;
546 };
547 
548 /*
549  * Parameter flags is given as-is to mutt_iconv_open(). See there
550  * for its meaning and usage policy.
551  */
fgetconv_open(FILE * file,const char * from,const char * to,int flags)552 FGETCONV *fgetconv_open (FILE *file, const char *from, const char *to, int flags)
553 {
554   struct fgetconv_s *fc;
555   iconv_t cd = (iconv_t)-1;
556   static ICONV_CONST char *repls[] = { "\357\277\275", "?", 0 };
557 
558   if (from && to)
559     cd = mutt_iconv_open (to, from, flags);
560 
561   if (cd != (iconv_t)-1)
562   {
563     fc = safe_malloc (sizeof (struct fgetconv_s));
564     fc->p = fc->ob = fc->bufo;
565     fc->ib = fc->bufi;
566     fc->ibl = 0;
567     fc->inrepls = mutt_is_utf8 (to) ? repls : repls + 1;
568   }
569   else
570     fc = safe_malloc (sizeof (struct fgetconv_not));
571   fc->file = file;
572   fc->cd = cd;
573   return (FGETCONV *)fc;
574 }
575 
fgetconvs(char * buf,size_t l,FGETCONV * _fc)576 char *fgetconvs (char *buf, size_t l, FGETCONV *_fc)
577 {
578   int c;
579   size_t r;
580 
581   for (r = 0; r + 1 < l;)
582   {
583     if ((c = fgetconv (_fc)) == EOF)
584       break;
585     buf[r++] = (char) c;
586     if (c == '\n')
587       break;
588   }
589   buf[r] = '\0';
590 
591   if (r)
592     return buf;
593   else
594     return NULL;
595 }
596 
fgetconv(FGETCONV * _fc)597 int fgetconv (FGETCONV *_fc)
598 {
599   struct fgetconv_s *fc = (struct fgetconv_s *)_fc;
600 
601   if (!fc)
602     return EOF;
603   if (fc->cd == (iconv_t)-1)
604     return fgetc (fc->file);
605   if (!fc->p)
606     return EOF;
607   if (fc->p < fc->ob)
608     return (unsigned char)*(fc->p)++;
609 
610   /* Try to convert some more */
611   fc->p = fc->ob = fc->bufo;
612   if (fc->ibl)
613   {
614     size_t obl = sizeof (fc->bufo);
615     iconv (fc->cd, (ICONV_CONST char **)&fc->ib, &fc->ibl, &fc->ob, &obl);
616     if (fc->p < fc->ob)
617       return (unsigned char)*(fc->p)++;
618   }
619 
620   /* If we trusted iconv a bit more, we would at this point
621    * ask why it had stopped converting ... */
622 
623   /* Try to read some more */
624   if (fc->ibl == sizeof (fc->bufi) ||
625       (fc->ibl && fc->ib + fc->ibl < fc->bufi + sizeof (fc->bufi)))
626   {
627     fc->p = 0;
628     return EOF;
629   }
630   if (fc->ibl)
631     memcpy (fc->bufi, fc->ib, fc->ibl);
632   fc->ib = fc->bufi;
633   fc->ibl += fread (fc->ib + fc->ibl, 1, sizeof (fc->bufi) - fc->ibl, fc->file);
634 
635   /* Try harder this time to convert some */
636   if (fc->ibl)
637   {
638     size_t obl = sizeof (fc->bufo);
639     mutt_iconv (fc->cd, (ICONV_CONST char **)&fc->ib, &fc->ibl, &fc->ob, &obl,
640 		fc->inrepls, 0);
641     if (fc->p < fc->ob)
642       return (unsigned char)*(fc->p)++;
643   }
644 
645   /* Either the file has finished or one of the buffers is too small */
646   fc->p = 0;
647   return EOF;
648 }
649 
fgetconv_close(FGETCONV ** _fc)650 void fgetconv_close (FGETCONV **_fc)
651 {
652   struct fgetconv_s *fc = (struct fgetconv_s *) *_fc;
653 
654   if (fc->cd != (iconv_t)-1)
655     iconv_close (fc->cd);
656   FREE (_fc);		/* __FREE_CHECKED__ */
657 }
658 
mutt_check_charset(const char * s,int strict)659 int mutt_check_charset (const char *s, int strict)
660 {
661   int i;
662   iconv_t cd;
663 
664   if (mutt_is_utf8 (s))
665     return 0;
666 
667   if (!strict)
668     for (i = 0; PreferredMIMENames[i].key; i++)
669     {
670       if (ascii_strcasecmp (PreferredMIMENames[i].key, s) == 0 ||
671 	  ascii_strcasecmp (PreferredMIMENames[i].pref, s) == 0)
672 	return 0;
673     }
674 
675   if ((cd = mutt_iconv_open (s, s, 0)) != (iconv_t)(-1))
676   {
677     iconv_close (cd);
678     return 0;
679   }
680 
681   return -1;
682 }
683 
684 /*
685  * mutt_sanitize_ja_chars()
686  *   Adapted by TAKIZAWA Takashi <taki@cyber.email.ne.jp>
687  *
688  * - It replaces undefined KANJI characters to GETA mark.
689  * - It replaces character of 'JIS X 0201 kana' to '?'.
690  * - If $charset is EUC-JP, it replaces third character 'J' of
691  *   escape sequence switching to 'JIS X 0201 latin' to 'B' indicating
692  *   'US-ASCII'.
693  * - If $charset is Shift_JIS, it replaces third character 'B' of
694  *   escape sequence switching to 'US-ASCII' to 'J' indicating
695  *   'JIS X 0201 latin'.
696  */
697 
698 #define ASCII 0
699 #define JISX0201LATIN 1
700 #define JISX0201KANA 2
701 #define JISX0208 3
702 #define OTHER_CS 4
703 
mutt_sanitize_ja_chars(char * s,size_t len,int keep_state)704 void mutt_sanitize_ja_chars(char *s, size_t len, int keep_state)
705 {
706   static int cs = ASCII;
707   static int kanji_cont = 0;
708   static int illegal_kanji = 0;
709   static int es = 0;
710   static char pes = '\0';
711   static char ascii_3rd_char = 'B';
712   static char jisx0201_3rd_char = 'J';
713 
714   char *p = s;
715   char *p1 = NULL;
716   unsigned char c;
717 
718   if (!keep_state || *p == 0x1b) /* consideration about mbstate's buffer */
719   {
720     if (!ascii_strcasecmp (Charset, "euc-jp"))
721       jisx0201_3rd_char = 'B';
722     else if (!ascii_strcasecmp (Charset, "shift_jis"))
723       ascii_3rd_char = 'J';
724     cs = ASCII;
725     kanji_cont = 0;
726     illegal_kanji = 0;
727     es = 0;
728     pes = '\0';
729   }
730 
731   for (;p - s < len;p++)
732   {
733     if (es == 0)
734     {
735       if (*p == 0x1b)
736 	es++;
737       else
738       {
739 	switch (cs)
740 	{
741 	case ASCII:
742 	case JISX0201LATIN:
743 	  break;
744 	case JISX0201KANA:
745 	  *p = '?';
746 	  break;
747 	case JISX0208:
748 	  /* replace ku-ten code from 9 to 15 and 85 or more to "GETA MARK" */
749 	  c = (unsigned char)*p;
750 	  if (! kanji_cont)
751 	  {
752 	    if ((size_t)(p - s + 1) == len)
753 	      return; /* the last character is a primary byte of KANJI */
754 	    if (c <= 0x20 || (c >= 0x29 && c <= 0x2f)
755 		|| (c >= 0x75 && c <= 0xa0))
756 	      illegal_kanji = 1;
757 	    kanji_cont = 1;
758 	    p1 = p;
759 	  }
760 	  else
761 	  {
762 	    if (c <= 0x20 || c >= 0x7f)
763 	      illegal_kanji = 1;
764 	    if (illegal_kanji && p1)
765 	      *p1 = 0x22, *p = 0x2e;
766 	    kanji_cont = 0;
767 	    illegal_kanji = 0;
768 	  }
769 	  break;
770 	}
771       }
772     }
773     else if (es == 1)
774     {
775       if (*p == '$' || (*p >= '(' && *p <= '/' && *p != ','))
776       {
777 	es++;
778 	pes = *p;
779       }
780       else
781       {
782 	es = 0;
783 	return; /* broken */
784       }
785     }
786     else if (es == 2)
787     {
788       if (pes == '(')
789       {
790 	switch (*p)
791 	{
792 	case 'B':
793 	  cs = ASCII, *p = ascii_3rd_char;
794 	  break;
795 	case 'J':
796 	  cs = JISX0201LATIN, *p = jisx0201_3rd_char;
797 	  break;
798 	case 'I':
799 	  /* ready to replace character to '?' */
800 	  cs = JISX0201KANA, *p = ascii_3rd_char;
801 	  break;
802 	default:
803 	  cs = OTHER_CS;
804 	}
805 	es = 0;
806       }
807       else if (pes == '$')
808       {
809 	switch (*p)
810 	{
811 	case '@': /* JIS X 0208-1978 */
812 	case 'B': /* JIS X 0208-1983 */
813 	  cs = JISX0208;
814 	  es = 0;
815 	  break;
816 	case 'A':
817 	  cs = OTHER_CS; /* GB 2312 */
818 	  es = 0;
819 	  break;
820 	case '(':
821 	case ')':
822 	case '*':
823 	case '+':
824 	case '-':
825 	case '.':
826 	case '/':
827 	  es++;
828 	  break;
829 	default:
830 	  es = 0;
831 	  return; /* broken */
832 	}
833       }
834       else
835       {
836 	cs = OTHER_CS;
837 	es = 0;
838       }
839     }
840     else /* es == 3 */
841     {
842       cs = OTHER_CS;
843       es = 0;
844     }
845   }
846 }
847 
mutt_copy_bytes_sanitize_ja(FILE * in,FILE * out,size_t size)848 int mutt_copy_bytes_sanitize_ja (FILE *in, FILE *out, size_t size)
849 {
850   char buf[2048];
851   size_t chunk;
852 
853   mutt_sanitize_ja_chars (NULL, 0, 0);
854   while (size > 0)
855   {
856     chunk = (size > sizeof (buf)) ? sizeof (buf) : size;
857     if ((chunk = fread (buf, 1, chunk, in)) < 1)
858       break;
859     mutt_sanitize_ja_chars (buf, chunk, 1);
860     if (fwrite (buf, 1, chunk, out) != chunk)
861       return (-1);
862     size -= chunk;
863   }
864 
865   return 0;
866 }
867 
868