1 /*
2  * This file is part of mpv.
3  *
4  * Based on code taken from libass (ISC license), which was originally part
5  * of MPlayer (GPL).
6  * Copyright (C) 2006 Evgeniy Stepanov <eugeni.stepanov@gmail.com>
7  *
8  * mpv is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * mpv is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include <stdlib.h>
23 #include <errno.h>
24 #include <strings.h>
25 #include <assert.h>
26 
27 #include "config.h"
28 
29 #include "common/msg.h"
30 
31 #if HAVE_UCHARDET
32 #include <uchardet.h>
33 #endif
34 
35 #if HAVE_ICONV
36 #include <iconv.h>
37 #endif
38 
39 #include "charset_conv.h"
40 
mp_charset_is_utf8(const char * user_cp)41 bool mp_charset_is_utf8(const char *user_cp)
42 {
43     return user_cp && (strcasecmp(user_cp, "utf8") == 0 ||
44                        strcasecmp(user_cp, "utf-8") == 0);
45 }
46 
mp_charset_is_utf16(const char * user_cp)47 bool mp_charset_is_utf16(const char *user_cp)
48 {
49     bstr s = bstr0(user_cp);
50     return bstr_case_startswith(s, bstr0("utf16")) ||
51            bstr_case_startswith(s, bstr0("utf-16"));
52 }
53 
54 static const char *const utf_bom[3] = {"\xEF\xBB\xBF", "\xFF\xFE", "\xFE\xFF"};
55 static const char *const utf_enc[3] = {"utf-8",        "utf-16le", "utf-16be"};
56 
ms_bom_guess(bstr buf)57 static const char *ms_bom_guess(bstr buf)
58 {
59     for (int n = 0; n < 3; n++) {
60         if (bstr_startswith0(buf, utf_bom[n]))
61             return utf_enc[n];
62     }
63     return NULL;
64 }
65 
66 #if HAVE_UCHARDET
mp_uchardet(void * talloc_ctx,struct mp_log * log,bstr buf)67 static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf)
68 {
69     uchardet_t det = uchardet_new();
70     if (!det)
71         return NULL;
72     if (uchardet_handle_data(det, buf.start, buf.len) != 0) {
73         uchardet_delete(det);
74         return NULL;
75     }
76     uchardet_data_end(det);
77     char *res = talloc_strdup(talloc_ctx, uchardet_get_charset(det));
78     if (res && !res[0])
79         res = NULL;
80     if (res) {
81         mp_verbose(log, "libuchardet detected charset as %s\n", res);
82         iconv_t icdsc = iconv_open("UTF-8", res);
83         if (icdsc == (iconv_t)(-1)) {
84             mp_warn(log, "Charset '%s' not supported by iconv.\n", res);
85             res = NULL;
86         } else {
87             iconv_close(icdsc);
88         }
89     }
90     uchardet_delete(det);
91     return res;
92 }
93 #endif
94 
95 // Runs charset auto-detection on the input buffer, and returns the result.
96 // If auto-detection fails, NULL is returned.
97 // If user_cp doesn't refer to any known auto-detection (for example because
98 // it's a real iconv codepage), user_cp is returned without even looking at
99 // the buf data.
100 // The return value may (but doesn't have to) be allocated under talloc_ctx.
mp_charset_guess(void * talloc_ctx,struct mp_log * log,bstr buf,const char * user_cp,int flags)101 const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log,  bstr buf,
102                              const char *user_cp, int flags)
103 {
104     if (strcasecmp(user_cp, "enca") == 0 || strcasecmp(user_cp, "guess") == 0 ||
105         strcasecmp(user_cp, "uchardet") == 0 || strchr(user_cp, ':'))
106     {
107         mp_err(log, "This syntax for the --sub-codepage option was deprecated "
108                     "and has been removed.\n");
109         if (strncasecmp(user_cp, "utf8:", 5) == 0) {
110             user_cp = user_cp + 5;
111         } else {
112             user_cp = "";
113         }
114     }
115 
116     if (user_cp[0] == '+') {
117         mp_verbose(log, "Forcing charset '%s'.\n", user_cp + 1);
118         return user_cp + 1;
119     }
120 
121     const char *bom_cp = ms_bom_guess(buf);
122     if (bom_cp) {
123         mp_verbose(log, "Data has a BOM, assuming %s as charset.\n", bom_cp);
124         return bom_cp;
125     }
126 
127     int r = bstr_validate_utf8(buf);
128     if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) {
129         mp_verbose(log, "Data looks like UTF-8, ignoring user-provided charset.\n");
130         return "utf-8";
131     }
132 
133     const char *res = NULL;
134     if (strcasecmp(user_cp, "auto") == 0) {
135 #if HAVE_UCHARDET
136         res = mp_uchardet(talloc_ctx, log, buf);
137 #endif
138         if (!res) {
139             mp_verbose(log, "Charset auto-detection failed.\n");
140             res = "UTF-8-BROKEN";
141         }
142     } else {
143         res = user_cp;
144     }
145 
146     mp_verbose(log, "Using charset '%s'.\n", res);
147     return res;
148 }
149 
150 // Use iconv to convert buf to UTF-8.
151 // Returns buf.start==NULL on error. Returns buf if cp is NULL, or if there is
152 // obviously no conversion required (e.g. if cp is "UTF-8").
153 // Returns a newly allocated buffer if conversion is done and succeeds. The
154 // buffer will be terminated with 0 for convenience (the terminating 0 is not
155 // included in the returned length).
156 // Free the returned buffer with talloc_free().
157 //  buf: input data
158 //  cp: iconv codepage (or NULL)
159 //  flags: combination of MP_ICONV_* flags
160 //  returns: buf (no conversion), .start==NULL (error), or allocated buffer
mp_iconv_to_utf8(struct mp_log * log,bstr buf,const char * cp,int flags)161 bstr mp_iconv_to_utf8(struct mp_log *log, bstr buf, const char *cp, int flags)
162 {
163 #if HAVE_ICONV
164     if (!cp || !cp[0] || mp_charset_is_utf8(cp))
165         return buf;
166 
167     if (strcasecmp(cp, "ASCII") == 0)
168         return buf;
169 
170     if (strcasecmp(cp, "UTF-8-BROKEN") == 0)
171         return bstr_sanitize_utf8_latin1(NULL, buf);
172 
173     // Force CP949 over EUC-KR since iconv distinguishes them and
174     // EUC-KR causes error on CP949 encoded data
175     if (strcasecmp(cp, "EUC-KR") == 0)
176       cp = "CP949";
177 
178     iconv_t icdsc;
179     if ((icdsc = iconv_open("UTF-8", cp)) == (iconv_t) (-1)) {
180         if (flags & MP_ICONV_VERBOSE)
181             mp_err(log, "Error opening iconv with codepage '%s'\n", cp);
182         goto failure;
183     }
184 
185     size_t size = buf.len;
186     size_t osize = size;
187     size_t ileft = size;
188     size_t oleft = size - 1;
189 
190     char *outbuf = talloc_size(NULL, osize);
191     char *ip = buf.start;
192     char *op = outbuf;
193 
194     while (1) {
195         int clear = 0;
196         size_t rc;
197         if (ileft)
198             rc = iconv(icdsc, &ip, &ileft, &op, &oleft);
199         else {
200             clear = 1; // clear the conversion state and leave
201             rc = iconv(icdsc, NULL, NULL, &op, &oleft);
202         }
203         if (rc == (size_t) (-1)) {
204             if (errno == E2BIG) {
205                 size_t offset = op - outbuf;
206                 outbuf = talloc_realloc_size(NULL, outbuf, osize + size);
207                 op = outbuf + offset;
208                 osize += size;
209                 oleft += size;
210             } else {
211                 if (errno == EINVAL && (flags & MP_ICONV_ALLOW_CUTOFF)) {
212                     // This is intended for cases where the input buffer is cut
213                     // at a random byte position. If this happens in the middle
214                     // of the buffer, it should still be an error. We say it's
215                     // fine if the error is within 10 bytes of the end.
216                     if (ileft <= 10)
217                         break;
218                 }
219                 if (flags & MP_ICONV_VERBOSE) {
220                     mp_err(log, "Error recoding text with codepage '%s'\n", cp);
221                 }
222                 talloc_free(outbuf);
223                 iconv_close(icdsc);
224                 goto failure;
225             }
226         } else if (clear)
227             break;
228     }
229 
230     iconv_close(icdsc);
231 
232     outbuf[osize - oleft - 1] = 0;
233     return (bstr){outbuf, osize - oleft - 1};
234 #endif
235 
236 failure:
237     if (flags & MP_NO_LATIN1_FALLBACK) {
238         return buf;
239     } else {
240         return bstr_sanitize_utf8_latin1(NULL, buf);
241     }
242 }
243