1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2011, 2012, 2013 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
16
17 #include <config.h>
18
19 #include "libpspp/encoding-guesser.h"
20
21 #include <errno.h>
22 #include <iconv.h>
23 #include <stdbool.h>
24 #include <stdio.h>
25 #include <stdint.h>
26 #include <string.h>
27 #include <unistr.h>
28
29 #include "libpspp/cast.h"
30 #include "libpspp/i18n.h"
31
32 #include "gl/localcharset.h"
33 #include "gl/c-strcase.h"
34
35 /* http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info is a useful source
36 of information about encoding detection.
37 */
38
39 /* Returns the encoding specified by ENCODING, which must be in one of the
40 forms described at the top of encoding-guesser.h. The returned string might
41 be ENCODING itself or a suffix of it, or it might be a statically allocated
42 string. */
43 const char *
encoding_guess_parse_encoding(const char * encoding)44 encoding_guess_parse_encoding (const char *encoding)
45 {
46 if (encoding == NULL
47 || !c_strcasecmp (encoding, "auto")
48 || !c_strcasecmp (encoding, "auto,locale")
49 || !c_strcasecmp (encoding, "locale"))
50 return locale_charset ();
51 else if (!c_strncasecmp (encoding, "auto,", 5))
52 return encoding + 5;
53 else
54 return encoding;
55 }
56
57 /* Returns true if ENCODING, which must be in one of the forms described at the
58 top of encoding-guesser.h, is one that performs encoding autodetection,
59 false otherwise. */
60 bool
encoding_guess_encoding_is_auto(const char * encoding)61 encoding_guess_encoding_is_auto (const char *encoding)
62 {
63 return (encoding == NULL
64 || (!c_strncasecmp (encoding, "auto", 4)
65 && (encoding[4] == ',' || encoding[4] == '\0')));
66 }
67
68 static uint16_t
get_be16(const uint8_t * data)69 get_be16 (const uint8_t *data)
70 {
71 return (data[0] << 8) | data[1];
72 }
73
74 static uint16_t
get_le16(const uint8_t * data)75 get_le16 (const uint8_t *data)
76 {
77 return (data[1] << 8) | data[0];
78 }
79
80 static uint32_t
get_be32(const uint8_t * data)81 get_be32 (const uint8_t *data)
82 {
83 return (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3];
84
85 }
86
87 static uint32_t
get_le32(const uint8_t * data)88 get_le32 (const uint8_t *data)
89 {
90 return (data[3] << 24) | (data[2] << 16) | (data[1] << 8) | data[0];
91 }
92
93 static const char *
guess_utf16(const uint8_t * data,size_t n)94 guess_utf16 (const uint8_t *data, size_t n)
95 {
96 size_t even_nulls, odd_nulls;
97
98 if (n < ENCODING_GUESS_MIN && n % 2 != 0)
99 return NULL;
100
101 even_nulls = odd_nulls = 0;
102 while (n >= 2)
103 {
104 even_nulls += data[0] == 0;
105 odd_nulls += data[1] == 0;
106 if (data[0] == 0 && data[1] == 0)
107 return NULL;
108
109 data += 2;
110 n -= 2;
111 }
112
113 if (odd_nulls > even_nulls)
114 return "UTF-16LE";
115 else if (even_nulls > 0)
116 return "UTF-16BE";
117 else
118 return NULL;
119 }
120
121 static bool
is_utf32(const uint8_t * data,size_t n,uint32_t (* get_u32)(const uint8_t *))122 is_utf32 (const uint8_t *data, size_t n, uint32_t (*get_u32) (const uint8_t *))
123 {
124 if (n < ENCODING_GUESS_MIN && n % 4 != 0)
125 return false;
126
127 while (n >= 4)
128 {
129 uint32_t uc = get_u32 (data);
130
131 if (uc < 0x09 || uc > 0x10ffff)
132 return false;
133
134 data += 4;
135 n -= 4;
136 }
137
138 return true;
139 }
140
141 /* Counts and returns the number of bytes, but no more than N, starting at S
142 that are ASCII text characters. */
143 size_t
encoding_guess_count_ascii(const void * s_,size_t n)144 encoding_guess_count_ascii (const void *s_, size_t n)
145 {
146 const uint8_t *s = s_;
147 size_t ofs;
148
149 for (ofs = 0; ofs < n; ofs++)
150 if (!encoding_guess_is_ascii_text (s[ofs]))
151 break;
152 return ofs;
153 }
154
155 static bool
is_all_utf8_text(const void * s_,size_t n)156 is_all_utf8_text (const void *s_, size_t n)
157 {
158 const uint8_t *s = s_;
159 size_t ofs;
160
161 ofs = 0;
162 while (ofs < n)
163 {
164 uint8_t c = s[ofs];
165 if (c < 0x80)
166 {
167 if (!encoding_guess_is_ascii_text (c))
168 return false;
169 ofs++;
170 }
171 else
172 {
173 ucs4_t uc;
174 int mblen;
175
176 mblen = u8_mbtoucr (&uc, s + ofs, n - ofs);
177 if (mblen < 0)
178 return mblen == -2;
179
180 ofs += mblen;
181 }
182 }
183 return true;
184 }
185
186 static bool
is_utf8_bom(const uint8_t * data,size_t n)187 is_utf8_bom (const uint8_t *data, size_t n)
188 {
189 return n >= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf;
190 }
191
192 static bool
is_utf16le_bom(const uint8_t * data,size_t n)193 is_utf16le_bom (const uint8_t *data, size_t n)
194 {
195 return (n >= ENCODING_GUESS_MIN || n % 2 == 0) && get_le16 (data) == 0xfeff;
196 }
197
198 static bool
is_utf16be_bom(const uint8_t * data,size_t n)199 is_utf16be_bom (const uint8_t *data, size_t n)
200 {
201 return (n >= ENCODING_GUESS_MIN || n % 2 == 0) && get_be16 (data) == 0xfeff;
202 }
203
204 static bool
is_utf32le_bom(const uint8_t * data,size_t n)205 is_utf32le_bom (const uint8_t *data, size_t n)
206 {
207 return (n >= ENCODING_GUESS_MIN || n % 4 == 0) && get_le32 (data) == 0xfeff;
208 }
209
210 static bool
is_utf32be_bom(const uint8_t * data,size_t n)211 is_utf32be_bom (const uint8_t *data, size_t n)
212 {
213 return (n >= ENCODING_GUESS_MIN || n % 4 == 0) && get_be32 (data) == 0xfeff;
214 }
215
216 /* Attempts to guess the encoding of a text file based on ENCODING, an encoding
217 name in one of the forms described at the top of encoding-guesser.h, and
218 DATA, which contains the first N bytes of the file. Returns the guessed
219 encoding, which might be ENCODING itself or a suffix of it or a statically
220 allocated string.
221
222 Encoding autodetection only takes place if ENCODING actually specifies
223 autodetection. See encoding-guesser.h for details.
224
225 UTF-8 cannot be distinguished from other ASCII-based encodings until a
226 non-ASCII text character is encountered. If ENCODING specifies
227 autodetection and this function returns "ASCII", then the client should
228 process the input until it encounters an non-ASCII character (as returned by
229 encoding_guess_is_ascii_text()) and then use encoding_guess_tail_encoding()
230 to make a final encoding guess. See encoding-guesser.h for details.
231
232 N must be at least ENCODING_GUESS_MIN, unless the file is shorter than
233 that. */
234 const char *
encoding_guess_head_encoding(const char * encoding,const void * data_,size_t n)235 encoding_guess_head_encoding (const char *encoding,
236 const void *data_, size_t n)
237 {
238 const uint8_t *data = data_;
239 const char *fallback_encoding;
240 const char *guess;
241
242 fallback_encoding = encoding_guess_parse_encoding (encoding);
243 if (!encoding_guess_encoding_is_auto (encoding))
244 return fallback_encoding;
245
246 if (n == 0)
247 return fallback_encoding;
248
249 if (is_utf32be_bom (data, n) || is_utf32le_bom (data, n))
250 return "UTF-32";
251
252 if (n >= 4)
253 {
254 uint32_t x = get_be32 (data);
255 if (x == 0x84319533)
256 return "GB-18030";
257 else if (x == 0xdd736673)
258 return "UTF-EBCDIC";
259 }
260
261 if (is_utf16be_bom (data, n) || is_utf16le_bom (data, n))
262 return "UTF-16";
263
264 if (is_utf8_bom (data, n))
265 return "UTF-8";
266
267 guess = guess_utf16 (data, n);
268 if (guess != NULL)
269 return guess;
270
271 if (is_utf32 (data, n, get_be32))
272 return "UTF-32BE";
273 if (is_utf32 (data, n, get_le32))
274 return "UTF-32LE";
275
276 /* We've tried all the "giveaways" that make the encoding obvious. That
277 rules out, incidentally, all the encodings with multibyte units
278 (e.g. UTF-16, UTF-32). Our remaining goal is to try to distinguish UTF-8
279 from some ASCII-based fallback encoding. */
280
281 /* If the fallback encoding isn't ASCII compatible, give up. */
282 if (!is_encoding_ascii_compatible (fallback_encoding))
283 return fallback_encoding;
284
285 /* If the data we have clearly is not UTF-8, give up. */
286 if (!encoding_guess_tail_is_utf8 (data, n))
287 {
288 /* If the fallback encoding is UTF-8, fall back on something else.*/
289 if (is_encoding_utf8 (fallback_encoding))
290 return "windows-1252";
291
292 return fallback_encoding;
293 }
294
295 return "ASCII";
296 }
297
298 static bool
is_encoding_utf16(const char * encoding)299 is_encoding_utf16 (const char *encoding)
300 {
301 return (!c_strcasecmp (encoding, "utf-16")
302 || !c_strcasecmp (encoding, "utf16"));
303 }
304
305 static bool
is_encoding_utf32(const char * encoding)306 is_encoding_utf32 (const char *encoding)
307 {
308 return (!c_strcasecmp (encoding, "utf-32")
309 || !c_strcasecmp (encoding, "utf32"));
310 }
311
312 /* If ENCODING is the name of an encoding that could begin with a byte-order
313 mark, and in fact the N bytes in DATA do begin with a byte-order mark,
314 returns the number of bytes in the byte-order mark. Otherwise, returns 0.
315
316 N must be at least ENCODING_GUESS_MIN, unless the file is shorter than
317 that. */
318 size_t
encoding_guess_bom_length(const char * encoding,const void * data_,size_t n)319 encoding_guess_bom_length (const char *encoding,
320 const void *data_, size_t n)
321 {
322 const uint8_t *data = data_;
323
324 return (is_utf8_bom (data, n) && is_encoding_utf8 (encoding) ? 3
325 : is_utf16le_bom (data, n) && is_encoding_utf16 (encoding) ? 2
326 : is_utf16be_bom (data, n) && is_encoding_utf16 (encoding) ? 2
327 : is_utf32le_bom (data, n) && is_encoding_utf32 (encoding) ? 4
328 : is_utf32be_bom (data, n) && is_encoding_utf32 (encoding) ? 4
329 : 0);
330 }
331
332 /* Returns an encoding guess based on ENCODING and the N bytes of text starting
333 at DATA. DATA should start with the first non-ASCII text character (as
334 determined by encoding_guess_is_ascii_text()) found in the input.
335
336 The return value will either be "UTF-8" or the fallback encoding for
337 ENCODING.
338
339 See encoding-guesser.h for intended use of this function.
340
341 N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than
342 that starting with the first non-ASCII text character. */
343 const char *
encoding_guess_tail_encoding(const char * encoding,const void * data,size_t n)344 encoding_guess_tail_encoding (const char *encoding,
345 const void *data, size_t n)
346 {
347
348 if (encoding_guess_tail_is_utf8 (data, n) != 0)
349 return "UTF-8";
350 else
351 {
352 /* The data is not UTF-8. */
353 const char *fallback_encoding = encoding_guess_parse_encoding (encoding);
354
355 /* If the fallback encoding is UTF-8, fall back on something else.*/
356 if (is_encoding_utf8 (fallback_encoding))
357 return "windows-1252";
358
359 return fallback_encoding;
360 }
361
362 }
363
364 /* Returns an encoding guess based on ENCODING and the N bytes of text starting
365 at DATA. DATA should start with the first non-ASCII text character (as
366 determined by encoding_guess_is_ascii_text()) found in the input.
367
368 The return value is:
369
370 0, if the encoding is definitely not UTF-8 (because the input contains
371 byte sequences that are not valid in UTF-8).
372
373 1, if the encoding appears to be UTF-8 (because the input contains valid
374 UTF-8 multibyte sequences).
375
376 -1, if the input contains only ASCII characters. (This means that the
377 input may be treated as UTF-8, since ASCII is a subset of UTF-8.)
378
379 See encoding-guesser.h for intended use of this function.
380
381 N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than
382 that starting with the first non-ASCII text character. */
383 int
encoding_guess_tail_is_utf8(const void * data,size_t n)384 encoding_guess_tail_is_utf8 (const void *data, size_t n)
385 {
386 /* If all the bytes are in the ASCII range, it's just ASCII. */
387 if (encoding_guess_count_ascii (data, n) == n)
388 return -1;
389
390 return (n < ENCODING_GUESS_MIN
391 ? u8_check (data, n) == NULL
392 : is_all_utf8_text (data, n));
393 }
394
395 /* Attempts to guess the encoding of a text file based on ENCODING, an encoding
396 name in one of the forms described at the top of encoding-guesser.h, and the
397 SIZE byts in DATA, which contains the entire contents of the file. Returns
398 the guessed encoding, which might be ENCODING itself or a suffix of it or a
399 statically allocated string.
400
401 Encoding autodetection only takes place if ENCODING actually specifies
402 autodetection. See encoding-guesser.h for details. */
403 const char *
encoding_guess_whole_file(const char * encoding,const void * text,size_t size)404 encoding_guess_whole_file (const char *encoding, const void *text, size_t size)
405 {
406 const char *guess;
407
408 guess = encoding_guess_head_encoding (encoding, text, size);
409 if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding))
410 return encoding_guess_tail_encoding (encoding, text, size);
411 else
412 return guess;
413 }
414