1 /*
2 multibyte character set checks
3
4 Copyright (C) 2000-2003 David Necas (Yeti) <yeti@physics.muni.cz>
5
6 This program is free software; you can redistribute it and/or modify it
7 under the terms of version 2 of the GNU General Public License as published
8 by the Free Software Foundation.
9
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 more details.
14
15 You should have received a copy of the GNU General Public License along
16 with this program; if not, write to the Free Software Foundation, Inc.,
17 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
18 */
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif /* HAVE_CONFIG_H */
22
23 #include <math.h>
24
25 #include "enca.h"
26 #include "internal.h"
27
28 /*
29 * See http://www.unicode.org/unicode/faq/utf_bom.html#25 for BOMs:
30 * 00 00 FE FF UTF-32, big-endian
31 * FF FE 00 00 UTF-32, little-endian
32 * FE FF UTF-16, big-endian
33 * FF FE UTF-16, little-endian
34 * EF BB BF UTF-8
35 */
36
37 /* Local prototypes. */
38 static int is_valid_utf8 (EncaAnalyserState *analyser);
39 static int looks_like_TeX (EncaAnalyserState *analyser);
40 static int is_valid_utf7 (EncaAnalyserState *analyser);
41 static int looks_like_hz (EncaAnalyserState *analyser);
42 static int looks_like_ucs2 (EncaAnalyserState *analyser);
43 static int looks_like_ucs4 (EncaAnalyserState *analyser);
44 static int looks_like_utf8 (EncaAnalyserState *analyser);
45 static size_t what_if_it_was_ucs4 (const unsigned char *buffer,
46 size_t size,
47 size_t min_chars,
48 EncaSurface *crlf_surf);
49 static void shuffle_byte_order (unsigned char *buffer,
50 size_t size,
51 EncaSurface permutation);
52
53 /* Multibyte test lists.
54 * These arrays must be NULL-terminated. */
55 EncaGuessFunc ENCA_MULTIBYTE_TESTS_ASCII[] = {
56 &is_valid_utf7,
57 &looks_like_TeX,
58 &looks_like_hz,
59 NULL
60 };
61
62 EncaGuessFunc ENCA_MULTIBYTE_TESTS_8BIT[] = {
63 &is_valid_utf8,
64 NULL
65 };
66
67 EncaGuessFunc ENCA_MULTIBYTE_TESTS_BINARY[] = {
68 &looks_like_ucs4,
69 &looks_like_ucs2,
70 NULL
71 };
72
73 EncaGuessFunc ENCA_MULTIBYTE_TESTS_8BIT_TOLERANT[] = {
74 &looks_like_utf8,
75 NULL
76 };
77
78 /**
79 * is_valid_utf8:
80 * @analyser: Analyser whose buffer is to be checked.
81 *
82 * Checks whether @analyser->buffer contains valid UTF-8.
83 *
84 * Directly modifies @analyser->result on success.
85 *
86 * Returns: Nonzero when @analyser->result was set, zero othewrise.
87 **/
88 static int
is_valid_utf8(EncaAnalyserState * analyser)89 is_valid_utf8(EncaAnalyserState *analyser)
90 {
91 static int utf8 = ENCA_CS_UNKNOWN; /* UTF-8 charset */
92 size_t size = analyser->size;
93 const unsigned char *buffer = analyser->buffer;
94 const size_t *const counts = analyser->counts;
95
96 /* Bonus added when we catch a byte order marker. */
97 size_t bom_bonus;
98
99 int remains_10xxxxxx = 0; /* how many next bytes have to be 10xxxxxx */
100 int utf8count = 0; /* number of UTF-8 encoded characters */
101 size_t i;
102 unsigned char b;
103
104 /* Bytes 0xfe and 0xff just cannot appear in utf-8 in any case. */
105 if (counts[0xfe] || counts[0xff])
106 return 0;
107
108 /* Initialize when we are called the first time. */
109 if (utf8 == ENCA_CS_UNKNOWN) {
110 utf8 = enca_name_to_charset("utf-8");
111 assert(utf8 != ENCA_CS_UNKNOWN);
112 }
113
114 /* Check BOM */
115 bom_bonus = (size_t)(sqrt((double)size) + size/10.0);
116 if (size >= 3
117 && buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf) {
118 utf8count += bom_bonus;
119 buffer += 3;
120 size -= 3;
121 }
122
123 /* Parse. */
124 for (i = 0; i < size; i++) {
125 b = buffer[i];
126 if (!remains_10xxxxxx) {
127 if ((b & 0x80) == 0) /* 7bit characters */
128 continue;
129 if ((b & 0xe0) == 0xc0) { /* 110xxxxx 10xxxxxx sequence */
130 remains_10xxxxxx = 1;
131 utf8count++;
132 continue;
133 }
134 if ((b & 0xf0) == 0xe0) { /* 1110xxxx 2 x 10xxxxxx sequence */
135 remains_10xxxxxx = 2;
136 utf8count++;
137 continue;
138 }
139 /* Following are valid 32-bit UCS characters, but not 16-bit Unicode,
140 they are very rare, nevertheless we accept them. */
141 if ((b & 0xf8) == 0xf0) { /* 1110xxxx 3 x 10xxxxxx sequence */
142 remains_10xxxxxx = 3;
143 utf8count++;
144 continue;
145 }
146 if ((b & 0xfc) == 0xf8) { /* 1110xxxx 4 x 10xxxxxx sequence */
147 remains_10xxxxxx = 4;
148 utf8count++;
149 continue;
150 }
151 if ((b & 0xfe) == 0xfc) { /* 1110xxxx 5 x 10xxxxxx sequence */
152 remains_10xxxxxx = 5;
153 utf8count++;
154 continue;
155 }
156 /* We can get here only when input is invalid: (b & 0xc0) == 0x80. */
157 return 0;
158 }
159 else {
160 /* Broken 10xxxxxx sequence? */
161 if ((b & 0xc0) != 0x80) {
162 return 0;
163 }
164 remains_10xxxxxx--;
165 }
166 }
167
168 /* Unfinished 10xxxxxx sequence. */
169 if (remains_10xxxxxx != 0 && analyser->options.termination_strictness > 0)
170 return 0;
171
172 if (utf8count < (int)analyser->options.min_chars)
173 return 0;
174
175 analyser->result.charset = utf8;
176 analyser->result.surface |= enca_eol_surface(buffer, size, counts);
177 return 1;
178 }
179
180 /**
181 * looks_like_TeX:
182 * @analyser: Analyser whose buffer is to be checked.
183 *
184 * Checks whether @analyser->buffer contains TeX-encoded 8bit characters.
185 *
186 * Directly modifies @analyser->result on success.
187 *
188 * Returns: Nonzero when @analyser->result was set, zero othewrise.
189 **/
190 static int
looks_like_TeX(EncaAnalyserState * analyser)191 looks_like_TeX(EncaAnalyserState *analyser)
192 {
193 /* TeX escape character, skip-characters, punctuation and alpha accents */
194
195 /* THIS IS A GENERATED TABLE, see tools/expand_table.pl */
196 static const unsigned char TEX_ACCPUNCT[] = {
197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
203 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
205 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
213 };
214
215 /* THIS IS A GENERATED TABLE, see tools/expand_table.pl */
216 static const unsigned char TEX_ACCALPHA[] = {
217 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
224 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
233 };
234
235 static const unsigned char TEX_ESCAPE = '\\';
236 static const unsigned char TEX_BRACE = '{';
237
238 static int TeX = ENCA_CS_UNKNOWN; /* TeX charset */
239
240 const unsigned char *const buffer = analyser->buffer;
241 const size_t size = analyser->size;
242 const size_t *const counts = analyser->counts;
243
244 size_t TeXaccents = 0; /* number of TeX accents */
245 const unsigned char *p;
246
247 /* When the file doesn't contain enough escape characters,
248 don't waste time scanning it. */
249 if (counts[TEX_ESCAPE] < analyser->options.min_chars)
250 return 0;
251
252 /* Initialize when we are called the first time. */
253 if (TeX == ENCA_CS_UNKNOWN) {
254 TeX = enca_name_to_charset("TeX");
255 assert(TeX != ENCA_CS_UNKNOWN);
256 }
257
258 /* [roughly] count TeX accents */
259 p = memchr(buffer, TEX_ESCAPE, size);
260 while (p != NULL && (size_t)(p-buffer) + 2 < size) {
261 if (*p == TEX_ESCAPE) {
262 p++;
263 if (*p == TEX_ESCAPE)
264 p++; /* catch \\ */
265 if (TEX_ACCPUNCT[*p]
266 || (TEX_ACCALPHA[*p]
267 && (*++p == TEX_BRACE || enca_isspace((char)*p)))) {
268 while ((size_t)(p-buffer) + 1 < size
269 && (*++p == TEX_BRACE || enca_isspace((char)*p)))
270 ;
271 if (enca_isalpha(*p)) TeXaccents++;
272 }
273 continue;
274 }
275 p = memchr(p, TEX_ESCAPE, size - (p - buffer));
276 }
277
278 if (TeXaccents < analyser->options.min_chars)
279 return 0;
280
281 analyser->result.charset = TeX;
282 analyser->result.surface |= enca_eol_surface(buffer, size, counts);
283 return 1;
284 }
285
286 /**
287 * is_valid_utf7:
288 * @analyser: Analyser whose buffer is to be checked.
289 *
290 * Checks whether @analyser->buffer contains valid UTF-7
291 *
292 * Directly modifies @analyser->result on success.
293 *
294 * Returns: Nonzero when @analyser->result was set, zero othewrise.
295 **/
296 static int
is_valid_utf7(EncaAnalyserState * analyser)297 is_valid_utf7(EncaAnalyserState *analyser)
298 {
299 /* UTF-7 special characters. */
300 static const unsigned char UTF7_ESCAPE = '+';
301 /* This is not a bug. `+-' is `+' in UTF-7. */
302 static const unsigned char UTF7_PLUS = '-';
303
304 /* Base64 base (or so-called set B), see RFC1521, RFC1642 */
305 /* THIS IS A GENERATED TABLE, see tools/expand_table.pl */
306 static const short int BASE64[] = {
307 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63, 0, 0, 0, 64,
310 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 0, 0, 0, 0, 0, 0,
311 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
312 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 0, 0, 0, 0,
313 0, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
314 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 0, 0, 0, 0, 0,
315 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
316 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
317 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
318 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
319 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
320 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
321 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
322 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
323 };
324
325 static int utf7 = ENCA_CS_UNKNOWN; /* UTF-7 charset */
326
327 const unsigned char *const buffer = analyser->buffer;
328 const size_t size = analyser->size;
329 const size_t *const counts = analyser->counts;
330
331 size_t utf7count = 0; /* number of >7bit characters */
332 unsigned char *p,*q;
333
334 /* When the file doesn't contain enough UTF-7 shift characters,
335 don't waste time scanning it. */
336 if (counts[UTF7_ESCAPE] < analyser->options.min_chars)
337 return 0;
338
339 /* Initialize when we are called the first time. */
340 if (utf7 == ENCA_CS_UNKNOWN) {
341 utf7 = enca_name_to_charset("utf-7");
342 assert(utf7 != ENCA_CS_UNKNOWN);
343 }
344
345 p = memchr(buffer, UTF7_ESCAPE, size);
346 while (p != NULL && (size_t)(p-buffer) + 1 < size) {
347 p++;
348 if (*p == UTF7_PLUS) { /* +- */
349 /* Don't count +- since it's often used for 0x00b1 in plain 7bit ascii. */
350 /* utf7count++; */
351 } else {
352 for (q = p; (size_t)(q-buffer) < size && BASE64[*q]; q++)
353 ;
354 if ((size_t)(q-buffer) == size) {
355 p = q;
356 break;
357 }
358 /* check whether all padding bits are 0's (don't try to understand how) */
359 if (q-p == 0
360 || ((BASE64[*(q-1)]-1) & 0x3f>>(6 - 6*(q - p)%8)))
361 return 0;
362
363 utf7count += 6*(q - p)/16;
364 p = q;
365 }
366 p = memchr(p, UTF7_ESCAPE, size - (p - buffer));
367 }
368
369 /* p != NULL means unsinished sequence here. */
370 if (p != NULL && analyser->options.termination_strictness > 0)
371 return 0;
372
373 if (utf7count < analyser->options.min_chars)
374 return 0;
375
376 analyser->result.charset = utf7;
377 analyser->result.surface |= enca_eol_surface(buffer, size, counts);
378 return 1;
379 }
380
381 /**
382 * looks_like_ucs2:
383 * @analyser: Analyser whose buffer is to be checked.
384 *
385 * Checks whether @analyser->buffer contains UCS-2 encoded characters.
386 *
387 * Directly modifies @analyser->result on success.
388 *
389 * Returns: Nonzero when @analyser->result was set, zero othewrise.
390 **/
391 static int
looks_like_ucs2(EncaAnalyserState * analyser)392 looks_like_ucs2(EncaAnalyserState *analyser)
393 {
394 static int ucs2 = ENCA_CS_UNKNOWN; /* UCS-2 charset id */
395
396 const unsigned char *const buffer = analyser->buffer;
397 const size_t size = analyser->size;
398 const size_t *const counts = analyser->counts;
399
400 /* Bonus added when we catch a byte order marker. */
401 size_t bom_bonus;
402
403 size_t ucs2count = 0; /* something like number of good ucs-2 characters */
404 unsigned int byte_order = 0; /* default byte order is little endian
405 * (msb first) */
406 unsigned int byte_order_changes = 0; /* how many times byte_order changed */
407 size_t cr = 0; /* number of CR's */
408 size_t lf = 0; /* number of LF's */
409 int crlf_ok = 1; /* are all LF's preceeded by CR's? */
410 unsigned char b1, b2;
411 double r;
412 size_t i;
413
414 /* The number of bytes must be of course even */
415 if (size%2 != 0)
416 return 0;
417
418 bom_bonus = (size_t)(sqrt((double)size) + size/10.0);
419
420 /* When the file doesn't contain enough zeros,
421 don't waste time scanning it. */
422 r = (2.0*(counts[0] + counts[1] + counts[2] + counts[3] + counts[4])
423 + bom_bonus)/size;
424 if (r < log(analyser->options.threshold + EPSILON))
425 return 0;
426
427 /* Initialize when we are called the first time. */
428 if (ucs2 == ENCA_CS_UNKNOWN) {
429 ucs2 = enca_name_to_charset("ucs-2");
430 assert(ucs2 != ENCA_CS_UNKNOWN);
431 }
432
433 /* Try to catch lsb even when it doesn't start with endian marker. */
434 if (buffer[1] == 0 && enca_isprint(buffer[0]))
435 byte_order = 1;
436
437 /* Scan buffer. */
438 for (i = 0; i < size; i += 2) {
439 b1 = buffer[i + byte_order];
440 b2 = buffer[i+1 - byte_order];
441 /* Byte order marker detection. */
442 if (b1 == 0xfe && b2 == 0xff) {
443 if (i == 0)
444 ucs2count += bom_bonus;
445 else
446 byte_order_changes++;
447 continue;
448 }
449 if (b1 == 0xff && b2 == 0xfe) {
450 byte_order = 1-byte_order;
451 if (i == 0)
452 ucs2count += bom_bonus;
453 else
454 byte_order_changes++;
455 continue;
456 }
457 /* Black magic.
458 * When almost any word can be UCS-2 character, we have to assume some
459 * are far more probable. */
460 if (b1 == 0) {
461 ucs2count += (enca_isprint(b2) || enca_isspace(b2)) ? 2 : 0;
462 /* check EOLs */
463 if (b2 == CR)
464 cr++;
465 if (b2 == LF) {
466 lf++;
467 if (i > 0
468 && (buffer[i-1-byte_order] != CR
469 || buffer[i-2+byte_order] != 0))
470 crlf_ok = 0;
471 }
472 }
473 else {
474 if (b1 <= 4)
475 ucs2count += 2;
476 }
477 }
478
479 /* Now we have to decide what we tell to the caller. */
480 r = (double)ucs2count/size;
481 if (r < log(analyser->options.threshold + EPSILON)
482 || ucs2count/2 < analyser->options.min_chars)
483 return 0;
484
485 analyser->result.charset = ucs2;
486
487 /* Byte order surface. */
488 if (byte_order_changes)
489 analyser->result.surface |= ENCA_SURFACE_PERM_MIX;
490 else
491 analyser->result.surface |= byte_order ? ENCA_SURFACE_PERM_21: 0;
492
493 /* EOL surface. */
494 if (cr == 0)
495 analyser->result.surface |= ENCA_SURFACE_EOL_LF;
496 else {
497 if (lf == 0)
498 analyser->result.surface |= ENCA_SURFACE_EOL_CR;
499 else {
500 analyser->result.surface |= crlf_ok
501 ? ENCA_SURFACE_EOL_CRLF
502 : ENCA_SURFACE_EOL_MIX;
503 }
504 }
505
506 return 1;
507 }
508
509 /**
510 * looks_like_ucs4:
511 * @analyser: Analyser whose buffer is to be checked.
512 *
513 * Checks whether @analyser->buffer contains UCS-4 encoded characters.
514 *
515 * Directly modifies @analyser->result on success.
516 *
517 * Returns: Nonzero when @analyser->result was set, zero othewrise.
518 **/
519 static int
looks_like_ucs4(EncaAnalyserState * analyser)520 looks_like_ucs4(EncaAnalyserState *analyser)
521 {
522 static const EncaSurface PERMS[] = {
523 ENCA_SURFACE_PERM_4321,
524 ENCA_SURFACE_PERM_21
525 };
526
527 static int ucs4 = ENCA_CS_UNKNOWN; /* UCS-4 charset id */
528
529 unsigned char *buffer = analyser->buffer;
530 const size_t size = analyser->size;
531 const size_t *const counts = analyser->counts;
532
533 ssize_t ucs4count = 0; /* ucs-4-icity */
534 size_t count_perm[4]; /* counts for various byteorders */
535 EncaSurface eol[4]; /* EOL types for various byteorders */
536 double r; /* rating */
537 size_t i, max;
538
539 /* The number of bytes must be of course multiple of 4. */
540 if (size%4 != 0)
541 return 0;
542
543 /* When the file doesn't contain enough zeros (and other small bytes),
544 don't waste time scanning it. */
545 r = (4.0*(counts[0] + counts[1] + counts[2] + counts[3] + counts[4])/3.0)
546 /size;
547 if (r < log(analyser->options.threshold + EPSILON))
548 return 0;
549
550 /* Initialize when we are called the first time. */
551 if (ucs4 == ENCA_CS_UNKNOWN) {
552 ucs4 = enca_name_to_charset("ucs-4");
553 assert(ucs4 != ENCA_CS_UNKNOWN);
554 }
555
556 /* Try all sensible unsigned charorders and find maximum.
557 At the end the buffer has the same byteorder as it had, but when
558 the buffer have to be considered const, work on copy. */
559 if (analyser->options.const_buffer) {
560 buffer = memcpy(enca_malloc(size), buffer, size);
561 }
562
563 max = 0;
564 for (i = 0; i < 4; i++) {
565 count_perm[i] = what_if_it_was_ucs4(buffer, size,
566 analyser->options.min_chars,
567 eol + i);
568 if (count_perm[i] > count_perm[max])
569 max = i;
570 shuffle_byte_order(buffer, size, PERMS[i%2]);
571 }
572
573 if (analyser->options.const_buffer)
574 enca_free(buffer);
575
576 /* Use quite a cruel selection to restrain other byteorders. */
577 ucs4count = 2*count_perm[max];
578 for (i = 0; i < 4; i++)
579 ucs4count -= count_perm[i];
580
581 /* Now we have to decide what we tell to the caller. */
582 r = (double)ucs4count/size;
583 if (r < log(analyser->options.threshold + EPSILON)
584 || ucs4count/4 < (int)analyser->options.min_chars)
585 return 0;
586
587 analyser->result.charset = ucs4;
588 /* Compute what permutation corresponds to max. */
589 for (i = 0; i < max; i++)
590 analyser->result.surface ^= PERMS[i%2];
591 analyser->result.surface |= eol[max];
592
593 return 1;
594 }
595
596 /**
597 * what_if_it_was_ucs4:
598 * @buffer: Buffer to be checked.
599 * @size: Size of @buffer.
600 * @min_chars: Minimal number of `nice' UCS-4 characters to succeede.
601 * @crlf_surf: Where detected EOL surface type should be stored.
602 *
603 * Checks whether @buffer contains little endian UCS-4 encoded characters.
604 *
605 * Assumes @buffer contains little endian UCS-4 and returns the number of
606 * `good' characters, and in case it's at least @min_chars, finds EOL surface
607 * type too.
608 *
609 * Returns: The number of `good' UCS-4 characters with some bonus for a good
610 * BOM.
611 **/
612 static size_t
what_if_it_was_ucs4(const unsigned char * buffer,size_t size,size_t min_chars,EncaSurface * crlf_surf)613 what_if_it_was_ucs4(const unsigned char *buffer,
614 size_t size,
615 size_t min_chars,
616 EncaSurface *crlf_surf)
617 {
618 /* Bonus added when we catch a byte order marker. */
619 size_t bom_bonus;
620
621 size_t count = 0; /* ucs-4-icity */
622 size_t cr = 0; /* number of CR's */
623 size_t lf = 0; /* number of LF's */
624 int crlf_ok = 1; /* are all LF's preceeded by CR's? */
625 size_t i;
626
627 /* check BOM */
628 bom_bonus = (size_t)(sqrt((double)size) + size/20.0);
629 if (size) {
630 if (buffer[0] == 0 && buffer[1] == 0
631 && buffer[2] == 0xfe && buffer[3] == 0xff) {
632 count += bom_bonus;
633 buffer += 4;
634 size -= 4;
635 }
636 }
637
638 for (i = 0; i < size; i += 4) {
639 /* Does it look like little endian ucs-4? */
640 if (buffer[i] == 0 && buffer[i+1] == 0) {
641 if (buffer[i+2] == 0)
642 count += enca_isprint(buffer[i+3]) || enca_isspace(buffer[i+3]) ? 4 : 0;
643 else {
644 if (buffer[i+2] < 5)
645 count += 4;
646 }
647 }
648 }
649
650 /* Detect EOL surface
651 * To be 100% portable, we do it the ugly way: by testing individual bytes. */
652 if (count/4 >= min_chars) {
653 for (i = 0; i < size; i += 4) {
654 if (buffer[i+3] == CR && buffer[i+2] == 0
655 && buffer[i+1] == 0 && buffer[i] == 0)
656 cr++;
657 if (buffer[i+3] == LF && buffer[i+2] == 0
658 && buffer[i+1] == 0 && buffer[i] == 0) {
659 lf++;
660 if (crlf_ok && i > 0
661 && (buffer[i-1] != CR || buffer[i-2] != 0
662 || buffer[i-3] != 0 || buffer[i-4] != 0))
663 crlf_ok = 0;
664 }
665 }
666 /* EOL surface result */
667 if (cr == 0)
668 *crlf_surf = ENCA_SURFACE_EOL_LF;
669 else {
670 if (lf == 0)
671 *crlf_surf = ENCA_SURFACE_EOL_CR;
672 else
673 *crlf_surf = crlf_ok ? ENCA_SURFACE_EOL_CRLF : ENCA_SURFACE_EOL_MIX;
674 }
675 }
676
677 return count;
678 }
679
680 /**
681 * shuffle_byte_order:
682 * @buffer: Buffer to be shuffled.
683 * @size: Size of @buffer.
684 * @permutation: Permutation type, possible values mean
685 * 0 no change
686 * ENCA_SURFACE_PERM_4321 4321
687 * ENCA_SURFACE_PERM_21 21 (== 2143)
688 * ENCA_SURFACE_PERM_21|ENCA_SURFACE_PERM_4321 3412
689 *
690 * Performs given permutation on @buffer.
691 **/
692 static void
shuffle_byte_order(unsigned char * buffer,size_t size,EncaSurface permutation)693 shuffle_byte_order(unsigned char *buffer,
694 size_t size,
695 EncaSurface permutation)
696 {
697 size_t i;
698 unsigned char b;
699
700 if (permutation & ENCA_SURFACE_PERM_4321) {
701 for (i = 0; i < size; i += 4) {
702 b = buffer[i];
703 buffer[i] = buffer[i+3];
704 buffer [i+3] = b;
705
706 b = buffer[i+1];
707 buffer[i+1] = buffer[i+2];
708 buffer[i+2] = b;
709 }
710 }
711
712 if (permutation & ENCA_SURFACE_PERM_21) {
713 for (i = 0; i < size; i += 2) {
714 b = buffer[i];
715 buffer[i] = buffer[i+1];
716 buffer [i+1] = b;
717 }
718 }
719 }
720
721 /**
722 * looks_like_utf8:
723 * @analyser: Analyser whose buffer is to be checked.
724 *
725 * Checks whether @analyser->buffer may contain UTF-8.
726 *
727 * This is a fault-tolerant version of is_valid_utf8, intended to be used after
728 * filtering, when a few stray 8bit characters may appear in the sample.
729 *
730 * Directly modifies @analyser->result on success.
731 *
732 * Returns: Nonzero when @analyser->result was set, zero othewrise.
733 **/
734 static int
looks_like_utf8(EncaAnalyserState * analyser)735 looks_like_utf8(EncaAnalyserState *analyser)
736 {
737 static int utf8 = ENCA_CS_UNKNOWN; /* UTF-8 charset */
738 size_t size = analyser->size;
739 const unsigned char *buffer = analyser->buffer;
740 const size_t *const counts = analyser->counts;
741
742 /* Bonus added when we catch a byte order marker. */
743 size_t bom_bonus;
744
745 int remains_10xxxxxx = 0; /* how many next bytes have to be 10xxxxxx */
746 int utf8count = 0; /* number of UTF-8 encoded characters */
747 int failures = 0; /* number of invalid sequences encountered */
748 size_t i;
749 unsigned char b;
750
751 /* Initialize when we are called the first time. */
752 if (utf8 == ENCA_CS_UNKNOWN) {
753 utf8 = enca_name_to_charset("utf-8");
754 assert(utf8 != ENCA_CS_UNKNOWN);
755 }
756
757 /* Check BOM */
758 bom_bonus = (size_t)(sqrt((double)size) + size/10.0);
759 if (size >= 3
760 && buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf) {
761 utf8count += bom_bonus;
762 buffer += 3;
763 size -= 3;
764 }
765
766 /* Parse. */
767 for (i = 0; i < size; i++) {
768 b = buffer[i];
769 if (!remains_10xxxxxx) {
770 if ((b & 0x80) == 0) /* 7bit characters */
771 continue;
772 if ((b & 0xe0) == 0xc0) { /* 110xxxxx 10xxxxxx sequence */
773 remains_10xxxxxx = 1;
774 utf8count++;
775 continue;
776 }
777 if ((b & 0xf0) == 0xe0) { /* 1110xxxx 2 x 10xxxxxx sequence */
778 remains_10xxxxxx = 2;
779 utf8count++;
780 continue;
781 }
782 /* Following are valid 32-bit UCS characters, but not 16-bit Unicode,
783 they are very rare, nevertheless we accept them. */
784 if ((b & 0xf8) == 0xf0) { /* 1110xxxx 3 x 10xxxxxx sequence */
785 remains_10xxxxxx = 3;
786 utf8count++;
787 continue;
788 }
789 if ((b & 0xfc) == 0xf8) { /* 1110xxxx 4 x 10xxxxxx sequence */
790 remains_10xxxxxx = 4;
791 utf8count++;
792 continue;
793 }
794 if ((b & 0xfe) == 0xfc) { /* 1110xxxx 5 x 10xxxxxx sequence */
795 remains_10xxxxxx = 5;
796 utf8count++;
797 continue;
798 }
799 /* We can get here only when input is invalid: (b & 0xc0) == 0x80. */
800 failures++;
801 remains_10xxxxxx = 0;
802 }
803 else {
804 /* Broken 10xxxxxx sequence? */
805 if ((b & 0xc0) != 0x80) {
806 failures++;
807 utf8count--;
808 remains_10xxxxxx = 0;
809 }
810 else
811 remains_10xxxxxx--;
812 }
813 }
814
815 /* Unfinished 10xxxxxx sequence. */
816 if (remains_10xxxxxx != 0 && analyser->options.termination_strictness > 0)
817 failures += 2;
818
819 /* Tolerate a small number of failures. */
820 if (failures > exp(-7*(analyser->options.threshold - 1.0))*utf8count/2.0)
821 return 0;
822
823 analyser->result.charset = utf8;
824 analyser->result.surface |= enca_eol_surface(buffer, size, counts);
825 if (failures > 0)
826 analyser->result.surface |= ENCA_SURFACE_EOL_BIN;
827 return 1;
828 }
829
830 /**
831 * looks_like_hz:
832 * @analyser: An analyser.
833 *
834 * Checks whether @analyser buffer is HZ-encoded. See RFC 1843
835 *
836 * Directly modifies @analyser->result on success.
837 *
838 * Returns: Nonzero when @analyser->result was set, zero othewrise.
839 **/
840 static int
looks_like_hz(EncaAnalyserState * analyser)841 looks_like_hz(EncaAnalyserState *analyser)
842 {
843 unsigned char *buffer = analyser->buffer;
844 size_t size = analyser->size;
845 static int hz = ENCA_CS_UNKNOWN; /* HZ charset */
846 size_t hzcount = 0; /* number of qp encoded characters */
847 unsigned char *p = buffer;
848 const size_t *const counts = analyser->counts;
849
850 int escaped; /* true when we're in 8-bit mode */
851 unsigned int i;
852
853 /* Initialize when we are called the first time. */
854 if (hz == ENCA_CS_UNKNOWN) {
855 hz = enca_name_to_charset("hz");
856 assert(hz != ENCA_CS_UNKNOWN);
857 }
858
859 for (i = 0; i < analyser->ncharsets; i++)
860 if (analyser->charsets[i] == hz)
861 goto goahead;
862 return 0;
863
864 goahead:
865 /* When the file doesn't contain escape characters,
866 don't waste time scanning it. */
867 if (counts['{'] == 0
868 || counts['}'] == 0
869 || counts['~'] == 0)
870 return 0;
871
872 /* Move to first escaped-in */
873 /* FIXME: Things will be simpler if we have strnstr()? */
874 while ((size_t)(p - buffer) + 2 < size) {
875 p = memchr(p, '~', size - (p - buffer));
876 if (p == NULL)
877 return 0;
878 if (p[1] == '{') {
879 escaped = 1;
880 p += 2;
881 break;
882 } else if (p[1] == '\n') {
883 p += 2;
884 } else if (p[1] == '~') {
885 p += 2;
886 } else
887 p += 2;
888 }
889
890 /* Check if it's valid HZ and count hz encoded characters. */
891 while (p < buffer + size) {
892 if (*p == '~' && p < buffer + size - 1) {
893 switch (p[1]) {
894 case '~':
895 if (escaped) {
896 p++;
897 hzcount++;
898 } else {
899 p += 2;
900 }
901 break;
902 case '{':
903 if (!escaped) {
904 p += 2;
905 escaped = 1;
906 } else {
907 return 0;
908 }
909 break;
910 case '}':
911 if (escaped) {
912 escaped = 0;
913 p += 2;
914 } else {
915 return 0;
916 }
917 break;
918 case '\n':
919 if (escaped) {
920 return 0;
921 }
922 p += 2;
923 break;
924 default:
925 if (!escaped) {
926 return 0;
927 }
928 p++;
929 }
930 } else {
931 /* Spaces, CR or LF not allowed in escaped block */
932 if (escaped) {
933 if (*p < ' ') {
934 return 0;
935 }
936 hzcount++;
937 }
938 p++;
939 }
940 }
941
942 if (hzcount < analyser->options.min_chars)
943 return 0;
944
945 /* Unfinished escaped block here. */
946 if (escaped && analyser->options.termination_strictness > 0)
947 return 0;
948
949 analyser->result.charset = hz;
950 analyser->result.surface |= enca_eol_surface(buffer, size, counts);
951
952 return 1;
953 }
954