1 /*
2   multibyte character set checks
3 
4   Copyright (C) 2000-2003 David Necas (Yeti) <yeti@physics.muni.cz>
5 
6   This program is free software; you can redistribute it and/or modify it
7   under the terms of version 2 of the GNU General Public License as published
8   by the Free Software Foundation.
9 
10   This program is distributed in the hope that it will be useful, but WITHOUT
11   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13   more details.
14 
15   You should have received a copy of the GNU General Public License along
16   with this program; if not, write to the Free Software Foundation, Inc.,
17   59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
18 */
19 #ifdef HAVE_CONFIG_H
20 #  include "config.h"
21 #endif /* HAVE_CONFIG_H */
22 
23 #include <math.h>
24 
25 #include "enca.h"
26 #include "internal.h"
27 
28 /*
29  * See http://www.unicode.org/unicode/faq/utf_bom.html#25 for BOMs:
30  * 00 00 FE FF      UTF-32, big-endian
31  * FF FE 00 00      UTF-32, little-endian
32  * FE FF            UTF-16, big-endian
33  * FF FE            UTF-16, little-endian
34  * EF BB BF         UTF-8
35  */
36 
37 /* Local prototypes. */
38 static int    is_valid_utf8       (EncaAnalyserState *analyser);
39 static int    looks_like_TeX      (EncaAnalyserState *analyser);
40 static int    is_valid_utf7       (EncaAnalyserState *analyser);
41 static int    looks_like_hz       (EncaAnalyserState *analyser);
42 static int    looks_like_ucs2     (EncaAnalyserState *analyser);
43 static int    looks_like_ucs4     (EncaAnalyserState *analyser);
44 static int    looks_like_utf8     (EncaAnalyserState *analyser);
45 static size_t what_if_it_was_ucs4 (const unsigned char *buffer,
46                                    size_t size,
47                                    size_t min_chars,
48                                    EncaSurface *crlf_surf);
49 static void   shuffle_byte_order  (unsigned char *buffer,
50                                    size_t size,
51                                    EncaSurface permutation);
52 
53 /* Multibyte test lists.
54  * These arrays must be NULL-terminated. */
55 EncaGuessFunc ENCA_MULTIBYTE_TESTS_ASCII[] = {
56   &is_valid_utf7,
57   &looks_like_TeX,
58   &looks_like_hz,
59   NULL
60 };
61 
62 EncaGuessFunc ENCA_MULTIBYTE_TESTS_8BIT[] = {
63   &is_valid_utf8,
64   NULL
65 };
66 
67 EncaGuessFunc ENCA_MULTIBYTE_TESTS_BINARY[] = {
68   &looks_like_ucs4,
69   &looks_like_ucs2,
70   NULL
71 };
72 
73 EncaGuessFunc ENCA_MULTIBYTE_TESTS_8BIT_TOLERANT[] = {
74   &looks_like_utf8,
75   NULL
76 };
77 
78 /**
79  * is_valid_utf8:
80  * @analyser: Analyser whose buffer is to be checked.
81  *
82  * Checks whether @analyser->buffer contains valid UTF-8.
83  *
84  * Directly modifies @analyser->result on success.
85  *
86  * Returns: Nonzero when @analyser->result was set, zero othewrise.
87  **/
88 static int
is_valid_utf8(EncaAnalyserState * analyser)89 is_valid_utf8(EncaAnalyserState *analyser)
90 {
91   static int utf8 = ENCA_CS_UNKNOWN; /* UTF-8 charset */
92   size_t size = analyser->size;
93   const unsigned char *buffer = analyser->buffer;
94   const size_t *const counts = analyser->counts;
95 
96   /* Bonus added when we catch a byte order marker. */
97   size_t bom_bonus;
98 
99   int remains_10xxxxxx = 0;       /* how many next bytes have to be 10xxxxxx */
100   int utf8count = 0;              /* number of UTF-8 encoded characters */
101   size_t i;
102   unsigned char b;
103 
104   /* Bytes 0xfe and 0xff just cannot appear in utf-8 in any case. */
105   if (counts[0xfe] || counts[0xff])
106     return 0;
107 
108   /* Initialize when we are called the first time. */
109   if (utf8 == ENCA_CS_UNKNOWN) {
110     utf8 = enca_name_to_charset("utf-8");
111     assert(utf8 != ENCA_CS_UNKNOWN);
112   }
113 
114   /* Check BOM */
115   bom_bonus = (size_t)(sqrt((double)size) + size/10.0);
116   if (size >= 3
117       && buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf) {
118     utf8count += bom_bonus;
119     buffer += 3;
120     size -= 3;
121   }
122 
123   /* Parse. */
124   for (i = 0; i < size; i++) {
125     b = buffer[i];
126     if (!remains_10xxxxxx) {
127       if ((b & 0x80) == 0) /* 7bit characters */
128         continue;
129       if ((b & 0xe0) == 0xc0) { /* 110xxxxx 10xxxxxx sequence */
130         remains_10xxxxxx = 1;
131         utf8count++;
132         continue;
133       }
134       if ((b & 0xf0) == 0xe0) { /* 1110xxxx 2 x 10xxxxxx sequence */
135         remains_10xxxxxx = 2;
136         utf8count++;
137         continue;
138       }
139       /* Following are valid 32-bit UCS characters, but not 16-bit Unicode,
140          they are very rare, nevertheless we accept them. */
141       if ((b & 0xf8) == 0xf0) { /* 1110xxxx 3 x 10xxxxxx sequence */
142         remains_10xxxxxx = 3;
143         utf8count++;
144         continue;
145       }
146       if ((b & 0xfc) == 0xf8) { /* 1110xxxx 4 x 10xxxxxx sequence */
147         remains_10xxxxxx = 4;
148         utf8count++;
149         continue;
150       }
151       if ((b & 0xfe) == 0xfc) { /* 1110xxxx 5 x 10xxxxxx sequence */
152         remains_10xxxxxx = 5;
153         utf8count++;
154         continue;
155       }
156       /* We can get here only when input is invalid: (b & 0xc0) == 0x80. */
157       return 0;
158     }
159     else {
160       /* Broken 10xxxxxx sequence? */
161       if ((b & 0xc0) != 0x80) {
162         return 0;
163       }
164       remains_10xxxxxx--;
165     }
166   }
167 
168   /* Unfinished 10xxxxxx sequence. */
169   if (remains_10xxxxxx != 0 && analyser->options.termination_strictness > 0)
170     return 0;
171 
172   if (utf8count < (int)analyser->options.min_chars)
173     return 0;
174 
175   analyser->result.charset = utf8;
176   analyser->result.surface |= enca_eol_surface(buffer, size, counts);
177   return 1;
178 }
179 
180 /**
181  * looks_like_TeX:
182  * @analyser: Analyser whose buffer is to be checked.
183  *
184  * Checks whether @analyser->buffer contains TeX-encoded 8bit characters.
185  *
186  * Directly modifies @analyser->result on success.
187  *
188  * Returns: Nonzero when @analyser->result was set, zero othewrise.
189  **/
190 static int
looks_like_TeX(EncaAnalyserState * analyser)191 looks_like_TeX(EncaAnalyserState *analyser)
192 {
193   /* TeX escape character, skip-characters, punctuation and alpha accents */
194 
195   /* THIS IS A GENERATED TABLE, see tools/expand_table.pl */
196   static const unsigned char TEX_ACCPUNCT[] = {
197     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
198     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199     0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
200     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
201     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
203     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
204     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
205     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
206     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
207     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
208     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
209     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
210     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
211     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
212     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
213   };
214 
215   /* THIS IS A GENERATED TABLE, see tools/expand_table.pl */
216   static const unsigned char TEX_ACCALPHA[] = {
217     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
218     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
219     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
220     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
221     0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
222     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
223     0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
224     0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
225     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
226     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
227     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
228     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
229     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
230     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
231     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
232     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
233   };
234 
235   static const unsigned char TEX_ESCAPE = '\\';
236   static const unsigned char TEX_BRACE = '{';
237 
238   static int TeX = ENCA_CS_UNKNOWN; /* TeX charset */
239 
240   const unsigned char *const buffer = analyser->buffer;
241   const size_t size = analyser->size;
242   const size_t *const counts = analyser->counts;
243 
244   size_t TeXaccents = 0; /* number of TeX accents */
245   const unsigned char *p;
246 
247   /* When the file doesn't contain enough escape characters,
248      don't waste time scanning it. */
249   if (counts[TEX_ESCAPE] < analyser->options.min_chars)
250     return 0;
251 
252   /* Initialize when we are called the first time. */
253   if (TeX == ENCA_CS_UNKNOWN) {
254     TeX = enca_name_to_charset("TeX");
255     assert(TeX != ENCA_CS_UNKNOWN);
256   }
257 
258   /* [roughly] count TeX accents */
259   p = memchr(buffer, TEX_ESCAPE, size);
260   while (p != NULL && (size_t)(p-buffer) + 2 < size) {
261     if (*p == TEX_ESCAPE) {
262       p++;
263       if (*p == TEX_ESCAPE)
264         p++; /* catch \\ */
265       if (TEX_ACCPUNCT[*p]
266           || (TEX_ACCALPHA[*p]
267               && (*++p == TEX_BRACE || enca_isspace((char)*p)))) {
268         while ((size_t)(p-buffer) + 1 < size
269                && (*++p == TEX_BRACE || enca_isspace((char)*p)))
270           ;
271         if (enca_isalpha(*p)) TeXaccents++;
272       }
273       continue;
274     }
275     p = memchr(p, TEX_ESCAPE, size - (p - buffer));
276   }
277 
278   if (TeXaccents < analyser->options.min_chars)
279     return 0;
280 
281   analyser->result.charset = TeX;
282   analyser->result.surface |= enca_eol_surface(buffer, size, counts);
283   return 1;
284 }
285 
286 /**
287  * is_valid_utf7:
288  * @analyser: Analyser whose buffer is to be checked.
289  *
290  * Checks whether @analyser->buffer contains valid UTF-7
291  *
292  * Directly modifies @analyser->result on success.
293  *
294  * Returns: Nonzero when @analyser->result was set, zero othewrise.
295  **/
296 static int
is_valid_utf7(EncaAnalyserState * analyser)297 is_valid_utf7(EncaAnalyserState *analyser)
298 {
299   /* UTF-7 special characters. */
300   static const unsigned char UTF7_ESCAPE = '+';
301   /* This is not a bug. `+-' is `+' in UTF-7. */
302   static const unsigned char UTF7_PLUS = '-';
303 
304   /* Base64 base (or so-called set B), see RFC1521, RFC1642 */
305   /* THIS IS A GENERATED TABLE, see tools/expand_table.pl */
306   static const short int BASE64[] = {
307      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
308      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
309      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 63,  0,  0,  0, 64,
310     53, 54, 55, 56, 57, 58, 59, 60, 61, 62,  0,  0,  0,  0,  0,  0,
311      0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
312     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,  0,  0,  0,  0,  0,
313      0, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
314     42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,  0,  0,  0,  0,  0,
315      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
316      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
317      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
318      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
319      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
320      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
321      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
322      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
323   };
324 
325   static int utf7 = ENCA_CS_UNKNOWN; /* UTF-7 charset */
326 
327   const unsigned char *const buffer = analyser->buffer;
328   const size_t size = analyser->size;
329   const size_t *const counts = analyser->counts;
330 
331   size_t utf7count = 0; /* number of >7bit characters */
332   unsigned char *p,*q;
333 
334   /* When the file doesn't contain enough UTF-7 shift characters,
335      don't waste time scanning it. */
336   if (counts[UTF7_ESCAPE] < analyser->options.min_chars)
337     return 0;
338 
339   /* Initialize when we are called the first time. */
340   if (utf7 == ENCA_CS_UNKNOWN) {
341     utf7 = enca_name_to_charset("utf-7");
342     assert(utf7 != ENCA_CS_UNKNOWN);
343   }
344 
345   p = memchr(buffer, UTF7_ESCAPE, size);
346   while (p != NULL && (size_t)(p-buffer) + 1 < size) {
347     p++;
348     if (*p == UTF7_PLUS) { /* +- */
349       /* Don't count +- since it's often used for 0x00b1 in plain 7bit ascii. */
350       /* utf7count++; */
351     } else {
352       for (q = p; (size_t)(q-buffer) < size && BASE64[*q]; q++)
353         ;
354       if ((size_t)(q-buffer) == size) {
355         p = q;
356         break;
357       }
358       /* check whether all padding bits are 0's (don't try to understand how) */
359       if (q-p == 0
360           || ((BASE64[*(q-1)]-1) & 0x3f>>(6 - 6*(q - p)%8)))
361         return 0;
362 
363       utf7count += 6*(q - p)/16;
364       p = q;
365     }
366     p = memchr(p, UTF7_ESCAPE, size - (p - buffer));
367   }
368 
369   /* p != NULL means unsinished sequence here. */
370   if (p != NULL && analyser->options.termination_strictness > 0)
371     return 0;
372 
373   if (utf7count < analyser->options.min_chars)
374     return 0;
375 
376   analyser->result.charset = utf7;
377   analyser->result.surface |= enca_eol_surface(buffer, size, counts);
378   return 1;
379 }
380 
381 /**
382  * looks_like_ucs2:
383  * @analyser: Analyser whose buffer is to be checked.
384  *
385  * Checks whether @analyser->buffer contains UCS-2 encoded characters.
386  *
387  * Directly modifies @analyser->result on success.
388  *
389  * Returns: Nonzero when @analyser->result was set, zero othewrise.
390  **/
391 static int
looks_like_ucs2(EncaAnalyserState * analyser)392 looks_like_ucs2(EncaAnalyserState *analyser)
393 {
394   static int ucs2 = ENCA_CS_UNKNOWN; /* UCS-2 charset id */
395 
396   const unsigned char *const buffer = analyser->buffer;
397   const size_t size = analyser->size;
398   const size_t *const counts = analyser->counts;
399 
400   /* Bonus added when we catch a byte order marker. */
401   size_t bom_bonus;
402 
403   size_t ucs2count = 0; /* something like number of good ucs-2 characters */
404   unsigned int byte_order = 0; /* default byte order is little endian
405                                 * (msb first) */
406   unsigned int byte_order_changes = 0; /* how many times byte_order changed */
407   size_t cr = 0; /* number of CR's */
408   size_t lf = 0; /* number of LF's */
409   int crlf_ok = 1; /* are all LF's preceeded by CR's? */
410   unsigned char b1, b2;
411   double r;
412   size_t i;
413 
414   /* The number of bytes must be of course even */
415   if (size%2 != 0)
416     return 0;
417 
418   bom_bonus = (size_t)(sqrt((double)size) + size/10.0);
419 
420   /* When the file doesn't contain enough zeros,
421      don't waste time scanning it. */
422   r = (2.0*(counts[0] + counts[1] + counts[2] + counts[3] + counts[4])
423        + bom_bonus)/size;
424   if (r < log(analyser->options.threshold + EPSILON))
425     return 0;
426 
427   /* Initialize when we are called the first time. */
428   if (ucs2 == ENCA_CS_UNKNOWN) {
429     ucs2 = enca_name_to_charset("ucs-2");
430     assert(ucs2 != ENCA_CS_UNKNOWN);
431   }
432 
433   /* Try to catch lsb even when it doesn't start with endian marker. */
434   if (buffer[1] == 0 && enca_isprint(buffer[0]))
435     byte_order = 1;
436 
437   /* Scan buffer. */
438   for (i = 0; i < size; i += 2) {
439     b1 = buffer[i + byte_order];
440     b2 = buffer[i+1 - byte_order];
441     /* Byte order marker detection. */
442     if (b1 == 0xfe && b2 == 0xff) {
443       if (i == 0)
444         ucs2count += bom_bonus;
445       else
446         byte_order_changes++;
447       continue;
448     }
449     if (b1 == 0xff && b2 == 0xfe) {
450       byte_order = 1-byte_order;
451       if (i == 0)
452         ucs2count += bom_bonus;
453       else
454         byte_order_changes++;
455       continue;
456     }
457     /* Black magic.
458      * When almost any word can be UCS-2 character, we have to assume some
459      * are far more probable. */
460     if (b1 == 0) {
461       ucs2count += (enca_isprint(b2) || enca_isspace(b2)) ? 2 : 0;
462       /* check EOLs */
463       if (b2 == CR)
464         cr++;
465       if (b2 == LF) {
466         lf++;
467         if (i > 0
468             && (buffer[i-1-byte_order] != CR
469                 || buffer[i-2+byte_order] != 0))
470           crlf_ok = 0;
471       }
472     }
473     else {
474       if (b1 <= 4)
475         ucs2count += 2;
476     }
477   }
478 
479   /* Now we have to decide what we tell to the caller. */
480   r = (double)ucs2count/size;
481   if (r < log(analyser->options.threshold + EPSILON)
482       || ucs2count/2 < analyser->options.min_chars)
483     return 0;
484 
485   analyser->result.charset = ucs2;
486 
487   /* Byte order surface. */
488   if (byte_order_changes)
489     analyser->result.surface |= ENCA_SURFACE_PERM_MIX;
490   else
491     analyser->result.surface |= byte_order ? ENCA_SURFACE_PERM_21: 0;
492 
493   /* EOL surface. */
494   if (cr == 0)
495     analyser->result.surface |= ENCA_SURFACE_EOL_LF;
496   else {
497     if (lf == 0)
498       analyser->result.surface |= ENCA_SURFACE_EOL_CR;
499     else {
500       analyser->result.surface |= crlf_ok
501                                   ? ENCA_SURFACE_EOL_CRLF
502                                   : ENCA_SURFACE_EOL_MIX;
503     }
504   }
505 
506   return 1;
507 }
508 
509 /**
510  * looks_like_ucs4:
511  * @analyser: Analyser whose buffer is to be checked.
512  *
513  * Checks whether @analyser->buffer contains UCS-4 encoded characters.
514  *
515  * Directly modifies @analyser->result on success.
516  *
517  * Returns: Nonzero when @analyser->result was set, zero othewrise.
518  **/
519 static int
looks_like_ucs4(EncaAnalyserState * analyser)520 looks_like_ucs4(EncaAnalyserState *analyser)
521 {
522   static const EncaSurface PERMS[] = {
523     ENCA_SURFACE_PERM_4321,
524     ENCA_SURFACE_PERM_21
525   };
526 
527   static int ucs4 = ENCA_CS_UNKNOWN; /* UCS-4 charset id */
528 
529   unsigned char *buffer = analyser->buffer;
530   const size_t size = analyser->size;
531   const size_t *const counts = analyser->counts;
532 
533   ssize_t ucs4count = 0; /* ucs-4-icity */
534   size_t count_perm[4]; /* counts for various byteorders */
535   EncaSurface eol[4]; /* EOL types for various byteorders */
536   double r; /* rating */
537   size_t i, max;
538 
539   /* The number of bytes must be of course multiple of 4. */
540   if (size%4 != 0)
541     return 0;
542 
543   /* When the file doesn't contain enough zeros (and other small bytes),
544      don't waste time scanning it. */
545   r = (4.0*(counts[0] + counts[1] + counts[2] + counts[3] + counts[4])/3.0)
546       /size;
547   if (r < log(analyser->options.threshold + EPSILON))
548     return 0;
549 
550   /* Initialize when we are called the first time. */
551   if (ucs4 == ENCA_CS_UNKNOWN) {
552     ucs4 = enca_name_to_charset("ucs-4");
553     assert(ucs4 != ENCA_CS_UNKNOWN);
554   }
555 
556   /* Try all sensible unsigned charorders and find maximum.
557      At the end the buffer has the same byteorder as it had, but when
558      the buffer have to be considered const, work on copy. */
559   if (analyser->options.const_buffer) {
560     buffer = memcpy(enca_malloc(size), buffer, size);
561   }
562 
563   max = 0;
564   for (i = 0; i < 4; i++) {
565     count_perm[i] = what_if_it_was_ucs4(buffer, size,
566                                         analyser->options.min_chars,
567                                         eol + i);
568     if (count_perm[i] > count_perm[max])
569       max = i;
570     shuffle_byte_order(buffer, size, PERMS[i%2]);
571   }
572 
573   if (analyser->options.const_buffer)
574     enca_free(buffer);
575 
576   /* Use quite a cruel selection to restrain other byteorders. */
577   ucs4count = 2*count_perm[max];
578   for (i = 0; i < 4; i++)
579     ucs4count -= count_perm[i];
580 
581   /* Now we have to decide what we tell to the caller. */
582   r = (double)ucs4count/size;
583   if (r < log(analyser->options.threshold + EPSILON)
584       || ucs4count/4 < (int)analyser->options.min_chars)
585     return 0;
586 
587   analyser->result.charset = ucs4;
588   /* Compute what permutation corresponds to max. */
589   for (i = 0; i < max; i++)
590     analyser->result.surface ^= PERMS[i%2];
591   analyser->result.surface |= eol[max];
592 
593   return 1;
594 }
595 
596 /**
597  * what_if_it_was_ucs4:
598  * @buffer: Buffer to be checked.
599  * @size: Size of @buffer.
600  * @min_chars: Minimal number of `nice' UCS-4 characters to succeede.
601  * @crlf_surf: Where detected EOL surface type should be stored.
602  *
603  * Checks whether @buffer contains little endian UCS-4 encoded characters.
604  *
605  * Assumes @buffer contains little endian UCS-4 and returns the number of
606  * `good' characters, and in case it's at least @min_chars, finds EOL surface
607  * type too.
608  *
609  * Returns: The number of `good' UCS-4 characters with some bonus for a good
610  * BOM.
611  **/
612 static size_t
what_if_it_was_ucs4(const unsigned char * buffer,size_t size,size_t min_chars,EncaSurface * crlf_surf)613 what_if_it_was_ucs4(const unsigned char *buffer,
614                     size_t size,
615                     size_t min_chars,
616                     EncaSurface *crlf_surf)
617 {
618   /* Bonus added when we catch a byte order marker. */
619   size_t bom_bonus;
620 
621   size_t count = 0;   /* ucs-4-icity */
622   size_t cr = 0;      /* number of CR's */
623   size_t lf = 0;      /* number of LF's */
624   int crlf_ok = 1;    /* are all LF's preceeded by CR's? */
625   size_t i;
626 
627   /* check BOM */
628   bom_bonus = (size_t)(sqrt((double)size) + size/20.0);
629   if (size) {
630     if (buffer[0] == 0 && buffer[1] == 0
631         && buffer[2] == 0xfe && buffer[3] == 0xff) {
632       count += bom_bonus;
633       buffer += 4;
634       size -= 4;
635     }
636   }
637 
638   for (i = 0; i < size; i += 4) {
639     /* Does it look like little endian ucs-4? */
640     if (buffer[i] == 0 && buffer[i+1] == 0) {
641       if (buffer[i+2] == 0)
642         count += enca_isprint(buffer[i+3]) || enca_isspace(buffer[i+3]) ? 4 : 0;
643       else {
644         if (buffer[i+2] < 5)
645           count += 4;
646       }
647     }
648   }
649 
650   /* Detect EOL surface
651    * To be 100% portable, we do it the ugly way: by testing individual bytes. */
652   if (count/4 >= min_chars) {
653     for (i = 0; i < size; i += 4) {
654       if (buffer[i+3] == CR && buffer[i+2] == 0
655           && buffer[i+1] == 0 && buffer[i] == 0)
656         cr++;
657       if (buffer[i+3] == LF && buffer[i+2] == 0
658           && buffer[i+1] == 0 && buffer[i] == 0) {
659         lf++;
660         if (crlf_ok && i > 0
661             && (buffer[i-1] != CR || buffer[i-2] != 0
662                 || buffer[i-3] != 0 || buffer[i-4] != 0))
663           crlf_ok = 0;
664       }
665     }
666     /* EOL surface result */
667     if (cr == 0)
668       *crlf_surf = ENCA_SURFACE_EOL_LF;
669     else {
670       if (lf == 0)
671         *crlf_surf = ENCA_SURFACE_EOL_CR;
672       else
673         *crlf_surf = crlf_ok ? ENCA_SURFACE_EOL_CRLF : ENCA_SURFACE_EOL_MIX;
674     }
675   }
676 
677   return count;
678 }
679 
680 /**
681  * shuffle_byte_order:
682  * @buffer: Buffer to be shuffled.
683  * @size: Size of @buffer.
684  * @permutation: Permutation type, possible values mean
685  *               0                                                no change
686  *               ENCA_SURFACE_PERM_4321                           4321
687  *               ENCA_SURFACE_PERM_21                             21 (== 2143)
688  *               ENCA_SURFACE_PERM_21|ENCA_SURFACE_PERM_4321      3412
689  *
690  * Performs given permutation on @buffer.
691  **/
692 static void
shuffle_byte_order(unsigned char * buffer,size_t size,EncaSurface permutation)693 shuffle_byte_order(unsigned char *buffer,
694                    size_t size,
695                    EncaSurface permutation)
696 {
697   size_t i;
698   unsigned char b;
699 
700   if (permutation & ENCA_SURFACE_PERM_4321) {
701     for (i = 0; i < size; i += 4) {
702       b = buffer[i];
703       buffer[i] = buffer[i+3];
704       buffer [i+3] = b;
705 
706       b = buffer[i+1];
707       buffer[i+1] = buffer[i+2];
708       buffer[i+2] = b;
709     }
710   }
711 
712   if (permutation & ENCA_SURFACE_PERM_21) {
713     for (i = 0; i < size; i += 2) {
714       b = buffer[i];
715       buffer[i] = buffer[i+1];
716       buffer [i+1] = b;
717     }
718   }
719 }
720 
721 /**
722  * looks_like_utf8:
723  * @analyser: Analyser whose buffer is to be checked.
724  *
725  * Checks whether @analyser->buffer may contain UTF-8.
726  *
727  * This is a fault-tolerant version of is_valid_utf8, intended to be used after
728  * filtering, when a few stray 8bit characters may appear in the sample.
729  *
730  * Directly modifies @analyser->result on success.
731  *
732  * Returns: Nonzero when @analyser->result was set, zero othewrise.
733  **/
734 static int
looks_like_utf8(EncaAnalyserState * analyser)735 looks_like_utf8(EncaAnalyserState *analyser)
736 {
737   static int utf8 = ENCA_CS_UNKNOWN; /* UTF-8 charset */
738   size_t size = analyser->size;
739   const unsigned char *buffer = analyser->buffer;
740   const size_t *const counts = analyser->counts;
741 
742   /* Bonus added when we catch a byte order marker. */
743   size_t bom_bonus;
744 
745   int remains_10xxxxxx = 0;       /* how many next bytes have to be 10xxxxxx */
746   int utf8count = 0;              /* number of UTF-8 encoded characters */
747   int failures = 0;               /* number of invalid sequences encountered */
748   size_t i;
749   unsigned char b;
750 
751   /* Initialize when we are called the first time. */
752   if (utf8 == ENCA_CS_UNKNOWN) {
753     utf8 = enca_name_to_charset("utf-8");
754     assert(utf8 != ENCA_CS_UNKNOWN);
755   }
756 
757   /* Check BOM */
758   bom_bonus = (size_t)(sqrt((double)size) + size/10.0);
759   if (size >= 3
760       && buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf) {
761     utf8count += bom_bonus;
762     buffer += 3;
763     size -= 3;
764   }
765 
766   /* Parse. */
767   for (i = 0; i < size; i++) {
768     b = buffer[i];
769     if (!remains_10xxxxxx) {
770       if ((b & 0x80) == 0) /* 7bit characters */
771         continue;
772       if ((b & 0xe0) == 0xc0) { /* 110xxxxx 10xxxxxx sequence */
773         remains_10xxxxxx = 1;
774         utf8count++;
775         continue;
776       }
777       if ((b & 0xf0) == 0xe0) { /* 1110xxxx 2 x 10xxxxxx sequence */
778         remains_10xxxxxx = 2;
779         utf8count++;
780         continue;
781       }
782       /* Following are valid 32-bit UCS characters, but not 16-bit Unicode,
783          they are very rare, nevertheless we accept them. */
784       if ((b & 0xf8) == 0xf0) { /* 1110xxxx 3 x 10xxxxxx sequence */
785         remains_10xxxxxx = 3;
786         utf8count++;
787         continue;
788       }
789       if ((b & 0xfc) == 0xf8) { /* 1110xxxx 4 x 10xxxxxx sequence */
790         remains_10xxxxxx = 4;
791         utf8count++;
792         continue;
793       }
794       if ((b & 0xfe) == 0xfc) { /* 1110xxxx 5 x 10xxxxxx sequence */
795         remains_10xxxxxx = 5;
796         utf8count++;
797         continue;
798       }
799       /* We can get here only when input is invalid: (b & 0xc0) == 0x80. */
800       failures++;
801       remains_10xxxxxx = 0;
802     }
803     else {
804       /* Broken 10xxxxxx sequence? */
805       if ((b & 0xc0) != 0x80) {
806         failures++;
807         utf8count--;
808         remains_10xxxxxx = 0;
809       }
810       else
811         remains_10xxxxxx--;
812     }
813   }
814 
815   /* Unfinished 10xxxxxx sequence. */
816   if (remains_10xxxxxx != 0 && analyser->options.termination_strictness > 0)
817     failures += 2;
818 
819   /* Tolerate a small number of failures. */
820   if (failures > exp(-7*(analyser->options.threshold - 1.0))*utf8count/2.0)
821     return 0;
822 
823   analyser->result.charset = utf8;
824   analyser->result.surface |= enca_eol_surface(buffer, size, counts);
825   if (failures > 0)
826     analyser->result.surface |= ENCA_SURFACE_EOL_BIN;
827   return 1;
828 }
829 
830 /**
831  * looks_like_hz:
832  * @analyser: An analyser.
833  *
834  * Checks whether @analyser buffer is HZ-encoded. See RFC 1843
835  *
836  * Directly modifies @analyser->result on success.
837  *
838  * Returns: Nonzero when @analyser->result was set, zero othewrise.
839  **/
840 static int
looks_like_hz(EncaAnalyserState * analyser)841 looks_like_hz(EncaAnalyserState *analyser)
842 {
843   unsigned char *buffer = analyser->buffer;
844   size_t size = analyser->size;
845   static int hz = ENCA_CS_UNKNOWN; /* HZ charset */
846   size_t hzcount = 0; /* number of qp encoded characters */
847   unsigned char *p = buffer;
848   const size_t *const counts = analyser->counts;
849 
850   int escaped; /* true when we're in 8-bit mode */
851   unsigned int i;
852 
853   /* Initialize when we are called the first time. */
854   if (hz == ENCA_CS_UNKNOWN) {
855     hz = enca_name_to_charset("hz");
856     assert(hz != ENCA_CS_UNKNOWN);
857   }
858 
859   for (i = 0; i < analyser->ncharsets; i++)
860    if (analyser->charsets[i] ==  hz)
861      goto goahead;
862   return 0;
863 
864 goahead:
865   /* When the file doesn't contain escape characters,
866      don't waste time scanning it. */
867   if (counts['{'] == 0
868     || counts['}'] == 0
869     || counts['~'] == 0)
870     return 0;
871 
872   /* Move to first escaped-in */
873   /* FIXME: Things will be simpler if we have strnstr()? */
874   while ((size_t)(p - buffer) + 2 < size) {
875      p = memchr(p, '~', size - (p - buffer));
876      if (p == NULL)
877        return 0;
878      if (p[1] == '{') {
879        escaped = 1;
880        p += 2;
881        break;
882      } else if (p[1] == '\n') {
883        p += 2;
884      } else if (p[1] == '~') {
885        p += 2;
886      } else
887        p += 2;
888   }
889 
890   /* Check if it's valid HZ and count hz encoded characters. */
891   while (p < buffer + size) {
892     if (*p == '~' && p < buffer + size - 1) {
893       switch (p[1]) {
894         case '~':
895           if (escaped) {
896             p++;
897             hzcount++;
898           } else {
899             p += 2;
900           }
901           break;
902         case '{':
903           if (!escaped) {
904             p += 2;
905             escaped = 1;
906           } else {
907             return 0;
908           }
909           break;
910         case '}':
911           if (escaped) {
912             escaped = 0;
913             p += 2;
914           } else {
915             return 0;
916           }
917           break;
918         case '\n':
919           if (escaped) {
920             return 0;
921           }
922           p += 2;
923           break;
924         default:
925           if (!escaped) {
926             return 0;
927           }
928           p++;
929       }
930     } else {
931       /* Spaces, CR or LF not allowed in escaped block */
932       if (escaped) {
933         if (*p < ' ') {
934           return 0;
935         }
936         hzcount++;
937       }
938       p++;
939     }
940   }
941 
942   if (hzcount < analyser->options.min_chars)
943     return 0;
944 
945   /* Unfinished escaped block here. */
946   if (escaped && analyser->options.termination_strictness > 0)
947     return 0;
948 
949   analyser->result.charset = hz;
950   analyser->result.surface |= enca_eol_surface(buffer, size, counts);
951 
952   return 1;
953 }
954