1 /*
2 ** Copyright (c) 2013 D. Richard Hipp
3 **
4 ** This program is free software; you can redistribute it and/or
5 ** modify it under the terms of the Simplified BSD License (also
6 ** known as the "2-Clause License" or "FreeBSD License".)
7 
8 ** This program is distributed in the hope that it will be useful,
9 ** but without any warranty; without even the implied warranty of
10 ** merchantability or fitness for a particular purpose.
11 **
12 ** Author contact information:
13 **   drh@hwaci.com
14 **   http://www.hwaci.com/drh/
15 **
16 *******************************************************************************
17 **
18 ** This file contains code used to try to guess if a particular file is
19 ** text or binary, what types of line endings it uses, is it UTF8 or
20 ** UTF16, etc.
21 */
22 #include "config.h"
23 #include "lookslike.h"
24 #include <assert.h>
25 
26 
27 #if INTERFACE
28 
29 /*
30 ** This macro is designed to return non-zero if the specified blob contains
31 ** data that MAY be binary in nature; otherwise, zero will be returned.
32 */
33 #define looks_like_binary(blob) \
34     ((looks_like_utf8((blob), LOOK_BINARY) & LOOK_BINARY) != LOOK_NONE)
35 
36 /*
37 ** Output flags for the looks_like_utf8() and looks_like_utf16() routines used
38 ** to convey status information about the blob content.
39 */
40 #define LOOK_NONE    ((int)0x00000000) /* Nothing special was found. */
41 #define LOOK_NUL     ((int)0x00000001) /* One or more NUL chars were found. */
42 #define LOOK_CR      ((int)0x00000002) /* One or more CR chars were found. */
43 #define LOOK_LONE_CR ((int)0x00000004) /* An unpaired CR char was found. */
44 #define LOOK_LF      ((int)0x00000008) /* One or more LF chars were found. */
45 #define LOOK_LONE_LF ((int)0x00000010) /* An unpaired LF char was found. */
46 #define LOOK_CRLF    ((int)0x00000020) /* One or more CR/LF pairs were found. */
47 #define LOOK_LONG    ((int)0x00000040) /* An over length line was found. */
48 #define LOOK_ODD     ((int)0x00000080) /* An odd number of bytes was found. */
49 #define LOOK_SHORT   ((int)0x00000100) /* Unable to perform full check. */
50 #define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */
51 #define LOOK_BINARY  (LOOK_NUL | LOOK_LONG | LOOK_SHORT) /* May be binary. */
52 #define LOOK_EOL     (LOOK_LONE_CR | LOOK_LONE_LF | LOOK_CRLF) /* Line seps. */
53 #endif /* INTERFACE */
54 
55 /* definitions for various UTF-8 sequence lengths, encoded as start value
56  * and size of each valid range belonging to some lead byte*/
57 #define US2A  0x80, 0x01 /* for lead byte 0xC0 */
58 #define US2B  0x80, 0x40 /* for lead bytes 0xC2-0xDF */
59 #define US3A  0xA0, 0x20 /* for lead byte 0xE0 */
60 #define US3B  0x80, 0x40 /* for lead bytes 0xE1-0xEF */
61 #define US4A  0x90, 0x30 /* for lead byte 0xF0 */
62 #define US4B  0x80, 0x40 /* for lead bytes 0xF1-0xF3 */
63 #define US4C  0x80, 0x10 /* for lead byte 0xF4 */
64 #define US0A  0x00, 0x00 /* for any other lead byte */
65 
66 /* a table used for quick lookup of the definition that goes with a
67  * particular lead byte */
68 static const unsigned char lb_tab[] = {
69   US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
70   US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
71   US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
72   US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
73   US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
74   US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
75   US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
76   US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
77   US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
78   US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
79   US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
80   US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
81   US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
82   US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
83   US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
84   US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
85 };
86 
87 /*
88 ** This function attempts to scan each logical line within the blob to
89 ** determine the type of content it appears to contain.  The return value
90 ** is a combination of one or more of the LOOK_XXX flags (see above):
91 **
92 ** !LOOK_BINARY -- The content appears to consist entirely of text; however,
93 **                 the encoding may not be UTF-8.
94 **
95 ** LOOK_BINARY -- The content appears to be binary because it contains one
96 **                or more embedded NUL characters or an extremely long line.
97 **                Since this function does not understand UTF-16, it may
98 **                falsely consider UTF-16 text to be binary.
99 **
100 ** Additional flags (i.e. those other than the ones included in LOOK_BINARY)
101 ** may be present in the result as well; however, they should not impact the
102 ** determination of text versus binary content.
103 **
104 ************************************ WARNING **********************************
105 **
106 ** This function does not validate that the blob content is properly formed
107 ** UTF-8.  It assumes that all code points are the same size.  It does not
108 ** validate any code points.  It makes no attempt to detect if any [invalid]
109 ** switches between UTF-8 and other encodings occur.
110 **
111 ** The only code points that this function cares about are the NUL character,
112 ** carriage-return, and line-feed.
113 **
114 ** This function examines the contents of the blob until one of the flags
115 ** specified in "stopFlags" is set.
116 **
117 ************************************ WARNING **********************************
118 */
looks_like_utf8(const Blob * pContent,int stopFlags)119 int looks_like_utf8(const Blob *pContent, int stopFlags){
120   const char *z = blob_buffer(pContent);
121   unsigned int n = blob_size(pContent);
122   int j, c, flags = LOOK_NONE;  /* Assume UTF-8 text, prove otherwise */
123 
124   if( n==0 ) return flags;  /* Empty file -> text */
125   c = *z;
126   if( c==0 ){
127     flags |= LOOK_NUL;  /* NUL character in a file -> binary */
128   }else if( c=='\r' ){
129     flags |= LOOK_CR;
130     if( n<=1 || z[1]!='\n' ){
131       flags |= LOOK_LONE_CR;  /* Not enough chars or next char not LF */
132     }
133   }
134   j = (c!='\n');
135   if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF);  /* Found LF as first char */
136   while( !(flags&stopFlags) && --n>0 ){
137     int c2 = c;
138     c = *++z; ++j;
139     if( c==0 ){
140       flags |= LOOK_NUL;  /* NUL character in a file -> binary */
141     }else if( c=='\n' ){
142       flags |= LOOK_LF;
143       if( c2=='\r' ){
144         flags |= (LOOK_CR | LOOK_CRLF);  /* Found LF preceded by CR */
145       }else{
146         flags |= LOOK_LONE_LF;
147       }
148       if( j>LENGTH_MASK ){
149         flags |= LOOK_LONG;  /* Very long line -> binary */
150       }
151       j = 0;
152     }else if( c=='\r' ){
153       flags |= LOOK_CR;
154       if( n<=1 || z[1]!='\n' ){
155         flags |= LOOK_LONE_CR;  /* Not enough chars or next char not LF */
156       }
157     }
158   }
159   if( n ){
160     flags |= LOOK_SHORT;  /* The whole blob was not examined */
161   }
162   if( j>LENGTH_MASK ){
163     flags |= LOOK_LONG;  /* Very long line -> binary */
164   }
165   return flags;
166 }
167 
168 /*
169 ** Checks for proper UTF-8. It uses the method described in:
170 **   http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
171 ** except for the "overlong form" of \u0000 which is not considered
172 ** invalid here: Some languages like Java and Tcl use it. This function
173 ** also considers valid the derivatives CESU-8 & WTF-8 (as described in
174 ** the same wikipedia article referenced previously). For UTF-8 characters
175 ** > 0x7f, the variable 'c' not necessary means the real lead byte.
176 ** It's number of higher 1-bits indicate the number of continuation
177 ** bytes that are expected to be followed. E.g. when 'c' has a value
178 ** in the range 0xc0..0xdf it means that after 'c' a single continuation
179 ** byte is expected. A value 0xe0..0xef means that after 'c' two more
180 ** continuation bytes are expected.
181 */
182 
invalid_utf8(const Blob * pContent)183 int invalid_utf8(
184   const Blob *pContent
185 ){
186   const unsigned char *z = (unsigned char *) blob_buffer(pContent);
187   unsigned int n = blob_size(pContent);
188   unsigned char c; /* lead byte to be handled. */
189 
190   if( n==0 ) return 0;  /* Empty file -> OK */
191   c = *z;
192   while( --n>0 ){
193     if( c>=0x80 ){
194       const unsigned char *def; /* pointer to range table*/
195 
196       c <<= 1; /* multiply by 2 and get rid of highest bit */
197       def = &lb_tab[c]; /* search fb's valid range in table */
198       if( (unsigned int)(*++z-def[0])>=def[1] ){
199         return LOOK_INVALID; /* Invalid UTF-8 */
200       }
201       c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */
202     } else {
203       c = *++z;
204     }
205   }
206   return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */
207 }
208 
209 /*
210 ** Define the type needed to represent a Unicode (UTF-16) character.
211 */
212 #ifndef WCHAR_T
213 #  ifdef _WIN32
214 #    define WCHAR_T wchar_t
215 #  else
216 #    define WCHAR_T unsigned short
217 #  endif
218 #endif
219 
220 /*
221 ** Maximum length of a line in a text file, in UTF-16 characters.  (4096)
222 ** The number of bytes represented by this value cannot exceed LENGTH_MASK
223 ** bytes, because that is the line buffer size used by the diff engine.
224 */
225 #define UTF16_LENGTH_MASK_SZ   (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
226 #define UTF16_LENGTH_MASK      ((1<<UTF16_LENGTH_MASK_SZ)-1)
227 
228 /*
229 ** This macro is used to swap the byte order of a UTF-16 character in the
230 ** looks_like_utf16() function.
231 */
232 #define UTF16_SWAP(ch)         ((((ch) << 8) & 0xff00) | (((ch) >> 8) & 0xff))
233 #define UTF16_SWAP_IF(expr,ch) ((expr) ? UTF16_SWAP((ch)) : (ch))
234 
235 /*
236 ** This function attempts to scan each logical line within the blob to
237 ** determine the type of content it appears to contain.  The return value
238 ** is a combination of one or more of the LOOK_XXX flags (see above):
239 **
240 ** !LOOK_BINARY -- The content appears to consist entirely of text; however,
241 **                 the encoding may not be UTF-16.
242 **
243 ** LOOK_BINARY -- The content appears to be binary because it contains one
244 **                or more embedded NUL characters or an extremely long line.
245 **                Since this function does not understand UTF-8, it may
246 **                falsely consider UTF-8 text to be binary.
247 **
248 ** Additional flags (i.e. those other than the ones included in LOOK_BINARY)
249 ** may be present in the result as well; however, they should not impact the
250 ** determination of text versus binary content.
251 **
252 ************************************ WARNING **********************************
253 **
254 ** This function does not validate that the blob content is properly formed
255 ** UTF-16.  It assumes that all code points are the same size.  It does not
256 ** validate any code points.  It makes no attempt to detect if any [invalid]
257 ** switches between the UTF-16be and UTF-16le encodings occur.
258 **
259 ** The only code points that this function cares about are the NUL character,
260 ** carriage-return, and line-feed.
261 **
262 ** This function examines the contents of the blob until one of the flags
263 ** specified in "stopFlags" is set.
264 **
265 ************************************ WARNING **********************************
266 */
looks_like_utf16(const Blob * pContent,int bReverse,int stopFlags)267 int looks_like_utf16(const Blob *pContent, int bReverse, int stopFlags){
268   const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent);
269   unsigned int n = blob_size(pContent);
270   int j, c, flags = LOOK_NONE;  /* Assume UTF-16 text, prove otherwise */
271 
272   if( n%sizeof(WCHAR_T) ){
273     flags |= LOOK_ODD;  /* Odd number of bytes -> binary (UTF-8?) */
274   }
275   if( n<sizeof(WCHAR_T) ) return flags;  /* Zero or One byte -> binary (UTF-8?) */
276   c = *z;
277   if( bReverse ){
278     c = UTF16_SWAP(c);
279   }
280   if( c==0 ){
281     flags |= LOOK_NUL;  /* NUL character in a file -> binary */
282   }else if( c=='\r' ){
283     flags |= LOOK_CR;
284     if( n<(2*sizeof(WCHAR_T)) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
285       flags |= LOOK_LONE_CR;  /* Not enough chars or next char not LF */
286     }
287   }
288   j = (c!='\n');
289   if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF);  /* Found LF as first char */
290   while( !(flags&stopFlags) && ((n-=sizeof(WCHAR_T))>=sizeof(WCHAR_T)) ){
291     int c2 = c;
292     c = *++z;
293     if( bReverse ){
294       c = UTF16_SWAP(c);
295     }
296     ++j;
297     if( c==0 ){
298       flags |= LOOK_NUL;  /* NUL character in a file -> binary */
299     }else if( c=='\n' ){
300       flags |= LOOK_LF;
301       if( c2=='\r' ){
302         flags |= (LOOK_CR | LOOK_CRLF);  /* Found LF preceded by CR */
303       }else{
304         flags |= LOOK_LONE_LF;
305       }
306       if( j>UTF16_LENGTH_MASK ){
307         flags |= LOOK_LONG;  /* Very long line -> binary */
308       }
309       j = 0;
310     }else if( c=='\r' ){
311       flags |= LOOK_CR;
312       if( n<(2*sizeof(WCHAR_T)) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
313         flags |= LOOK_LONE_CR;  /* Not enough chars or next char not LF */
314       }
315     }
316   }
317   if( n ){
318     flags |= LOOK_SHORT;  /* The whole blob was not examined */
319   }
320   if( j>UTF16_LENGTH_MASK ){
321     flags |= LOOK_LONG;  /* Very long line -> binary */
322   }
323   return flags;
324 }
325 
326 /*
327 ** This function returns an array of bytes representing the byte-order-mark
328 ** for UTF-8.
329 */
get_utf8_bom(int * pnByte)330 const unsigned char *get_utf8_bom(int *pnByte){
331   static const unsigned char bom[] = {
332     0xef, 0xbb, 0xbf, 0x00, 0x00, 0x00
333   };
334   if( pnByte ) *pnByte = 3;
335   return bom;
336 }
337 
338 /*
339 ** This function returns non-zero if the blob starts with a UTF-8
340 ** byte-order-mark (BOM).
341 */
starts_with_utf8_bom(const Blob * pContent,int * pnByte)342 int starts_with_utf8_bom(const Blob *pContent, int *pnByte){
343   const char *z = blob_buffer(pContent);
344   int bomSize = 0;
345   const unsigned char *bom = get_utf8_bom(&bomSize);
346 
347   if( pnByte ) *pnByte = bomSize;
348   if( blob_size(pContent)<bomSize ) return 0;
349   return memcmp(z, bom, bomSize)==0;
350 }
351 
352 /*
353 ** This function returns non-zero if the blob starts with a UTF-16
354 ** byte-order-mark (BOM), either in the endianness of the machine
355 ** or in reversed byte order. The UTF-32 BOM is ruled out by checking
356 ** if the UTF-16 BOM is not immediately followed by (utf16) 0.
357 ** pnByte is only set when the function returns 1.
358 **
359 ** pbReverse is always set, even when no BOM is found. Without a BOM,
360 ** it is set to 1 on little-endian and 0 on big-endian platforms. See
361 ** clause D98 of conformance (section 3.10) of the Unicode standard.
362 */
starts_with_utf16_bom(const Blob * pContent,int * pnByte,int * pbReverse)363 int starts_with_utf16_bom(
364   const Blob *pContent, /* IN: Blob content to perform BOM detection on. */
365   int *pnByte,          /* OUT: The number of bytes used for the BOM. */
366   int *pbReverse        /* OUT: Non-zero for BOM in reverse byte-order. */
367 ){
368   const unsigned char *z = (unsigned char *)blob_buffer(pContent);
369   int bomSize = sizeof(unsigned short);
370   int size = blob_size(pContent);
371   unsigned short i0;
372 
373   if( size<bomSize ) goto noBom;  /* No: cannot read BOM. */
374   if( size>=(2*bomSize) && z[2]==0 && z[3]==0 ) goto noBom;
375   memcpy(&i0, z, sizeof(i0));
376   if( i0==0xfeff ){
377     if( pbReverse ) *pbReverse = 0;
378   }else if( i0==0xfffe ){
379     if( pbReverse ) *pbReverse = 1;
380   }else{
381     static const int one = 1;
382   noBom:
383     if( pbReverse ) *pbReverse = *(char *) &one;
384     return 0; /* No: UTF-16 byte-order-mark not found. */
385   }
386   if( pnByte ) *pnByte = bomSize;
387   return 1; /* Yes. */
388 }
389 
390 /*
391 ** Returns non-zero if the specified content could be valid UTF-16.
392 */
could_be_utf16(const Blob * pContent,int * pbReverse)393 int could_be_utf16(const Blob *pContent, int *pbReverse){
394   return (blob_size(pContent) % sizeof(WCHAR_T) == 0) ?
395       starts_with_utf16_bom(pContent, 0, pbReverse) : 0;
396 }
397 
398 
399 /*
400 ** COMMAND: test-looks-like-utf
401 **
402 ** Usage:  %fossil test-looks-like-utf FILENAME
403 **
404 ** Options:
405 **    -n|--limit N     Repeat looks-like function N times, for
406 **                     performance measurement. Default = 1
407 **    --utf8           Ignoring BOM and file size, force UTF-8 checking
408 **    --utf16          Ignoring BOM and file size, force UTF-16 checking
409 **
410 ** FILENAME is the name of a file to check for textual content in the UTF-8
411 ** and/or UTF-16 encodings.
412 */
looks_like_utf_test_cmd(void)413 void looks_like_utf_test_cmd(void){
414   Blob blob;         /* the contents of the specified file */
415   int fUtf8 = 0;     /* return value of starts_with_utf8_bom() */
416   int fUtf16 = 0;    /* return value of starts_with_utf16_bom() */
417   int fUnicode = 0;  /* return value of could_be_utf16() */
418   int lookFlags = 0; /* output flags from looks_like_utf8/utf16() */
419   int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */
420   int fForceUtf8 = find_option("utf8",0,0)!=0;
421   int fForceUtf16 = find_option("utf16",0,0)!=0;
422   const char *zCount = find_option("limit","n",1);
423   int nRepeat = 1;
424 
425   if( g.argc!=3 ) usage("FILENAME");
426   if( zCount ){
427     nRepeat = atoi(zCount);
428   }
429   blob_read_from_file(&blob, g.argv[2], ExtFILE);
430   while( --nRepeat >= 0 ){
431     fUtf8 = starts_with_utf8_bom(&blob, 0);
432     fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16);
433     if( fForceUtf8 ){
434       fUnicode = 0;
435     }else{
436       fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
437     }
438     if( fUnicode ){
439       lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
440     }else{
441       lookFlags = looks_like_utf8(&blob, 0) | invalid_utf8(&blob);
442     }
443   }
444   fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
445   fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
446   fossil_print("Starts with UTF-16 BOM: %s\n",
447                fUtf16?(bRevUtf16?"reversed":"yes"):"no");
448   fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",
449                (lookFlags&LOOK_BINARY)?"no":"yes");
450   fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
451   fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no");
452   fossil_print("Has flag LOOK_LONE_CR: %s\n",
453                (lookFlags&LOOK_LONE_CR)?"yes":"no");
454   fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no");
455   fossil_print("Has flag LOOK_LONE_LF: %s\n",
456                (lookFlags&LOOK_LONE_LF)?"yes":"no");
457   fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no");
458   fossil_print("Has flag LOOK_LONG: %s\n",(lookFlags&LOOK_LONG)?"yes":"no");
459   fossil_print("Has flag LOOK_INVALID: %s\n",
460                (lookFlags&LOOK_INVALID)?"yes":"no");
461   fossil_print("Has flag LOOK_ODD: %s\n",(lookFlags&LOOK_ODD)?"yes":"no");
462   fossil_print("Has flag LOOK_SHORT: %s\n",(lookFlags&LOOK_SHORT)?"yes":"no");
463   blob_reset(&blob);
464 }
465