1 /*
2  * isutf8.c - do the input files look like valid utf-8 byte streams?
3  *
4  * Copyright (C) 2005  Lars Wirzenius
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
19  */
20 
21 
22 /*
23   Editor's Note:
24    This file was shamelessly stolen from the original author and subsequently butchered
25    by the perpetrator, so please don't blame Lars Wirzenius if you find any bugs!
26   - Jeff Pohlmeyer 2009-2013
27 */
28 
29 #include <assert.h>
30 #include <stdio.h>
31 #include <string.h>
32 
33 
34 
35 #define ISUTF8_VERSION "1.1"
36 
37 
38 /*
39  * Code to indicate an invalid UTF8 character.
40  */
41 enum { INVALID_CHAR = 0xffffffff };
42 
43 
44 /*
45  * Produce shortest UTF8 encoding of a 31-bit value in 'u', returning it
46  * in the array 'buf'. Return the number of bytes in the encoded value.
47  * If the value is too large (more than 32 bits or would take more than
48  * 'maxbytes' bytes), return -1.
49  */
encodeutf8(unsigned long u,unsigned char * buf,size_t maxbytes)50 static int encodeutf8(unsigned long u, unsigned char *buf, size_t maxbytes)
51 {
52   static const struct {
53     unsigned int nbytes;
54     unsigned long max;
55   } tab[] = {
56     { 1, 0x0000007F },
57     { 2, 0x000007FF },
58     { 3, 0x0000FFFF },
59     { 4, 0x001FFFFF },
60     { 5, 0x03FFFFFF },
61     { 6, 0x7FFFFFFF },
62   };
63   static const int ntab = sizeof(tab) / sizeof(tab[0]);
64   int i, j;
65   if (u > tab[ntab-1].max) { return -1; }
66   for (i = 0; i < ntab; ++i) {  if (u <= tab[i].max) { break;} }
67   assert(i < ntab);
68   if (tab[i].nbytes > maxbytes) { return -1; }
69   if (tab[i].nbytes == 1) {
70     buf[0] = u;
71   } else {
72     for (j = tab[i].nbytes-1; j > 0; --j) {
73       buf[j] = 0x80 | (u & 0x3f);
74       u >>= 6;
75     }
76     unsigned char mask = ~(0xFF >> tab[i].nbytes);
77     buf[0] = mask | u;
78   }
79   return tab[i].nbytes;
80 }
81 
82 
83 /*
84  * Return number of ones at the top of a byte.
85  *
86  * I'm pretty sure there is a fancy trick to do this without a loop,
87  * but I'm too tired to figure it out now. --liw
88  */
high_ones(int c)89 static int high_ones(int c) {
90   int n;
91   for (n = 0; (c & 0x80) == 0x80; c <<= 1) { ++n; }
92   return n;
93 }
94 
95 
96 /*
97  * Decode a UTF8 character from an array of bytes. Return character code.
98  * Upon error, return INVALID_CHAR.
99  */
decodeutf8(unsigned char * buf,int nbytes)100 static unsigned long decodeutf8(unsigned char *buf, int nbytes)
101 {
102   unsigned long u;
103   int i, j;
104   if (nbytes <= 0) { return INVALID_CHAR; }
105   if (nbytes == 1) {
106     if (buf[0] >= 0x80) { return INVALID_CHAR; }
107     return buf[0];
108   }
109   i = high_ones(buf[0]);
110   if (i != nbytes) { return INVALID_CHAR; }
111   u = buf[0] & (0xff >> i);
112   for (j = 1; j < nbytes; ++j) {
113     if ((buf[j] & 0xC0) != 0x80) { return INVALID_CHAR; }
114     u = (u << 6) | (buf[j] & 0x3f);
115   }
116   /* UTF-8 can't contain codes 0xd800-0xdfff (UTF-16 surrogates) OR 0xfffe OR 0xffff */
117   if (u >= 0xD800 && u <= 0xDFFF) { return INVALID_CHAR; }
118   if (u == 0xFFFE || u == 0xFFFF) { return INVALID_CHAR; }
119   return u;
120 }
121 
122 
123 /*
124   Determine if the contents of an open file form a valid UTF8 byte stream.
125   Do this by collecting bytes for a character into a buffer and then decode
126   the bytes and re-encode them and compare that they are identical to the
127   original bytes. If any step fails, return 'H' for "high" (extended ASCII).
128   If EOF is reached, return 'U' for UTF-8, or 'T' for text if the file might also
129   be interpreted as seven-bit US-ASCII. At the same time, also check for control
130   characters: we will accept carriage-returns, line-feeds, form-feeds, audibles[1],
131   and horizontal or vertical tabs - Any other characters with a value less than 32
132   would probably indicate this is not a text file at all, so return 'B' for binary.
133   [1] Some autoconf configure scripts contain BEL chars as an awk field separator.
134  */
get_stream_encoding(FILE * file)135 static char get_stream_encoding(FILE *file) {
136   enum { MAX_UTF8_BYTES = 6 };
137   unsigned char buf[MAX_UTF8_BYTES];
138   unsigned char buf2[MAX_UTF8_BYTES];
139   int nbytes=0;
140   int nbytes2;
141   int c;
142   unsigned long code;
143   char result='T';
144   fseek(file,0,SEEK_SET);
145   if ((getc(file)==0xFF)&&(getc(file)==0xFE)) { return 'e'; } else { fseek(file,0,SEEK_SET); }
146   if ((getc(file)==0xFE)&&(getc(file)==0xFF)) { return 'E'; } else { fseek(file,0,SEEK_SET); }
147   if ((getc(file)==0xEF)&&(getc(file)==0xBB)&&(getc(file)==0xBF)) { return 'M'; } else { fseek(file,0,SEEK_SET); }
148   for (;;) {
149     c = getc(file);
150     if (c != EOF) {
151       if ( (c<32) && ( (c==0) || (!strchr("\n\t\r\f\v\a",c)) ) ) {
152         /* If the very last byte is [SUB] it's probably an ancient CP/M text file. */
153         if ((c=='\032') && (getc(file)==EOF)) return 'T';
154         /* Probably not a text file, so bail out now. */
155         return 'B';
156       }
157       if ((result=='T') && (c >= 0x80)) {
158         /* Can't be 7-bit, so it's either valid UTF-8, "extended" ASCII, or binary. */
159         result='U';
160       }
161     }
162     if (result!='H') {
163       if (c == EOF || c < 0x80 || (c & 0xC0) != 0x80) {
164         /* New char starts, deal with previous one. */
165         if (nbytes > 0) {
166           code = decodeutf8(buf, nbytes);
167           if (code == INVALID_CHAR) { result='H'; }
168           nbytes2 = encodeutf8(code, buf2, MAX_UTF8_BYTES);
169           if (nbytes != nbytes2 || memcmp(buf, buf2, nbytes) != 0) { result='H'; }
170         }
171         nbytes = 0;
172         /* If it's UTF8, start collecting again. */
173         if (c != EOF && c >= 0x80) { buf[nbytes++] = c; }
174       } else {
175          /* This is a continuation byte, append to buffer. */
176          if (nbytes == MAX_UTF8_BYTES) { result='H'; }
177          buf[nbytes++] = c;
178       }
179     }
180     if (c == EOF) { break; }
181   }
182   if (nbytes != 0) { return 'H'; }
183   return result;
184 }
185 
186 
187 
188 /*
189 Test the contents of FILENAME and return one of the following values:
190 
191   B: Binary file ( contains null bytes or control codes not normally found in text. )
192   T: Plain US-ASCII text file ( with no extended characters. )
193   H: High (extended ASCII) text file.
194   U: UTF-8 encoded text file w/o BOM (content validated).
195   M: UTF-8 BOM (content not validated)
196   Z: Zero-length (empty) file.
197   F: Failure, could not read the file.
198   e: UTF-16LE BOM
199   E: UTF-16BE BOM
200 Notes:
201   A return value of 'T' could also be treated as valid UTF-8
202   Unknown encodings might be incorrectly returned as 'B' or 'H' !!!
203 
204 */
205 
get_file_encoding(const char * filename)206 char get_file_encoding(const char*filename)
207 {
208   FILE *file=fopen(filename,"rb");
209   if (file) {
210     char rv='F';
211     if ( getc(file)==EOF ) {
212       rv='Z';
213     } else {
214       rv=get_stream_encoding(file);
215     }
216     fclose(file);
217     return rv;
218   } else {
219     return 'F';
220   }
221 }
222 
223 
224 #ifdef TEST_FOR_GET_FILE_ENCODING
main(int argc,char * argv[])225   int main (int argc, char*argv[]) { printf("%c\n", get_file_encoding(argv[1])); }
226 #endif
227 
228