1 /*
2 * isutf8.c - do the input files look like valid utf-8 byte streams?
3 *
4 * Copyright (C) 2005 Lars Wirzenius
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21
22 /*
23 Editor's Note:
24 This file was shamelessly stolen from the original author and subsequently butchered
25 by the perpetrator, so please don't blame Lars Wirzenius if you find any bugs!
26 - Jeff Pohlmeyer 2009-2013
27 */
28
29 #include <assert.h>
30 #include <stdio.h>
31 #include <string.h>
32
33
34
35 #define ISUTF8_VERSION "1.1"
36
37
38 /*
39 * Code to indicate an invalid UTF8 character.
40 */
41 enum { INVALID_CHAR = 0xffffffff };
42
43
44 /*
45 * Produce shortest UTF8 encoding of a 31-bit value in 'u', returning it
46 * in the array 'buf'. Return the number of bytes in the encoded value.
47 * If the value is too large (more than 32 bits or would take more than
48 * 'maxbytes' bytes), return -1.
49 */
encodeutf8(unsigned long u,unsigned char * buf,size_t maxbytes)50 static int encodeutf8(unsigned long u, unsigned char *buf, size_t maxbytes)
51 {
52 static const struct {
53 unsigned int nbytes;
54 unsigned long max;
55 } tab[] = {
56 { 1, 0x0000007F },
57 { 2, 0x000007FF },
58 { 3, 0x0000FFFF },
59 { 4, 0x001FFFFF },
60 { 5, 0x03FFFFFF },
61 { 6, 0x7FFFFFFF },
62 };
63 static const int ntab = sizeof(tab) / sizeof(tab[0]);
64 int i, j;
65 if (u > tab[ntab-1].max) { return -1; }
66 for (i = 0; i < ntab; ++i) { if (u <= tab[i].max) { break;} }
67 assert(i < ntab);
68 if (tab[i].nbytes > maxbytes) { return -1; }
69 if (tab[i].nbytes == 1) {
70 buf[0] = u;
71 } else {
72 for (j = tab[i].nbytes-1; j > 0; --j) {
73 buf[j] = 0x80 | (u & 0x3f);
74 u >>= 6;
75 }
76 unsigned char mask = ~(0xFF >> tab[i].nbytes);
77 buf[0] = mask | u;
78 }
79 return tab[i].nbytes;
80 }
81
82
83 /*
84 * Return number of ones at the top of a byte.
85 *
86 * I'm pretty sure there is a fancy trick to do this without a loop,
87 * but I'm too tired to figure it out now. --liw
88 */
high_ones(int c)89 static int high_ones(int c) {
90 int n;
91 for (n = 0; (c & 0x80) == 0x80; c <<= 1) { ++n; }
92 return n;
93 }
94
95
96 /*
97 * Decode a UTF8 character from an array of bytes. Return character code.
98 * Upon error, return INVALID_CHAR.
99 */
decodeutf8(unsigned char * buf,int nbytes)100 static unsigned long decodeutf8(unsigned char *buf, int nbytes)
101 {
102 unsigned long u;
103 int i, j;
104 if (nbytes <= 0) { return INVALID_CHAR; }
105 if (nbytes == 1) {
106 if (buf[0] >= 0x80) { return INVALID_CHAR; }
107 return buf[0];
108 }
109 i = high_ones(buf[0]);
110 if (i != nbytes) { return INVALID_CHAR; }
111 u = buf[0] & (0xff >> i);
112 for (j = 1; j < nbytes; ++j) {
113 if ((buf[j] & 0xC0) != 0x80) { return INVALID_CHAR; }
114 u = (u << 6) | (buf[j] & 0x3f);
115 }
116 /* UTF-8 can't contain codes 0xd800-0xdfff (UTF-16 surrogates) OR 0xfffe OR 0xffff */
117 if (u >= 0xD800 && u <= 0xDFFF) { return INVALID_CHAR; }
118 if (u == 0xFFFE || u == 0xFFFF) { return INVALID_CHAR; }
119 return u;
120 }
121
122
123 /*
124 Determine if the contents of an open file form a valid UTF8 byte stream.
125 Do this by collecting bytes for a character into a buffer and then decode
126 the bytes and re-encode them and compare that they are identical to the
127 original bytes. If any step fails, return 'H' for "high" (extended ASCII).
128 If EOF is reached, return 'U' for UTF-8, or 'T' for text if the file might also
129 be interpreted as seven-bit US-ASCII. At the same time, also check for control
130 characters: we will accept carriage-returns, line-feeds, form-feeds, audibles[1],
131 and horizontal or vertical tabs - Any other characters with a value less than 32
132 would probably indicate this is not a text file at all, so return 'B' for binary.
133 [1] Some autoconf configure scripts contain BEL chars as an awk field separator.
134 */
get_stream_encoding(FILE * file)135 static char get_stream_encoding(FILE *file) {
136 enum { MAX_UTF8_BYTES = 6 };
137 unsigned char buf[MAX_UTF8_BYTES];
138 unsigned char buf2[MAX_UTF8_BYTES];
139 int nbytes=0;
140 int nbytes2;
141 int c;
142 unsigned long code;
143 char result='T';
144 fseek(file,0,SEEK_SET);
145 if ((getc(file)==0xFF)&&(getc(file)==0xFE)) { return 'e'; } else { fseek(file,0,SEEK_SET); }
146 if ((getc(file)==0xFE)&&(getc(file)==0xFF)) { return 'E'; } else { fseek(file,0,SEEK_SET); }
147 if ((getc(file)==0xEF)&&(getc(file)==0xBB)&&(getc(file)==0xBF)) { return 'M'; } else { fseek(file,0,SEEK_SET); }
148 for (;;) {
149 c = getc(file);
150 if (c != EOF) {
151 if ( (c<32) && ( (c==0) || (!strchr("\n\t\r\f\v\a",c)) ) ) {
152 /* If the very last byte is [SUB] it's probably an ancient CP/M text file. */
153 if ((c=='\032') && (getc(file)==EOF)) return 'T';
154 /* Probably not a text file, so bail out now. */
155 return 'B';
156 }
157 if ((result=='T') && (c >= 0x80)) {
158 /* Can't be 7-bit, so it's either valid UTF-8, "extended" ASCII, or binary. */
159 result='U';
160 }
161 }
162 if (result!='H') {
163 if (c == EOF || c < 0x80 || (c & 0xC0) != 0x80) {
164 /* New char starts, deal with previous one. */
165 if (nbytes > 0) {
166 code = decodeutf8(buf, nbytes);
167 if (code == INVALID_CHAR) { result='H'; }
168 nbytes2 = encodeutf8(code, buf2, MAX_UTF8_BYTES);
169 if (nbytes != nbytes2 || memcmp(buf, buf2, nbytes) != 0) { result='H'; }
170 }
171 nbytes = 0;
172 /* If it's UTF8, start collecting again. */
173 if (c != EOF && c >= 0x80) { buf[nbytes++] = c; }
174 } else {
175 /* This is a continuation byte, append to buffer. */
176 if (nbytes == MAX_UTF8_BYTES) { result='H'; }
177 buf[nbytes++] = c;
178 }
179 }
180 if (c == EOF) { break; }
181 }
182 if (nbytes != 0) { return 'H'; }
183 return result;
184 }
185
186
187
188 /*
189 Test the contents of FILENAME and return one of the following values:
190
191 B: Binary file ( contains null bytes or control codes not normally found in text. )
192 T: Plain US-ASCII text file ( with no extended characters. )
193 H: High (extended ASCII) text file.
194 U: UTF-8 encoded text file w/o BOM (content validated).
195 M: UTF-8 BOM (content not validated)
196 Z: Zero-length (empty) file.
197 F: Failure, could not read the file.
198 e: UTF-16LE BOM
199 E: UTF-16BE BOM
200 Notes:
201 A return value of 'T' could also be treated as valid UTF-8
202 Unknown encodings might be incorrectly returned as 'B' or 'H' !!!
203
204 */
205
get_file_encoding(const char * filename)206 char get_file_encoding(const char*filename)
207 {
208 FILE *file=fopen(filename,"rb");
209 if (file) {
210 char rv='F';
211 if ( getc(file)==EOF ) {
212 rv='Z';
213 } else {
214 rv=get_stream_encoding(file);
215 }
216 fclose(file);
217 return rv;
218 } else {
219 return 'F';
220 }
221 }
222
223
224 #ifdef TEST_FOR_GET_FILE_ENCODING
main(int argc,char * argv[])225 int main (int argc, char*argv[]) { printf("%c\n", get_file_encoding(argv[1])); }
226 #endif
227
228