xref: /minix/external/bsd/file/dist/src/encoding.c (revision 0a6a1f1d)
1*0a6a1f1dSLionel Sambuc /*	$NetBSD: encoding.c,v 1.4 2015/01/02 21:15:32 christos Exp $	*/
2ef01931fSBen Gras 
3ef01931fSBen Gras /*
4ef01931fSBen Gras  * Copyright (c) Ian F. Darwin 1986-1995.
5ef01931fSBen Gras  * Software written by Ian F. Darwin and others;
6ef01931fSBen Gras  * maintained 1995-present by Christos Zoulas and others.
7ef01931fSBen Gras  *
8ef01931fSBen Gras  * Redistribution and use in source and binary forms, with or without
9ef01931fSBen Gras  * modification, are permitted provided that the following conditions
10ef01931fSBen Gras  * are met:
11ef01931fSBen Gras  * 1. Redistributions of source code must retain the above copyright
12ef01931fSBen Gras  *    notice immediately at the beginning of the file, without modification,
13ef01931fSBen Gras  *    this list of conditions, and the following disclaimer.
14ef01931fSBen Gras  * 2. Redistributions in binary form must reproduce the above copyright
15ef01931fSBen Gras  *    notice, this list of conditions and the following disclaimer in the
16ef01931fSBen Gras  *    documentation and/or other materials provided with the distribution.
17ef01931fSBen Gras  *
18ef01931fSBen Gras  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19ef01931fSBen Gras  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20ef01931fSBen Gras  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21ef01931fSBen Gras  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
22ef01931fSBen Gras  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23ef01931fSBen Gras  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24ef01931fSBen Gras  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25ef01931fSBen Gras  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26ef01931fSBen Gras  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27ef01931fSBen Gras  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28ef01931fSBen Gras  * SUCH DAMAGE.
29ef01931fSBen Gras  */
30ef01931fSBen Gras /*
31ef01931fSBen Gras  * Encoding -- determine the character encoding of a text file.
32ef01931fSBen Gras  *
33ef01931fSBen Gras  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
34ef01931fSBen Gras  * international characters.
35ef01931fSBen Gras  */
36ef01931fSBen Gras 
37ef01931fSBen Gras #include "file.h"
38ef01931fSBen Gras 
39ef01931fSBen Gras #ifndef	lint
40ef01931fSBen Gras #if 0
41*0a6a1f1dSLionel Sambuc FILE_RCSID("@(#)$File: encoding.c,v 1.10 2014/09/11 12:08:52 christos Exp $")
42ef01931fSBen Gras #else
43*0a6a1f1dSLionel Sambuc __RCSID("$NetBSD: encoding.c,v 1.4 2015/01/02 21:15:32 christos Exp $");
44ef01931fSBen Gras #endif
45ef01931fSBen Gras #endif	/* lint */
46ef01931fSBen Gras 
47ef01931fSBen Gras #include "magic.h"
48ef01931fSBen Gras #include <string.h>
49ef01931fSBen Gras #include <memory.h>
50ef01931fSBen Gras #include <stdlib.h>
51ef01931fSBen Gras 
52ef01931fSBen Gras 
53ef01931fSBen Gras private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
54ef01931fSBen Gras private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
55ef01931fSBen Gras     size_t *);
56ef01931fSBen Gras private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
57ef01931fSBen Gras private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
58ef01931fSBen Gras private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
59ef01931fSBen Gras private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
60ef01931fSBen Gras 
61835f6802SDirk Vogt #ifdef DEBUG_ENCODING
62835f6802SDirk Vogt #define DPRINTF(a) printf a
63835f6802SDirk Vogt #else
64835f6802SDirk Vogt #define DPRINTF(a)
65835f6802SDirk Vogt #endif
66835f6802SDirk Vogt 
67ef01931fSBen Gras /*
68ef01931fSBen Gras  * Try to determine whether text is in some character code we can
69ef01931fSBen Gras  * identify.  Each of these tests, if it succeeds, will leave
70ef01931fSBen Gras  * the text converted into one-unichar-per-character Unicode in
71ef01931fSBen Gras  * ubuf, and the number of characters converted in ulen.
72ef01931fSBen Gras  */
73ef01931fSBen Gras protected int
file_encoding(struct magic_set * ms,const unsigned char * buf,size_t nbytes,unichar ** ubuf,size_t * ulen,const char ** code,const char ** code_mime,const char ** type)74ef01931fSBen Gras file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type)
75ef01931fSBen Gras {
76ef01931fSBen Gras 	size_t mlen;
77ef01931fSBen Gras 	int rv = 1, ucs_type;
78ef01931fSBen Gras 	unsigned char *nbuf = NULL;
79ef01931fSBen Gras 
8008ff44c4SLionel Sambuc 	*type = "text";
81*0a6a1f1dSLionel Sambuc 	*ulen = 0;
82*0a6a1f1dSLionel Sambuc 	*code = "unknown";
83*0a6a1f1dSLionel Sambuc 	*code_mime = "binary";
84*0a6a1f1dSLionel Sambuc 
85*0a6a1f1dSLionel Sambuc 	mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
86*0a6a1f1dSLionel Sambuc 	if ((*ubuf = CAST(unichar *, calloc((size_t)1, mlen))) == NULL) {
87ef01931fSBen Gras 		file_oomem(ms, mlen);
88ef01931fSBen Gras 		goto done;
89ef01931fSBen Gras 	}
90*0a6a1f1dSLionel Sambuc 	mlen = (nbytes + 1) * sizeof(nbuf[0]);
91*0a6a1f1dSLionel Sambuc 	if ((nbuf = CAST(unsigned char *, calloc((size_t)1, mlen))) == NULL) {
92ef01931fSBen Gras 		file_oomem(ms, mlen);
93ef01931fSBen Gras 		goto done;
94ef01931fSBen Gras 	}
95ef01931fSBen Gras 
96ef01931fSBen Gras 	if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
97835f6802SDirk Vogt 		DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
98ef01931fSBen Gras 		*code = "ASCII";
99ef01931fSBen Gras 		*code_mime = "us-ascii";
100ef01931fSBen Gras 	} else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
101835f6802SDirk Vogt 		DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
102ef01931fSBen Gras 		*code = "UTF-8 Unicode (with BOM)";
103ef01931fSBen Gras 		*code_mime = "utf-8";
104ef01931fSBen Gras 	} else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
105835f6802SDirk Vogt 		DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
106ef01931fSBen Gras 		*code = "UTF-8 Unicode";
107ef01931fSBen Gras 		*code_mime = "utf-8";
108ef01931fSBen Gras 	} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
109ef01931fSBen Gras 		if (ucs_type == 1) {
110ef01931fSBen Gras 			*code = "Little-endian UTF-16 Unicode";
111ef01931fSBen Gras 			*code_mime = "utf-16le";
112ef01931fSBen Gras 		} else {
113ef01931fSBen Gras 			*code = "Big-endian UTF-16 Unicode";
114ef01931fSBen Gras 			*code_mime = "utf-16be";
115ef01931fSBen Gras 		}
116835f6802SDirk Vogt 		DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen));
117ef01931fSBen Gras 	} else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
118835f6802SDirk Vogt 		DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen));
119ef01931fSBen Gras 		*code = "ISO-8859";
120ef01931fSBen Gras 		*code_mime = "iso-8859-1";
121ef01931fSBen Gras 	} else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
122835f6802SDirk Vogt 		DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen));
123ef01931fSBen Gras 		*code = "Non-ISO extended-ASCII";
124ef01931fSBen Gras 		*code_mime = "unknown-8bit";
125ef01931fSBen Gras 	} else {
126ef01931fSBen Gras 		from_ebcdic(buf, nbytes, nbuf);
127ef01931fSBen Gras 
128ef01931fSBen Gras 		if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
129835f6802SDirk Vogt 			DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen));
130ef01931fSBen Gras 			*code = "EBCDIC";
131ef01931fSBen Gras 			*code_mime = "ebcdic";
132ef01931fSBen Gras 		} else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
133835f6802SDirk Vogt 			DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n",
134835f6802SDirk Vogt 			    *ulen));
135ef01931fSBen Gras 			*code = "International EBCDIC";
136ef01931fSBen Gras 			*code_mime = "ebcdic";
137ef01931fSBen Gras 		} else { /* Doesn't look like text at all */
138835f6802SDirk Vogt 			DPRINTF(("binary\n"));
139ef01931fSBen Gras 			rv = 0;
140ef01931fSBen Gras 			*type = "binary";
141ef01931fSBen Gras 		}
142ef01931fSBen Gras 	}
143ef01931fSBen Gras 
144ef01931fSBen Gras  done:
145ef01931fSBen Gras 	free(nbuf);
146ef01931fSBen Gras 
147ef01931fSBen Gras 	return rv;
148ef01931fSBen Gras }
149ef01931fSBen Gras 
150ef01931fSBen Gras /*
151ef01931fSBen Gras  * This table reflects a particular philosophy about what constitutes
152ef01931fSBen Gras  * "text," and there is room for disagreement about it.
153ef01931fSBen Gras  *
154ef01931fSBen Gras  * Version 3.31 of the file command considered a file to be ASCII if
155ef01931fSBen Gras  * each of its characters was approved by either the isascii() or
156ef01931fSBen Gras  * isalpha() function.  On most systems, this would mean that any
157ef01931fSBen Gras  * file consisting only of characters in the range 0x00 ... 0x7F
158ef01931fSBen Gras  * would be called ASCII text, but many systems might reasonably
159ef01931fSBen Gras  * consider some characters outside this range to be alphabetic,
160ef01931fSBen Gras  * so the file command would call such characters ASCII.  It might
161ef01931fSBen Gras  * have been more accurate to call this "considered textual on the
162ef01931fSBen Gras  * local system" than "ASCII."
163ef01931fSBen Gras  *
164ef01931fSBen Gras  * It considered a file to be "International language text" if each
165ef01931fSBen Gras  * of its characters was either an ASCII printing character (according
166ef01931fSBen Gras  * to the real ASCII standard, not the above test), a character in
167ef01931fSBen Gras  * the range 0x80 ... 0xFF, or one of the following control characters:
168ef01931fSBen Gras  * backspace, tab, line feed, vertical tab, form feed, carriage return,
169ef01931fSBen Gras  * escape.  No attempt was made to determine the language in which files
170ef01931fSBen Gras  * of this type were written.
171ef01931fSBen Gras  *
172ef01931fSBen Gras  *
173ef01931fSBen Gras  * The table below considers a file to be ASCII if all of its characters
174ef01931fSBen Gras  * are either ASCII printing characters (again, according to the X3.4
175ef01931fSBen Gras  * standard, not isascii()) or any of the following controls: bell,
176ef01931fSBen Gras  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
177ef01931fSBen Gras  *
178ef01931fSBen Gras  * I include bell because some programs (particularly shell scripts)
179ef01931fSBen Gras  * use it literally, even though it is rare in normal text.  I exclude
180ef01931fSBen Gras  * vertical tab because it never seems to be used in real text.  I also
181ef01931fSBen Gras  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
182ef01931fSBen Gras  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
183ef01931fSBen Gras  * character to.  It might be more appropriate to include it in the 8859
184ef01931fSBen Gras  * set instead of the ASCII set, but it's got to be included in *something*
185ef01931fSBen Gras  * we recognize or EBCDIC files aren't going to be considered textual.
186ef01931fSBen Gras  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
187ef01931fSBen Gras  * and Latin characters, so these should possibly be allowed.  But they
188ef01931fSBen Gras  * make a real mess on VT100-style displays if they're not paired properly,
189ef01931fSBen Gras  * so we are probably better off not calling them text.
190ef01931fSBen Gras  *
191ef01931fSBen Gras  * A file is considered to be ISO-8859 text if its characters are all
192ef01931fSBen Gras  * either ASCII, according to the above definition, or printing characters
193ef01931fSBen Gras  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
194ef01931fSBen Gras  *
195ef01931fSBen Gras  * Finally, a file is considered to be international text from some other
196ef01931fSBen Gras  * character code if its characters are all either ISO-8859 (according to
197ef01931fSBen Gras  * the above definition) or characters in the range 0x80 ... 0x9F, which
198ef01931fSBen Gras  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
199ef01931fSBen Gras  * consider to be printing characters.
200ef01931fSBen Gras  */
201ef01931fSBen Gras 
202ef01931fSBen Gras #define F 0   /* character never appears in text */
203ef01931fSBen Gras #define T 1   /* character appears in plain ASCII text */
204ef01931fSBen Gras #define I 2   /* character appears in ISO-8859 text */
205ef01931fSBen Gras #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
206ef01931fSBen Gras 
207ef01931fSBen Gras private char text_chars[256] = {
208ef01931fSBen Gras 	/*                  BEL BS HT LF    FF CR    */
209ef01931fSBen Gras 	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
210ef01931fSBen Gras 	/*                              ESC          */
211ef01931fSBen Gras 	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
212ef01931fSBen Gras 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
213ef01931fSBen Gras 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
214ef01931fSBen Gras 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
215ef01931fSBen Gras 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
216ef01931fSBen Gras 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
217ef01931fSBen Gras 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
218ef01931fSBen Gras 	/*            NEL                            */
219ef01931fSBen Gras 	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
220ef01931fSBen Gras 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
221ef01931fSBen Gras 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
222ef01931fSBen Gras 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
223ef01931fSBen Gras 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
224ef01931fSBen Gras 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
225ef01931fSBen Gras 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
226ef01931fSBen Gras 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
227ef01931fSBen Gras };
228ef01931fSBen Gras 
229ef01931fSBen Gras private int
looks_ascii(const unsigned char * buf,size_t nbytes,unichar * ubuf,size_t * ulen)230ef01931fSBen Gras looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
231ef01931fSBen Gras     size_t *ulen)
232ef01931fSBen Gras {
233ef01931fSBen Gras 	size_t i;
234ef01931fSBen Gras 
235ef01931fSBen Gras 	*ulen = 0;
236ef01931fSBen Gras 
237ef01931fSBen Gras 	for (i = 0; i < nbytes; i++) {
238ef01931fSBen Gras 		int t = text_chars[buf[i]];
239ef01931fSBen Gras 
240ef01931fSBen Gras 		if (t != T)
241ef01931fSBen Gras 			return 0;
242ef01931fSBen Gras 
243ef01931fSBen Gras 		ubuf[(*ulen)++] = buf[i];
244ef01931fSBen Gras 	}
245ef01931fSBen Gras 
246ef01931fSBen Gras 	return 1;
247ef01931fSBen Gras }
248ef01931fSBen Gras 
249ef01931fSBen Gras private int
looks_latin1(const unsigned char * buf,size_t nbytes,unichar * ubuf,size_t * ulen)250ef01931fSBen Gras looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
251ef01931fSBen Gras {
252ef01931fSBen Gras 	size_t i;
253ef01931fSBen Gras 
254ef01931fSBen Gras 	*ulen = 0;
255ef01931fSBen Gras 
256ef01931fSBen Gras 	for (i = 0; i < nbytes; i++) {
257ef01931fSBen Gras 		int t = text_chars[buf[i]];
258ef01931fSBen Gras 
259ef01931fSBen Gras 		if (t != T && t != I)
260ef01931fSBen Gras 			return 0;
261ef01931fSBen Gras 
262ef01931fSBen Gras 		ubuf[(*ulen)++] = buf[i];
263ef01931fSBen Gras 	}
264ef01931fSBen Gras 
265ef01931fSBen Gras 	return 1;
266ef01931fSBen Gras }
267ef01931fSBen Gras 
268ef01931fSBen Gras private int
looks_extended(const unsigned char * buf,size_t nbytes,unichar * ubuf,size_t * ulen)269ef01931fSBen Gras looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
270ef01931fSBen Gras     size_t *ulen)
271ef01931fSBen Gras {
272ef01931fSBen Gras 	size_t i;
273ef01931fSBen Gras 
274ef01931fSBen Gras 	*ulen = 0;
275ef01931fSBen Gras 
276ef01931fSBen Gras 	for (i = 0; i < nbytes; i++) {
277ef01931fSBen Gras 		int t = text_chars[buf[i]];
278ef01931fSBen Gras 
279ef01931fSBen Gras 		if (t != T && t != I && t != X)
280ef01931fSBen Gras 			return 0;
281ef01931fSBen Gras 
282ef01931fSBen Gras 		ubuf[(*ulen)++] = buf[i];
283ef01931fSBen Gras 	}
284ef01931fSBen Gras 
285ef01931fSBen Gras 	return 1;
286ef01931fSBen Gras }
287ef01931fSBen Gras 
288ef01931fSBen Gras /*
289ef01931fSBen Gras  * Decide whether some text looks like UTF-8. Returns:
290ef01931fSBen Gras  *
291ef01931fSBen Gras  *     -1: invalid UTF-8
292ef01931fSBen Gras  *      0: uses odd control characters, so doesn't look like text
293ef01931fSBen Gras  *      1: 7-bit text
294ef01931fSBen Gras  *      2: definitely UTF-8 text (valid high-bit set bytes)
295ef01931fSBen Gras  *
296ef01931fSBen Gras  * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
297ef01931fSBen Gras  * ubuf must be big enough!
298ef01931fSBen Gras  */
299ef01931fSBen Gras protected int
file_looks_utf8(const unsigned char * buf,size_t nbytes,unichar * ubuf,size_t * ulen)300ef01931fSBen Gras file_looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
301ef01931fSBen Gras {
302ef01931fSBen Gras 	size_t i;
303ef01931fSBen Gras 	int n;
304ef01931fSBen Gras 	unichar c;
305ef01931fSBen Gras 	int gotone = 0, ctrl = 0;
306ef01931fSBen Gras 
307ef01931fSBen Gras 	if (ubuf)
308ef01931fSBen Gras 		*ulen = 0;
309ef01931fSBen Gras 
310ef01931fSBen Gras 	for (i = 0; i < nbytes; i++) {
311ef01931fSBen Gras 		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
312ef01931fSBen Gras 			/*
313ef01931fSBen Gras 			 * Even if the whole file is valid UTF-8 sequences,
314ef01931fSBen Gras 			 * still reject it if it uses weird control characters.
315ef01931fSBen Gras 			 */
316ef01931fSBen Gras 
317ef01931fSBen Gras 			if (text_chars[buf[i]] != T)
318ef01931fSBen Gras 				ctrl = 1;
319ef01931fSBen Gras 
320ef01931fSBen Gras 			if (ubuf)
321ef01931fSBen Gras 				ubuf[(*ulen)++] = buf[i];
322ef01931fSBen Gras 		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
323ef01931fSBen Gras 			return -1;
324ef01931fSBen Gras 		} else {			   /* 11xxxxxx begins UTF-8 */
325ef01931fSBen Gras 			int following;
326ef01931fSBen Gras 
327ef01931fSBen Gras 			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
328ef01931fSBen Gras 				c = buf[i] & 0x1f;
329ef01931fSBen Gras 				following = 1;
330ef01931fSBen Gras 			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
331ef01931fSBen Gras 				c = buf[i] & 0x0f;
332ef01931fSBen Gras 				following = 2;
333ef01931fSBen Gras 			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
334ef01931fSBen Gras 				c = buf[i] & 0x07;
335ef01931fSBen Gras 				following = 3;
336ef01931fSBen Gras 			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
337ef01931fSBen Gras 				c = buf[i] & 0x03;
338ef01931fSBen Gras 				following = 4;
339ef01931fSBen Gras 			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
340ef01931fSBen Gras 				c = buf[i] & 0x01;
341ef01931fSBen Gras 				following = 5;
342ef01931fSBen Gras 			} else
343ef01931fSBen Gras 				return -1;
344ef01931fSBen Gras 
345ef01931fSBen Gras 			for (n = 0; n < following; n++) {
346ef01931fSBen Gras 				i++;
347ef01931fSBen Gras 				if (i >= nbytes)
348ef01931fSBen Gras 					goto done;
349ef01931fSBen Gras 
350ef01931fSBen Gras 				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
351ef01931fSBen Gras 					return -1;
352ef01931fSBen Gras 
353ef01931fSBen Gras 				c = (c << 6) + (buf[i] & 0x3f);
354ef01931fSBen Gras 			}
355ef01931fSBen Gras 
356ef01931fSBen Gras 			if (ubuf)
357ef01931fSBen Gras 				ubuf[(*ulen)++] = c;
358ef01931fSBen Gras 			gotone = 1;
359ef01931fSBen Gras 		}
360ef01931fSBen Gras 	}
361ef01931fSBen Gras done:
362ef01931fSBen Gras 	return ctrl ? 0 : (gotone ? 2 : 1);
363ef01931fSBen Gras }
364ef01931fSBen Gras 
365ef01931fSBen Gras /*
366ef01931fSBen Gras  * Decide whether some text looks like UTF-8 with BOM. If there is no
367ef01931fSBen Gras  * BOM, return -1; otherwise return the result of looks_utf8 on the
368ef01931fSBen Gras  * rest of the text.
369ef01931fSBen Gras  */
370ef01931fSBen Gras private int
looks_utf8_with_BOM(const unsigned char * buf,size_t nbytes,unichar * ubuf,size_t * ulen)371ef01931fSBen Gras looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf,
372ef01931fSBen Gras     size_t *ulen)
373ef01931fSBen Gras {
374ef01931fSBen Gras 	if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
375ef01931fSBen Gras 		return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
376ef01931fSBen Gras 	else
377ef01931fSBen Gras 		return -1;
378ef01931fSBen Gras }
379ef01931fSBen Gras 
380ef01931fSBen Gras private int
looks_ucs16(const unsigned char * buf,size_t nbytes,unichar * ubuf,size_t * ulen)381ef01931fSBen Gras looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
382ef01931fSBen Gras     size_t *ulen)
383ef01931fSBen Gras {
384ef01931fSBen Gras 	int bigend;
385ef01931fSBen Gras 	size_t i;
386ef01931fSBen Gras 
387ef01931fSBen Gras 	if (nbytes < 2)
388ef01931fSBen Gras 		return 0;
389ef01931fSBen Gras 
390ef01931fSBen Gras 	if (buf[0] == 0xff && buf[1] == 0xfe)
391ef01931fSBen Gras 		bigend = 0;
392ef01931fSBen Gras 	else if (buf[0] == 0xfe && buf[1] == 0xff)
393ef01931fSBen Gras 		bigend = 1;
394ef01931fSBen Gras 	else
395ef01931fSBen Gras 		return 0;
396ef01931fSBen Gras 
397ef01931fSBen Gras 	*ulen = 0;
398ef01931fSBen Gras 
399ef01931fSBen Gras 	for (i = 2; i + 1 < nbytes; i += 2) {
400ef01931fSBen Gras 		/* XXX fix to properly handle chars > 65536 */
401ef01931fSBen Gras 
402ef01931fSBen Gras 		if (bigend)
403ef01931fSBen Gras 			ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
404ef01931fSBen Gras 		else
405ef01931fSBen Gras 			ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
406ef01931fSBen Gras 
407ef01931fSBen Gras 		if (ubuf[*ulen - 1] == 0xfffe)
408ef01931fSBen Gras 			return 0;
409ef01931fSBen Gras 		if (ubuf[*ulen - 1] < 128 &&
410ef01931fSBen Gras 		    text_chars[(size_t)ubuf[*ulen - 1]] != T)
411ef01931fSBen Gras 			return 0;
412ef01931fSBen Gras 	}
413ef01931fSBen Gras 
414ef01931fSBen Gras 	return 1 + bigend;
415ef01931fSBen Gras }
416ef01931fSBen Gras 
417ef01931fSBen Gras #undef F
418ef01931fSBen Gras #undef T
419ef01931fSBen Gras #undef I
420ef01931fSBen Gras #undef X
421ef01931fSBen Gras 
422ef01931fSBen Gras /*
423ef01931fSBen Gras  * This table maps each EBCDIC character to an (8-bit extended) ASCII
424ef01931fSBen Gras  * character, as specified in the rationale for the dd(1) command in
425ef01931fSBen Gras  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
426ef01931fSBen Gras  *
427ef01931fSBen Gras  * Unfortunately it does not seem to correspond exactly to any of the
428ef01931fSBen Gras  * five variants of EBCDIC documented in IBM's _Enterprise Systems
429ef01931fSBen Gras  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
430ef01931fSBen Gras  * Edition, July, 1999, pp. I-1 - I-4.
431ef01931fSBen Gras  *
432ef01931fSBen Gras  * Fortunately, though, all versions of EBCDIC, including this one, agree
433ef01931fSBen Gras  * on most of the printing characters that also appear in (7-bit) ASCII.
434ef01931fSBen Gras  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
435ef01931fSBen Gras  *
436ef01931fSBen Gras  * Fortunately too, there is general agreement that codes 0x00 through
437ef01931fSBen Gras  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
438ef01931fSBen Gras  * remainder printing characters.
439ef01931fSBen Gras  *
440ef01931fSBen Gras  * This is sufficient to allow us to identify EBCDIC text and to distinguish
441ef01931fSBen Gras  * between old-style and internationalized examples of text.
442ef01931fSBen Gras  */
443ef01931fSBen Gras 
444ef01931fSBen Gras private unsigned char ebcdic_to_ascii[] = {
445ef01931fSBen Gras   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
446ef01931fSBen Gras  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
447ef01931fSBen Gras 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
448ef01931fSBen Gras 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
449ef01931fSBen Gras ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
450ef01931fSBen Gras '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
451ef01931fSBen Gras '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
452ef01931fSBen Gras 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
453ef01931fSBen Gras 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
454ef01931fSBen Gras 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
455ef01931fSBen Gras 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
456ef01931fSBen Gras 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
457ef01931fSBen Gras '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
458ef01931fSBen Gras '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
459ef01931fSBen Gras '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
460ef01931fSBen Gras '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
461ef01931fSBen Gras };
462ef01931fSBen Gras 
463ef01931fSBen Gras #ifdef notdef
464ef01931fSBen Gras /*
465ef01931fSBen Gras  * The following EBCDIC-to-ASCII table may relate more closely to reality,
466ef01931fSBen Gras  * or at least to modern reality.  It comes from
467ef01931fSBen Gras  *
468ef01931fSBen Gras  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
469ef01931fSBen Gras  *
470ef01931fSBen Gras  * and maps the characters of EBCDIC code page 1047 (the code used for
471ef01931fSBen Gras  * Unix-derived software on IBM's 390 systems) to the corresponding
472ef01931fSBen Gras  * characters from ISO 8859-1.
473ef01931fSBen Gras  *
474ef01931fSBen Gras  * If this table is used instead of the above one, some of the special
475ef01931fSBen Gras  * cases for the NEL character can be taken out of the code.
476ef01931fSBen Gras  */
477ef01931fSBen Gras 
478ef01931fSBen Gras private unsigned char ebcdic_1047_to_8859[] = {
479ef01931fSBen Gras 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
480ef01931fSBen Gras 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
481ef01931fSBen Gras 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
482ef01931fSBen Gras 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
483ef01931fSBen Gras 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
484ef01931fSBen Gras 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
485ef01931fSBen Gras 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
486ef01931fSBen Gras 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
487ef01931fSBen Gras 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
488ef01931fSBen Gras 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
489ef01931fSBen Gras 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
490ef01931fSBen Gras 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
491ef01931fSBen Gras 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
492ef01931fSBen Gras 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
493ef01931fSBen Gras 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
494ef01931fSBen Gras 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
495ef01931fSBen Gras };
496ef01931fSBen Gras #endif
497ef01931fSBen Gras 
498ef01931fSBen Gras /*
499ef01931fSBen Gras  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
500ef01931fSBen Gras  */
501ef01931fSBen Gras private void
from_ebcdic(const unsigned char * buf,size_t nbytes,unsigned char * out)502ef01931fSBen Gras from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
503ef01931fSBen Gras {
504ef01931fSBen Gras 	size_t i;
505ef01931fSBen Gras 
506ef01931fSBen Gras 	for (i = 0; i < nbytes; i++) {
507ef01931fSBen Gras 		out[i] = ebcdic_to_ascii[buf[i]];
508ef01931fSBen Gras 	}
509ef01931fSBen Gras }
510