xref: /openbsd/usr.bin/less/charset.c (revision 3d8817e4)
1 /*
2  * Copyright (C) 1984-2002  Mark Nudelman
3  *
4  * You may distribute under the terms of either the GNU General Public
5  * License or the Less License, as specified in the README file.
6  *
7  * For more information about less, or for information on how to
8  * contact the author, see the README file.
9  */
10 
11 
12 /*
13  * Functions to define the character set
14  * and do things specific to the character set.
15  */
16 
17 #include "less.h"
18 #if HAVE_LOCALE
19 #include <locale.h>
20 #include <ctype.h>
21 #endif
22 
23 public int utf_mode = 0;
24 
25 #if !SMALL
26 /*
27  * Predefined character sets,
28  * selected by the LESSCHARSET environment variable.
29  */
30 struct charset {
31 	char *name;
32 	int *p_flag;
33 	char *desc;
34 } charsets[] = {
35 	{ "ascii",	NULL,       "8bcccbcc18b95.b" },
36 	{ "dos",	NULL,       "8bcccbcc12bc5b223.b" },
37 	{ "ebcdic",	NULL,       "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
38 	{ "IBM-1047",	NULL,       "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
39 	{ "iso8859",	NULL,       "8bcccbcc18b95.33b." },
40 	{ "koi8-r",	NULL,       "8bcccbcc18b95.b128." },
41 	{ "next",	NULL,       "8bcccbcc18b95.bb125.bb" },
42 	{ "utf-8",	&utf_mode,  "8bcccbcc18b." },
43 	{ NULL, NULL, NULL }
44 };
45 
46 struct cs_alias {
47 	char *name;
48 	char *oname;
49 } cs_aliases[] = {
50 	{ "latin1",	"iso8859" },
51 	{ "latin9",	"iso8859" },
52 	{ NULL, NULL }
53 };
54 
55 #define	IS_BINARY_CHAR	01
56 #define	IS_CONTROL_CHAR	02
57 
58 static char chardef[256];
59 static char *binfmt = NULL;
60 public int binattr = AT_STANDOUT;
61 
62 
63 /*
64  * Define a charset, given a description string.
65  * The string consists of 256 letters,
66  * one for each character in the charset.
67  * If the string is shorter than 256 letters, missing letters
68  * are taken to be identical to the last one.
69  * A decimal number followed by a letter is taken to be a
70  * repetition of the letter.
71  *
72  * Each letter is one of:
73  *	. normal character
74  *	b binary character
75  *	c control character
76  */
77 	static void
78 ichardef(s)
79 	char *s;
80 {
81 	register char *cp;
82 	register int n;
83 	register char v;
84 
85 	n = 0;
86 	v = 0;
87 	cp = chardef;
88 	while (*s != '\0')
89 	{
90 		switch (*s++)
91 		{
92 		case '.':
93 			v = 0;
94 			break;
95 		case 'c':
96 			v = IS_CONTROL_CHAR;
97 			break;
98 		case 'b':
99 			v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
100 			break;
101 
102 		case '0': case '1': case '2': case '3': case '4':
103 		case '5': case '6': case '7': case '8': case '9':
104 			n = (10 * n) + (s[-1] - '0');
105 			continue;
106 
107 		default:
108 			error("invalid chardef", NULL_PARG);
109 			quit(QUIT_ERROR);
110 			/*NOTREACHED*/
111 		}
112 
113 		do
114 		{
115 			if (cp >= chardef + sizeof(chardef))
116 			{
117 				error("chardef longer than 256", NULL_PARG);
118 				quit(QUIT_ERROR);
119 				/*NOTREACHED*/
120 			}
121 			*cp++ = v;
122 		} while (--n > 0);
123 		n = 0;
124 	}
125 
126 	while (cp < chardef + sizeof(chardef))
127 		*cp++ = v;
128 }
129 
130 /*
131  * Define a charset, given a charset name.
132  * The valid charset names are listed in the "charsets" array.
133  */
134 	static int
135 icharset(name)
136 	register char *name;
137 {
138 	register struct charset *p;
139 	register struct cs_alias *a;
140 
141 	if (name == NULL || *name == '\0')
142 		return (0);
143 
144 	/* First see if the name is an alias. */
145 	for (a = cs_aliases;  a->name != NULL;  a++)
146 	{
147 		if (strcmp(name, a->name) == 0)
148 		{
149 			name = a->oname;
150 			break;
151 		}
152 	}
153 
154 	for (p = charsets;  p->name != NULL;  p++)
155 	{
156 		if (strcmp(name, p->name) == 0)
157 		{
158 			ichardef(p->desc);
159 			if (p->p_flag != NULL)
160 				*(p->p_flag) = 1;
161 			return (1);
162 		}
163 	}
164 
165 	error("invalid charset name", NULL_PARG);
166 	quit(QUIT_ERROR);
167 	/*NOTREACHED*/
168 	return (0);
169 }
170 
171 #if HAVE_LOCALE
172 /*
173  * Define a charset, given a locale name.
174  */
175 	static void
176 ilocale()
177 {
178 	register int c;
179 
180 	setlocale(LC_ALL, "");
181 	for (c = 0;  c < (int) sizeof(chardef);  c++)
182 	{
183 		if (isprint(c))
184 			chardef[c] = 0;
185 		else if (iscntrl(c))
186 			chardef[c] = IS_CONTROL_CHAR;
187 		else
188 			chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
189 	}
190 }
191 #endif
192 
193 /*
194  * Define the printing format for control chars.
195  */
196    	public void
197 setbinfmt(s)
198 	char *s;
199 {
200 	if (s == NULL || *s == '\0')
201 		s = "*s<%X>";
202 	/*
203 	 * Select the attributes if it starts with "*".
204 	 */
205 	if (*s == '*')
206 	{
207 		switch (s[1])
208 		{
209 		case 'd':  binattr = AT_BOLD;      break;
210 		case 'k':  binattr = AT_BLINK;     break;
211 		case 's':  binattr = AT_STANDOUT;  break;
212 		case 'u':  binattr = AT_UNDERLINE; break;
213 		default:   binattr = AT_NORMAL;    break;
214 		}
215 		s += 2;
216 	}
217 	binfmt = s;
218 }
219 
220 /*
221  * Initialize charset data structures.
222  */
223 	public void
224 init_charset()
225 {
226 	register char *s;
227 
228 	s = lgetenv("LESSBINFMT");
229 	setbinfmt(s);
230 
231 	/*
232 	 * See if environment variable LESSCHARSET is defined.
233 	 */
234 	s = lgetenv("LESSCHARSET");
235 	if (icharset(s))
236 		return;
237 	/*
238 	 * LESSCHARSET is not defined: try LESSCHARDEF.
239 	 */
240 	s = lgetenv("LESSCHARDEF");
241 	if (s != NULL && *s != '\0')
242 	{
243 		ichardef(s);
244 		return;
245 	}
246 
247 #if HAVE_STRSTR
248 	/*
249 	 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
250 	 */
251 	if ((s = lgetenv("LC_ALL")) != NULL ||
252 	    (s = lgetenv("LC_CTYPE")) != NULL ||
253 	    (s = lgetenv("LANG")) != NULL)
254 	{
255 		if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
256 			if (icharset("utf-8"))
257 				return;
258 	}
259 #endif
260 
261 #if HAVE_LOCALE
262 	/*
263 	 * Use setlocale.
264 	 */
265 	ilocale();
266 #else
267 #if MSDOS_COMPILER
268 	/*
269 	 * Default to "dos".
270 	 */
271 	(void) icharset("dos");
272 #else
273 	/*
274 	 * Default to "latin1".
275 	 */
276 	(void) icharset("latin1");
277 #endif
278 #endif
279 }
280 
281 /*
282  * Is a given character a "binary" character?
283  */
284 	public int
285 binary_char(c)
286 	unsigned char c;
287 {
288 	c &= 0377;
289 	return (chardef[c] & IS_BINARY_CHAR);
290 }
291 
292 /*
293  * Is a given character a "control" character?
294  */
295 	public int
296 control_char(c)
297 	int c;
298 {
299 	c &= 0377;
300 	return (chardef[c] & IS_CONTROL_CHAR);
301 }
302 
303 /*
304  * Return the printable form of a character.
305  * For example, in the "ascii" charset '\3' is printed as "^C".
306  */
307 	public char *
308 prchar(c)
309 	int c;
310 {
311 	static char buf[8];
312 
313 	c &= 0377;
314 	if (!control_char(c))
315 		snprintf(buf, sizeof(buf), "%c", c);
316 	else if (c == ESC)
317 		snprintf(buf, sizeof(buf), "ESC");
318 #if IS_EBCDIC_HOST
319 	else if (!binary_char(c) && c < 64)
320 		snprintf(buf, sizeof(buf), "^%c",
321 		/*
322 		 * This array roughly inverts CONTROL() #defined in less.h,
323 	 	 * and should be kept in sync with CONTROL() and IBM-1047.
324  	 	 */
325 		"@ABC.I.?...KLMNO"
326 		"PQRS.JH.XY.."
327 		"\\]^_"
328 		"......W[.....EFG"
329 		"..V....D....TU.Z"[c]);
330 #else
331   	else if (c < 128 && !control_char(c ^ 0100))
332   		snprintf(buf, sizeof(buf), "^%c", c ^ 0100);
333 #endif
334 	else
335 		snprintf(buf, sizeof(buf), binfmt, c);
336 	return (buf);
337 }
338 
339 #else /* SMALL */
340 
341 public int binattr = AT_STANDOUT;
342 
343 	public void
344 init_charset()
345 {
346 	return;
347 }
348 
349 /*
350  * Is a given character a "binary" character?
351  */
352 	public int
353 binary_char(c)
354 	unsigned char c;
355 {
356 	return (!isprint(c) && !isspace(c));
357 }
358 
359 /*
360  * Is a given character a "control" character?
361  */
362 	public int
363 control_char(c)
364 	int c;
365 {
366 	return (iscntrl(c));
367 }
368 
369 /*
370  * Return the printable form of a character.
371  * For example, in the "ascii" charset '\3' is printed as "^C".
372  */
373 	public char *
374 prchar(c)
375 	int c;
376 {
377 	static char buf[8];
378 
379 	c &= 0377;
380 	if (!iscntrl(c))
381 		snprintf(buf, sizeof(buf), "%c", c);
382 	else if (c == ESC)
383 		snprintf(buf, sizeof(buf), "ESC");
384   	else if (c < 128 && !iscntrl(c ^ 0100))
385   		snprintf(buf, sizeof(buf), "^%c", c ^ 0100);
386 	else
387 		snprintf(buf, sizeof(buf), "*s<%X>", c);
388 	return (buf);
389 }
390 #endif /* SMALL */
391