xref: /dragonfly/usr.bin/localedef/ctype.c (revision 73610d44)
1 /*
2  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright 2012 Garrett D'Amore <garrett@damore.org>  All rights reserved.
4  * Copyright 2015 John Marino <draco@marino.st>
5  *
6  * This source code is derived from the illumos localedef command, and
7  * provided under BSD-style license terms by Nexenta Systems, Inc.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * LC_CTYPE database generation routines for localedef.
34  */
35 
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <stddef.h>
39 #include <string.h>
40 #include <sys/types.h>
41 #include <wchar.h>
42 #include <ctype.h>
43 #include <wctype.h>
44 #include <unistd.h>
45 #include "localedef.h"
46 #include "parser.h"
47 #include "runefile.h"
48 #include "avl.h"
49 
50 /* Needed for bootstrapping, _CTYPE_N not available before 1 Sep 2015 */
51 #ifndef _CTYPE_N
52 #define _CTYPE_N	0x00400000L
53 #endif
54 
55 #define _ISUPPER	_CTYPE_U
56 #define _ISLOWER	_CTYPE_L
57 #define	_ISDIGIT	_CTYPE_D
58 #define	_ISXDIGIT	_CTYPE_X
59 #define	_ISSPACE	_CTYPE_S
60 #define	_ISBLANK	_CTYPE_B
61 #define	_ISALPHA	_CTYPE_A
62 #define	_ISPUNCT	_CTYPE_P
63 #define	_ISGRAPH	_CTYPE_G
64 #define	_ISPRINT	_CTYPE_R
65 #define	_ISCNTRL	_CTYPE_C
66 #define	_E1		_CTYPE_Q
67 #define	_E2		_CTYPE_I
68 #define	_E3		0
69 #define	_E4		_CTYPE_N
70 #define	_E5		_CTYPE_T
71 
72 static avl_tree_t	ctypes;
73 
74 static wchar_t		last_ctype;
75 
76 typedef struct ctype_node {
77 	wchar_t wc;
78 	int32_t	ctype;
79 	int32_t	toupper;
80 	int32_t	tolower;
81 	avl_node_t avl;
82 } ctype_node_t;
83 
84 typedef struct width_node {
85 	wchar_t start;
86 	wchar_t end;
87 	int8_t width;
88 	avl_node_t avl;
89 } width_node_t;
90 
91 static int
92 ctype_compare(const void *n1, const void *n2)
93 {
94 	const ctype_node_t *c1 = n1;
95 	const ctype_node_t *c2 = n2;
96 
97 	return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
98 }
99 
100 void
101 init_ctype(void)
102 {
103 	avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t),
104 	    offsetof(ctype_node_t, avl));
105 }
106 
107 
108 static void
109 add_ctype_impl(ctype_node_t *ctn)
110 {
111 	switch (last_kw) {
112 	case T_ISUPPER:
113 		ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
114 		break;
115 	case T_ISLOWER:
116 		ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
117 		break;
118 	case T_ISALPHA:
119 		ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
120 		break;
121 	case T_ISDIGIT:
122 		ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4);
123 		break;
124 	case T_ISSPACE:
125 		ctn->ctype |= _ISSPACE;
126 		break;
127 	case T_ISCNTRL:
128 		ctn->ctype |= _ISCNTRL;
129 		break;
130 	case T_ISGRAPH:
131 		ctn->ctype |= (_ISGRAPH | _ISPRINT);
132 		break;
133 	case T_ISPRINT:
134 		ctn->ctype |= _ISPRINT;
135 		break;
136 	case T_ISPUNCT:
137 		ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
138 		break;
139 	case T_ISXDIGIT:
140 		ctn->ctype |= (_ISXDIGIT | _ISPRINT);
141 		break;
142 	case T_ISBLANK:
143 		ctn->ctype |= (_ISBLANK | _ISSPACE);
144 		break;
145 	case T_ISPHONOGRAM:
146 		ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
147 		break;
148 	case T_ISIDEOGRAM:
149 		ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
150 		break;
151 	case T_ISENGLISH:
152 		ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
153 		break;
154 	case T_ISNUMBER:
155 		ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
156 		break;
157 	case T_ISSPECIAL:
158 		ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
159 		break;
160 	case T_ISALNUM:
161 		/*
162 		 * We can't do anything with this.  The character
163 		 * should already be specified as a digit or alpha.
164 		 */
165 		break;
166 	default:
167 		errf("not a valid character class");
168 	}
169 }
170 
171 static ctype_node_t *
172 get_ctype(wchar_t wc)
173 {
174 	ctype_node_t	srch;
175 	ctype_node_t	*ctn;
176 	avl_index_t	where;
177 
178 	srch.wc = wc;
179 	if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) {
180 		if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
181 			errf("out of memory");
182 			return (NULL);
183 		}
184 		ctn->wc = wc;
185 
186 		avl_insert(&ctypes, ctn, where);
187 	}
188 	return (ctn);
189 }
190 
191 void
192 add_ctype(int val)
193 {
194 	ctype_node_t	*ctn;
195 
196 	if ((ctn = get_ctype(val)) == NULL) {
197 		INTERR;
198 		return;
199 	}
200 	add_ctype_impl(ctn);
201 	last_ctype = ctn->wc;
202 }
203 
204 void
205 add_ctype_range(int end)
206 {
207 	ctype_node_t	*ctn;
208 	wchar_t		cur;
209 
210 	if (end < last_ctype) {
211 		errf("malformed character range (%u ... %u))",
212 		    last_ctype, end);
213 		return;
214 	}
215 	for (cur = last_ctype + 1; cur <= end; cur++) {
216 		if ((ctn = get_ctype(cur)) == NULL) {
217 			INTERR;
218 			return;
219 		}
220 		add_ctype_impl(ctn);
221 	}
222 	last_ctype = end;
223 
224 }
225 
226 /*
227  * A word about widths: if the width mask is specified, then libc
228  * unconditionally honors it.  Otherwise, it assumes printable
229  * characters have width 1, and non-printable characters have width
230  * -1 (except for NULL which is special with with 0).  Hence, we have
231  * no need to inject defaults here -- the "default" unset value of 0
232  * indicates that libc should use its own logic in wcwidth as described.
233  */
234 void
235 add_width(int wc, int width)
236 {
237 	ctype_node_t	*ctn;
238 
239 	if ((ctn = get_ctype(wc)) == NULL) {
240 		INTERR;
241 		return;
242 	}
243 	ctn->ctype &= ~(_CTYPE_SWM);
244 	switch (width) {
245 	case 0:
246 		ctn->ctype |= _CTYPE_SW0;
247 		break;
248 	case 1:
249 		ctn->ctype |= _CTYPE_SW1;
250 		break;
251 	case 2:
252 		ctn->ctype |= _CTYPE_SW2;
253 		break;
254 	case 3:
255 		ctn->ctype |= _CTYPE_SW3;
256 		break;
257 	}
258 }
259 
260 void
261 add_width_range(int start, int end, int width)
262 {
263 	for (; start <= end; start++) {
264 		add_width(start, width);
265 	}
266 }
267 
268 void
269 add_caseconv(int val, int wc)
270 {
271 	ctype_node_t	*ctn;
272 
273 	ctn = get_ctype(val);
274 	if (ctn == NULL) {
275 		INTERR;
276 		return;
277 	}
278 
279 	switch (last_kw) {
280 	case T_TOUPPER:
281 		ctn->toupper = wc;
282 		break;
283 	case T_TOLOWER:
284 		ctn->tolower = wc;
285 		break;
286 	default:
287 		INTERR;
288 		break;
289 	}
290 }
291 
292 void
293 dump_ctype(void)
294 {
295 	FILE		*f;
296 	_FileRuneLocale	rl;
297 	ctype_node_t	*ctn, *last_ct, *last_lo, *last_up;
298 	_FileRuneEntry	*ct = NULL;
299 	_FileRuneEntry	*lo = NULL;
300 	_FileRuneEntry	*up = NULL;
301 	wchar_t		wc;
302 
303 	(void) memset(&rl, 0, sizeof (rl));
304 	last_ct = NULL;
305 	last_lo = NULL;
306 	last_up = NULL;
307 
308 	if ((f = open_category()) == NULL)
309 		return;
310 
311 	(void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
312 	(void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
313 
314 	/*
315 	 * Initialize the identity map.
316 	 */
317 	for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
318 		rl.maplower[wc] = wc;
319 		rl.mapupper[wc] = wc;
320 	}
321 
322 	for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) {
323 		int conflict = 0;
324 
325 
326 		wc = ctn->wc;
327 
328 		/*
329 		 * POSIX requires certain portable characters have
330 		 * certain types.  Add them if they are missing.
331 		 */
332 		if ((wc >= 1) && (wc <= 127)) {
333 			if ((wc >= 'A') && (wc <= 'Z'))
334 				ctn->ctype |= _ISUPPER;
335 			if ((wc >= 'a') && (wc <= 'z'))
336 				ctn->ctype |= _ISLOWER;
337 			if ((wc >= '0') && (wc <= '9'))
338 				ctn->ctype |= _ISDIGIT;
339 			if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
340 				ctn->ctype |= _ISSPACE;
341 			if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
342 				ctn->ctype |= _ISXDIGIT;
343 			if (strchr(" \t", (char)wc))
344 				ctn->ctype |= _ISBLANK;
345 			if (wc == ' ')
346 				ctn->ctype |= _ISPRINT;
347 
348 			/*
349 			 * Technically these settings are only
350 			 * required for the C locale.  However, it
351 			 * turns out that because of the historical
352 			 * version of isprint(), we need them for all
353 			 * locales as well.  Note that these are not
354 			 * necessarily valid punctation characters in
355 			 * the current language, but ispunct() needs
356 			 * to return TRUE for them.
357 			 */
358 			if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
359 			    (char)wc))
360 				ctn->ctype |= _ISPUNCT;
361 		}
362 
363 		/*
364 		 * POSIX also requires that certain types imply
365 		 * others.  Add any inferred types here.
366 		 */
367 		if (ctn->ctype & (_ISUPPER |_ISLOWER))
368 			ctn->ctype |= _ISALPHA;
369 		if (ctn->ctype & _ISDIGIT)
370 			ctn->ctype |= _ISXDIGIT;
371 		if (ctn->ctype & _ISBLANK)
372 			ctn->ctype |= _ISSPACE;
373 		if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
374 			ctn->ctype |= _ISGRAPH;
375 		if (ctn->ctype & _ISGRAPH)
376 			ctn->ctype |= _ISPRINT;
377 
378 		/*
379 		 * Finally, POSIX requires that certain combinations
380 		 * are invalid.  We don't flag this as a fatal error,
381 		 * but we will warn about.
382 		 */
383 		if ((ctn->ctype & _ISALPHA) &&
384 		    (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
385 			conflict++;
386 		if ((ctn->ctype & _ISPUNCT) &
387 		    (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
388 			conflict++;
389 		if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
390 			conflict++;
391 		if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
392 			conflict++;
393 		if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
394 			conflict++;
395 
396 		if (conflict) {
397 			warn("conflicting classes for character 0x%x (%x)",
398 			    wc, ctn->ctype);
399 		}
400 		/*
401 		 * Handle the lower 256 characters using the simple
402 		 * optimization.  Note that if we have not defined the
403 		 * upper/lower case, then we identity map it.
404 		 */
405 		if ((unsigned)wc < _CACHED_RUNES) {
406 			rl.runetype[wc] = ctn->ctype;
407 			if (ctn->tolower)
408 				rl.maplower[wc] = ctn->tolower;
409 			if (ctn->toupper)
410 				rl.mapupper[wc] = ctn->toupper;
411 			continue;
412 		}
413 
414 		if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) {
415 			ct[rl.runetype_ext_nranges-1].max = wc;
416 			last_ct = ctn;
417 		} else {
418 			rl.runetype_ext_nranges++;
419 			ct = realloc(ct,
420 			    sizeof (*ct) * rl.runetype_ext_nranges);
421 			ct[rl.runetype_ext_nranges - 1].min = wc;
422 			ct[rl.runetype_ext_nranges - 1].max = wc;
423 			ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
424 			last_ct = ctn;
425 		}
426 		if (ctn->tolower == 0) {
427 			last_lo = NULL;
428 		} else if ((last_lo != NULL) &&
429 		    (last_lo->tolower + 1 == ctn->tolower)) {
430 			lo[rl.maplower_ext_nranges-1].max = wc;
431 			last_lo = ctn;
432 		} else {
433 			rl.maplower_ext_nranges++;
434 			lo = realloc(lo,
435 			    sizeof (*lo) * rl.maplower_ext_nranges);
436 			lo[rl.maplower_ext_nranges - 1].min = wc;
437 			lo[rl.maplower_ext_nranges - 1].max = wc;
438 			lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
439 			last_lo = ctn;
440 		}
441 
442 		if (ctn->toupper == 0) {
443 			last_up = NULL;
444 		} else if ((last_up != NULL) &&
445 		    (last_up->toupper + 1 == ctn->toupper)) {
446 			up[rl.mapupper_ext_nranges-1].max = wc;
447 			last_up = ctn;
448 		} else {
449 			rl.mapupper_ext_nranges++;
450 			up = realloc(up,
451 			    sizeof (*up) * rl.mapupper_ext_nranges);
452 			up[rl.mapupper_ext_nranges - 1].min = wc;
453 			up[rl.mapupper_ext_nranges - 1].max = wc;
454 			up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
455 			last_up = ctn;
456 		}
457 	}
458 
459 	if ((wr_category(&rl, sizeof (rl), f) < 0) ||
460 	    (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
461 	    (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
462 	    (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
463 		return;
464 	}
465 
466 	close_category(f);
467 }
468