xref: /dragonfly/usr.bin/localedef/ctype.c (revision 9348a738)
1 /*
2  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright 2012 Garrett D'Amore <garrett@damore.org>  All rights reserved.
4  * Copyright 2015 John Marino <draco@marino.st>
5  *
6  * This source code is derived from the illumos localedef command, and
7  * provided under BSD-style license terms by Nexenta Systems, Inc.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * LC_CTYPE database generation routines for localedef.
34  */
35 
36 #include <sys/tree.h>
37 
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <stddef.h>
41 #include <string.h>
42 #include <sys/types.h>
43 #include <wchar.h>
44 #include <ctype.h>
45 #include <wctype.h>
46 #include <unistd.h>
47 #include "localedef.h"
48 #include "parser.h"
49 #include "runefile.h"
50 
51 
52 /* Needed for bootstrapping, _CTYPE_N not available before 1 Sep 2015 */
53 #ifndef _CTYPE_N
54 #define _CTYPE_N       0x00400000L
55 #endif
56 
57 #define _ISUPPER	_CTYPE_U
58 #define _ISLOWER	_CTYPE_L
59 #define	_ISDIGIT	_CTYPE_D
60 #define	_ISXDIGIT	_CTYPE_X
61 #define	_ISSPACE	_CTYPE_S
62 #define	_ISBLANK	_CTYPE_B
63 #define	_ISALPHA	_CTYPE_A
64 #define	_ISPUNCT	_CTYPE_P
65 #define	_ISGRAPH	_CTYPE_G
66 #define	_ISPRINT	_CTYPE_R
67 #define	_ISCNTRL	_CTYPE_C
68 #define	_E1		_CTYPE_Q
69 #define	_E2		_CTYPE_I
70 #define	_E3		0
71 #define	_E4		_CTYPE_N
72 #define	_E5		_CTYPE_T
73 
74 static wchar_t		last_ctype;
75 static int ctype_compare(const void *n1, const void *n2);
76 
77 typedef struct ctype_node {
78 	wchar_t wc;
79 	int32_t	ctype;
80 	int32_t	toupper;
81 	int32_t	tolower;
82 	RB_ENTRY(ctype_node) entry;
83 } ctype_node_t;
84 
85 static RB_HEAD(ctypes, ctype_node) ctypes;
86 RB_PROTOTYPE_STATIC(ctypes, ctype_node, entry, ctype_compare);
87 RB_GENERATE(ctypes, ctype_node, entry, ctype_compare);
88 
89 static int
90 ctype_compare(const void *n1, const void *n2)
91 {
92 	const ctype_node_t *c1 = n1;
93 	const ctype_node_t *c2 = n2;
94 
95 	return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
96 }
97 
98 void
99 init_ctype(void)
100 {
101 	RB_INIT(&ctypes);
102 }
103 
104 
105 static void
106 add_ctype_impl(ctype_node_t *ctn)
107 {
108 	switch (last_kw) {
109 	case T_ISUPPER:
110 		ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
111 		break;
112 	case T_ISLOWER:
113 		ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
114 		break;
115 	case T_ISALPHA:
116 		ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
117 		break;
118 	case T_ISDIGIT:
119 		ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4);
120 		break;
121 	case T_ISSPACE:
122 		ctn->ctype |= _ISSPACE;
123 		break;
124 	case T_ISCNTRL:
125 		ctn->ctype |= _ISCNTRL;
126 		break;
127 	case T_ISGRAPH:
128 		ctn->ctype |= (_ISGRAPH | _ISPRINT);
129 		break;
130 	case T_ISPRINT:
131 		ctn->ctype |= _ISPRINT;
132 		break;
133 	case T_ISPUNCT:
134 		ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
135 		break;
136 	case T_ISXDIGIT:
137 		ctn->ctype |= (_ISXDIGIT | _ISPRINT);
138 		break;
139 	case T_ISBLANK:
140 		ctn->ctype |= (_ISBLANK | _ISSPACE);
141 		break;
142 	case T_ISPHONOGRAM:
143 		ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
144 		break;
145 	case T_ISIDEOGRAM:
146 		ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
147 		break;
148 	case T_ISENGLISH:
149 		ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
150 		break;
151 	case T_ISNUMBER:
152 		ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
153 		break;
154 	case T_ISSPECIAL:
155 		ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
156 		break;
157 	case T_ISALNUM:
158 		/*
159 		 * We can't do anything with this.  The character
160 		 * should already be specified as a digit or alpha.
161 		 */
162 		break;
163 	default:
164 		errf("not a valid character class");
165 	}
166 }
167 
168 static ctype_node_t *
169 get_ctype(wchar_t wc)
170 {
171 	ctype_node_t	srch;
172 	ctype_node_t	*ctn;
173 
174 	srch.wc = wc;
175 	if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) {
176 		if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
177 			errf("out of memory");
178 			return (NULL);
179 		}
180 		ctn->wc = wc;
181 
182 		RB_INSERT(ctypes, &ctypes, ctn);
183 	}
184 	return (ctn);
185 }
186 
187 void
188 add_ctype(int val)
189 {
190 	ctype_node_t	*ctn;
191 
192 	if ((ctn = get_ctype(val)) == NULL) {
193 		INTERR;
194 		return;
195 	}
196 	add_ctype_impl(ctn);
197 	last_ctype = ctn->wc;
198 }
199 
200 void
201 add_ctype_range(wchar_t end)
202 {
203 	ctype_node_t	*ctn;
204 	wchar_t		cur;
205 
206 	if (end < last_ctype) {
207 		errf("malformed character range (%u ... %u))",
208 		    last_ctype, end);
209 		return;
210 	}
211 	for (cur = last_ctype + 1; cur <= end; cur++) {
212 		if ((ctn = get_ctype(cur)) == NULL) {
213 			INTERR;
214 			return;
215 		}
216 		add_ctype_impl(ctn);
217 	}
218 	last_ctype = end;
219 
220 }
221 
222 /*
223  * A word about widths: if the width mask is specified, then libc
224  * unconditionally honors it.  Otherwise, it assumes printable
225  * characters have width 1, and non-printable characters have width
226  * -1 (except for NULL which is special with with 0).  Hence, we have
227  * no need to inject defaults here -- the "default" unset value of 0
228  * indicates that libc should use its own logic in wcwidth as described.
229  */
230 void
231 add_width(int wc, int width)
232 {
233 	ctype_node_t	*ctn;
234 
235 	if ((ctn = get_ctype(wc)) == NULL) {
236 		INTERR;
237 		return;
238 	}
239 	ctn->ctype &= ~(_CTYPE_SWM);
240 	switch (width) {
241 	case 0:
242 		ctn->ctype |= _CTYPE_SW0;
243 		break;
244 	case 1:
245 		ctn->ctype |= _CTYPE_SW1;
246 		break;
247 	case 2:
248 		ctn->ctype |= _CTYPE_SW2;
249 		break;
250 	case 3:
251 		ctn->ctype |= _CTYPE_SW3;
252 		break;
253 	}
254 }
255 
256 void
257 add_width_range(int start, int end, int width)
258 {
259 	for (; start <= end; start++) {
260 		add_width(start, width);
261 	}
262 }
263 
264 void
265 add_caseconv(int val, int wc)
266 {
267 	ctype_node_t	*ctn;
268 
269 	ctn = get_ctype(val);
270 	if (ctn == NULL) {
271 		INTERR;
272 		return;
273 	}
274 
275 	switch (last_kw) {
276 	case T_TOUPPER:
277 		ctn->toupper = wc;
278 		break;
279 	case T_TOLOWER:
280 		ctn->tolower = wc;
281 		break;
282 	default:
283 		INTERR;
284 		break;
285 	}
286 }
287 
288 void
289 dump_ctype(void)
290 {
291 	FILE		*f;
292 	_FileRuneLocale	rl;
293 	ctype_node_t	*ctn, *last_ct, *last_lo, *last_up;
294 	_FileRuneEntry	*ct = NULL;
295 	_FileRuneEntry	*lo = NULL;
296 	_FileRuneEntry	*up = NULL;
297 	wchar_t		wc;
298 
299 	(void) memset(&rl, 0, sizeof (rl));
300 	last_ct = NULL;
301 	last_lo = NULL;
302 	last_up = NULL;
303 
304 	if ((f = open_category()) == NULL)
305 		return;
306 
307 	(void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
308 	(void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
309 
310 	/*
311 	 * Initialize the identity map.
312 	 */
313 	for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
314 		rl.maplower[wc] = wc;
315 		rl.mapupper[wc] = wc;
316 	}
317 
318 	RB_FOREACH(ctn, ctypes, &ctypes) {
319 		int conflict = 0;
320 
321 		wc = ctn->wc;
322 
323 		/*
324 		 * POSIX requires certain portable characters have
325 		 * certain types.  Add them if they are missing.
326 		 */
327 		if ((wc >= 1) && (wc <= 127)) {
328 			if ((wc >= 'A') && (wc <= 'Z'))
329 				ctn->ctype |= _ISUPPER;
330 			if ((wc >= 'a') && (wc <= 'z'))
331 				ctn->ctype |= _ISLOWER;
332 			if ((wc >= '0') && (wc <= '9'))
333 				ctn->ctype |= _ISDIGIT;
334 			if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
335 				ctn->ctype |= _ISSPACE;
336 			if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
337 				ctn->ctype |= _ISXDIGIT;
338 			if (strchr(" \t", (char)wc))
339 				ctn->ctype |= _ISBLANK;
340 			if (wc == ' ')
341 				ctn->ctype |= _ISPRINT;
342 
343 			/*
344 			 * Technically these settings are only
345 			 * required for the C locale.  However, it
346 			 * turns out that because of the historical
347 			 * version of isprint(), we need them for all
348 			 * locales as well.  Note that these are not
349 			 * necessarily valid punctation characters in
350 			 * the current language, but ispunct() needs
351 			 * to return TRUE for them.
352 			 */
353 			if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
354 			    (char)wc))
355 				ctn->ctype |= _ISPUNCT;
356 		}
357 
358 		/*
359 		 * POSIX also requires that certain types imply
360 		 * others.  Add any inferred types here.
361 		 */
362 		if (ctn->ctype & (_ISUPPER |_ISLOWER))
363 			ctn->ctype |= _ISALPHA;
364 		if (ctn->ctype & _ISDIGIT)
365 			ctn->ctype |= _ISXDIGIT;
366 		if (ctn->ctype & _ISBLANK)
367 			ctn->ctype |= _ISSPACE;
368 		if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
369 			ctn->ctype |= _ISGRAPH;
370 		if (ctn->ctype & _ISGRAPH)
371 			ctn->ctype |= _ISPRINT;
372 
373 		/*
374 		 * Finally, POSIX requires that certain combinations
375 		 * are invalid.  We don't flag this as a fatal error,
376 		 * but we will warn about.
377 		 */
378 		if ((ctn->ctype & _ISALPHA) &&
379 		    (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
380 			conflict++;
381 		if ((ctn->ctype & _ISPUNCT) &
382 		    (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
383 			conflict++;
384 		if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
385 			conflict++;
386 		if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
387 			conflict++;
388 		if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
389 			conflict++;
390 
391 		if (conflict) {
392 			warn("conflicting classes for character 0x%x (%x)",
393 			    wc, ctn->ctype);
394 		}
395 		/*
396 		 * Handle the lower 256 characters using the simple
397 		 * optimization.  Note that if we have not defined the
398 		 * upper/lower case, then we identity map it.
399 		 */
400 		if ((unsigned)wc < _CACHED_RUNES) {
401 			rl.runetype[wc] = ctn->ctype;
402 			if (ctn->tolower)
403 				rl.maplower[wc] = ctn->tolower;
404 			if (ctn->toupper)
405 				rl.mapupper[wc] = ctn->toupper;
406 			continue;
407 		}
408 
409 		if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) &&
410 		    (last_ct->wc + 1 == wc)) {
411 			ct[rl.runetype_ext_nranges-1].max = wc;
412 		} else {
413 			rl.runetype_ext_nranges++;
414 			ct = realloc(ct,
415 			    sizeof (*ct) * rl.runetype_ext_nranges);
416 			ct[rl.runetype_ext_nranges - 1].min = wc;
417 			ct[rl.runetype_ext_nranges - 1].max = wc;
418 			ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
419 		}
420 		last_ct = ctn;
421 		if (ctn->tolower == 0) {
422 			last_lo = NULL;
423 		} else if ((last_lo != NULL) &&
424 		    (last_lo->tolower + 1 == ctn->tolower)) {
425 			lo[rl.maplower_ext_nranges-1].max = wc;
426 			last_lo = ctn;
427 		} else {
428 			rl.maplower_ext_nranges++;
429 			lo = realloc(lo,
430 			    sizeof (*lo) * rl.maplower_ext_nranges);
431 			lo[rl.maplower_ext_nranges - 1].min = wc;
432 			lo[rl.maplower_ext_nranges - 1].max = wc;
433 			lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
434 			last_lo = ctn;
435 		}
436 
437 		if (ctn->toupper == 0) {
438 			last_up = NULL;
439 		} else if ((last_up != NULL) &&
440 		    (last_up->toupper + 1 == ctn->toupper)) {
441 			up[rl.mapupper_ext_nranges-1].max = wc;
442 			last_up = ctn;
443 		} else {
444 			rl.mapupper_ext_nranges++;
445 			up = realloc(up,
446 			    sizeof (*up) * rl.mapupper_ext_nranges);
447 			up[rl.mapupper_ext_nranges - 1].min = wc;
448 			up[rl.mapupper_ext_nranges - 1].max = wc;
449 			up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
450 			last_up = ctn;
451 		}
452 	}
453 
454 	if ((wr_category(&rl, sizeof (rl), f) < 0) ||
455 	    (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
456 	    (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
457 	    (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
458 		return;
459 	}
460 
461 	close_category(f);
462 }
463