xref: /openbsd/usr.bin/lex/ccl.c (revision 20c29e2b)
1 /*	$OpenBSD: ccl.c,v 1.10 2024/11/09 18:03:44 op Exp $	*/
2 
3 /* ccl - routines for character classes */
4 
5 /*  Copyright (c) 1990 The Regents of the University of California. */
6 /*  All rights reserved. */
7 
8 /*  This code is derived from software contributed to Berkeley by */
9 /*  Vern Paxson. */
10 
11 /*  The United States Government has rights in this work pursuant */
12 /*  to contract no. DE-AC03-76SF00098 between the United States */
13  /* Department of Energy and the University of California. */
14 
15 /*  This file is part of flex. */
16 
17 /*  Redistribution and use in source and binary forms, with or without */
18 /*  modification, are permitted provided that the following conditions */
19 /*  are met: */
20 
21 /*  1. Redistributions of source code must retain the above copyright */
22 /*     notice, this list of conditions and the following disclaimer. */
23 /*  2. Redistributions in binary form must reproduce the above copyright */
24 /*     notice, this list of conditions and the following disclaimer in the */
25 /*     documentation and/or other materials provided with the distribution. */
26 
27 /*  Neither the name of the University nor the names of its contributors */
28 /*  may be used to endorse or promote products derived from this software */
29 /*  without specific prior written permission. */
30 
31 /*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */
32 /*  IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */
33 /*  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */
34 /*  PURPOSE. */
35 
36 #include "flexdef.h"
37 
38 /* return true if the chr is in the ccl. Takes negation into account. */
39 static bool
ccl_contains(const int cclp,const int ch)40 ccl_contains(const int cclp, const int ch)
41 {
42 	int ind, len, i;
43 
44 	len = ccllen[cclp];
45 	ind = cclmap[cclp];
46 
47 	for (i = 0; i < len; ++i)
48 		if (ccltbl[ind + i] == ch)
49 			return !cclng[cclp];
50 
51 	return cclng[cclp];
52 }
53 
54 
55 /* ccladd - add a single character to a ccl */
56 
57 void
ccladd(int cclp,int ch)58 ccladd(int cclp, int ch)
59 {
60 	int ind, len, newpos, i;
61 
62 	check_char(ch);
63 
64 	len = ccllen[cclp];
65 	ind = cclmap[cclp];
66 
67 	/* check to see if the character is already in the ccl */
68 
69 	for (i = 0; i < len; ++i)
70 		if (ccltbl[ind + i] == ch)
71 			return;
72 
73 	/* mark newlines */
74 	if (ch == nlch)
75 		ccl_has_nl[cclp] = true;
76 
77 	newpos = ind + len;
78 
79 	if (newpos >= current_max_ccl_tbl_size) {
80 		current_max_ccl_tbl_size += MAX_CCL_TBL_SIZE_INCREMENT;
81 
82 		++num_reallocs;
83 
84 		ccltbl = reallocate_Character_array(ccltbl,
85 		    current_max_ccl_tbl_size);
86 	}
87 	ccllen[cclp] = len + 1;
88 	ccltbl[newpos] = ch;
89 }
90 
91 /* dump_cclp - same thing as list_character_set, but for cclps.  */
92 
93 static void
dump_cclp(FILE * file,int cclp)94 dump_cclp(FILE * file, int cclp)
95 {
96 	int i;
97 
98 	putc('[', file);
99 
100 	for (i = 0; i < csize; ++i) {
101 		if (ccl_contains(cclp, i)) {
102 			int start_char = i;
103 
104 			putc(' ', file);
105 
106 			fputs(readable_form(i), file);
107 
108 			while (++i < csize && ccl_contains(cclp, i));
109 
110 			if (i - 1 > start_char)
111 				/* this was a run */
112 				fprintf(file, "-%s",
113 				    readable_form(i - 1));
114 
115 			putc(' ', file);
116 		}
117 	}
118 
119 	putc(']', file);
120 }
121 
122 
123 
124 /* ccl_set_diff - create a new ccl as the set difference of the two given ccls. */
125 int
ccl_set_diff(int a,int b)126 ccl_set_diff(int a, int b)
127 {
128 	int d, ch;
129 
130 	/* create new class  */
131 	d = cclinit();
132 
133 	/*
134 	 * In order to handle negation, we spin through all possible chars,
135 	 * adding each char in a that is not in b. (This could be O(n^2),
136 	 * but n is small and bounded.)
137 	 */
138 	for (ch = 0; ch < csize; ++ch)
139 		if (ccl_contains(a, ch) && !ccl_contains(b, ch))
140 			ccladd(d, ch);
141 
142 	/* debug */
143 	if (0) {
144 		fprintf(stderr, "ccl_set_diff (");
145 		fprintf(stderr, "\n    ");
146 		dump_cclp(stderr, a);
147 		fprintf(stderr, "\n    ");
148 		dump_cclp(stderr, b);
149 		fprintf(stderr, "\n    ");
150 		dump_cclp(stderr, d);
151 		fprintf(stderr, "\n)\n");
152 	}
153 	return d;
154 }
155 
156 /* ccl_set_union - create a new ccl as the set union of the two given ccls. */
157 int
ccl_set_union(int a,int b)158 ccl_set_union(int a, int b)
159 {
160 	int d, i;
161 
162 	/* create new class  */
163 	d = cclinit();
164 
165 	/* Add all of a */
166 	for (i = 0; i < ccllen[a]; ++i)
167 		ccladd(d, ccltbl[cclmap[a] + i]);
168 
169 	/* Add all of b */
170 	for (i = 0; i < ccllen[b]; ++i)
171 		ccladd(d, ccltbl[cclmap[b] + i]);
172 
173 	/* debug */
174 	if (0) {
175 		fprintf(stderr, "ccl_set_union (%d + %d = %d", a, b, d);
176 		fprintf(stderr, "\n    ");
177 		dump_cclp(stderr, a);
178 		fprintf(stderr, "\n    ");
179 		dump_cclp(stderr, b);
180 		fprintf(stderr, "\n    ");
181 		dump_cclp(stderr, d);
182 		fprintf(stderr, "\n)\n");
183 	}
184 	return d;
185 }
186 
187 
188 /* cclinit - return an empty ccl */
189 
190 int
cclinit(void)191 cclinit(void)
192 {
193 	if (++lastccl >= current_maxccls) {
194 		current_maxccls += MAX_CCLS_INCREMENT;
195 
196 		++num_reallocs;
197 
198 		cclmap =
199 		    reallocate_integer_array(cclmap, current_maxccls);
200 		ccllen =
201 		    reallocate_integer_array(ccllen, current_maxccls);
202 		cclng = reallocate_integer_array(cclng, current_maxccls);
203 		ccl_has_nl =
204 		    reallocate_bool_array(ccl_has_nl,
205 		    current_maxccls);
206 	}
207 	if (lastccl == 1)
208 		/* we're making the first ccl */
209 		cclmap[lastccl] = 0;
210 
211 	else
212 		/*
213 		 * The new pointer is just past the end of the last ccl.
214 		 * Since the cclmap points to the \first/ character of a ccl,
215 		 * adding the length of the ccl to the cclmap pointer will
216 		 * produce a cursor to the first free space.
217 		 */
218 		cclmap[lastccl] =
219 		    cclmap[lastccl - 1] + ccllen[lastccl - 1];
220 
221 	ccllen[lastccl] = 0;
222 	cclng[lastccl] = 0;	/* ccl's start out life un-negated */
223 	ccl_has_nl[lastccl] = false;
224 
225 	return lastccl;
226 }
227 
228 
229 /* cclnegate - negate the given ccl */
230 
231 void
cclnegate(int cclp)232 cclnegate(int cclp)
233 {
234 	cclng[cclp] = 1;
235 	ccl_has_nl[cclp] = !ccl_has_nl[cclp];
236 }
237 
238 
239 /* list_character_set - list the members of a set of characters in CCL form
240  *
241  * Writes to the given file a character-class representation of those
242  * characters present in the given CCL.  A character is present if it
243  * has a non-zero value in the cset array.
244  */
245 
246 void
list_character_set(FILE * file,int cset[])247 list_character_set(FILE *file, int cset[])
248 {
249 	int i;
250 
251 	putc('[', file);
252 
253 	for (i = 0; i < csize; ++i) {
254 		if (cset[i]) {
255 			int start_char = i;
256 
257 			putc(' ', file);
258 
259 			fputs(readable_form(i), file);
260 
261 			while (++i < csize && cset[i]);
262 
263 			if (i - 1 > start_char)
264 				/* this was a run */
265 				fprintf(file, "-%s",
266 				    readable_form(i - 1));
267 
268 			putc(' ', file);
269 		}
270 	}
271 
272 	putc(']', file);
273 }
274 
275 /** Determines if the range [c1-c2] is unambiguous in a case-insensitive
276  * scanner.  Specifically, if a lowercase or uppercase character, x, is in the
277  * range [c1-c2], then we require that UPPERCASE(x) and LOWERCASE(x) must also
278  * be in the range. If not, then this range is ambiguous, and the function
279  * returns false.  For example, [@-_] spans [a-z] but not [A-Z].  Beware that
280  * [a-z] will be labeled ambiguous because it does not include [A-Z].
281  *
282  * @param c1 the lower end of the range
283  * @param c2 the upper end of the range
284  * @return true if [c1-c2] is not ambiguous for a caseless scanner.
285  */
286 bool
range_covers_case(int c1,int c2)287 range_covers_case(int c1, int c2)
288 {
289 	int i, o;
290 
291 	for (i = c1; i <= c2; i++) {
292 		if (has_case(i)) {
293 			o = reverse_case(i);
294 			if (o < c1 || c2 < o)
295 				return false;
296 		}
297 	}
298 	return true;
299 }
300 
301 /** Reverse the case of a character, if possible.
302  * @return c if case-reversal does not apply.
303  */
304 int
reverse_case(int c)305 reverse_case(int c)
306 {
307 	return isupper(c) ? tolower(c) : (islower(c) ? toupper(c) : c);
308 }
309 
310 /** Return true if c is uppercase or lowercase. */
311 bool
has_case(int c)312 has_case(int c)
313 {
314 	return (isupper(c) || islower(c)) ? true : false;
315 }
316