xref: /dragonfly/usr.bin/tr/cset.c (revision f2c43266)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD: head/usr.bin/tr/cset.c 226363 2011-10-14 10:43:55Z ed $
27  */
28 /*
29  * "Set of characters" ADT implemented as a splay tree of extents, with
30  * a lookup table cache to simplify looking up the first bunch of
31  * characters (which are presumably more common than others).
32  */
33 
34 #include <assert.h>
35 #include <stdbool.h>
36 #include <stdlib.h>
37 #include <wchar.h>
38 #include <wctype.h>
39 #include "cset.h"
40 
41 static struct csnode *	cset_delete(struct csnode *, wchar_t);
42 static __inline int	cset_rangecmp(struct csnode *, wchar_t);
43 static struct csnode *	cset_splay(struct csnode *, wchar_t);
44 
45 /*
46  * cset_alloc --
47  *	Allocate a set of characters.
48  */
49 struct cset *
50 cset_alloc(void)
51 {
52 	struct cset *cs;
53 
54 	if ((cs = malloc(sizeof(*cs))) == NULL)
55 		return (NULL);
56 	cs->cs_root = NULL;
57 	cs->cs_classes = NULL;
58 	cs->cs_havecache = false;
59 	cs->cs_invert = false;
60 	return (cs);
61 }
62 
63 /*
64  * cset_add --
65  *	Add a character to the set.
66  */
67 bool
68 cset_add(struct cset *cs, wchar_t ch)
69 {
70 	struct csnode *csn, *ncsn;
71 	wchar_t oval;
72 
73 	cs->cs_havecache = false;
74 
75 	/*
76 	 * Inserting into empty tree; new item becomes the root.
77 	 */
78 	if (cs->cs_root == NULL) {
79 		csn = malloc(sizeof(*cs->cs_root));
80 		if (csn == NULL)
81 			return (false);
82 		csn->csn_left = csn->csn_right = NULL;
83 		csn->csn_min = csn->csn_max = ch;
84 		cs->cs_root = csn;
85 		return (true);
86 	}
87 
88 	/*
89 	 * Splay to check whether the item already exists, and otherwise,
90 	 * where we should put it.
91 	 */
92 	csn = cs->cs_root = cset_splay(cs->cs_root, ch);
93 
94 	/*
95 	 * Avoid adding duplicate nodes.
96 	 */
97 	if (cset_rangecmp(csn, ch) == 0)
98 		return (true);
99 
100 	/*
101 	 * Allocate a new node and make it the new root.
102 	 */
103 	ncsn = malloc(sizeof(*ncsn));
104 	if (ncsn == NULL)
105 		return (false);
106 	ncsn->csn_min = ncsn->csn_max = ch;
107 	if (cset_rangecmp(csn, ch) < 0) {
108 		ncsn->csn_left = csn->csn_left;
109 		ncsn->csn_right = csn;
110 		csn->csn_left = NULL;
111 	} else {
112 		ncsn->csn_right = csn->csn_right;
113 		ncsn->csn_left = csn;
114 		csn->csn_right = NULL;
115 	}
116 	cs->cs_root = ncsn;
117 
118 	/*
119 	 * Coalesce with left and right neighbours if possible.
120 	 */
121 	if (ncsn->csn_left != NULL) {
122 		ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1);
123 		if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) {
124 			oval = ncsn->csn_left->csn_min;
125 			ncsn->csn_left = cset_delete(ncsn->csn_left,
126 			    ncsn->csn_left->csn_min);
127 			ncsn->csn_min = oval;
128 		}
129 	}
130 	if (ncsn->csn_right != NULL) {
131 		ncsn->csn_right = cset_splay(ncsn->csn_right,
132 		    ncsn->csn_max + 1);
133 		if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) {
134 			oval = ncsn->csn_right->csn_max;
135 			ncsn->csn_right = cset_delete(ncsn->csn_right,
136 			    ncsn->csn_right->csn_min);
137 			ncsn->csn_max = oval;
138 		}
139 	}
140 
141 	return (true);
142 }
143 
144 /*
145  * cset_in_hard --
146  *	Determine whether a character is in the set without using
147  *	the cache.
148  */
149 bool
150 cset_in_hard(struct cset *cs, wchar_t ch)
151 {
152 	struct csclass *csc;
153 
154 	for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
155 		if (csc->csc_invert ^ (iswctype(ch, csc->csc_type) != 0))
156 			return (cs->cs_invert ^ true);
157 	if (cs->cs_root != NULL) {
158 		cs->cs_root = cset_splay(cs->cs_root, ch);
159 		return (cs->cs_invert ^ (cset_rangecmp(cs->cs_root, ch) == 0));
160 	}
161 	return (cs->cs_invert ^ false);
162 }
163 
164 /*
165  * cset_cache --
166  *	Update the cache.
167  */
168 void
169 cset_cache(struct cset *cs)
170 {
171 	wchar_t i;
172 
173 	for (i = 0; i < CS_CACHE_SIZE; i++)
174 		cs->cs_cache[i] = cset_in_hard(cs, i);
175 
176 	cs->cs_havecache = true;
177 }
178 
179 /*
180  * cset_invert --
181  *	Invert the character set.
182  */
183 void
184 cset_invert(struct cset *cs)
185 {
186 
187 	cs->cs_invert ^= true;
188 	cs->cs_havecache = false;
189 }
190 
191 /*
192  * cset_addclass --
193  *	Add a wctype()-style character class to the set, optionally
194  *	inverting it.
195  */
196 bool
197 cset_addclass(struct cset *cs, wctype_t type, bool invert)
198 {
199 	struct csclass *csc;
200 
201 	csc = malloc(sizeof(*csc));
202 	if (csc == NULL)
203 		return (false);
204 	csc->csc_type = type;
205 	csc->csc_invert = invert;
206 	csc->csc_next = cs->cs_classes;
207 	cs->cs_classes = csc;
208 	cs->cs_havecache = false;
209 	return (true);
210 }
211 
212 static __inline int
213 cset_rangecmp(struct csnode *t, wchar_t ch)
214 {
215 
216 	if (ch < t->csn_min)
217 		return (-1);
218 	if (ch > t->csn_max)
219 		return (1);
220 	return (0);
221 }
222 
223 static struct csnode *
224 cset_splay(struct csnode *t, wchar_t ch)
225 {
226 	struct csnode N, *l, *r, *y;
227 
228 	/*
229 	 * Based on public domain code from Sleator.
230 	 */
231 
232 	assert(t != NULL);
233 
234 	N.csn_left = N.csn_right = NULL;
235 	l = r = &N;
236 	for (;;) {
237 		if (cset_rangecmp(t, ch) < 0) {
238 			if (t->csn_left != NULL &&
239 			    cset_rangecmp(t->csn_left, ch) < 0) {
240 				y = t->csn_left;
241 				t->csn_left = y->csn_right;
242 				y->csn_right = t;
243 				t = y;
244 			}
245 			if (t->csn_left == NULL)
246 				break;
247 			r->csn_left = t;
248 			r = t;
249 			t = t->csn_left;
250 		} else if (cset_rangecmp(t, ch) > 0) {
251 			if (t->csn_right != NULL &&
252 			    cset_rangecmp(t->csn_right, ch) > 0) {
253 				y = t->csn_right;
254 				t->csn_right = y->csn_left;
255 				y->csn_left = t;
256 				t = y;
257 			}
258 			if (t->csn_right == NULL)
259 				break;
260 			l->csn_right = t;
261 			l = t;
262 			t = t->csn_right;
263 		} else
264 			break;
265 	}
266 	l->csn_right = t->csn_left;
267 	r->csn_left = t->csn_right;
268 	t->csn_left = N.csn_right;
269 	t->csn_right = N.csn_left;
270 	return (t);
271 }
272 
273 static struct csnode *
274 cset_delete(struct csnode *t, wchar_t ch)
275 {
276 	struct csnode *x;
277 
278 	assert(t != NULL);
279 	t = cset_splay(t, ch);
280 	assert(cset_rangecmp(t, ch) == 0);
281 	if (t->csn_left == NULL)
282 		x = t->csn_right;
283 	else {
284 		x = cset_splay(t->csn_left, ch);
285 		x->csn_right = t->csn_right;
286 	}
287 	free(t);
288 	return x;
289 }
290