1 /*-
2 * Copyright (c) 2004 Tim J. Robbins.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: head/usr.bin/tr/cset.c 226363 2011-10-14 10:43:55Z ed $
27 */
28 /*
29 * "Set of characters" ADT implemented as a splay tree of extents, with
30 * a lookup table cache to simplify looking up the first bunch of
31 * characters (which are presumably more common than others).
32 */
33
34 #include <assert.h>
35 #include <stdbool.h>
36 #include <stdlib.h>
37 #include <wchar.h>
38 #include <wctype.h>
39 #include "cset.h"
40
41 static struct csnode * cset_delete(struct csnode *, wchar_t);
42 static __inline int cset_rangecmp(struct csnode *, wchar_t);
43 static struct csnode * cset_splay(struct csnode *, wchar_t);
44
45 /*
46 * cset_alloc --
47 * Allocate a set of characters.
48 */
49 struct cset *
cset_alloc(void)50 cset_alloc(void)
51 {
52 struct cset *cs;
53
54 if ((cs = malloc(sizeof(*cs))) == NULL)
55 return (NULL);
56 cs->cs_root = NULL;
57 cs->cs_classes = NULL;
58 cs->cs_havecache = false;
59 cs->cs_invert = false;
60 return (cs);
61 }
62
63 /*
64 * cset_add --
65 * Add a character to the set.
66 */
67 bool
cset_add(struct cset * cs,wchar_t ch)68 cset_add(struct cset *cs, wchar_t ch)
69 {
70 struct csnode *csn, *ncsn;
71 wchar_t oval;
72
73 cs->cs_havecache = false;
74
75 /*
76 * Inserting into empty tree; new item becomes the root.
77 */
78 if (cs->cs_root == NULL) {
79 csn = malloc(sizeof(*cs->cs_root));
80 if (csn == NULL)
81 return (false);
82 csn->csn_left = csn->csn_right = NULL;
83 csn->csn_min = csn->csn_max = ch;
84 cs->cs_root = csn;
85 return (true);
86 }
87
88 /*
89 * Splay to check whether the item already exists, and otherwise,
90 * where we should put it.
91 */
92 csn = cs->cs_root = cset_splay(cs->cs_root, ch);
93
94 /*
95 * Avoid adding duplicate nodes.
96 */
97 if (cset_rangecmp(csn, ch) == 0)
98 return (true);
99
100 /*
101 * Allocate a new node and make it the new root.
102 */
103 ncsn = malloc(sizeof(*ncsn));
104 if (ncsn == NULL)
105 return (false);
106 ncsn->csn_min = ncsn->csn_max = ch;
107 if (cset_rangecmp(csn, ch) < 0) {
108 ncsn->csn_left = csn->csn_left;
109 ncsn->csn_right = csn;
110 csn->csn_left = NULL;
111 } else {
112 ncsn->csn_right = csn->csn_right;
113 ncsn->csn_left = csn;
114 csn->csn_right = NULL;
115 }
116 cs->cs_root = ncsn;
117
118 /*
119 * Coalesce with left and right neighbours if possible.
120 */
121 if (ncsn->csn_left != NULL) {
122 ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1);
123 if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) {
124 oval = ncsn->csn_left->csn_min;
125 ncsn->csn_left = cset_delete(ncsn->csn_left,
126 ncsn->csn_left->csn_min);
127 ncsn->csn_min = oval;
128 }
129 }
130 if (ncsn->csn_right != NULL) {
131 ncsn->csn_right = cset_splay(ncsn->csn_right,
132 ncsn->csn_max + 1);
133 if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) {
134 oval = ncsn->csn_right->csn_max;
135 ncsn->csn_right = cset_delete(ncsn->csn_right,
136 ncsn->csn_right->csn_min);
137 ncsn->csn_max = oval;
138 }
139 }
140
141 return (true);
142 }
143
144 /*
145 * cset_in_hard --
146 * Determine whether a character is in the set without using
147 * the cache.
148 */
149 bool
cset_in_hard(struct cset * cs,wchar_t ch)150 cset_in_hard(struct cset *cs, wchar_t ch)
151 {
152 struct csclass *csc;
153
154 for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
155 if (csc->csc_invert ^ (iswctype(ch, csc->csc_type) != 0))
156 return (cs->cs_invert ^ true);
157 if (cs->cs_root != NULL) {
158 cs->cs_root = cset_splay(cs->cs_root, ch);
159 return (cs->cs_invert ^ (cset_rangecmp(cs->cs_root, ch) == 0));
160 }
161 return (cs->cs_invert ^ false);
162 }
163
164 /*
165 * cset_cache --
166 * Update the cache.
167 */
168 void
cset_cache(struct cset * cs)169 cset_cache(struct cset *cs)
170 {
171 wchar_t i;
172
173 for (i = 0; i < CS_CACHE_SIZE; i++)
174 cs->cs_cache[i] = cset_in_hard(cs, i);
175
176 cs->cs_havecache = true;
177 }
178
179 /*
180 * cset_invert --
181 * Invert the character set.
182 */
183 void
cset_invert(struct cset * cs)184 cset_invert(struct cset *cs)
185 {
186
187 cs->cs_invert ^= true;
188 cs->cs_havecache = false;
189 }
190
191 /*
192 * cset_addclass --
193 * Add a wctype()-style character class to the set, optionally
194 * inverting it.
195 */
196 bool
cset_addclass(struct cset * cs,wctype_t type,bool invert)197 cset_addclass(struct cset *cs, wctype_t type, bool invert)
198 {
199 struct csclass *csc;
200
201 csc = malloc(sizeof(*csc));
202 if (csc == NULL)
203 return (false);
204 csc->csc_type = type;
205 csc->csc_invert = invert;
206 csc->csc_next = cs->cs_classes;
207 cs->cs_classes = csc;
208 cs->cs_havecache = false;
209 return (true);
210 }
211
212 static __inline int
cset_rangecmp(struct csnode * t,wchar_t ch)213 cset_rangecmp(struct csnode *t, wchar_t ch)
214 {
215
216 if (ch < t->csn_min)
217 return (-1);
218 if (ch > t->csn_max)
219 return (1);
220 return (0);
221 }
222
223 static struct csnode *
cset_splay(struct csnode * t,wchar_t ch)224 cset_splay(struct csnode *t, wchar_t ch)
225 {
226 struct csnode N, *l, *r, *y;
227
228 /*
229 * Based on public domain code from Sleator.
230 */
231
232 assert(t != NULL);
233
234 N.csn_left = N.csn_right = NULL;
235 l = r = &N;
236 for (;;) {
237 if (cset_rangecmp(t, ch) < 0) {
238 if (t->csn_left != NULL &&
239 cset_rangecmp(t->csn_left, ch) < 0) {
240 y = t->csn_left;
241 t->csn_left = y->csn_right;
242 y->csn_right = t;
243 t = y;
244 }
245 if (t->csn_left == NULL)
246 break;
247 r->csn_left = t;
248 r = t;
249 t = t->csn_left;
250 } else if (cset_rangecmp(t, ch) > 0) {
251 if (t->csn_right != NULL &&
252 cset_rangecmp(t->csn_right, ch) > 0) {
253 y = t->csn_right;
254 t->csn_right = y->csn_left;
255 y->csn_left = t;
256 t = y;
257 }
258 if (t->csn_right == NULL)
259 break;
260 l->csn_right = t;
261 l = t;
262 t = t->csn_right;
263 } else
264 break;
265 }
266 l->csn_right = t->csn_left;
267 r->csn_left = t->csn_right;
268 t->csn_left = N.csn_right;
269 t->csn_right = N.csn_left;
270 return (t);
271 }
272
273 static struct csnode *
cset_delete(struct csnode * t,wchar_t ch)274 cset_delete(struct csnode *t, wchar_t ch)
275 {
276 struct csnode *x;
277
278 assert(t != NULL);
279 t = cset_splay(t, ch);
280 assert(cset_rangecmp(t, ch) == 0);
281 if (t->csn_left == NULL)
282 x = t->csn_right;
283 else {
284 x = cset_splay(t->csn_left, ch);
285 x->csn_right = t->csn_right;
286 }
287 free(t);
288 return x;
289 }
290