xref: /dragonfly/usr.bin/tr/cset.c (revision fbc9049b)
1*fbc9049bSSascha Wildner /*-
2*fbc9049bSSascha Wildner  * Copyright (c) 2004 Tim J. Robbins.
3*fbc9049bSSascha Wildner  * All rights reserved.
4*fbc9049bSSascha Wildner  *
5*fbc9049bSSascha Wildner  * Redistribution and use in source and binary forms, with or without
6*fbc9049bSSascha Wildner  * modification, are permitted provided that the following conditions
7*fbc9049bSSascha Wildner  * are met:
8*fbc9049bSSascha Wildner  * 1. Redistributions of source code must retain the above copyright
9*fbc9049bSSascha Wildner  *    notice, this list of conditions and the following disclaimer.
10*fbc9049bSSascha Wildner  * 2. Redistributions in binary form must reproduce the above copyright
11*fbc9049bSSascha Wildner  *    notice, this list of conditions and the following disclaimer in the
12*fbc9049bSSascha Wildner  *    documentation and/or other materials provided with the distribution.
13*fbc9049bSSascha Wildner  *
14*fbc9049bSSascha Wildner  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15*fbc9049bSSascha Wildner  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16*fbc9049bSSascha Wildner  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17*fbc9049bSSascha Wildner  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18*fbc9049bSSascha Wildner  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19*fbc9049bSSascha Wildner  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20*fbc9049bSSascha Wildner  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21*fbc9049bSSascha Wildner  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22*fbc9049bSSascha Wildner  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23*fbc9049bSSascha Wildner  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24*fbc9049bSSascha Wildner  * SUCH DAMAGE.
25*fbc9049bSSascha Wildner  *
26*fbc9049bSSascha Wildner  * $FreeBSD: head/usr.bin/tr/cset.c 226363 2011-10-14 10:43:55Z ed $
27*fbc9049bSSascha Wildner  */
28*fbc9049bSSascha Wildner /*
29*fbc9049bSSascha Wildner  * "Set of characters" ADT implemented as a splay tree of extents, with
30*fbc9049bSSascha Wildner  * a lookup table cache to simplify looking up the first bunch of
31*fbc9049bSSascha Wildner  * characters (which are presumably more common than others).
32*fbc9049bSSascha Wildner  */
33*fbc9049bSSascha Wildner 
34*fbc9049bSSascha Wildner #include <assert.h>
35*fbc9049bSSascha Wildner #include <stdbool.h>
36*fbc9049bSSascha Wildner #include <stdlib.h>
37*fbc9049bSSascha Wildner #include <wchar.h>
38*fbc9049bSSascha Wildner #include <wctype.h>
39*fbc9049bSSascha Wildner #include "cset.h"
40*fbc9049bSSascha Wildner 
41*fbc9049bSSascha Wildner static struct csnode *	cset_delete(struct csnode *, wchar_t);
42*fbc9049bSSascha Wildner static __inline int	cset_rangecmp(struct csnode *, wchar_t);
43*fbc9049bSSascha Wildner static struct csnode *	cset_splay(struct csnode *, wchar_t);
44*fbc9049bSSascha Wildner 
45*fbc9049bSSascha Wildner /*
46*fbc9049bSSascha Wildner  * cset_alloc --
47*fbc9049bSSascha Wildner  *	Allocate a set of characters.
48*fbc9049bSSascha Wildner  */
49*fbc9049bSSascha Wildner struct cset *
cset_alloc(void)50*fbc9049bSSascha Wildner cset_alloc(void)
51*fbc9049bSSascha Wildner {
52*fbc9049bSSascha Wildner 	struct cset *cs;
53*fbc9049bSSascha Wildner 
54*fbc9049bSSascha Wildner 	if ((cs = malloc(sizeof(*cs))) == NULL)
55*fbc9049bSSascha Wildner 		return (NULL);
56*fbc9049bSSascha Wildner 	cs->cs_root = NULL;
57*fbc9049bSSascha Wildner 	cs->cs_classes = NULL;
58*fbc9049bSSascha Wildner 	cs->cs_havecache = false;
59*fbc9049bSSascha Wildner 	cs->cs_invert = false;
60*fbc9049bSSascha Wildner 	return (cs);
61*fbc9049bSSascha Wildner }
62*fbc9049bSSascha Wildner 
63*fbc9049bSSascha Wildner /*
64*fbc9049bSSascha Wildner  * cset_add --
65*fbc9049bSSascha Wildner  *	Add a character to the set.
66*fbc9049bSSascha Wildner  */
67*fbc9049bSSascha Wildner bool
cset_add(struct cset * cs,wchar_t ch)68*fbc9049bSSascha Wildner cset_add(struct cset *cs, wchar_t ch)
69*fbc9049bSSascha Wildner {
70*fbc9049bSSascha Wildner 	struct csnode *csn, *ncsn;
71*fbc9049bSSascha Wildner 	wchar_t oval;
72*fbc9049bSSascha Wildner 
73*fbc9049bSSascha Wildner 	cs->cs_havecache = false;
74*fbc9049bSSascha Wildner 
75*fbc9049bSSascha Wildner 	/*
76*fbc9049bSSascha Wildner 	 * Inserting into empty tree; new item becomes the root.
77*fbc9049bSSascha Wildner 	 */
78*fbc9049bSSascha Wildner 	if (cs->cs_root == NULL) {
79*fbc9049bSSascha Wildner 		csn = malloc(sizeof(*cs->cs_root));
80*fbc9049bSSascha Wildner 		if (csn == NULL)
81*fbc9049bSSascha Wildner 			return (false);
82*fbc9049bSSascha Wildner 		csn->csn_left = csn->csn_right = NULL;
83*fbc9049bSSascha Wildner 		csn->csn_min = csn->csn_max = ch;
84*fbc9049bSSascha Wildner 		cs->cs_root = csn;
85*fbc9049bSSascha Wildner 		return (true);
86*fbc9049bSSascha Wildner 	}
87*fbc9049bSSascha Wildner 
88*fbc9049bSSascha Wildner 	/*
89*fbc9049bSSascha Wildner 	 * Splay to check whether the item already exists, and otherwise,
90*fbc9049bSSascha Wildner 	 * where we should put it.
91*fbc9049bSSascha Wildner 	 */
92*fbc9049bSSascha Wildner 	csn = cs->cs_root = cset_splay(cs->cs_root, ch);
93*fbc9049bSSascha Wildner 
94*fbc9049bSSascha Wildner 	/*
95*fbc9049bSSascha Wildner 	 * Avoid adding duplicate nodes.
96*fbc9049bSSascha Wildner 	 */
97*fbc9049bSSascha Wildner 	if (cset_rangecmp(csn, ch) == 0)
98*fbc9049bSSascha Wildner 		return (true);
99*fbc9049bSSascha Wildner 
100*fbc9049bSSascha Wildner 	/*
101*fbc9049bSSascha Wildner 	 * Allocate a new node and make it the new root.
102*fbc9049bSSascha Wildner 	 */
103*fbc9049bSSascha Wildner 	ncsn = malloc(sizeof(*ncsn));
104*fbc9049bSSascha Wildner 	if (ncsn == NULL)
105*fbc9049bSSascha Wildner 		return (false);
106*fbc9049bSSascha Wildner 	ncsn->csn_min = ncsn->csn_max = ch;
107*fbc9049bSSascha Wildner 	if (cset_rangecmp(csn, ch) < 0) {
108*fbc9049bSSascha Wildner 		ncsn->csn_left = csn->csn_left;
109*fbc9049bSSascha Wildner 		ncsn->csn_right = csn;
110*fbc9049bSSascha Wildner 		csn->csn_left = NULL;
111*fbc9049bSSascha Wildner 	} else {
112*fbc9049bSSascha Wildner 		ncsn->csn_right = csn->csn_right;
113*fbc9049bSSascha Wildner 		ncsn->csn_left = csn;
114*fbc9049bSSascha Wildner 		csn->csn_right = NULL;
115*fbc9049bSSascha Wildner 	}
116*fbc9049bSSascha Wildner 	cs->cs_root = ncsn;
117*fbc9049bSSascha Wildner 
118*fbc9049bSSascha Wildner 	/*
119*fbc9049bSSascha Wildner 	 * Coalesce with left and right neighbours if possible.
120*fbc9049bSSascha Wildner 	 */
121*fbc9049bSSascha Wildner 	if (ncsn->csn_left != NULL) {
122*fbc9049bSSascha Wildner 		ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1);
123*fbc9049bSSascha Wildner 		if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) {
124*fbc9049bSSascha Wildner 			oval = ncsn->csn_left->csn_min;
125*fbc9049bSSascha Wildner 			ncsn->csn_left = cset_delete(ncsn->csn_left,
126*fbc9049bSSascha Wildner 			    ncsn->csn_left->csn_min);
127*fbc9049bSSascha Wildner 			ncsn->csn_min = oval;
128*fbc9049bSSascha Wildner 		}
129*fbc9049bSSascha Wildner 	}
130*fbc9049bSSascha Wildner 	if (ncsn->csn_right != NULL) {
131*fbc9049bSSascha Wildner 		ncsn->csn_right = cset_splay(ncsn->csn_right,
132*fbc9049bSSascha Wildner 		    ncsn->csn_max + 1);
133*fbc9049bSSascha Wildner 		if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) {
134*fbc9049bSSascha Wildner 			oval = ncsn->csn_right->csn_max;
135*fbc9049bSSascha Wildner 			ncsn->csn_right = cset_delete(ncsn->csn_right,
136*fbc9049bSSascha Wildner 			    ncsn->csn_right->csn_min);
137*fbc9049bSSascha Wildner 			ncsn->csn_max = oval;
138*fbc9049bSSascha Wildner 		}
139*fbc9049bSSascha Wildner 	}
140*fbc9049bSSascha Wildner 
141*fbc9049bSSascha Wildner 	return (true);
142*fbc9049bSSascha Wildner }
143*fbc9049bSSascha Wildner 
144*fbc9049bSSascha Wildner /*
145*fbc9049bSSascha Wildner  * cset_in_hard --
146*fbc9049bSSascha Wildner  *	Determine whether a character is in the set without using
147*fbc9049bSSascha Wildner  *	the cache.
148*fbc9049bSSascha Wildner  */
149*fbc9049bSSascha Wildner bool
cset_in_hard(struct cset * cs,wchar_t ch)150*fbc9049bSSascha Wildner cset_in_hard(struct cset *cs, wchar_t ch)
151*fbc9049bSSascha Wildner {
152*fbc9049bSSascha Wildner 	struct csclass *csc;
153*fbc9049bSSascha Wildner 
154*fbc9049bSSascha Wildner 	for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
155*fbc9049bSSascha Wildner 		if (csc->csc_invert ^ (iswctype(ch, csc->csc_type) != 0))
156*fbc9049bSSascha Wildner 			return (cs->cs_invert ^ true);
157*fbc9049bSSascha Wildner 	if (cs->cs_root != NULL) {
158*fbc9049bSSascha Wildner 		cs->cs_root = cset_splay(cs->cs_root, ch);
159*fbc9049bSSascha Wildner 		return (cs->cs_invert ^ (cset_rangecmp(cs->cs_root, ch) == 0));
160*fbc9049bSSascha Wildner 	}
161*fbc9049bSSascha Wildner 	return (cs->cs_invert ^ false);
162*fbc9049bSSascha Wildner }
163*fbc9049bSSascha Wildner 
164*fbc9049bSSascha Wildner /*
165*fbc9049bSSascha Wildner  * cset_cache --
166*fbc9049bSSascha Wildner  *	Update the cache.
167*fbc9049bSSascha Wildner  */
168*fbc9049bSSascha Wildner void
cset_cache(struct cset * cs)169*fbc9049bSSascha Wildner cset_cache(struct cset *cs)
170*fbc9049bSSascha Wildner {
171*fbc9049bSSascha Wildner 	wchar_t i;
172*fbc9049bSSascha Wildner 
173*fbc9049bSSascha Wildner 	for (i = 0; i < CS_CACHE_SIZE; i++)
174*fbc9049bSSascha Wildner 		cs->cs_cache[i] = cset_in_hard(cs, i);
175*fbc9049bSSascha Wildner 
176*fbc9049bSSascha Wildner 	cs->cs_havecache = true;
177*fbc9049bSSascha Wildner }
178*fbc9049bSSascha Wildner 
179*fbc9049bSSascha Wildner /*
180*fbc9049bSSascha Wildner  * cset_invert --
181*fbc9049bSSascha Wildner  *	Invert the character set.
182*fbc9049bSSascha Wildner  */
183*fbc9049bSSascha Wildner void
cset_invert(struct cset * cs)184*fbc9049bSSascha Wildner cset_invert(struct cset *cs)
185*fbc9049bSSascha Wildner {
186*fbc9049bSSascha Wildner 
187*fbc9049bSSascha Wildner 	cs->cs_invert ^= true;
188*fbc9049bSSascha Wildner 	cs->cs_havecache = false;
189*fbc9049bSSascha Wildner }
190*fbc9049bSSascha Wildner 
191*fbc9049bSSascha Wildner /*
192*fbc9049bSSascha Wildner  * cset_addclass --
193*fbc9049bSSascha Wildner  *	Add a wctype()-style character class to the set, optionally
194*fbc9049bSSascha Wildner  *	inverting it.
195*fbc9049bSSascha Wildner  */
196*fbc9049bSSascha Wildner bool
cset_addclass(struct cset * cs,wctype_t type,bool invert)197*fbc9049bSSascha Wildner cset_addclass(struct cset *cs, wctype_t type, bool invert)
198*fbc9049bSSascha Wildner {
199*fbc9049bSSascha Wildner 	struct csclass *csc;
200*fbc9049bSSascha Wildner 
201*fbc9049bSSascha Wildner 	csc = malloc(sizeof(*csc));
202*fbc9049bSSascha Wildner 	if (csc == NULL)
203*fbc9049bSSascha Wildner 		return (false);
204*fbc9049bSSascha Wildner 	csc->csc_type = type;
205*fbc9049bSSascha Wildner 	csc->csc_invert = invert;
206*fbc9049bSSascha Wildner 	csc->csc_next = cs->cs_classes;
207*fbc9049bSSascha Wildner 	cs->cs_classes = csc;
208*fbc9049bSSascha Wildner 	cs->cs_havecache = false;
209*fbc9049bSSascha Wildner 	return (true);
210*fbc9049bSSascha Wildner }
211*fbc9049bSSascha Wildner 
212*fbc9049bSSascha Wildner static __inline int
cset_rangecmp(struct csnode * t,wchar_t ch)213*fbc9049bSSascha Wildner cset_rangecmp(struct csnode *t, wchar_t ch)
214*fbc9049bSSascha Wildner {
215*fbc9049bSSascha Wildner 
216*fbc9049bSSascha Wildner 	if (ch < t->csn_min)
217*fbc9049bSSascha Wildner 		return (-1);
218*fbc9049bSSascha Wildner 	if (ch > t->csn_max)
219*fbc9049bSSascha Wildner 		return (1);
220*fbc9049bSSascha Wildner 	return (0);
221*fbc9049bSSascha Wildner }
222*fbc9049bSSascha Wildner 
223*fbc9049bSSascha Wildner static struct csnode *
cset_splay(struct csnode * t,wchar_t ch)224*fbc9049bSSascha Wildner cset_splay(struct csnode *t, wchar_t ch)
225*fbc9049bSSascha Wildner {
226*fbc9049bSSascha Wildner 	struct csnode N, *l, *r, *y;
227*fbc9049bSSascha Wildner 
228*fbc9049bSSascha Wildner 	/*
229*fbc9049bSSascha Wildner 	 * Based on public domain code from Sleator.
230*fbc9049bSSascha Wildner 	 */
231*fbc9049bSSascha Wildner 
232*fbc9049bSSascha Wildner 	assert(t != NULL);
233*fbc9049bSSascha Wildner 
234*fbc9049bSSascha Wildner 	N.csn_left = N.csn_right = NULL;
235*fbc9049bSSascha Wildner 	l = r = &N;
236*fbc9049bSSascha Wildner 	for (;;) {
237*fbc9049bSSascha Wildner 		if (cset_rangecmp(t, ch) < 0) {
238*fbc9049bSSascha Wildner 			if (t->csn_left != NULL &&
239*fbc9049bSSascha Wildner 			    cset_rangecmp(t->csn_left, ch) < 0) {
240*fbc9049bSSascha Wildner 				y = t->csn_left;
241*fbc9049bSSascha Wildner 				t->csn_left = y->csn_right;
242*fbc9049bSSascha Wildner 				y->csn_right = t;
243*fbc9049bSSascha Wildner 				t = y;
244*fbc9049bSSascha Wildner 			}
245*fbc9049bSSascha Wildner 			if (t->csn_left == NULL)
246*fbc9049bSSascha Wildner 				break;
247*fbc9049bSSascha Wildner 			r->csn_left = t;
248*fbc9049bSSascha Wildner 			r = t;
249*fbc9049bSSascha Wildner 			t = t->csn_left;
250*fbc9049bSSascha Wildner 		} else if (cset_rangecmp(t, ch) > 0) {
251*fbc9049bSSascha Wildner 			if (t->csn_right != NULL &&
252*fbc9049bSSascha Wildner 			    cset_rangecmp(t->csn_right, ch) > 0) {
253*fbc9049bSSascha Wildner 				y = t->csn_right;
254*fbc9049bSSascha Wildner 				t->csn_right = y->csn_left;
255*fbc9049bSSascha Wildner 				y->csn_left = t;
256*fbc9049bSSascha Wildner 				t = y;
257*fbc9049bSSascha Wildner 			}
258*fbc9049bSSascha Wildner 			if (t->csn_right == NULL)
259*fbc9049bSSascha Wildner 				break;
260*fbc9049bSSascha Wildner 			l->csn_right = t;
261*fbc9049bSSascha Wildner 			l = t;
262*fbc9049bSSascha Wildner 			t = t->csn_right;
263*fbc9049bSSascha Wildner 		} else
264*fbc9049bSSascha Wildner 			break;
265*fbc9049bSSascha Wildner 	}
266*fbc9049bSSascha Wildner 	l->csn_right = t->csn_left;
267*fbc9049bSSascha Wildner 	r->csn_left = t->csn_right;
268*fbc9049bSSascha Wildner 	t->csn_left = N.csn_right;
269*fbc9049bSSascha Wildner 	t->csn_right = N.csn_left;
270*fbc9049bSSascha Wildner 	return (t);
271*fbc9049bSSascha Wildner }
272*fbc9049bSSascha Wildner 
273*fbc9049bSSascha Wildner static struct csnode *
cset_delete(struct csnode * t,wchar_t ch)274*fbc9049bSSascha Wildner cset_delete(struct csnode *t, wchar_t ch)
275*fbc9049bSSascha Wildner {
276*fbc9049bSSascha Wildner 	struct csnode *x;
277*fbc9049bSSascha Wildner 
278*fbc9049bSSascha Wildner 	assert(t != NULL);
279*fbc9049bSSascha Wildner 	t = cset_splay(t, ch);
280*fbc9049bSSascha Wildner 	assert(cset_rangecmp(t, ch) == 0);
281*fbc9049bSSascha Wildner 	if (t->csn_left == NULL)
282*fbc9049bSSascha Wildner 		x = t->csn_right;
283*fbc9049bSSascha Wildner 	else {
284*fbc9049bSSascha Wildner 		x = cset_splay(t->csn_left, ch);
285*fbc9049bSSascha Wildner 		x->csn_right = t->csn_right;
286*fbc9049bSSascha Wildner 	}
287*fbc9049bSSascha Wildner 	free(t);
288*fbc9049bSSascha Wildner 	return x;
289*fbc9049bSSascha Wildner }
290