1 /*- 2 * Copyright (c) 2004 Tim J. Robbins. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: head/usr.bin/tr/cset.c 226363 2011-10-14 10:43:55Z ed $ 27 */ 28 /* 29 * "Set of characters" ADT implemented as a splay tree of extents, with 30 * a lookup table cache to simplify looking up the first bunch of 31 * characters (which are presumably more common than others). 32 */ 33 34 #include <assert.h> 35 #include <stdbool.h> 36 #include <stdlib.h> 37 #include <wchar.h> 38 #include <wctype.h> 39 #include "cset.h" 40 41 static struct csnode * cset_delete(struct csnode *, wchar_t); 42 static __inline int cset_rangecmp(struct csnode *, wchar_t); 43 static struct csnode * cset_splay(struct csnode *, wchar_t); 44 45 /* 46 * cset_alloc -- 47 * Allocate a set of characters. 48 */ 49 struct cset * 50 cset_alloc(void) 51 { 52 struct cset *cs; 53 54 if ((cs = malloc(sizeof(*cs))) == NULL) 55 return (NULL); 56 cs->cs_root = NULL; 57 cs->cs_classes = NULL; 58 cs->cs_havecache = false; 59 cs->cs_invert = false; 60 return (cs); 61 } 62 63 /* 64 * cset_add -- 65 * Add a character to the set. 66 */ 67 bool 68 cset_add(struct cset *cs, wchar_t ch) 69 { 70 struct csnode *csn, *ncsn; 71 wchar_t oval; 72 73 cs->cs_havecache = false; 74 75 /* 76 * Inserting into empty tree; new item becomes the root. 77 */ 78 if (cs->cs_root == NULL) { 79 csn = malloc(sizeof(*cs->cs_root)); 80 if (csn == NULL) 81 return (false); 82 csn->csn_left = csn->csn_right = NULL; 83 csn->csn_min = csn->csn_max = ch; 84 cs->cs_root = csn; 85 return (true); 86 } 87 88 /* 89 * Splay to check whether the item already exists, and otherwise, 90 * where we should put it. 91 */ 92 csn = cs->cs_root = cset_splay(cs->cs_root, ch); 93 94 /* 95 * Avoid adding duplicate nodes. 96 */ 97 if (cset_rangecmp(csn, ch) == 0) 98 return (true); 99 100 /* 101 * Allocate a new node and make it the new root. 102 */ 103 ncsn = malloc(sizeof(*ncsn)); 104 if (ncsn == NULL) 105 return (false); 106 ncsn->csn_min = ncsn->csn_max = ch; 107 if (cset_rangecmp(csn, ch) < 0) { 108 ncsn->csn_left = csn->csn_left; 109 ncsn->csn_right = csn; 110 csn->csn_left = NULL; 111 } else { 112 ncsn->csn_right = csn->csn_right; 113 ncsn->csn_left = csn; 114 csn->csn_right = NULL; 115 } 116 cs->cs_root = ncsn; 117 118 /* 119 * Coalesce with left and right neighbours if possible. 120 */ 121 if (ncsn->csn_left != NULL) { 122 ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1); 123 if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) { 124 oval = ncsn->csn_left->csn_min; 125 ncsn->csn_left = cset_delete(ncsn->csn_left, 126 ncsn->csn_left->csn_min); 127 ncsn->csn_min = oval; 128 } 129 } 130 if (ncsn->csn_right != NULL) { 131 ncsn->csn_right = cset_splay(ncsn->csn_right, 132 ncsn->csn_max + 1); 133 if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) { 134 oval = ncsn->csn_right->csn_max; 135 ncsn->csn_right = cset_delete(ncsn->csn_right, 136 ncsn->csn_right->csn_min); 137 ncsn->csn_max = oval; 138 } 139 } 140 141 return (true); 142 } 143 144 /* 145 * cset_in_hard -- 146 * Determine whether a character is in the set without using 147 * the cache. 148 */ 149 bool 150 cset_in_hard(struct cset *cs, wchar_t ch) 151 { 152 struct csclass *csc; 153 154 for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next) 155 if (csc->csc_invert ^ (iswctype(ch, csc->csc_type) != 0)) 156 return (cs->cs_invert ^ true); 157 if (cs->cs_root != NULL) { 158 cs->cs_root = cset_splay(cs->cs_root, ch); 159 return (cs->cs_invert ^ (cset_rangecmp(cs->cs_root, ch) == 0)); 160 } 161 return (cs->cs_invert ^ false); 162 } 163 164 /* 165 * cset_cache -- 166 * Update the cache. 167 */ 168 void 169 cset_cache(struct cset *cs) 170 { 171 wchar_t i; 172 173 for (i = 0; i < CS_CACHE_SIZE; i++) 174 cs->cs_cache[i] = cset_in_hard(cs, i); 175 176 cs->cs_havecache = true; 177 } 178 179 /* 180 * cset_invert -- 181 * Invert the character set. 182 */ 183 void 184 cset_invert(struct cset *cs) 185 { 186 187 cs->cs_invert ^= true; 188 cs->cs_havecache = false; 189 } 190 191 /* 192 * cset_addclass -- 193 * Add a wctype()-style character class to the set, optionally 194 * inverting it. 195 */ 196 bool 197 cset_addclass(struct cset *cs, wctype_t type, bool invert) 198 { 199 struct csclass *csc; 200 201 csc = malloc(sizeof(*csc)); 202 if (csc == NULL) 203 return (false); 204 csc->csc_type = type; 205 csc->csc_invert = invert; 206 csc->csc_next = cs->cs_classes; 207 cs->cs_classes = csc; 208 cs->cs_havecache = false; 209 return (true); 210 } 211 212 static __inline int 213 cset_rangecmp(struct csnode *t, wchar_t ch) 214 { 215 216 if (ch < t->csn_min) 217 return (-1); 218 if (ch > t->csn_max) 219 return (1); 220 return (0); 221 } 222 223 static struct csnode * 224 cset_splay(struct csnode *t, wchar_t ch) 225 { 226 struct csnode N, *l, *r, *y; 227 228 /* 229 * Based on public domain code from Sleator. 230 */ 231 232 assert(t != NULL); 233 234 N.csn_left = N.csn_right = NULL; 235 l = r = &N; 236 for (;;) { 237 if (cset_rangecmp(t, ch) < 0) { 238 if (t->csn_left != NULL && 239 cset_rangecmp(t->csn_left, ch) < 0) { 240 y = t->csn_left; 241 t->csn_left = y->csn_right; 242 y->csn_right = t; 243 t = y; 244 } 245 if (t->csn_left == NULL) 246 break; 247 r->csn_left = t; 248 r = t; 249 t = t->csn_left; 250 } else if (cset_rangecmp(t, ch) > 0) { 251 if (t->csn_right != NULL && 252 cset_rangecmp(t->csn_right, ch) > 0) { 253 y = t->csn_right; 254 t->csn_right = y->csn_left; 255 y->csn_left = t; 256 t = y; 257 } 258 if (t->csn_right == NULL) 259 break; 260 l->csn_right = t; 261 l = t; 262 t = t->csn_right; 263 } else 264 break; 265 } 266 l->csn_right = t->csn_left; 267 r->csn_left = t->csn_right; 268 t->csn_left = N.csn_right; 269 t->csn_right = N.csn_left; 270 return (t); 271 } 272 273 static struct csnode * 274 cset_delete(struct csnode *t, wchar_t ch) 275 { 276 struct csnode *x; 277 278 assert(t != NULL); 279 t = cset_splay(t, ch); 280 assert(cset_rangecmp(t, ch) == 0); 281 if (t->csn_left == NULL) 282 x = t->csn_right; 283 else { 284 x = cset_splay(t->csn_left, ch); 285 x->csn_right = t->csn_right; 286 } 287 free(t); 288 return x; 289 } 290