1 /* $NetBSD: str.c,v 1.12 2009/04/13 23:50:49 lukem Exp $ */ 2 3 /*- 4 * Copyright (c) 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 #ifndef lint 34 #if 0 35 static char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95"; 36 #endif 37 __RCSID("$NetBSD: str.c,v 1.12 2009/04/13 23:50:49 lukem Exp $"); 38 #endif /* not lint */ 39 40 #include <sys/types.h> 41 42 #include <err.h> 43 #include <errno.h> 44 #include <stddef.h> 45 #include <stdio.h> 46 #include <stdlib.h> 47 #include <string.h> 48 #include <ctype.h> 49 50 #include "extern.h" 51 52 static int backslash __P((STR *)); 53 static int bracket __P((STR *)); 54 static int c_class __P((const void *, const void *)); 55 static void genclass __P((STR *)); 56 static void genequiv __P((STR *)); 57 static int genrange __P((STR *)); 58 static void genseq __P((STR *)); 59 60 int 61 next(s) 62 STR *s; 63 { 64 int ch; 65 66 switch (s->state) { 67 case EOS: 68 return (0); 69 case INFINITE: 70 return (1); 71 case NORMAL: 72 switch (ch = *s->str) { 73 case '\0': 74 s->state = EOS; 75 return (0); 76 case '\\': 77 s->lastch = backslash(s); 78 break; 79 case '[': 80 if (bracket(s)) 81 return (next(s)); 82 /* FALLTHROUGH */ 83 default: 84 ++s->str; 85 s->lastch = ch; 86 break; 87 } 88 89 /* We can start a range at any time. */ 90 if (s->str[0] == '-' && genrange(s)) 91 return (next(s)); 92 return (1); 93 case RANGE: 94 if (s->cnt-- == 0) { 95 s->state = NORMAL; 96 return (next(s)); 97 } 98 ++s->lastch; 99 return (1); 100 case SEQUENCE: 101 if (s->cnt-- == 0) { 102 s->state = NORMAL; 103 return (next(s)); 104 } 105 return (1); 106 case SET: 107 if ((s->lastch = s->set[s->cnt++]) == OOBCH) { 108 s->state = NORMAL; 109 return (next(s)); 110 } 111 return (1); 112 } 113 /* NOTREACHED */ 114 return (0); 115 } 116 117 static int 118 bracket(s) 119 STR *s; 120 { 121 char *p; 122 123 switch (s->str[1]) { 124 case ':': /* "[:class:]" */ 125 if ((p = strstr(s->str + 2, ":]")) == NULL) 126 return (0); 127 *p = '\0'; 128 s->str += 2; 129 genclass(s); 130 s->str = p + 2; 131 return (1); 132 case '=': /* "[=equiv=]" */ 133 if ((p = strstr(s->str + 2, "=]")) == NULL) 134 return (0); 135 s->str += 2; 136 genequiv(s); 137 return (1); 138 default: /* "[\###*n]" or "[#*n]" */ 139 if ((p = strpbrk(s->str + 2, "*]")) == NULL) 140 return (0); 141 if (p[0] != '*' || strchr(p, ']') == NULL) 142 return (0); 143 s->str += 1; 144 genseq(s); 145 return (1); 146 } 147 /* NOTREACHED */ 148 } 149 150 typedef struct { 151 const char *name; 152 int (*func) __P((int)); 153 int *set; 154 } CLASS; 155 156 static CLASS classes[] = { 157 { "alnum", isalnum, NULL, }, 158 { "alpha", isalpha, NULL, }, 159 { "blank", isblank, NULL, }, 160 { "cntrl", iscntrl, NULL, }, 161 { "digit", isdigit, NULL, }, 162 { "graph", isgraph, NULL, }, 163 { "lower", islower, NULL, }, 164 { "print", isprint, NULL, }, 165 { "punct", ispunct, NULL, }, 166 { "space", isspace, NULL, }, 167 { "upper", isupper, NULL, }, 168 { "xdigit", isxdigit, NULL, }, 169 }; 170 171 static void 172 genclass(s) 173 STR *s; 174 { 175 int cnt, (*func) __P((int)); 176 CLASS *cp, tmp; 177 int *p; 178 179 tmp.name = s->str; 180 if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) / 181 sizeof(CLASS), sizeof(CLASS), c_class)) == NULL) 182 errx(1, "unknown class %s", s->str); 183 184 if ((cp->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL) 185 err(1, "malloc"); 186 memset(p, 0, (NCHARS + 1) * sizeof(int)); 187 for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt) 188 if ((func)(cnt)) 189 *p++ = cnt; 190 *p = OOBCH; 191 192 s->cnt = 0; 193 s->state = SET; 194 s->set = cp->set; 195 } 196 197 static int 198 c_class(a, b) 199 const void *a, *b; 200 { 201 return (strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name)); 202 } 203 204 /* 205 * English doesn't have any equivalence classes, so for now 206 * we just syntax check and grab the character. 207 */ 208 static void 209 genequiv(s) 210 STR *s; 211 { 212 if (*s->str == '\\') { 213 s->equiv[0] = backslash(s); 214 if (*s->str != '=') 215 errx(1, "misplaced equivalence equals sign"); 216 } else { 217 s->equiv[0] = s->str[0]; 218 if (s->str[1] != '=') 219 errx(1, "misplaced equivalence equals sign"); 220 } 221 s->str += 2; 222 s->cnt = 0; 223 s->state = SET; 224 s->set = s->equiv; 225 } 226 227 static int 228 genrange(s) 229 STR *s; 230 { 231 int stopval; 232 char *savestart; 233 234 savestart = s->str; 235 stopval = *++s->str == '\\' ? backslash(s) : *s->str++; 236 if (stopval < (u_char)s->lastch) { 237 s->str = savestart; 238 return (0); 239 } 240 s->cnt = stopval - s->lastch + 1; 241 s->state = RANGE; 242 --s->lastch; 243 return (1); 244 } 245 246 static void 247 genseq(s) 248 STR *s; 249 { 250 char *ep; 251 252 if (s->which == STRING1) 253 errx(1, "sequences only valid in string2"); 254 255 if (*s->str == '\\') 256 s->lastch = backslash(s); 257 else 258 s->lastch = *s->str++; 259 if (*s->str != '*') 260 errx(1, "misplaced sequence asterisk"); 261 262 switch (*++s->str) { 263 case '\\': 264 s->cnt = backslash(s); 265 break; 266 case ']': 267 s->cnt = 0; 268 ++s->str; 269 break; 270 default: 271 if (isdigit(*s->str)) { 272 s->cnt = strtol(s->str, &ep, 0); 273 if (*ep == ']') { 274 s->str = ep + 1; 275 break; 276 } 277 } 278 errx(1, "illegal sequence count"); 279 /* NOTREACHED */ 280 } 281 282 s->state = s->cnt ? SEQUENCE : INFINITE; 283 } 284 285 /* 286 * Translate \??? into a character. Up to 3 octal digits, if no digits either 287 * an escape code or a literal character. 288 */ 289 static int 290 backslash(s) 291 STR *s; 292 { 293 int ch, cnt, val; 294 295 for (cnt = val = 0;;) { 296 ch = *++s->str; 297 if (!isascii(ch) || !isdigit(ch)) 298 break; 299 val = val * 8 + ch - '0'; 300 if (++cnt == 3) { 301 ++s->str; 302 break; 303 } 304 } 305 if (cnt) 306 return (val); 307 if (ch != '\0') 308 ++s->str; 309 switch (ch) { 310 case 'a': /* escape characters */ 311 return ('\7'); 312 case 'b': 313 return ('\b'); 314 case 'f': 315 return ('\f'); 316 case 'n': 317 return ('\n'); 318 case 'r': 319 return ('\r'); 320 case 't': 321 return ('\t'); 322 case 'v': 323 return ('\13'); 324 case '\0': /* \" -> \ */ 325 s->state = EOS; 326 return ('\\'); 327 default: /* \x" -> x */ 328 return (ch); 329 } 330 } 331