1 /*- 2 * Copyright (c) 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * %sccs.include.redist.c% 6 */ 7 8 #ifndef lint 9 static char sccsid[] = "@(#)str.c 8.1 (Berkeley) 06/06/93"; 10 #endif /* not lint */ 11 12 #include <sys/cdefs.h> 13 #include <sys/types.h> 14 15 #include <errno.h> 16 #include <stddef.h> 17 #include <stdio.h> 18 #include <stdlib.h> 19 #include <string.h> 20 21 #include "extern.h" 22 23 static int backslash __P((STR *)); 24 static int bracket __P((STR *)); 25 static int c_class __P((const void *, const void *)); 26 static void genclass __P((STR *)); 27 static void genequiv __P((STR *)); 28 static int genrange __P((STR *)); 29 static void genseq __P((STR *)); 30 31 int 32 next(s) 33 register STR *s; 34 { 35 register int ch; 36 37 switch (s->state) { 38 case EOS: 39 return (0); 40 case INFINITE: 41 return (1); 42 case NORMAL: 43 switch (ch = *s->str) { 44 case '\0': 45 s->state = EOS; 46 return (0); 47 case '\\': 48 s->lastch = backslash(s); 49 break; 50 case '[': 51 if (bracket(s)) 52 return (next(s)); 53 /* FALLTHROUGH */ 54 default: 55 ++s->str; 56 s->lastch = ch; 57 break; 58 } 59 60 /* We can start a range at any time. */ 61 if (s->str[0] == '-' && genrange(s)) 62 return (next(s)); 63 return (1); 64 case RANGE: 65 if (s->cnt-- == 0) { 66 s->state = NORMAL; 67 return (next(s)); 68 } 69 ++s->lastch; 70 return (1); 71 case SEQUENCE: 72 if (s->cnt-- == 0) { 73 s->state = NORMAL; 74 return (next(s)); 75 } 76 return (1); 77 case SET: 78 if ((s->lastch = s->set[s->cnt++]) == OOBCH) { 79 s->state = NORMAL; 80 return (next(s)); 81 } 82 return (1); 83 } 84 /* NOTREACHED */ 85 } 86 87 static int 88 bracket(s) 89 register STR *s; 90 { 91 register char *p; 92 93 switch (s->str[1]) { 94 case ':': /* "[:class:]" */ 95 if ((p = strstr(s->str + 2, ":]")) == NULL) 96 return (0); 97 *p = '\0'; 98 s->str += 2; 99 genclass(s); 100 s->str = p + 2; 101 return (1); 102 case '=': /* "[=equiv=]" */ 103 if ((p = strstr(s->str + 2, "=]")) == NULL) 104 return (0); 105 s->str += 2; 106 genequiv(s); 107 return (1); 108 default: /* "[\###*n]" or "[#*n]" */ 109 if ((p = strpbrk(s->str + 2, "*]")) == NULL) 110 return (0); 111 if (p[0] != '*' || index(p, ']') == NULL) 112 return (0); 113 s->str += 1; 114 genseq(s); 115 return (1); 116 } 117 /* NOTREACHED */ 118 } 119 120 int isalnum __P((int)), 121 isalpha __P((int)), 122 isblank __P((int)), 123 isspace __P((int)), 124 iscntrl __P((int)), 125 isdigit __P((int)), 126 isgraph __P((int)), 127 islower __P((int)), 128 isprint __P((int)), 129 ispunct __P((int)), 130 isupper __P((int)), 131 isxdigit __P((int)); 132 133 typedef struct { 134 char *name; 135 int (*func) __P((int)); 136 int *set; 137 } CLASS; 138 139 static CLASS classes[] = { 140 { "alnum", isalnum, }, 141 { "alpha", isalpha, }, 142 { "blank", isblank, }, 143 { "cntrl", iscntrl, }, 144 { "digit", isdigit, }, 145 { "graph", isgraph, }, 146 { "lower", islower, }, 147 { "print", isupper, }, 148 { "punct", ispunct, }, 149 { "space", isspace, }, 150 { "upper", isupper, }, 151 { "xdigit", isxdigit, }, 152 }; 153 154 static void 155 genclass(s) 156 STR *s; 157 { 158 register int cnt, (*func) __P((int)); 159 CLASS *cp, tmp; 160 int *p; 161 162 tmp.name = s->str; 163 if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) / 164 sizeof(CLASS), sizeof(CLASS), c_class)) == NULL) 165 err("unknown class %s", s->str); 166 167 if ((cp->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL) 168 err("%s", strerror(errno)); 169 bzero(p, (NCHARS + 1) * sizeof(int)); 170 for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt) 171 if ((func)(cnt)) 172 *p++ = cnt; 173 *p = OOBCH; 174 175 s->cnt = 0; 176 s->state = SET; 177 s->set = cp->set; 178 } 179 180 static int 181 c_class(a, b) 182 const void *a, *b; 183 { 184 return (strcmp(((CLASS *)a)->name, ((CLASS *)b)->name)); 185 } 186 187 /* 188 * English doesn't have any equivalence classes, so for now 189 * we just syntax check and grab the character. 190 */ 191 static void 192 genequiv(s) 193 STR *s; 194 { 195 if (*s->str == '\\') { 196 s->equiv[0] = backslash(s); 197 if (*s->str != '=') 198 err("misplaced equivalence equals sign"); 199 } else { 200 s->equiv[0] = s->str[0]; 201 if (s->str[1] != '=') 202 err("misplaced equivalence equals sign"); 203 } 204 s->str += 2; 205 s->cnt = 0; 206 s->state = SET; 207 s->set = s->equiv; 208 } 209 210 static int 211 genrange(s) 212 STR *s; 213 { 214 int stopval; 215 char *savestart; 216 217 savestart = s->str; 218 stopval = *++s->str == '\\' ? backslash(s) : *s->str; 219 if (stopval < s->lastch) { 220 s->str = savestart; 221 return (0); 222 } 223 s->cnt = stopval - s->lastch + 1; 224 s->state = RANGE; 225 --s->lastch; 226 return (1); 227 } 228 229 static void 230 genseq(s) 231 STR *s; 232 { 233 char *ep; 234 235 if (s->which == STRING1) 236 err("sequences only valid in string2"); 237 238 if (*s->str == '\\') 239 s->lastch = backslash(s); 240 else 241 s->lastch = *s->str++; 242 if (*s->str != '*') 243 err("misplaced sequence asterisk"); 244 245 switch (*++s->str) { 246 case '\\': 247 s->cnt = backslash(s); 248 break; 249 case ']': 250 s->cnt = 0; 251 ++s->str; 252 break; 253 default: 254 if (isdigit(*s->str)) { 255 s->cnt = strtol(s->str, &ep, 0); 256 if (*ep == ']') { 257 s->str = ep + 1; 258 break; 259 } 260 } 261 err("illegal sequence count"); 262 /* NOTREACHED */ 263 } 264 265 s->state = s->cnt ? SEQUENCE : INFINITE; 266 } 267 268 /* Use the #defines isXXX() here, DON'T use them above. */ 269 #include <ctype.h> 270 271 /* 272 * Translate \??? into a character. Up to 3 octal digits, if no digits either 273 * an escape code or a literal character. 274 */ 275 static int 276 backslash(s) 277 register STR *s; 278 { 279 register int ch, cnt, val; 280 281 for (cnt = val = 0;;) { 282 ch = *++s->str; 283 if (!isascii(ch) || !isdigit(ch)) 284 break; 285 val = val * 8 + ch - '0'; 286 if (++cnt == 3) { 287 ++s->str; 288 break; 289 } 290 } 291 if (cnt) 292 return (val); 293 if (ch != '\0') 294 ++s->str; 295 switch (ch) { 296 case 'a': /* escape characters */ 297 return ('\7'); 298 case 'b': 299 return ('\b'); 300 case 'f': 301 return ('\f'); 302 case 'n': 303 return ('\n'); 304 case 'r': 305 return ('\r'); 306 case 't': 307 return ('\t'); 308 case 'v': 309 return ('\13'); 310 case '\0': /* \" -> \ */ 311 s->state = EOS; 312 return ('\\'); 313 default: /* \x" -> x */ 314 return (ch); 315 } 316 } 317