1 /*- 2 * Copyright (c) 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)str.c 8.2 (Berkeley) 4/28/95 30 * $FreeBSD: head/usr.bin/tr/str.c 229403 2012-01-03 18:51:58Z ed $ 31 */ 32 33 #include <sys/types.h> 34 35 #include <ctype.h> 36 #include <err.h> 37 #include <errno.h> 38 #include <stddef.h> 39 #include <stdio.h> 40 #include <stdlib.h> 41 #include <string.h> 42 #include <wchar.h> 43 #include <wctype.h> 44 45 #include "extern.h" 46 47 static int backslash(STR *, int *); 48 static int bracket(STR *); 49 static void genclass(STR *); 50 static void genequiv(STR *); 51 static int genrange(STR *, int); 52 static void genseq(STR *); 53 54 wint_t 55 next(STR *s) 56 { 57 int is_octal; 58 wint_t ch; 59 wchar_t wch; 60 size_t clen; 61 62 switch (s->state) { 63 case EOS: 64 return (0); 65 case INFINITE: 66 return (1); 67 case NORMAL: 68 switch (*s->str) { 69 case '\0': 70 s->state = EOS; 71 return (0); 72 case '\\': 73 s->lastch = backslash(s, &is_octal); 74 break; 75 case '[': 76 if (bracket(s)) 77 return (next(s)); 78 /* FALLTHROUGH */ 79 default: 80 clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL); 81 if (clen == (size_t)-1 || clen == (size_t)-2 || 82 clen == 0) 83 errc(1, EILSEQ, NULL); 84 is_octal = 0; 85 s->lastch = wch; 86 s->str += clen; 87 break; 88 } 89 90 /* We can start a range at any time. */ 91 if (s->str[0] == '-' && genrange(s, is_octal)) 92 return (next(s)); 93 return (1); 94 case RANGE: 95 if (s->cnt-- == 0) { 96 s->state = NORMAL; 97 return (next(s)); 98 } 99 ++s->lastch; 100 return (1); 101 case SEQUENCE: 102 if (s->cnt-- == 0) { 103 s->state = NORMAL; 104 return (next(s)); 105 } 106 return (1); 107 case CCLASS: 108 case CCLASS_UPPER: 109 case CCLASS_LOWER: 110 s->cnt++; 111 ch = nextwctype(s->lastch, s->cclass); 112 if (ch == -1) { 113 s->state = NORMAL; 114 return (next(s)); 115 } 116 s->lastch = ch; 117 return (1); 118 case SET: 119 if ((ch = s->set[s->cnt++]) == OOBCH) { 120 s->state = NORMAL; 121 return (next(s)); 122 } 123 s->lastch = ch; 124 return (1); 125 default: 126 return (0); 127 } 128 /* NOTREACHED */ 129 } 130 131 static int 132 bracket(STR *s) 133 { 134 char *p; 135 136 switch (s->str[1]) { 137 case ':': /* "[:class:]" */ 138 if ((p = strchr(s->str + 2, ']')) == NULL) 139 return (0); 140 if (*(p - 1) != ':' || p - s->str < 4) 141 goto repeat; 142 *(p - 1) = '\0'; 143 s->str += 2; 144 genclass(s); 145 s->str = p + 1; 146 return (1); 147 case '=': /* "[=equiv=]" */ 148 if (s->str[2] == '\0' || (p = strchr(s->str + 3, ']')) == NULL) 149 return (0); 150 if (*(p - 1) != '=' || p - s->str < 4) 151 goto repeat; 152 s->str += 2; 153 genequiv(s); 154 return (1); 155 default: /* "[\###*n]" or "[#*n]" */ 156 repeat: 157 if ((p = strpbrk(s->str + 2, "*]")) == NULL) 158 return (0); 159 if (p[0] != '*' || strchr(p, ']') == NULL) 160 return (0); 161 s->str += 1; 162 genseq(s); 163 return (1); 164 } 165 /* NOTREACHED */ 166 } 167 168 static void 169 genclass(STR *s) 170 { 171 if ((s->cclass = wctype(s->str)) == 0) 172 errx(1, "unknown class %s", s->str); 173 s->cnt = 0; 174 s->lastch = -1; /* incremented before check in next() */ 175 if (strcmp(s->str, "upper") == 0) 176 s->state = CCLASS_UPPER; 177 else if (strcmp(s->str, "lower") == 0) 178 s->state = CCLASS_LOWER; 179 else 180 s->state = CCLASS; 181 } 182 183 static void 184 genequiv(STR *s) 185 { 186 int i, p, pri; 187 char src[2], dst[3]; 188 size_t clen; 189 wchar_t wc; 190 191 if (*s->str == '\\') { 192 s->equiv[0] = backslash(s, NULL); 193 if (*s->str != '=') 194 errx(1, "misplaced equivalence equals sign"); 195 s->str += 2; 196 } else { 197 clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 198 if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0) 199 errc(1, EILSEQ, NULL); 200 s->equiv[0] = wc; 201 if (s->str[clen] != '=') 202 errx(1, "misplaced equivalence equals sign"); 203 s->str += clen + 2; 204 } 205 206 /* 207 * Calculate the set of all characters in the same equivalence class 208 * as the specified character (they will have the same primary 209 * collation weights). 210 * XXX Knows too much about how strxfrm() is implemented. Assumes 211 * it fills the string with primary collation weight bytes. Only one- 212 * to-one mappings are supported. 213 * XXX Equivalence classes not supported in multibyte locales. 214 */ 215 src[0] = (char)s->equiv[0]; 216 src[1] = '\0'; 217 if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) { 218 pri = (unsigned char)*dst; 219 for (p = 1, i = 1; i < NCHARS_SB; i++) { 220 *src = i; 221 if (strxfrm(dst, src, sizeof(dst)) == 1 && pri && 222 pri == (unsigned char)*dst) 223 s->equiv[p++] = i; 224 } 225 s->equiv[p] = OOBCH; 226 } 227 228 s->cnt = 0; 229 s->state = SET; 230 s->set = s->equiv; 231 } 232 233 static int 234 genrange(STR *s, int was_octal) 235 { 236 int stopval, octal; 237 char *savestart; 238 int n, cnt, *p; 239 size_t clen; 240 wchar_t wc; 241 242 octal = 0; 243 savestart = s->str; 244 if (*++s->str == '\\') 245 stopval = backslash(s, &octal); 246 else { 247 clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 248 if (clen == (size_t)-1 || clen == (size_t)-2) 249 errc(1, EILSEQ, NULL); 250 stopval = wc; 251 s->str += clen; 252 } 253 /* 254 * XXX Characters are not ordered according to collating sequence in 255 * multibyte locales. 256 */ 257 if (octal || was_octal || MB_CUR_MAX > 1) { 258 if (stopval < s->lastch) { 259 s->str = savestart; 260 return (0); 261 } 262 s->cnt = stopval - s->lastch + 1; 263 s->state = RANGE; 264 --s->lastch; 265 return (1); 266 } 267 if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) { 268 s->str = savestart; 269 return (0); 270 } 271 if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL) 272 err(1, "genrange() malloc"); 273 for (cnt = 0; cnt < NCHARS_SB; cnt++) 274 if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 && 275 charcoll((const void *)&cnt, (const void *)&stopval) <= 0) 276 *p++ = cnt; 277 *p = OOBCH; 278 n = p - s->set; 279 280 s->cnt = 0; 281 s->state = SET; 282 if (n > 1) 283 mergesort(s->set, n, sizeof(*(s->set)), charcoll); 284 return (1); 285 } 286 287 static void 288 genseq(STR *s) 289 { 290 char *ep; 291 wchar_t wc; 292 size_t clen; 293 294 if (s->which == STRING1) 295 errx(1, "sequences only valid in string2"); 296 297 if (*s->str == '\\') 298 s->lastch = backslash(s, NULL); 299 else { 300 clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); 301 if (clen == (size_t)-1 || clen == (size_t)-2) 302 errc(1, EILSEQ, NULL); 303 s->lastch = wc; 304 s->str += clen; 305 } 306 if (*s->str != '*') 307 errx(1, "misplaced sequence asterisk"); 308 309 switch (*++s->str) { 310 case '\\': 311 s->cnt = backslash(s, NULL); 312 break; 313 case ']': 314 s->cnt = 0; 315 ++s->str; 316 break; 317 default: 318 if (isdigit((u_char)*s->str)) { 319 s->cnt = strtol(s->str, &ep, 0); 320 if (*ep == ']') { 321 s->str = ep + 1; 322 break; 323 } 324 } 325 errx(1, "illegal sequence count"); 326 /* NOTREACHED */ 327 } 328 329 s->state = s->cnt ? SEQUENCE : INFINITE; 330 } 331 332 /* 333 * Translate \??? into a character. Up to 3 octal digits, if no digits either 334 * an escape code or a literal character. 335 */ 336 static int 337 backslash(STR *s, int *is_octal) 338 { 339 int ch, cnt, val; 340 341 if (is_octal != NULL) 342 *is_octal = 0; 343 for (cnt = val = 0;;) { 344 ch = (u_char)*++s->str; 345 if (!isdigit(ch) || ch > '7') 346 break; 347 val = val * 8 + ch - '0'; 348 if (++cnt == 3) { 349 ++s->str; 350 break; 351 } 352 } 353 if (cnt) { 354 if (is_octal != NULL) 355 *is_octal = 1; 356 return (val); 357 } 358 if (ch != '\0') 359 ++s->str; 360 switch (ch) { 361 case 'a': /* escape characters */ 362 return ('\7'); 363 case 'b': 364 return ('\b'); 365 case 'f': 366 return ('\f'); 367 case 'n': 368 return ('\n'); 369 case 'r': 370 return ('\r'); 371 case 't': 372 return ('\t'); 373 case 'v': 374 return ('\13'); 375 case '\0': /* \" -> \ */ 376 s->state = EOS; 377 return ('\\'); 378 default: /* \x" -> x */ 379 return (ch); 380 } 381 } 382