1 /* $OpenBSD: tokenizer.c,v 1.21 2016/04/11 21:17:29 schwarze Exp $ */ 2 /* $NetBSD: tokenizer.c,v 1.28 2016/04/11 18:56:31 christos Exp $ */ 3 4 /*- 5 * Copyright (c) 1992, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Christos Zoulas of Cornell University. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include "config.h" 37 38 /* We build this file twice, once as NARROW, once as WIDE. */ 39 /* 40 * tokenize.c: Bourne shell like tokenizer 41 */ 42 #include <stdlib.h> 43 #include <string.h> 44 45 #include "histedit.h" 46 47 typedef enum { 48 Q_none, Q_single, Q_double, Q_one, Q_doubleone 49 } quote_t; 50 51 #define TOK_KEEP 1 52 #define TOK_EAT 2 53 54 #define WINCR 20 55 #define AINCR 10 56 57 #define IFS STR("\t \n") 58 59 #ifdef NARROWCHAR 60 #define Char char 61 #define FUN(prefix, rest) prefix ## _ ## rest 62 #define TYPE(type) type 63 #define STR(x) x 64 #define Strchr(s, c) strchr(s, c) 65 #define tok_strdup(s) strdup(s) 66 #else 67 #define Char wchar_t 68 #define FUN(prefix, rest) prefix ## _w ## rest 69 #define TYPE(type) type ## W 70 #define STR(x) L ## x 71 #define Strchr(s, c) wcschr(s, c) 72 #define tok_strdup(s) wcsdup(s) 73 #endif 74 75 struct TYPE(tokenizer) { 76 Char *ifs; /* In field separator */ 77 int argc, amax; /* Current and maximum number of args */ 78 Char **argv; /* Argument list */ 79 Char *wptr, *wmax; /* Space and limit on the word buffer */ 80 Char *wstart; /* Beginning of next word */ 81 Char *wspace; /* Space of word buffer */ 82 quote_t quote; /* Quoting state */ 83 int flags; /* flags; */ 84 }; 85 86 87 static void FUN(tok,finish)(TYPE(Tokenizer) *); 88 89 90 /* FUN(tok,finish)(): 91 * Finish a word in the tokenizer. 92 */ 93 static void 94 FUN(tok,finish)(TYPE(Tokenizer) *tok) 95 { 96 97 *tok->wptr = '\0'; 98 if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) { 99 tok->argv[tok->argc++] = tok->wstart; 100 tok->argv[tok->argc] = NULL; 101 tok->wstart = ++tok->wptr; 102 } 103 tok->flags &= ~TOK_KEEP; 104 } 105 106 107 /* FUN(tok,init)(): 108 * Initialize the tokenizer 109 */ 110 TYPE(Tokenizer) * 111 FUN(tok,init)(const Char *ifs) 112 { 113 TYPE(Tokenizer) *tok = malloc(sizeof(TYPE(Tokenizer))); 114 115 if (tok == NULL) 116 return NULL; 117 tok->ifs = tok_strdup(ifs ? ifs : IFS); 118 if (tok->ifs == NULL) { 119 free(tok); 120 return NULL; 121 } 122 tok->argc = 0; 123 tok->amax = AINCR; 124 tok->argv = reallocarray(NULL, tok->amax, sizeof(*tok->argv)); 125 if (tok->argv == NULL) { 126 free(tok->ifs); 127 free(tok); 128 return NULL; 129 } 130 tok->argv[0] = NULL; 131 tok->wspace = reallocarray(NULL, WINCR, sizeof(*tok->wspace)); 132 if (tok->wspace == NULL) { 133 free(tok->argv); 134 free(tok->ifs); 135 free(tok); 136 return NULL; 137 } 138 tok->wmax = tok->wspace + WINCR; 139 tok->wstart = tok->wspace; 140 tok->wptr = tok->wspace; 141 tok->flags = 0; 142 tok->quote = Q_none; 143 144 return tok; 145 } 146 147 148 /* FUN(tok,reset)(): 149 * Reset the tokenizer 150 */ 151 void 152 FUN(tok,reset)(TYPE(Tokenizer) *tok) 153 { 154 155 tok->argc = 0; 156 tok->wstart = tok->wspace; 157 tok->wptr = tok->wspace; 158 tok->flags = 0; 159 tok->quote = Q_none; 160 } 161 162 163 /* FUN(tok,end)(): 164 * Clean up 165 */ 166 void 167 FUN(tok,end)(TYPE(Tokenizer) *tok) 168 { 169 170 free(tok->ifs); 171 free(tok->wspace); 172 free(tok->argv); 173 free(tok); 174 } 175 176 177 178 /* FUN(tok,line)(): 179 * Bourne shell (sh(1)) like tokenizing 180 * Arguments: 181 * tok current tokenizer state (setup with FUN(tok,init)()) 182 * line line to parse 183 * Returns: 184 * -1 Internal error 185 * 3 Quoted return 186 * 2 Unmatched double quote 187 * 1 Unmatched single quote 188 * 0 Ok 189 * Modifies (if return value is 0): 190 * argc number of arguments 191 * argv argument array 192 * cursorc if !NULL, argv element containing cursor 193 * cursorv if !NULL, offset in argv[cursorc] of cursor 194 */ 195 int 196 FUN(tok,line)(TYPE(Tokenizer) *tok, const TYPE(LineInfo) *line, 197 int *argc, const Char ***argv, int *cursorc, int *cursoro) 198 { 199 const Char *ptr; 200 int cc, co; 201 202 cc = co = -1; 203 ptr = line->buffer; 204 for (ptr = line->buffer; ;ptr++) { 205 if (ptr >= line->lastchar) 206 ptr = STR(""); 207 if (ptr == line->cursor) { 208 cc = tok->argc; 209 co = (int)(tok->wptr - tok->wstart); 210 } 211 switch (*ptr) { 212 case '\'': 213 tok->flags |= TOK_KEEP; 214 tok->flags &= ~TOK_EAT; 215 switch (tok->quote) { 216 case Q_none: 217 tok->quote = Q_single; /* Enter single quote 218 * mode */ 219 break; 220 221 case Q_single: /* Exit single quote mode */ 222 tok->quote = Q_none; 223 break; 224 225 case Q_one: /* Quote this ' */ 226 tok->quote = Q_none; 227 *tok->wptr++ = *ptr; 228 break; 229 230 case Q_double: /* Stay in double quote mode */ 231 *tok->wptr++ = *ptr; 232 break; 233 234 case Q_doubleone: /* Quote this ' */ 235 tok->quote = Q_double; 236 *tok->wptr++ = *ptr; 237 break; 238 239 default: 240 return -1; 241 } 242 break; 243 244 case '"': 245 tok->flags &= ~TOK_EAT; 246 tok->flags |= TOK_KEEP; 247 switch (tok->quote) { 248 case Q_none: /* Enter double quote mode */ 249 tok->quote = Q_double; 250 break; 251 252 case Q_double: /* Exit double quote mode */ 253 tok->quote = Q_none; 254 break; 255 256 case Q_one: /* Quote this " */ 257 tok->quote = Q_none; 258 *tok->wptr++ = *ptr; 259 break; 260 261 case Q_single: /* Stay in single quote mode */ 262 *tok->wptr++ = *ptr; 263 break; 264 265 case Q_doubleone: /* Quote this " */ 266 tok->quote = Q_double; 267 *tok->wptr++ = *ptr; 268 break; 269 270 default: 271 return -1; 272 } 273 break; 274 275 case '\\': 276 tok->flags |= TOK_KEEP; 277 tok->flags &= ~TOK_EAT; 278 switch (tok->quote) { 279 case Q_none: /* Quote next character */ 280 tok->quote = Q_one; 281 break; 282 283 case Q_double: /* Quote next character */ 284 tok->quote = Q_doubleone; 285 break; 286 287 case Q_one: /* Quote this, restore state */ 288 *tok->wptr++ = *ptr; 289 tok->quote = Q_none; 290 break; 291 292 case Q_single: /* Stay in single quote mode */ 293 *tok->wptr++ = *ptr; 294 break; 295 296 case Q_doubleone: /* Quote this \ */ 297 tok->quote = Q_double; 298 *tok->wptr++ = *ptr; 299 break; 300 301 default: 302 return -1; 303 } 304 break; 305 306 case '\n': 307 tok->flags &= ~TOK_EAT; 308 switch (tok->quote) { 309 case Q_none: 310 goto tok_line_outok; 311 312 case Q_single: 313 case Q_double: 314 *tok->wptr++ = *ptr; /* Add the return */ 315 break; 316 317 case Q_doubleone: /* Back to double, eat the '\n' */ 318 tok->flags |= TOK_EAT; 319 tok->quote = Q_double; 320 break; 321 322 case Q_one: /* No quote, more eat the '\n' */ 323 tok->flags |= TOK_EAT; 324 tok->quote = Q_none; 325 break; 326 327 default: 328 return 0; 329 } 330 break; 331 332 case '\0': 333 switch (tok->quote) { 334 case Q_none: 335 /* Finish word and return */ 336 if (tok->flags & TOK_EAT) { 337 tok->flags &= ~TOK_EAT; 338 return 3; 339 } 340 goto tok_line_outok; 341 342 case Q_single: 343 return 1; 344 345 case Q_double: 346 return 2; 347 348 case Q_doubleone: 349 tok->quote = Q_double; 350 *tok->wptr++ = *ptr; 351 break; 352 353 case Q_one: 354 tok->quote = Q_none; 355 *tok->wptr++ = *ptr; 356 break; 357 358 default: 359 return -1; 360 } 361 break; 362 363 default: 364 tok->flags &= ~TOK_EAT; 365 switch (tok->quote) { 366 case Q_none: 367 if (Strchr(tok->ifs, *ptr) != NULL) 368 FUN(tok,finish)(tok); 369 else 370 *tok->wptr++ = *ptr; 371 break; 372 373 case Q_single: 374 case Q_double: 375 *tok->wptr++ = *ptr; 376 break; 377 378 379 case Q_doubleone: 380 *tok->wptr++ = '\\'; 381 tok->quote = Q_double; 382 *tok->wptr++ = *ptr; 383 break; 384 385 case Q_one: 386 tok->quote = Q_none; 387 *tok->wptr++ = *ptr; 388 break; 389 390 default: 391 return -1; 392 393 } 394 break; 395 } 396 397 if (tok->wptr >= tok->wmax - 4) { 398 size_t size = tok->wmax - tok->wspace + WINCR; 399 Char *s = reallocarray(tok->wspace, size, sizeof(*s)); 400 if (s == NULL) 401 return -1; 402 403 if (s != tok->wspace) { 404 int i; 405 for (i = 0; i < tok->argc; i++) { 406 tok->argv[i] = 407 (tok->argv[i] - tok->wspace) + s; 408 } 409 tok->wptr = (tok->wptr - tok->wspace) + s; 410 tok->wstart = (tok->wstart - tok->wspace) + s; 411 tok->wspace = s; 412 } 413 tok->wmax = s + size; 414 } 415 if (tok->argc >= tok->amax - 4) { 416 Char **p; 417 tok->amax += AINCR; 418 p = reallocarray(tok->argv, tok->amax, sizeof(*p)); 419 if (p == NULL) { 420 tok->amax -= AINCR; 421 return -1; 422 } 423 tok->argv = p; 424 } 425 } 426 tok_line_outok: 427 if (cc == -1 && co == -1) { 428 cc = tok->argc; 429 co = (int)(tok->wptr - tok->wstart); 430 } 431 if (cursorc != NULL) 432 *cursorc = cc; 433 if (cursoro != NULL) 434 *cursoro = co; 435 FUN(tok,finish)(tok); 436 *argv = (const Char **)tok->argv; 437 *argc = tok->argc; 438 return 0; 439 } 440 441 /* FUN(tok,str)(): 442 * Simpler version of tok_line, taking a NUL terminated line 443 * and splitting into words, ignoring cursor state. 444 */ 445 int 446 FUN(tok,str)(TYPE(Tokenizer) *tok, const Char *line, int *argc, 447 const Char ***argv) 448 { 449 TYPE(LineInfo) li; 450 451 memset(&li, 0, sizeof(li)); 452 li.buffer = line; 453 li.cursor = li.lastchar = Strchr(line, '\0'); 454 return FUN(tok,line)(tok, &li, argc, argv, NULL, NULL); 455 } 456