xref: /netbsd/lib/libedit/tokenizer.c (revision c4a72b64)
1 /*	$NetBSD: tokenizer.c,v 1.11 2002/10/27 20:24:29 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 1992, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Christos Zoulas of Cornell University.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 #include "config.h"
40 #if !defined(lint) && !defined(SCCSID)
41 #if 0
42 static char sccsid[] = "@(#)tokenizer.c	8.1 (Berkeley) 6/4/93";
43 #else
44 __RCSID("$NetBSD: tokenizer.c,v 1.11 2002/10/27 20:24:29 christos Exp $");
45 #endif
46 #endif /* not lint && not SCCSID */
47 
48 /*
49  * tokenize.c: Bourne shell like tokenizer
50  */
51 #include <string.h>
52 #include <stdlib.h>
53 #include "tokenizer.h"
54 
55 typedef enum {
56 	Q_none, Q_single, Q_double, Q_one, Q_doubleone
57 } quote_t;
58 
59 #define	IFS		"\t \n"
60 
61 #define	TOK_KEEP	1
62 #define	TOK_EAT		2
63 
64 #define	WINCR		20
65 #define	AINCR		10
66 
67 #define	tok_malloc(a)		malloc(a)
68 #define	tok_free(a)		free(a)
69 #define	tok_realloc(a, b)	realloc(a, b)
70 
71 
72 struct tokenizer {
73 	char	*ifs;		/* In field separator			 */
74 	int	 argc, amax;	/* Current and maximum number of args	 */
75 	char   **argv;		/* Argument list			 */
76 	char	*wptr, *wmax;	/* Space and limit on the word buffer	 */
77 	char	*wstart;	/* Beginning of next word		 */
78 	char	*wspace;	/* Space of word buffer			 */
79 	quote_t	 quote;		/* Quoting state			 */
80 	int	 flags;		/* flags;				 */
81 };
82 
83 
84 private void tok_finish(Tokenizer *);
85 
86 
87 /* tok_finish():
88  *	Finish a word in the tokenizer.
89  */
90 private void
91 tok_finish(Tokenizer *tok)
92 {
93 
94 	*tok->wptr = '\0';
95 	if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
96 		tok->argv[tok->argc++] = tok->wstart;
97 		tok->argv[tok->argc] = NULL;
98 		tok->wstart = ++tok->wptr;
99 	}
100 	tok->flags &= ~TOK_KEEP;
101 }
102 
103 
104 /* tok_init():
105  *	Initialize the tokenizer
106  */
107 public Tokenizer *
108 tok_init(const char *ifs)
109 {
110 	Tokenizer *tok = (Tokenizer *) tok_malloc(sizeof(Tokenizer));
111 
112 	if (tok == NULL)
113 		return NULL;
114 	tok->ifs = strdup(ifs ? ifs : IFS);
115 	if (tok->ifs == NULL) {
116 		tok_free((ptr_t)tok);
117 		return NULL;
118 	}
119 	tok->argc = 0;
120 	tok->amax = AINCR;
121 	tok->argv = (char **) tok_malloc(sizeof(char *) * tok->amax);
122 	if (tok->argv == NULL) {
123 		tok_free((ptr_t)tok->ifs);
124 		tok_free((ptr_t)tok);
125 		return NULL;
126 	}
127 	tok->argv[0] = NULL;
128 	tok->wspace = (char *) tok_malloc(WINCR);
129 	if (tok->wspace == NULL) {
130 		tok_free((ptr_t)tok->argv);
131 		tok_free((ptr_t)tok->ifs);
132 		tok_free((ptr_t)tok);
133 		return NULL;
134 	}
135 	tok->wmax = tok->wspace + WINCR;
136 	tok->wstart = tok->wspace;
137 	tok->wptr = tok->wspace;
138 	tok->flags = 0;
139 	tok->quote = Q_none;
140 
141 	return (tok);
142 }
143 
144 
145 /* tok_reset():
146  *	Reset the tokenizer
147  */
148 public void
149 tok_reset(Tokenizer *tok)
150 {
151 
152 	tok->argc = 0;
153 	tok->wstart = tok->wspace;
154 	tok->wptr = tok->wspace;
155 	tok->flags = 0;
156 	tok->quote = Q_none;
157 }
158 
159 
160 /* tok_end():
161  *	Clean up
162  */
163 public void
164 tok_end(Tokenizer *tok)
165 {
166 
167 	tok_free((ptr_t) tok->ifs);
168 	tok_free((ptr_t) tok->wspace);
169 	tok_free((ptr_t) tok->argv);
170 	tok_free((ptr_t) tok);
171 }
172 
173 
174 
175 /* tok_line():
176  *	Bourne shell like tokenizing
177  *	Return:
178  *		-1: Internal error
179  *		 3: Quoted return
180  *		 2: Unmatched double quote
181  *		 1: Unmatched single quote
182  *		 0: Ok
183  */
184 public int
185 tok_line(Tokenizer *tok, const char *line, int *argc, const char ***argv)
186 {
187 	const char *ptr;
188 
189 	for (;;) {
190 		switch (*(ptr = line++)) {
191 		case '\'':
192 			tok->flags |= TOK_KEEP;
193 			tok->flags &= ~TOK_EAT;
194 			switch (tok->quote) {
195 			case Q_none:
196 				tok->quote = Q_single;	/* Enter single quote
197 							 * mode */
198 				break;
199 
200 			case Q_single:	/* Exit single quote mode */
201 				tok->quote = Q_none;
202 				break;
203 
204 			case Q_one:	/* Quote this ' */
205 				tok->quote = Q_none;
206 				*tok->wptr++ = *ptr;
207 				break;
208 
209 			case Q_double:	/* Stay in double quote mode */
210 				*tok->wptr++ = *ptr;
211 				break;
212 
213 			case Q_doubleone:	/* Quote this ' */
214 				tok->quote = Q_double;
215 				*tok->wptr++ = *ptr;
216 				break;
217 
218 			default:
219 				return (-1);
220 			}
221 			break;
222 
223 		case '"':
224 			tok->flags &= ~TOK_EAT;
225 			tok->flags |= TOK_KEEP;
226 			switch (tok->quote) {
227 			case Q_none:	/* Enter double quote mode */
228 				tok->quote = Q_double;
229 				break;
230 
231 			case Q_double:	/* Exit double quote mode */
232 				tok->quote = Q_none;
233 				break;
234 
235 			case Q_one:	/* Quote this " */
236 				tok->quote = Q_none;
237 				*tok->wptr++ = *ptr;
238 				break;
239 
240 			case Q_single:	/* Stay in single quote mode */
241 				*tok->wptr++ = *ptr;
242 				break;
243 
244 			case Q_doubleone:	/* Quote this " */
245 				tok->quote = Q_double;
246 				*tok->wptr++ = *ptr;
247 				break;
248 
249 			default:
250 				return (-1);
251 			}
252 			break;
253 
254 		case '\\':
255 			tok->flags |= TOK_KEEP;
256 			tok->flags &= ~TOK_EAT;
257 			switch (tok->quote) {
258 			case Q_none:	/* Quote next character */
259 				tok->quote = Q_one;
260 				break;
261 
262 			case Q_double:	/* Quote next character */
263 				tok->quote = Q_doubleone;
264 				break;
265 
266 			case Q_one:	/* Quote this, restore state */
267 				*tok->wptr++ = *ptr;
268 				tok->quote = Q_none;
269 				break;
270 
271 			case Q_single:	/* Stay in single quote mode */
272 				*tok->wptr++ = *ptr;
273 				break;
274 
275 			case Q_doubleone:	/* Quote this \ */
276 				tok->quote = Q_double;
277 				*tok->wptr++ = *ptr;
278 				break;
279 
280 			default:
281 				return (-1);
282 			}
283 			break;
284 
285 		case '\n':
286 			tok->flags &= ~TOK_EAT;
287 			switch (tok->quote) {
288 			case Q_none:
289 				tok_finish(tok);
290 				*argv = (const char **)tok->argv;
291 				*argc = tok->argc;
292 				return (0);
293 
294 			case Q_single:
295 			case Q_double:
296 				*tok->wptr++ = *ptr;	/* Add the return */
297 				break;
298 
299 			case Q_doubleone:   /* Back to double, eat the '\n' */
300 				tok->flags |= TOK_EAT;
301 				tok->quote = Q_double;
302 				break;
303 
304 			case Q_one:	/* No quote, more eat the '\n' */
305 				tok->flags |= TOK_EAT;
306 				tok->quote = Q_none;
307 				break;
308 
309 			default:
310 				return (0);
311 			}
312 			break;
313 
314 		case '\0':
315 			switch (tok->quote) {
316 			case Q_none:
317 				/* Finish word and return */
318 				if (tok->flags & TOK_EAT) {
319 					tok->flags &= ~TOK_EAT;
320 					return (3);
321 				}
322 				tok_finish(tok);
323 				*argv = (const char **)tok->argv;
324 				*argc = tok->argc;
325 				return (0);
326 
327 			case Q_single:
328 				return (1);
329 
330 			case Q_double:
331 				return (2);
332 
333 			case Q_doubleone:
334 				tok->quote = Q_double;
335 				*tok->wptr++ = *ptr;
336 				break;
337 
338 			case Q_one:
339 				tok->quote = Q_none;
340 				*tok->wptr++ = *ptr;
341 				break;
342 
343 			default:
344 				return (-1);
345 			}
346 			break;
347 
348 		default:
349 			tok->flags &= ~TOK_EAT;
350 			switch (tok->quote) {
351 			case Q_none:
352 				if (strchr(tok->ifs, *ptr) != NULL)
353 					tok_finish(tok);
354 				else
355 					*tok->wptr++ = *ptr;
356 				break;
357 
358 			case Q_single:
359 			case Q_double:
360 				*tok->wptr++ = *ptr;
361 				break;
362 
363 
364 			case Q_doubleone:
365 				*tok->wptr++ = '\\';
366 				tok->quote = Q_double;
367 				*tok->wptr++ = *ptr;
368 				break;
369 
370 			case Q_one:
371 				tok->quote = Q_none;
372 				*tok->wptr++ = *ptr;
373 				break;
374 
375 			default:
376 				return (-1);
377 
378 			}
379 			break;
380 		}
381 
382 		if (tok->wptr >= tok->wmax - 4) {
383 			size_t size = tok->wmax - tok->wspace + WINCR;
384 			char *s = (char *) tok_realloc(tok->wspace, size);
385 			if (s == NULL)
386 				return (-1);
387 
388 			if (s != tok->wspace) {
389 				int i;
390 				for (i = 0; i < tok->argc; i++) {
391 				    tok->argv[i] =
392 					(tok->argv[i] - tok->wspace) + s;
393 				}
394 				tok->wptr = (tok->wptr - tok->wspace) + s;
395 				tok->wstart = (tok->wstart - tok->wspace) + s;
396 				tok->wspace = s;
397 			}
398 			tok->wmax = s + size;
399 		}
400 		if (tok->argc >= tok->amax - 4) {
401 			char **p;
402 			tok->amax += AINCR;
403 			p = (char **) tok_realloc(tok->argv,
404 			    tok->amax * sizeof(char *));
405 			if (p == NULL)
406 				return (-1);
407 			tok->argv = p;
408 		}
409 	}
410 }
411