xref: /original-bsd/lib/libedit/tokenizer.c (revision c3e32dec)
1 /*-
2  * Copyright (c) 1992, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Christos Zoulas of Cornell University.
7  *
8  * %sccs.include.redist.c%
9  */
10 
11 #if !defined(lint) && !defined(SCCSID)
12 static char sccsid[] = "@(#)tokenizer.c	8.1 (Berkeley) 06/04/93";
13 #endif /* not lint && not SCCSID */
14 
15 /*
16  * tokenize.c: Bourne shell like tokenizer
17  */
18 #include "sys.h"
19 #include <string.h>
20 #include <stdlib.h>
21 #include "tokenizer.h"
22 
23 typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t;
24 
25 #define IFS "\t \n"
26 
27 #define TOK_KEEP	1
28 #define TOK_EAT		2
29 
30 #define WINCR 20
31 #define AINCR 10
32 
33 #define tok_malloc(a)		malloc(a)
34 #define tok_free(a)		free(a)
35 #define tok_realloc(a, b)	realloc(a, b)
36 
37 
38 struct tokenizer {
39     char   *ifs;		/* In field separator			*/
40     int     argc, amax;		/* Current and maximum number of args	*/
41     char  **argv;		/* Argument list			*/
42     char   *wptr, *wmax;	/* Space and limit on the word buffer	*/
43     char   *wstart;		/* Beginning of next word		*/
44     char   *wspace;		/* Space of word buffer			*/
45     quote_t quote;		/* Quoting state			*/
46     int	    flags;		/* flags;				*/
47 };
48 
49 
50 private void tok_finish	__P((Tokenizer *));
51 
52 
53 /* tok_finish():
54  *	Finish a word in the tokenizer.
55  */
56 private void
57 tok_finish(tok)
58     Tokenizer *tok;
59 {
60     *tok->wptr = '\0';
61     if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
62 	tok->argv[tok->argc++] = tok->wstart;
63 	tok->argv[tok->argc] = NULL;
64 	tok->wstart = ++tok->wptr;
65     }
66     tok->flags &= ~TOK_KEEP;
67 }
68 
69 
70 /* tok_init():
71  *	Initialize the tokenizer
72  */
73 public Tokenizer *
74 tok_init(ifs)
75     const char *ifs;
76 {
77     Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer));
78 
79     tok->ifs     = strdup(ifs ? ifs : IFS);
80     tok->argc    = 0;
81     tok->amax    = AINCR;
82     tok->argv    = (char **) tok_malloc(sizeof(char *) * tok->amax);
83     tok->argv[0] = NULL;
84     tok->wspace  = (char *) tok_malloc(WINCR);
85     tok->wmax    = tok->wspace + WINCR;
86     tok->wstart  = tok->wspace;
87     tok->wptr    = tok->wspace;
88     tok->flags   = 0;
89     tok->quote   = Q_none;
90 
91     return tok;
92 }
93 
94 
95 /* tok_reset():
96  *	Reset the tokenizer
97  */
98 public void
99 tok_reset(tok)
100     Tokenizer *tok;
101 {
102     tok->argc  = 0;
103     tok->wstart = tok->wspace;
104     tok->wptr = tok->wspace;
105     tok->flags = 0;
106     tok->quote = Q_none;
107 }
108 
109 
110 /* tok_end():
111  *	Clean up
112  */
113 public void
114 tok_end(tok)
115     Tokenizer *tok;
116 {
117     tok_free((ptr_t) tok->ifs);
118     tok_free((ptr_t) tok->wspace);
119     tok_free((ptr_t) tok->argv);
120     tok_free((ptr_t) tok);
121 }
122 
123 
124 
125 /* tok_line():
126  *	Bourne shell like tokenizing
127  *	Return:
128  *		-1: Internal error
129  *		 3: Quoted return
130  *		 2: Unmatched double quote
131  *		 1: Unmatched single quote
132  *		 0: Ok
133  */
134 public int
135 tok_line(tok, line, argc, argv)
136     Tokenizer *tok;
137     const char* line;
138     int *argc;
139     char ***argv;
140 {
141     const char *ptr;
142 
143     while (1) {
144 	switch (*(ptr = line++)) {
145 	case '\'':
146 	    tok->flags |= TOK_KEEP;
147 	    tok->flags &= ~TOK_EAT;
148 	    switch (tok->quote) {
149 	    case Q_none:
150 		tok->quote = Q_single;	/* Enter single quote mode */
151 		break;
152 
153 	    case Q_single:		/* Exit single quote mode */
154 		tok->quote = Q_none;
155 		break;
156 
157 	    case Q_one:			/* Quote this ' */
158 		tok->quote = Q_none;
159 		*tok->wptr++ = *ptr;
160 		break;
161 
162 	    case Q_double:		/* Stay in double quote mode */
163 		*tok->wptr++ = *ptr;
164 		break;
165 
166 	    case Q_doubleone:		/* Quote this ' */
167 		tok->quote = Q_double;
168 		*tok->wptr++ = *ptr;
169 		break;
170 
171 	    default:
172 		return(-1);
173 	    }
174 	    break;
175 
176 	case '"':
177 	    tok->flags &= ~TOK_EAT;
178 	    tok->flags |= TOK_KEEP;
179 	    switch (tok->quote) {
180 	    case Q_none:		/* Enter double quote mode */
181 		tok->quote = Q_double;
182 		break;
183 
184 	    case Q_double:
185 		tok->quote = Q_none;	/* Exit double quote mode */
186 		break;
187 
188 	    case Q_one:			/* Quote this " */
189 		tok->quote = Q_none;
190 		*tok->wptr++ = *ptr;
191 		break;
192 
193 	    case Q_single:		/* Stay in single quote mode */
194 		*tok->wptr++ = *ptr;
195 		break;
196 
197 	    case Q_doubleone:		/* Quote this " */
198 		tok->quote = Q_double;
199 		*tok->wptr++ = *ptr;
200 		break;
201 
202 	    default:
203 		return(-1);
204 	    }
205 	    break;
206 
207 	case '\\':
208 	    tok->flags |= TOK_KEEP;
209 	    tok->flags &= ~TOK_EAT;
210 	    switch (tok->quote) {
211 	    case Q_none:		/* Quote next character */
212 		tok->quote = Q_one;
213 		break;
214 
215 	    case Q_double:
216 		tok->quote = Q_doubleone;/* Quote next character */
217 		break;
218 
219 	    case Q_one:
220 		*tok->wptr++ = *ptr;
221 		tok->quote = Q_none;	/* Quote this, restore state */
222 		break;
223 
224 	    case Q_single:		/* Stay in single quote mode */
225 		*tok->wptr++ = *ptr;
226 		break;
227 
228 	    case Q_doubleone:		/* Quote this \ */
229 		tok->quote = Q_double;
230 		*tok->wptr++ = *ptr;
231 		break;
232 
233 	    default:
234 		return(-1);
235 	    }
236 	    break;
237 
238 	case '\n':
239 	    tok->flags &= ~TOK_EAT;
240 	    switch (tok->quote) {
241 	    case Q_none:
242 		tok_finish(tok);
243 		*argv = tok->argv;
244 		*argc = tok->argc;
245 		return(0);
246 
247 	    case Q_single:
248 	    case Q_double:
249 		*tok->wptr++ = *ptr;	/* Add the return		*/
250 		break;
251 
252 	    case Q_doubleone:
253 		tok->flags |= TOK_EAT;
254 		tok->quote = Q_double;	/* Back to double, eat the '\n' */
255 		break;
256 
257 	    case Q_one:
258 		tok->flags |= TOK_EAT;
259 		tok->quote = Q_none;	/* No quote, more eat the '\n' */
260 		break;
261 
262 	    default:
263 		return(0);
264 	    }
265 	    break;
266 
267 	case '\0':
268 	    switch (tok->quote) {
269 	    case Q_none:
270 		/* Finish word and return */
271 		if (tok->flags & TOK_EAT) {
272 		    tok->flags &= ~TOK_EAT;
273 		    return 3;
274 		}
275 		tok_finish(tok);
276 		*argv = tok->argv;
277 		*argc = tok->argc;
278 		return(0);
279 
280 	    case Q_single:
281 		return(1);
282 
283 	    case Q_double:
284 		return(2);
285 
286 	    case Q_doubleone:
287 		tok->quote = Q_double;
288 		*tok->wptr++ = *ptr;
289 		break;
290 
291 	    case Q_one:
292 		tok->quote = Q_none;
293 		*tok->wptr++ = *ptr;
294 		break;
295 
296 	    default:
297 		return(-1);
298 	    }
299 	    break;
300 
301 	default:
302 	    tok->flags &= ~TOK_EAT;
303 	    switch (tok->quote) {
304 	    case Q_none:
305 		if (strchr(tok->ifs, *ptr) != NULL)
306 		    tok_finish(tok);
307 		else
308 		    *tok->wptr++ = *ptr;
309 		break;
310 
311 	    case Q_single:
312 	    case Q_double:
313 		*tok->wptr++ = *ptr;
314 		break;
315 
316 
317 	    case Q_doubleone:
318 		*tok->wptr++ = '\\';
319 		tok->quote = Q_double;
320 		*tok->wptr++ = *ptr;
321 		break;
322 
323 	    case Q_one:
324 		tok->quote = Q_none;
325 		*tok->wptr++ = *ptr;
326 		break;
327 
328 	    default:
329 		return(-1);
330 
331 	    }
332 	    break;
333 	}
334 
335 	if (tok->wptr >= tok->wmax - 4) {
336 	    size_t size = tok->wmax - tok->wspace + WINCR;
337 	    char *s = (char *) tok_realloc(tok->wspace, size);
338 	    /*SUPPRESS 22*/
339 	    int offs = s - tok->wspace;
340 
341 	    if (offs != 0) {
342 		int i;
343 		for (i = 0; i < tok->argc; i++)
344 		    tok->argv[i] = tok->argv[i] + offs;
345 		tok->wptr   = tok->wptr + offs;
346 		tok->wstart = tok->wstart + offs;
347 		tok->wmax   = s + size;
348 		tok->wspace = s;
349 	    }
350 	}
351 
352 	if (tok->argc >= tok->amax - 4) {
353 	    tok->amax += AINCR;
354 	    tok->argv = (char **) tok_realloc(tok->argv,
355 					      tok->amax * sizeof(char*));
356 	}
357 
358     }
359 }
360