1 //
2 //   lex.c
3 //
4 //   Oliver Frome  <olli@fromme.com>
5 //   @(#)$Id: lex.c,v 1.3 1998/11/02 05:03:33 olli Exp $
6 //
7 //   This module implements a tokenizer with quoting, command
8 //   substitution and parameter substitution.  It tries to
9 //   mimic the behaviour of /bin/sh as closely as possible.
10 //
11 
12 static const char cvsid[]
13     = "@(#)$Id: lex.c,v 1.3 1998/11/02 05:03:33 olli Exp $";
14 
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 
19 #include "lex.h"
20 
21 //
22 //   The following constants (bit fields) are used for
23 //   lexstate.quote_state (see below).
24 //
25 //   QUOTE_DOUBLE   is set if we're inside double quotes.
26 //   QUOTE_SINGLE   is set if we're inside single quotes.
27 //   QUOTE_BSLASH   is set if the next character is being
28 //                  escaped by a backslash.
29 //   QUOTE_BTICK    is set if we're inside backticks.
30 //   QUOTE_PARAM    is set if we're inside a parameter.
31 //   QUOTE_SUBST    is set if we're inside the result of a
32 //                  command substitution (backticks) or
33 //                  parameter substitution (dollar sign).
34 //   QUOTE_SPACE    is set if we encountered whitespace that
35 //                  is relevant for splitting/tokenizing.
36 //                  It is NOT set for quoted whitespace.
37 //
38 
39 #define QUOTE_DOUBLE	0x01
40 #define QUOTE_SINGLE	0x02
41 #define QUOTE_BSLASH	0x04
42 #define QUOTE_BTICK	0x08
43 #define QUOTE_QUOTE	(QUOTE_DOUBLE | QUOTE_SINGLE \
44 			| QUOTE_BSLASH | QUOTE_BTICK)
45 #define QUOTE_PARAM	0x10
46 #define QUOTE_SUBST	0x20
47 #define QUOTE_SPACE	0x40
48 
49 int lineno;		// current line number
50 lex_inputfunc  lif;	// lex() input function, set with lex_setinput()
51 lex_outputfunc lof;	// lex() output function, set with lex_setoutput()
52 void *lifudata;		// user data for input function
53 void *lofudata;		// user data for output function
54 
55 void
lex_setinput(lex_inputfunc func,void * userdata)56 lex_setinput (lex_inputfunc func, void *userdata)
57 {
58 	lif = func;
59 	lifudata = userdata;
60 }
61 
62 void
lex_setoutput(lex_outputfunc func,void * userdata)63 lex_setoutput (lex_outputfunc func, void *userdata)
64 {
65 	lof = func;
66 	lofudata = userdata;
67 }
68 
69 static int
isspace(char c)70 isspace (char c)
71 {
72 	return c == ' ' || c == '\t' || c == '\n';
73 }
74 
75 static int
isparamstart(char c)76 isparamstart (char c)
77 {
78 	if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_')
79 		return 1;
80 	return 0;
81 }
82 
83 static int
isparamchar(char c)84 isparamchar (char c)
85 {
86 	if (isparamstart(c) || (c >= '0' && c <= '9'))
87 		return 1;
88 	return 0;
89 }
90 
91 //
92 //   strappend(s, a) works similar to strcat(s, a), but treats
93 //   "s" as a dynamically allocated string.
94 //
95 //   The return value is the new location of the resulting
96 //   string (or NULL if an error occured), the value of "s" is
97 //   undefined in either case after calling strappend().
98 //
99 //   chrappend(s, a) works similar but appends only a single
100 //   character.
101 //
102 
103 static char *
strappend(char * s,char * a)104 strappend (char *s, char *a)
105 {
106 	char *new;
107 	int sl, al;
108 
109 	sl = s ? strlen(s) : 0;
110 	al = strlen(a);
111 	if (!(new = (char *) realloc(s, sl + al + 1))) {
112 		free (s);
113 		return NULL;
114 	}
115 	if (s)
116 		strcpy (new, s);
117 	strcpy (new + sl, a);
118 	return new;
119 }
120 
121 static char *
chrappend(char * s,char a)122 chrappend (char *s, char a)
123 {
124 	char *new;
125 	int sl;
126 
127 	sl = s ? strlen(s) : 0;
128 	if (!(new = (char *) realloc(s, sl + 2))) {
129 		free (s);
130 		return NULL;
131 	}
132 	if (s)
133 		strcpy (new, s);
134 	new[sl] = a;
135 	new[sl + 1] = '\0';
136 	return new;
137 }
138 
139 static void
freetokens(int numtokens,char * tokens[])140 freetokens (int numtokens, char *tokens[])
141 {
142 	if (numtokens && tokens) {
143 		for (; numtokens; numtokens--)
144 			free (tokens[numtokens - 1]);
145 		free (tokens);
146 	}
147 }
148 
149 static int
addtoken(int * numtokens,char ** tokens[],char ** token)150 addtoken (int *numtokens, char **tokens[], char **token)
151 {
152 	char **newtoks;
153 
154 	if (!(newtoks = (char **) realloc(*tokens,
155 	    sizeof(char *) * (*numtokens + 1)))) {
156 		freetokens (*numtokens, *tokens);
157 		free (*token);
158 		*token = NULL;
159 		return 0;
160 	}
161 	if (!(newtoks[*numtokens] = *token ? strdup(*token) : strdup(""))) {
162 		freetokens (*numtokens + 1, newtoks);
163 		free (*token);
164 		*token = NULL;
165 		return 0;
166 	}
167 	*numtokens += 1;
168 	free (*token);
169 	*token = NULL;
170 	*tokens = newtoks;
171 	return 1;
172 }
173 
174 char *
readinput(FILE * inputfile)175 readinput (FILE *inputfile)
176 {
177 	char *line, *last;
178 	char buf[256];
179 
180 	line = NULL;
181 	for (;;) {
182 		if (!fgets(buf, 256, inputfile))
183 			break;
184 		if (!(line = strappend(line, buf)))
185 			return NULL;
186 		last = line + (strlen(line) - 1);
187 		if (*last == '\n') {
188 			*last = '\0';
189 			break;
190 		}
191 	}
192 	lineno++;
193 	return line;
194 }
195 
196 typedef struct lexstate_t {
197 	char *token;		// the current token
198 	char **tokens;		// list of tokens
199 	int numtokens;		// number of tokens in the list
200 	int quote_state;	// current quoting state
201 	char *cmdpar;		// parameter or backtick contents
202 } lexstate;
203 
204 static int lexcommand (lexstate *ls, char *src);
205 
206 static int lexparameter (lexstate *ls, char *src);
207 
208 static int
lexchar(lexstate * ls,char * src)209 lexchar (lexstate *ls, char *src)
210 {
211 	//   Inside single quotes, everything is quoted
212 	//   except for the terminating single quote.
213 	if ((ls->quote_state & QUOTE_SINGLE))
214 		if (*src == '\'') {
215 			ls->quote_state &= ~QUOTE_SINGLE;
216 			return 1;
217 		}
218 		else
219 			goto copychar;
220 
221 	//   Inside backticks (``), just collect characters
222 	//   into the command string, until we find an unquoted
223 	//   backtick that terminates the command string.
224 	if ((ls->quote_state & QUOTE_BTICK))
225 		return lexcommand(ls, src);
226 
227 	//   Inside backticks (``), just collect characters
228 	//   into the command string, until we find an unquoted
229 	//   backtick that terminates the command string.
230 	if ((ls->quote_state & QUOTE_PARAM))
231 		return lexparameter(ls, src);
232 
233 	//   If the previous character was an unquoted backslash,
234 	//   this character is quoted, no matter what it is.
235 	//   (Well, if it's a newline, we remove it.)
236 	if ((ls->quote_state & QUOTE_BSLASH)) {
237 		ls->quote_state &= ~QUOTE_BSLASH;
238 		if (*src == '\n')
239 			return 1;
240 		goto copychar;
241 	}
242 
243 	//   Input is split into tokens at sequences of
244 	//   unquoted whitespace.
245 	if (isspace(*src)) {
246 		if ((ls->quote_state & QUOTE_DOUBLE))
247 			goto copychar;
248 		else if ((ls->quote_state & QUOTE_SPACE))
249 			return 1;
250 		if (!(addtoken(&ls->numtokens, &ls->tokens, &ls->token)))
251 			return 0;
252 		ls->quote_state |= QUOTE_SPACE;
253 		return 1;
254 	}
255 
256 	//   At this point, if we're inside the result of a
257 	//   command substitution (resulted from backticks),
258 	//   everything is taken verbatim.
259 	//   Note that whitespace-splitting does take place,
260 	//   unless the whole thing is also in double quotes
261 	//   (see above).
262 	if ((ls->quote_state & QUOTE_SUBST)) {
263 		ls->quote_state &= ~QUOTE_SPACE;
264 		goto copychar;
265 	}
266 
267 	//   A word beginning with an unquoted '#' introduces
268 	//   a comment that extends to the end of the line.
269 	if (*src == '#' && (ls->quote_state & QUOTE_SPACE))
270 		return 2;
271 
272 	//   If a backslash (unquoted or within double quotes) is
273 	//   followed by a newline, both characters are removed.
274 	//   An unquoted backslash quotes the following character,
275 	//   no matter what it is.  However, within double quotes,
276 	//   a backslash quotes the following character only if it's
277 	//   "`", '"', '$' or another backslash; otherwise the
278 	//   backslash is taken literally.
279 	if (*src == '\\') {
280 		//   If a backslash (unquoted or within double quotes)
281 		//   is followed by a newline, both characters are
282 		//   removed.  Note that the QUOTE_SPACE state (set
283 		//   or not set) may not change at this point.
284 		if (!src[1] || src[1] == '\n') {
285 			ls->quote_state |= QUOTE_BSLASH;
286 			return 1;
287 		}
288 		if ((ls->quote_state & QUOTE_DOUBLE) && src[1] != '\\'
289 		    && src[1] != '`' && src[1] != '\"' && src[1] != '$')
290 			goto copychar;
291 		ls->quote_state &= ~QUOTE_SPACE;
292 		ls->quote_state |= QUOTE_BSLASH;
293 		return 1;
294 	}
295 
296 	//   Starting backtick for command substitution.  (The
297 	//   terminating backtick is handled at the beginning.)
298 	if (*src == '`') {
299 		ls->quote_state |= QUOTE_BTICK;
300 		if (ls->cmdpar)
301 			free (ls->cmdpar);
302 		ls->cmdpar = NULL;
303 		return 1;
304 	}
305 
306 	//   A dollar sign introduces a parameter substitution,
307 	//   unless followed by an invalid character.
308 	if (*src == '$' && (isparamstart(src[1]) || src[1] == '{')) {
309 		ls->quote_state |= QUOTE_PARAM;
310 		if (ls->cmdpar)
311 			free (ls->cmdpar);
312 		ls->cmdpar = NULL;
313 		return 1;
314 	}
315 
316 	//   At this point, we definitely have something which is not
317 	//   unquoted whitespace, thus reset the QUOTE_SPACE flag.
318 	ls->quote_state &= ~QUOTE_SPACE;
319 
320 	//   Starting or terminating double quotes.
321 	if (*src == '\"') {
322 		ls->quote_state ^= QUOTE_DOUBLE;
323 		return 1;
324 	}
325 
326 	//   Starting single quote.  (Terminating single quote
327 	//   is handled at the beginning.)
328 	if (*src == '\'') {
329 		if ((ls->quote_state & QUOTE_DOUBLE))
330 			goto copychar;
331 		ls->quote_state |= QUOTE_SINGLE;
332 		return 1;
333 	}
334 
335 	//   Append the current character literally to
336 	//   the current token.
337 	copychar:
338 	if (!(ls->token = chrappend(ls->token, *src)))
339 		return 0;
340 	return 1;
341 }
342 
343 static int
lexcommand(lexstate * ls,char * src)344 lexcommand (lexstate *ls, char *src)
345 {
346 	if ((ls->quote_state & QUOTE_BSLASH))
347 		ls->quote_state &= ~QUOTE_BSLASH;
348 	else if (*src == '\\') {
349 		if (!src[1] || src[1] == '$'
350 		    || src[1] == '`' || src[1] == '\\') {
351 			ls->quote_state |= QUOTE_BSLASH;
352 			return 1;
353 		}
354 		if ((ls->quote_state & QUOTE_DOUBLE) && src[1] == '\"')
355 			return 1;
356 	}
357 	else if (*src == '`') {
358 		FILE *pipe;
359 		char *pline, *cptr;
360 		int oldlineno, result;
361 
362 		ls->quote_state &= ~QUOTE_BTICK;
363 		if (!(pipe = popen(ls->cmdpar, "r"))) {
364 			fprintf (stderr, "Error in line %d:  "
365 			    "Can't execute command.\n", lineno);
366 			return 0;
367 		}
368 		oldlineno = lineno;
369 		ls->quote_state |= QUOTE_SUBST;
370 		result = 1;
371 		while (result && (pline = readinput(pipe))) {
372 			if (result == 2) {
373 				if (!lexchar(ls, "\n")) {
374 					result = 0;
375 					free (pline);
376 					break;
377 				}
378 			}
379 			else
380 				result = 2;
381 			for (cptr = pline; *cptr; cptr++)
382 				if (!lexchar(ls, cptr)) {
383 					result = 0;
384 					break;
385 				}
386 			free (pline);
387 		}
388 		ls->quote_state &= ~QUOTE_SUBST;
389 		lineno = oldlineno;
390 		pclose (pipe);
391 		free (ls->cmdpar);
392 		ls->cmdpar = NULL;
393 		return result ? 1 : 0;
394 	}
395 	if (!(ls->cmdpar = chrappend(ls->cmdpar, *src)))
396 		return 0;
397 	return 1;
398 }
399 
400 static int
lexparameter(lexstate * ls,char * src)401 lexparameter (lexstate *ls, char *src)
402 {
403 	if (*src == '}') {
404 		//
405 		//   If this is the closing brace, we're
406 		//   already done.  Just clear the flag
407 		//   and return.
408 		//
409 		ls->quote_state &= ~QUOTE_PARAM;
410 		return 1;
411 	}
412 	if (!(ls->cmdpar = chrappend(ls->cmdpar, *src)))
413 		return 0;
414 	//
415 	//   If the next character does not belong to the
416 	//   parameter anymore, then perform the actual
417 	//   parameter substitution.
418 	//
419 	if (!isparamchar(src[1])) {
420 		char *value;
421 		int result;
422 
423 		ls->quote_state &= ~QUOTE_PARAM;
424 		result = 1;
425 		if (*(value = ls->cmdpar) == '{')
426 			value++;
427 		if ((value = getenv(value))) {
428 			ls->quote_state |= QUOTE_SUBST;
429 			for (; *value; value++)
430 				if (!lexchar(ls, value)) {
431 					result = 0;
432 					break;
433 				}
434 			ls->quote_state &= ~QUOTE_SUBST;
435 		}
436 		//
437 		//   If the parameter is enclosed in braces,
438 		//   set the QUOTE_PARAM flag again, so we
439 		//   will swallow the '}' at the next call.
440 		//
441 		if (src[1] == '}' && *ls->cmdpar == '{')
442 			ls->quote_state |= QUOTE_PARAM;
443 		free (ls->cmdpar);
444 		ls->cmdpar = NULL;
445 		return result ? 1 : 0;
446 	}
447 	return 1;
448 }
449 
450 static int
lex_quotenewline(lexstate * ls,char ** line)451 lex_quotenewline (lexstate *ls, char **line)
452 {
453 	//   nl     (unquoted newline terminates the logical line)
454 	//   "nl    -->   nl
455 	//   'nl    -->   nl
456 	//   `nl    -->   nl
457 	//   \nl    -->   (nothing)
458 	//   "\nl   -->   (nothing)
459 	//   '\nl   (handled by the 'nl case, the \ is literal)
460 	//   `\nl   -->   (nothing)
461 
462 	if ((ls->quote_state & QUOTE_BTICK)
463 	    && !(ls->quote_state & QUOTE_BSLASH))
464 	{
465 		if (!(ls->cmdpar = chrappend(ls->cmdpar, '\n')))
466 			return 0;
467 	}
468 	else if ((ls->quote_state & QUOTE_SINGLE) ||
469 	    ((ls->quote_state & QUOTE_DOUBLE)
470 	    && !(ls->quote_state & QUOTE_BSLASH)))
471 	{
472 		if (!(ls->token = chrappend(ls->token, '\n')))
473 			return 0;
474 	}
475 	ls->quote_state &= ~QUOTE_BSLASH;
476 	*line = lif(lifudata);	//   fetch another physical line
477 	return 1;
478 }
479 
480 static int
lex_quotecheck(lexstate * ls)481 lex_quotecheck (lexstate *ls)
482 {
483 	if ((ls->quote_state & QUOTE_DOUBLE)) {
484 		fprintf (stderr,
485 		    "Error in line %d:  Open double quote.\n", lineno);
486 		return 0;
487 	}
488 	if ((ls->quote_state & QUOTE_SINGLE)) {
489 		fprintf (stderr,
490 		    "Error in line %d:  Open single quote.\n", lineno);
491 		return 0;
492 	}
493 	if ((ls->quote_state & QUOTE_BSLASH)) {
494 		fprintf (stderr,
495 		    "Error in line %d:  Open backslash quote.\n", lineno);
496 		return 0;
497 	}
498 	if ((ls->quote_state & QUOTE_BTICK)) {
499 		fprintf (stderr,
500 		    "Error in line %d:  Open command substitution.\n",
501 		    lineno);
502 		return 0;
503 	}
504 	return 1;
505 }
506 
507 int
lex(void)508 lex (void)
509 {
510 	char *line, *src;
511 	lexstate ls;
512 	int result = 1;
513 
514 	while ((line = lif(lifudata))) {
515 		ls.token = NULL;
516 		ls.tokens = NULL;
517 		ls.numtokens = 0;
518 		ls.quote_state = QUOTE_SPACE;
519 		ls.cmdpar = NULL;
520 		while (1) {
521 			for (src = line; *src; src++) {
522 				if ((result = lexchar(&ls, src)) < 1)
523 					return 0;
524 				if (result == 2)	// comment
525 					break;
526 			}
527 			free (line);
528 			if ((ls.quote_state & QUOTE_QUOTE)) {
529 				if (!lex_quotenewline(&ls, &line))
530 					return 0;
531 				if (line)
532 					continue;
533 			}
534 			break;
535 		}
536 		if (ls.token || !(ls.quote_state & QUOTE_SPACE))
537 			if (!(addtoken(&ls.numtokens, &ls.tokens, &ls.token)))
538 				return 0;
539 		result = lex_quotecheck(&ls);
540 		if (!lof(ls.numtokens, ls.tokens, lofudata)) {
541 			freetokens (ls.numtokens, ls.tokens);
542 			return 0;
543 		}
544 		freetokens (ls.numtokens, ls.tokens);
545 	}
546 	return result;
547 }
548 
549 #ifdef DEBUG_STANDALONE
550 
551 int
writeoutput(int tokenc,char * tokenv[],void * dummy)552 writeoutput (int tokenc, char *tokenv[], void *dummy)
553 {
554 	int i;
555 
556 	for (i = 0; i < tokenc; i++)
557 		printf ("#%d: \"%s\"\n", i, tokenv[i]);
558 	printf ("\n");
559 	return 1;
560 }
561 
562 int
main(int argc,char * argv[])563 main (int argc, char *argv[])
564 {
565 	lineno = 0;
566 	lex_setinput ((lex_inputfunc) readinput, stdin);
567 	lex_setoutput ((lex_outputfunc) writeoutput, NULL);
568 	return lex() == 0;
569 }
570 
571 #endif
572 
573 /*
574 
575 Excerpt from Solaris 2.6 "man sh"
576 =================================
577 
578   Comments Lines
579      A word beginning with # causes that word and all the follow-
580      ing characters up to a newline to be ignored.
581 
582   Command Substitution
583      The shell reads commands from the string between  two  grave
584      accents  (` `)  and  the standard output from these commands
585      may be used as all or part of  a  word.   Trailing  newlines
586      from the standard output are removed.
587 
588      No interpretation is done on the string before the string is
589      read,  except to remove backslashes (\) used to escape other
590      characters.  Backslashes may  be  used  to  escape  a  grave
591      accent  (`)  or another backslash (\) and are removed before
592      the command string is read.  Escaping grave  accents  allows
593      nested  command  substitution.   If the command substitution
594      lies within a pair of double quotes (" ... `...` ...  "),  a
595      backslash  used  to  escape  a  double  quote  (\")  will be
596      removed; otherwise, it will be left intact.
597 
598      If a backslash is used to escape a newline character  (\new-
599      line),  both  the backslash and the newline are removed (see
600      the later section on  Quoting).   In  addition,  backslashes
601      used  to  escape  dollar  signs  (\$) are removed.  Since no
602      parameter substitution is done on the command string  before
603      it  is  read,  inserting a backslash to escape a dollar sign
604      has no effect.  Backslashes that  precede  characters  other
605      than  \, `, ", newline,  and $ are left intact when the com-
606      mand string is read.
607 
608   Parameter Substitution
609      The  character  $  is  used   to   introduce   substitutable
610      parameters.   [...]
611 
612      ${parameter}
613           The value, if any, of  the  parameter  is  substituted.
614           The braces are required only when parameter is followed
615           by a letter, digit, or underscore that  is  not  to  be
616           interpreted  as part of its name.  [...]
617 
618      The following parameters are automatically set by the shell.
619           [...]
620           $    The process number of this shell.
621 
622      [...]
623 
624   Blank Interpretation
625      After parameter and command  substitution,  the  results  of
626      substitution  are scanned for internal field separator char-
627      acters (those found in IFS) and split  into  distinct  argu-
628      ments  where such characters are found.  Explicit null argu-
629      ments ("" or '')  are  retained.   Implicit  null  arguments
630      (those  resulting  from  parameters that have no values) are
631      removed.
632 
633   Quoting
634      The following characters have a special meaning to the shell
635      and cause termination of a word unless quoted:
636 
637           ;  &  (  )  |  ^  <  >  newline  space  tab
638 
639      A character may be  quoted  (that  is,  made  to  stand  for
640      itself) by preceding it with a backslash (\) or inserting it
641      between a pair of quote marks ('' or "").   During  process-
642      ing,  the shell may quote certain characters to prevent them
643      from taking on a special meaning.  Backslashes used to quote
644      a single character are removed from the word before the com-
645      mand is executed.  The pair \newline is removed from a  word
646      before command and parameter substitution.
647 
648      All characters enclosed between a pair of single quote marks
649      (''),  except  a  single  quote,  are  quoted  by the shell.
650      Backslash has no special meaning inside  a  pair  of  single
651      quotes.   A single quote may be quoted inside a pair of dou-
652      ble quote marks (for example, "'"), but a single  quote  can
653      not be quoted inside a pair of single quotes.
654 
655      Inside a pair of double quote marks (""), parameter and com-
656      mand substitution occurs and the shell quotes the results to
657      avoid blank interpretation and file name generation.  If  $*
658      is within a pair of double quotes, the positional parameters
659      are substituted and quoted, separated by quoted spaces  ("$1
660      $2  ..."); however, if $@ is within a pair of double quotes,
661      the  positional  parameters  are  substituted  and   quoted,
662      separated by unquoted spaces ("$1" "$2" ... ).  \ quotes the
663      characters \, `, ", and $.  The  pair  \newline  is  removed
664      before  parameter  and command substitution.  If a backslash
665      precedes characters other than \, `, ", $, and newline, then
666      the backslash itself is quoted by the shell.
667 
668 */
669 
670 //--
671