1 //
2 // lex.c
3 //
4 // Oliver Frome <olli@fromme.com>
5 // @(#)$Id: lex.c,v 1.3 1998/11/02 05:03:33 olli Exp $
6 //
7 // This module implements a tokenizer with quoting, command
8 // substitution and parameter substitution. It tries to
9 // mimic the behaviour of /bin/sh as closely as possible.
10 //
11
12 static const char cvsid[]
13 = "@(#)$Id: lex.c,v 1.3 1998/11/02 05:03:33 olli Exp $";
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18
19 #include "lex.h"
20
21 //
22 // The following constants (bit fields) are used for
23 // lexstate.quote_state (see below).
24 //
25 // QUOTE_DOUBLE is set if we're inside double quotes.
26 // QUOTE_SINGLE is set if we're inside single quotes.
27 // QUOTE_BSLASH is set if the next character is being
28 // escaped by a backslash.
29 // QUOTE_BTICK is set if we're inside backticks.
30 // QUOTE_PARAM is set if we're inside a parameter.
31 // QUOTE_SUBST is set if we're inside the result of a
32 // command substitution (backticks) or
33 // parameter substitution (dollar sign).
34 // QUOTE_SPACE is set if we encountered whitespace that
35 // is relevant for splitting/tokenizing.
36 // It is NOT set for quoted whitespace.
37 //
38
39 #define QUOTE_DOUBLE 0x01
40 #define QUOTE_SINGLE 0x02
41 #define QUOTE_BSLASH 0x04
42 #define QUOTE_BTICK 0x08
43 #define QUOTE_QUOTE (QUOTE_DOUBLE | QUOTE_SINGLE \
44 | QUOTE_BSLASH | QUOTE_BTICK)
45 #define QUOTE_PARAM 0x10
46 #define QUOTE_SUBST 0x20
47 #define QUOTE_SPACE 0x40
48
49 int lineno; // current line number
50 lex_inputfunc lif; // lex() input function, set with lex_setinput()
51 lex_outputfunc lof; // lex() output function, set with lex_setoutput()
52 void *lifudata; // user data for input function
53 void *lofudata; // user data for output function
54
55 void
lex_setinput(lex_inputfunc func,void * userdata)56 lex_setinput (lex_inputfunc func, void *userdata)
57 {
58 lif = func;
59 lifudata = userdata;
60 }
61
62 void
lex_setoutput(lex_outputfunc func,void * userdata)63 lex_setoutput (lex_outputfunc func, void *userdata)
64 {
65 lof = func;
66 lofudata = userdata;
67 }
68
69 static int
isspace(char c)70 isspace (char c)
71 {
72 return c == ' ' || c == '\t' || c == '\n';
73 }
74
75 static int
isparamstart(char c)76 isparamstart (char c)
77 {
78 if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_')
79 return 1;
80 return 0;
81 }
82
83 static int
isparamchar(char c)84 isparamchar (char c)
85 {
86 if (isparamstart(c) || (c >= '0' && c <= '9'))
87 return 1;
88 return 0;
89 }
90
91 //
92 // strappend(s, a) works similar to strcat(s, a), but treats
93 // "s" as a dynamically allocated string.
94 //
95 // The return value is the new location of the resulting
96 // string (or NULL if an error occured), the value of "s" is
97 // undefined in either case after calling strappend().
98 //
99 // chrappend(s, a) works similar but appends only a single
100 // character.
101 //
102
103 static char *
strappend(char * s,char * a)104 strappend (char *s, char *a)
105 {
106 char *new;
107 int sl, al;
108
109 sl = s ? strlen(s) : 0;
110 al = strlen(a);
111 if (!(new = (char *) realloc(s, sl + al + 1))) {
112 free (s);
113 return NULL;
114 }
115 if (s)
116 strcpy (new, s);
117 strcpy (new + sl, a);
118 return new;
119 }
120
121 static char *
chrappend(char * s,char a)122 chrappend (char *s, char a)
123 {
124 char *new;
125 int sl;
126
127 sl = s ? strlen(s) : 0;
128 if (!(new = (char *) realloc(s, sl + 2))) {
129 free (s);
130 return NULL;
131 }
132 if (s)
133 strcpy (new, s);
134 new[sl] = a;
135 new[sl + 1] = '\0';
136 return new;
137 }
138
139 static void
freetokens(int numtokens,char * tokens[])140 freetokens (int numtokens, char *tokens[])
141 {
142 if (numtokens && tokens) {
143 for (; numtokens; numtokens--)
144 free (tokens[numtokens - 1]);
145 free (tokens);
146 }
147 }
148
149 static int
addtoken(int * numtokens,char ** tokens[],char ** token)150 addtoken (int *numtokens, char **tokens[], char **token)
151 {
152 char **newtoks;
153
154 if (!(newtoks = (char **) realloc(*tokens,
155 sizeof(char *) * (*numtokens + 1)))) {
156 freetokens (*numtokens, *tokens);
157 free (*token);
158 *token = NULL;
159 return 0;
160 }
161 if (!(newtoks[*numtokens] = *token ? strdup(*token) : strdup(""))) {
162 freetokens (*numtokens + 1, newtoks);
163 free (*token);
164 *token = NULL;
165 return 0;
166 }
167 *numtokens += 1;
168 free (*token);
169 *token = NULL;
170 *tokens = newtoks;
171 return 1;
172 }
173
174 char *
readinput(FILE * inputfile)175 readinput (FILE *inputfile)
176 {
177 char *line, *last;
178 char buf[256];
179
180 line = NULL;
181 for (;;) {
182 if (!fgets(buf, 256, inputfile))
183 break;
184 if (!(line = strappend(line, buf)))
185 return NULL;
186 last = line + (strlen(line) - 1);
187 if (*last == '\n') {
188 *last = '\0';
189 break;
190 }
191 }
192 lineno++;
193 return line;
194 }
195
196 typedef struct lexstate_t {
197 char *token; // the current token
198 char **tokens; // list of tokens
199 int numtokens; // number of tokens in the list
200 int quote_state; // current quoting state
201 char *cmdpar; // parameter or backtick contents
202 } lexstate;
203
204 static int lexcommand (lexstate *ls, char *src);
205
206 static int lexparameter (lexstate *ls, char *src);
207
208 static int
lexchar(lexstate * ls,char * src)209 lexchar (lexstate *ls, char *src)
210 {
211 // Inside single quotes, everything is quoted
212 // except for the terminating single quote.
213 if ((ls->quote_state & QUOTE_SINGLE))
214 if (*src == '\'') {
215 ls->quote_state &= ~QUOTE_SINGLE;
216 return 1;
217 }
218 else
219 goto copychar;
220
221 // Inside backticks (``), just collect characters
222 // into the command string, until we find an unquoted
223 // backtick that terminates the command string.
224 if ((ls->quote_state & QUOTE_BTICK))
225 return lexcommand(ls, src);
226
227 // Inside backticks (``), just collect characters
228 // into the command string, until we find an unquoted
229 // backtick that terminates the command string.
230 if ((ls->quote_state & QUOTE_PARAM))
231 return lexparameter(ls, src);
232
233 // If the previous character was an unquoted backslash,
234 // this character is quoted, no matter what it is.
235 // (Well, if it's a newline, we remove it.)
236 if ((ls->quote_state & QUOTE_BSLASH)) {
237 ls->quote_state &= ~QUOTE_BSLASH;
238 if (*src == '\n')
239 return 1;
240 goto copychar;
241 }
242
243 // Input is split into tokens at sequences of
244 // unquoted whitespace.
245 if (isspace(*src)) {
246 if ((ls->quote_state & QUOTE_DOUBLE))
247 goto copychar;
248 else if ((ls->quote_state & QUOTE_SPACE))
249 return 1;
250 if (!(addtoken(&ls->numtokens, &ls->tokens, &ls->token)))
251 return 0;
252 ls->quote_state |= QUOTE_SPACE;
253 return 1;
254 }
255
256 // At this point, if we're inside the result of a
257 // command substitution (resulted from backticks),
258 // everything is taken verbatim.
259 // Note that whitespace-splitting does take place,
260 // unless the whole thing is also in double quotes
261 // (see above).
262 if ((ls->quote_state & QUOTE_SUBST)) {
263 ls->quote_state &= ~QUOTE_SPACE;
264 goto copychar;
265 }
266
267 // A word beginning with an unquoted '#' introduces
268 // a comment that extends to the end of the line.
269 if (*src == '#' && (ls->quote_state & QUOTE_SPACE))
270 return 2;
271
272 // If a backslash (unquoted or within double quotes) is
273 // followed by a newline, both characters are removed.
274 // An unquoted backslash quotes the following character,
275 // no matter what it is. However, within double quotes,
276 // a backslash quotes the following character only if it's
277 // "`", '"', '$' or another backslash; otherwise the
278 // backslash is taken literally.
279 if (*src == '\\') {
280 // If a backslash (unquoted or within double quotes)
281 // is followed by a newline, both characters are
282 // removed. Note that the QUOTE_SPACE state (set
283 // or not set) may not change at this point.
284 if (!src[1] || src[1] == '\n') {
285 ls->quote_state |= QUOTE_BSLASH;
286 return 1;
287 }
288 if ((ls->quote_state & QUOTE_DOUBLE) && src[1] != '\\'
289 && src[1] != '`' && src[1] != '\"' && src[1] != '$')
290 goto copychar;
291 ls->quote_state &= ~QUOTE_SPACE;
292 ls->quote_state |= QUOTE_BSLASH;
293 return 1;
294 }
295
296 // Starting backtick for command substitution. (The
297 // terminating backtick is handled at the beginning.)
298 if (*src == '`') {
299 ls->quote_state |= QUOTE_BTICK;
300 if (ls->cmdpar)
301 free (ls->cmdpar);
302 ls->cmdpar = NULL;
303 return 1;
304 }
305
306 // A dollar sign introduces a parameter substitution,
307 // unless followed by an invalid character.
308 if (*src == '$' && (isparamstart(src[1]) || src[1] == '{')) {
309 ls->quote_state |= QUOTE_PARAM;
310 if (ls->cmdpar)
311 free (ls->cmdpar);
312 ls->cmdpar = NULL;
313 return 1;
314 }
315
316 // At this point, we definitely have something which is not
317 // unquoted whitespace, thus reset the QUOTE_SPACE flag.
318 ls->quote_state &= ~QUOTE_SPACE;
319
320 // Starting or terminating double quotes.
321 if (*src == '\"') {
322 ls->quote_state ^= QUOTE_DOUBLE;
323 return 1;
324 }
325
326 // Starting single quote. (Terminating single quote
327 // is handled at the beginning.)
328 if (*src == '\'') {
329 if ((ls->quote_state & QUOTE_DOUBLE))
330 goto copychar;
331 ls->quote_state |= QUOTE_SINGLE;
332 return 1;
333 }
334
335 // Append the current character literally to
336 // the current token.
337 copychar:
338 if (!(ls->token = chrappend(ls->token, *src)))
339 return 0;
340 return 1;
341 }
342
343 static int
lexcommand(lexstate * ls,char * src)344 lexcommand (lexstate *ls, char *src)
345 {
346 if ((ls->quote_state & QUOTE_BSLASH))
347 ls->quote_state &= ~QUOTE_BSLASH;
348 else if (*src == '\\') {
349 if (!src[1] || src[1] == '$'
350 || src[1] == '`' || src[1] == '\\') {
351 ls->quote_state |= QUOTE_BSLASH;
352 return 1;
353 }
354 if ((ls->quote_state & QUOTE_DOUBLE) && src[1] == '\"')
355 return 1;
356 }
357 else if (*src == '`') {
358 FILE *pipe;
359 char *pline, *cptr;
360 int oldlineno, result;
361
362 ls->quote_state &= ~QUOTE_BTICK;
363 if (!(pipe = popen(ls->cmdpar, "r"))) {
364 fprintf (stderr, "Error in line %d: "
365 "Can't execute command.\n", lineno);
366 return 0;
367 }
368 oldlineno = lineno;
369 ls->quote_state |= QUOTE_SUBST;
370 result = 1;
371 while (result && (pline = readinput(pipe))) {
372 if (result == 2) {
373 if (!lexchar(ls, "\n")) {
374 result = 0;
375 free (pline);
376 break;
377 }
378 }
379 else
380 result = 2;
381 for (cptr = pline; *cptr; cptr++)
382 if (!lexchar(ls, cptr)) {
383 result = 0;
384 break;
385 }
386 free (pline);
387 }
388 ls->quote_state &= ~QUOTE_SUBST;
389 lineno = oldlineno;
390 pclose (pipe);
391 free (ls->cmdpar);
392 ls->cmdpar = NULL;
393 return result ? 1 : 0;
394 }
395 if (!(ls->cmdpar = chrappend(ls->cmdpar, *src)))
396 return 0;
397 return 1;
398 }
399
400 static int
lexparameter(lexstate * ls,char * src)401 lexparameter (lexstate *ls, char *src)
402 {
403 if (*src == '}') {
404 //
405 // If this is the closing brace, we're
406 // already done. Just clear the flag
407 // and return.
408 //
409 ls->quote_state &= ~QUOTE_PARAM;
410 return 1;
411 }
412 if (!(ls->cmdpar = chrappend(ls->cmdpar, *src)))
413 return 0;
414 //
415 // If the next character does not belong to the
416 // parameter anymore, then perform the actual
417 // parameter substitution.
418 //
419 if (!isparamchar(src[1])) {
420 char *value;
421 int result;
422
423 ls->quote_state &= ~QUOTE_PARAM;
424 result = 1;
425 if (*(value = ls->cmdpar) == '{')
426 value++;
427 if ((value = getenv(value))) {
428 ls->quote_state |= QUOTE_SUBST;
429 for (; *value; value++)
430 if (!lexchar(ls, value)) {
431 result = 0;
432 break;
433 }
434 ls->quote_state &= ~QUOTE_SUBST;
435 }
436 //
437 // If the parameter is enclosed in braces,
438 // set the QUOTE_PARAM flag again, so we
439 // will swallow the '}' at the next call.
440 //
441 if (src[1] == '}' && *ls->cmdpar == '{')
442 ls->quote_state |= QUOTE_PARAM;
443 free (ls->cmdpar);
444 ls->cmdpar = NULL;
445 return result ? 1 : 0;
446 }
447 return 1;
448 }
449
450 static int
lex_quotenewline(lexstate * ls,char ** line)451 lex_quotenewline (lexstate *ls, char **line)
452 {
453 // nl (unquoted newline terminates the logical line)
454 // "nl --> nl
455 // 'nl --> nl
456 // `nl --> nl
457 // \nl --> (nothing)
458 // "\nl --> (nothing)
459 // '\nl (handled by the 'nl case, the \ is literal)
460 // `\nl --> (nothing)
461
462 if ((ls->quote_state & QUOTE_BTICK)
463 && !(ls->quote_state & QUOTE_BSLASH))
464 {
465 if (!(ls->cmdpar = chrappend(ls->cmdpar, '\n')))
466 return 0;
467 }
468 else if ((ls->quote_state & QUOTE_SINGLE) ||
469 ((ls->quote_state & QUOTE_DOUBLE)
470 && !(ls->quote_state & QUOTE_BSLASH)))
471 {
472 if (!(ls->token = chrappend(ls->token, '\n')))
473 return 0;
474 }
475 ls->quote_state &= ~QUOTE_BSLASH;
476 *line = lif(lifudata); // fetch another physical line
477 return 1;
478 }
479
480 static int
lex_quotecheck(lexstate * ls)481 lex_quotecheck (lexstate *ls)
482 {
483 if ((ls->quote_state & QUOTE_DOUBLE)) {
484 fprintf (stderr,
485 "Error in line %d: Open double quote.\n", lineno);
486 return 0;
487 }
488 if ((ls->quote_state & QUOTE_SINGLE)) {
489 fprintf (stderr,
490 "Error in line %d: Open single quote.\n", lineno);
491 return 0;
492 }
493 if ((ls->quote_state & QUOTE_BSLASH)) {
494 fprintf (stderr,
495 "Error in line %d: Open backslash quote.\n", lineno);
496 return 0;
497 }
498 if ((ls->quote_state & QUOTE_BTICK)) {
499 fprintf (stderr,
500 "Error in line %d: Open command substitution.\n",
501 lineno);
502 return 0;
503 }
504 return 1;
505 }
506
507 int
lex(void)508 lex (void)
509 {
510 char *line, *src;
511 lexstate ls;
512 int result = 1;
513
514 while ((line = lif(lifudata))) {
515 ls.token = NULL;
516 ls.tokens = NULL;
517 ls.numtokens = 0;
518 ls.quote_state = QUOTE_SPACE;
519 ls.cmdpar = NULL;
520 while (1) {
521 for (src = line; *src; src++) {
522 if ((result = lexchar(&ls, src)) < 1)
523 return 0;
524 if (result == 2) // comment
525 break;
526 }
527 free (line);
528 if ((ls.quote_state & QUOTE_QUOTE)) {
529 if (!lex_quotenewline(&ls, &line))
530 return 0;
531 if (line)
532 continue;
533 }
534 break;
535 }
536 if (ls.token || !(ls.quote_state & QUOTE_SPACE))
537 if (!(addtoken(&ls.numtokens, &ls.tokens, &ls.token)))
538 return 0;
539 result = lex_quotecheck(&ls);
540 if (!lof(ls.numtokens, ls.tokens, lofudata)) {
541 freetokens (ls.numtokens, ls.tokens);
542 return 0;
543 }
544 freetokens (ls.numtokens, ls.tokens);
545 }
546 return result;
547 }
548
549 #ifdef DEBUG_STANDALONE
550
551 int
writeoutput(int tokenc,char * tokenv[],void * dummy)552 writeoutput (int tokenc, char *tokenv[], void *dummy)
553 {
554 int i;
555
556 for (i = 0; i < tokenc; i++)
557 printf ("#%d: \"%s\"\n", i, tokenv[i]);
558 printf ("\n");
559 return 1;
560 }
561
562 int
main(int argc,char * argv[])563 main (int argc, char *argv[])
564 {
565 lineno = 0;
566 lex_setinput ((lex_inputfunc) readinput, stdin);
567 lex_setoutput ((lex_outputfunc) writeoutput, NULL);
568 return lex() == 0;
569 }
570
571 #endif
572
573 /*
574
575 Excerpt from Solaris 2.6 "man sh"
576 =================================
577
578 Comments Lines
579 A word beginning with # causes that word and all the follow-
580 ing characters up to a newline to be ignored.
581
582 Command Substitution
583 The shell reads commands from the string between two grave
584 accents (` `) and the standard output from these commands
585 may be used as all or part of a word. Trailing newlines
586 from the standard output are removed.
587
588 No interpretation is done on the string before the string is
589 read, except to remove backslashes (\) used to escape other
590 characters. Backslashes may be used to escape a grave
591 accent (`) or another backslash (\) and are removed before
592 the command string is read. Escaping grave accents allows
593 nested command substitution. If the command substitution
594 lies within a pair of double quotes (" ... `...` ... "), a
595 backslash used to escape a double quote (\") will be
596 removed; otherwise, it will be left intact.
597
598 If a backslash is used to escape a newline character (\new-
599 line), both the backslash and the newline are removed (see
600 the later section on Quoting). In addition, backslashes
601 used to escape dollar signs (\$) are removed. Since no
602 parameter substitution is done on the command string before
603 it is read, inserting a backslash to escape a dollar sign
604 has no effect. Backslashes that precede characters other
605 than \, `, ", newline, and $ are left intact when the com-
606 mand string is read.
607
608 Parameter Substitution
609 The character $ is used to introduce substitutable
610 parameters. [...]
611
612 ${parameter}
613 The value, if any, of the parameter is substituted.
614 The braces are required only when parameter is followed
615 by a letter, digit, or underscore that is not to be
616 interpreted as part of its name. [...]
617
618 The following parameters are automatically set by the shell.
619 [...]
620 $ The process number of this shell.
621
622 [...]
623
624 Blank Interpretation
625 After parameter and command substitution, the results of
626 substitution are scanned for internal field separator char-
627 acters (those found in IFS) and split into distinct argu-
628 ments where such characters are found. Explicit null argu-
629 ments ("" or '') are retained. Implicit null arguments
630 (those resulting from parameters that have no values) are
631 removed.
632
633 Quoting
634 The following characters have a special meaning to the shell
635 and cause termination of a word unless quoted:
636
637 ; & ( ) | ^ < > newline space tab
638
639 A character may be quoted (that is, made to stand for
640 itself) by preceding it with a backslash (\) or inserting it
641 between a pair of quote marks ('' or ""). During process-
642 ing, the shell may quote certain characters to prevent them
643 from taking on a special meaning. Backslashes used to quote
644 a single character are removed from the word before the com-
645 mand is executed. The pair \newline is removed from a word
646 before command and parameter substitution.
647
648 All characters enclosed between a pair of single quote marks
649 (''), except a single quote, are quoted by the shell.
650 Backslash has no special meaning inside a pair of single
651 quotes. A single quote may be quoted inside a pair of dou-
652 ble quote marks (for example, "'"), but a single quote can
653 not be quoted inside a pair of single quotes.
654
655 Inside a pair of double quote marks (""), parameter and com-
656 mand substitution occurs and the shell quotes the results to
657 avoid blank interpretation and file name generation. If $*
658 is within a pair of double quotes, the positional parameters
659 are substituted and quoted, separated by quoted spaces ("$1
660 $2 ..."); however, if $@ is within a pair of double quotes,
661 the positional parameters are substituted and quoted,
662 separated by unquoted spaces ("$1" "$2" ... ). \ quotes the
663 characters \, `, ", and $. The pair \newline is removed
664 before parameter and command substitution. If a backslash
665 precedes characters other than \, `, ", $, and newline, then
666 the backslash itself is quoted by the shell.
667
668 */
669
670 //--
671