1 /*
2  * $Id: rubyfilt.c,v 1.86 2020/01/17 23:37:07 tom Exp $
3  *
4  * Filter to add vile "attribution" sequences to ruby scripts.  This began as a
5  * translation into C of an earlier version written for LEX/FLEX.
6  *
7  * Although the documentation says it has simpler syntax, Ruby borrows from
8  * Perl the worst of its syntax, i.e., regular expressions which can be split
9  * across lines, have embedded comments.
10  *
11  * TODO: %x is equivalent of back-quoted string
12  * TODO: %r is equivalent of regular expression
13  * TODO: %-quoting may accept a space as delimiter
14  * TODO: embed quotes using backslashes
15  * TODO: embed quotes by using double-quotes inside single
16  * TODO: embed quotes by using single-quotes inside double
17  * TODO: make var_embedded() display nested expressions
18  */
19 
20 #include <filters.h>
21 
22 #ifdef DEBUG
23 DefineOptFilter(ruby, "de");
24 #else
25 DefineOptFilter(ruby, "e");
26 #endif
27 
28 #define isIdent(c)   (isalnum(CharOf(c)) || c == '_')
29 #define isIdent0(c)  (isalpha(CharOf(c)) || c == '_')
30 
31 #define MORE(s)	     ((s) < the_last)
32 #define ATLEAST(s,n) ((the_last - (s)) > (n))
33 
34 #ifdef DEBUG
35 #define DPRINTF(params) if(FltOptions('d'))printf params
36 #else
37 #define DPRINTF(params)		/*nothing */
38 #endif
39 
40 #define Parsed(var,tok) var = tok; \
41 	DPRINTF(("...%s: %d\n", tokenType(var), ok))
42 
43 typedef enum {
44     eALIAS
45     ,eCLASS
46     ,eCODE
47     ,eDEF
48     ,eHERE
49     ,ePOD
50     ,eEND
51 } States;
52 
53 typedef enum {
54     tNULL
55     ,tBLANK
56     ,tCHAR
57     ,tCOMMENT
58     ,tERB
59     ,tHERE
60     ,tKEYWORD
61     ,tNUMBER
62     ,tOPERATOR
63     ,tREGEXP
64     ,tSTRING
65     ,tVARIABLE
66 } TType;
67 
68 static char *Action_attr;
69 static char *Comment_attr;
70 static char *Error_attr;
71 static char *Ident_attr;
72 static char *Ident2_attr;
73 static char *Keyword_attr;
74 static char *String_attr;
75 static char *Number_attr;
76 static char *Type_attr;
77 
78 static int line_size(char *);
79 static int var_embedded(char *);
80 
81 /*
82  * The in-memory copy of the input file.
83  */
84 static char *the_file;
85 static char *the_last;
86 static size_t the_size;
87 
88 /*
89  * Stack of here-document tags
90  */
91 #define HERE_TAGS struct _here_tags
92 HERE_TAGS {
93     HERE_TAGS *next;
94     char *value;
95     int strip;
96     int quote;
97 };
98 
99 static HERE_TAGS *here_tags;
100 
101 #ifdef DEBUG
102 static char *
stateName(States state)103 stateName(States state)
104 {
105     char *result = "?";
106 
107     switch (state) {
108     case eALIAS:
109 	result = "ALIAS";
110 	break;
111     case eCLASS:
112 	result = "CLASS";
113 	break;
114     case eCODE:
115 	result = "CODE";
116 	break;
117     case eDEF:
118 	result = "DEF";
119 	break;
120     case eHERE:
121 	result = "HERE";
122 	break;
123     case ePOD:
124 	result = "POD";
125 	break;
126     case eEND:
127 	result = "END";
128 	break;
129     }
130     return result;
131 }
132 
133 static char *
tokenType(TType type)134 tokenType(TType type)
135 {
136     char *result = "?";
137 
138     switch (type) {
139     case tNULL:
140 	result = "No Token";
141 	break;
142     case tBLANK:
143 	result = "BLANK";
144 	break;
145     case tCHAR:
146 	result = "CHAR";
147 	break;
148     case tCOMMENT:
149 	result = "COMMENT";
150 	break;
151     case tERB:
152 	result = "ERB";
153 	break;
154     case tHERE:
155 	result = "HERE";
156 	break;
157     case tKEYWORD:
158 	result = "KEYWORD";
159 	break;
160     case tNUMBER:
161 	result = "NUMBER";
162 	break;
163     case tOPERATOR:
164 	result = "OPERATOR";
165 	break;
166     case tREGEXP:
167 	result = "REGEXP";
168 	break;
169     case tSTRING:
170 	result = "STRING";
171 	break;
172     case tVARIABLE:
173 	result = "VARIABLE";
174 	break;
175     }
176     return result;
177 }
178 #endif
179 
180 /******************************************************************************
181  * Lexical functions that match a particular token type                       *
182  ******************************************************************************/
183 static int
is_BLANK(char * s)184 is_BLANK(char *s)
185 {
186     int found = 0;
187     while (MORE(s) && isBlank(*s)) {
188 	found++;
189 	s++;
190     }
191     return found;
192 }
193 
194 static int
is_ESCAPED(char * s)195 is_ESCAPED(char *s)
196 {
197     int found = 0;
198     if (*s == BACKSLASH) {
199 	found = 2;
200     }
201     return found;
202 }
203 
204 static int
is_STRINGS(char * s,int * err,int left_delim,int right_delim,int single)205 is_STRINGS(char *s, int *err, int left_delim, int right_delim, int single)
206 {
207     char *base = s;
208     int found = 0;
209     int escape = 0;
210     int level = 0;
211     int len;
212 
213     *err = 0;
214     if (*s == left_delim) {	/* should always be true */
215 	s++;
216 	if (left_delim != right_delim)
217 	    ++level;
218 
219 	for (;;) {
220 	    if (!MORE(s)) {
221 		*err = 1;	/* unterminated string */
222 		break;
223 	    }
224 	    if (!escape && (*s == BACKSLASH)) {
225 		escape = 1;
226 	    } else if (escape) {
227 		escape = 0;
228 	    } else if (!single && (len = var_embedded(s)) != 0) {
229 		s += len - 1;
230 	    } else {
231 		if (left_delim != right_delim) {
232 		    if (*s == left_delim) {
233 			++level;
234 		    } else if (*s == right_delim) {
235 			if (--level > 0) {
236 			    ++s;
237 			    continue;
238 			}
239 			/* otherwise, fallthru to the other right_delim check */
240 		    }
241 		}
242 		if (*s == right_delim) {
243 		    s++;
244 		    break;
245 		}
246 	    }
247 	    s++;
248 	}
249 	found = (int) (s - base);
250 	DPRINTF(("...found(%d)\n", found));
251     }
252     return found;
253 }
254 
255 /*
256  * pattern: ({SSTRING}|{DSTRING}|{KEYWORD}|"`"{KEYWORD}"`")
257  */
258 static int
is_QIDENT(char * s)259 is_QIDENT(char *s)
260 {
261     char *base = s;
262     int ch;
263     int delim = 0;
264     int leading = 1;
265 
266     while (MORE(s)) {
267 	ch = CharOf(*s);
268 	if (leading && isIdent0(ch)) {
269 	    leading = 0;
270 	    s++;
271 	} else if (!leading && isIdent(ch)) {
272 	    s++;
273 	} else if (ch == BQUOTE
274 		   || ch == SQUOTE
275 		   || ch == DQUOTE) {
276 	    s++;
277 	    if (delim) {
278 		if (delim == ch)
279 		    break;
280 	    } else {
281 		delim = ch;
282 	    }
283 	} else if (!delim) {
284 	    break;
285 	} else {
286 	    ++s;
287 	}
288     }
289     return (int) (s - base);
290 }
291 
292 /*
293  * pattern: {KEYWORD}
294  * returns: length of the token
295  */
296 static int
is_KEYWORD(char * s)297 is_KEYWORD(char *s)
298 {
299     int found = 0;
300 
301     if (isIdent0(CharOf(s[0]))) {
302 	while (ATLEAST(s, found) && isIdent(s[found])) {
303 	    ++found;
304 	}
305     }
306     return found;
307 }
308 
309 /*
310  * Context-specific keyword: Match a method-style keyword,
311  * ie. a KEYWORD with a possible suffix of !, ?, or =.
312  * pattern: {KEYWORD}[!?=]?
313  * returns: length of the token
314  */
315 static int
is_MKEYWORD(char * s,int use_eql)316 is_MKEYWORD(char *s, int use_eql)
317 {
318     int found = 0;
319 
320     if ((found = is_KEYWORD(s)) != 0) {
321 	if (ATLEAST(s, found)
322 	    && (s[found] == '!'
323 		|| s[found] == '?'	/* FIXME - should look for '.' */
324 		|| (use_eql == 0 && s[found] == '='))) {
325 	    ++found;
326 	}
327     }
328     return found;
329 }
330 
331 /*
332  * pattern: [$]([-_.\/,"\\=~\$?&`'+*;!@<>:]|{CONST}|{VAR})
333  * returns: length of the token, or zero.
334  */
335 static int
is_GLOBAL(char * s)336 is_GLOBAL(char *s)
337 {
338     int found = 0;
339 
340     if (*s == '$' && MORE(++s)) {
341 	if (*s != '\0' && strchr("-_./,\"\\=~$?&`'+*;!@<>:", *s)) {
342 	    found = 1;
343 	} else if (isdigit(CharOf(*s))) {
344 	    while (ATLEAST(s, found)
345 		   && isdigit(CharOf(s[found])))
346 		++found;
347 	} else {
348 	    found = is_KEYWORD(s);
349 	}
350 	if (found != 0)
351 	    ++found;
352     }
353     return found;
354 }
355 
356 /*
357  * pattern: [@][@]?{KEYWORD}
358  * returns: length of the token, or zero.
359  */
360 static int
is_INSTANCE(char * s)361 is_INSTANCE(char *s)
362 {
363     char *base = s;
364     int found = 0;
365 
366     if (*s == '@' && MORE(++s)) {
367 	if (*s == '@')
368 	    s++;
369 	if ((found = is_KEYWORD(s)) != 0)
370 	    found += (int) (s - base);
371     }
372     return found;
373 }
374 
375 /*
376  * pattern: ({INSTANCE}|{GLOBAL})
377  * returns: length of the token, or zero.
378  */
379 static int
is_VARIABLE(char * s)380 is_VARIABLE(char *s)
381 {
382     int found = 0;
383 
384     if (*s == '$') {
385 	found = is_GLOBAL(s);
386     } else if (*s == '@') {
387 	found = is_INSTANCE(s);
388     }
389     return found;
390 }
391 
392 /*
393  * pattern: \?(\\M-a)?(\\C-a)?
394  */
395 static int
is_CHAR(char * s,int * err)396 is_CHAR(char *s, int *err)
397 {
398     int found = 0;
399 
400     if (*s == '?' && ATLEAST(s, 5)) {
401 	if (*++s == BACKSLASH) {
402 	    ++s;
403 	    if (((*s == 'M') || (*s == 'C'))
404 		&& (s[1] == '-')) {
405 		*err = 0;
406 		found = 5;
407 		if (*s == 'M' && ATLEAST(s, 5)
408 		    && s[2] == BACKSLASH
409 		    && s[3] == 'C'
410 		    && s[4] == '-'
411 		    && isgraph(CharOf(s[5]))) {
412 		    s += 5;
413 		    found += 3;
414 		} else if (*s == 'C') {
415 		    s += 2;
416 		}
417 	    } else {
418 		found = 3;
419 	    }
420 	} else {
421 	    found = 2;
422 	}
423 	if (found) {
424 	    /* s now points to the character, but it may be escaped */
425 	    if (!isgraph(CharOf(*s))) {
426 		found = 0;
427 	    } else if (*s == BACKSLASH) {
428 		++found;
429 	    }
430 	}
431     }
432     return found;
433 }
434 
435 /*
436  * lex patterns:
437  *	SIGN		[-+]
438  *	DECIMAL		[0-9_]+
439  *	OCTAL		0[0-7_]+
440  *	HEXADECIMAL	0x[0-9a-fA-F_]+
441  *	REAL		[-+]?([0-9_]*\.[0-9][0-9_]*)([eE][+-]?[0-9_]+)?
442  *	NUMBER		{SIGN}?({DECIMAL}|{OCTAL}|{HEXADECIMAL})[L]?|{REAL}
443  *
444  * But note that ruby allows a function immediately after a number separated
445  * by a ".", e.g.,
446  *	1.upto(n)
447  */
448 static int
is_NUMBER(char * s,int * err)449 is_NUMBER(char *s, int *err)
450 {
451     char *base = s;
452     int state = 0;
453     int value = 0;
454     int radix = 0;
455     int dot = 0;
456 
457     *err = 0;
458     while (MORE(s)) {
459 	int ch = CharOf(*s);
460 
461 	if ((s == base) && (ch == '+' || ch == '-')) {
462 	    /* EMPTY */ ;
463 	} else if ((s == base) && (ch == 'v')) {
464 	    radix = 11;
465 	} else if (radix == 11) {
466 	    if (ch != '_') {
467 		if (isdigit(ch)) {
468 		    value = 1;
469 		} else if (ch != '.' || !isdigit(CharOf(s[-1]))) {
470 		    break;
471 		}
472 	    }
473 	} else if (ch == '.') {
474 	    if (ATLEAST(s, 1)
475 		&& (s[1] == '.' || isalpha(CharOf(s[1])))) {
476 		break;
477 	    }
478 	    if (radix == 8)
479 		radix = 10;
480 	    if (dot || (radix != 0 && radix != 10) || (state > 1)) {
481 		*err = 1;
482 	    }
483 	    dot = 1;
484 	    radix = 10;
485 	    state = 1;
486 	} else if (ch == '_') {
487 	    /* EMPTY */ ;
488 	} else if (radix == 0) {
489 	    if (ch == '0') {
490 		if (!ATLEAST(s, 1)) {
491 		    radix = 10;
492 		} else if (s[1] == 'b') {
493 		    radix = 2;
494 		    s++;
495 		} else if (s[1] == 'x') {
496 		    radix = 16;
497 		    s++;
498 		} else {
499 		    radix = 8;
500 		}
501 		value = 1;
502 	    } else if (isdigit(ch)) {
503 		radix = 10;
504 		value = 1;
505 	    } else {
506 		break;
507 	    }
508 	} else if (ch == 'e' || ch == 'E') {
509 	    if ((state > 1 && radix < 16) || !value) {
510 		*err = 1;
511 	    }
512 	    state = 2;
513 	} else {
514 	    if (((state || (radix == 10)) && isdigit(ch))
515 		|| ((radix == 16) && isxdigit(ch))
516 		|| ((radix == 8) && (ch >= '0' && ch < '8'))
517 		|| ((radix == 2) && (ch >= '0' && ch < '2'))) {
518 		value = 1;
519 	    } else {
520 		if (value) {
521 		    while (MORE(s)) {
522 			ch = CharOf(*s);
523 			if (isalnum(ch)) {
524 			    *err = 1;
525 			    ++s;
526 			} else {
527 			    break;
528 			}
529 		    }
530 		}
531 		break;
532 	    }
533 	}
534 	s++;
535     }
536 
537     return value ? (int) (s - base) : 0;
538 }
539 
540 /*
541  * pattern: ^([#].*[\n])+
542  * returns: length of the comment, or zero
543  */
544 static int
is_COMMENT(char * s)545 is_COMMENT(char *s)
546 {
547     char *base = s;
548     char *t = is_BLANK(s) + s;
549 
550     if (*t++ == '#') {
551 	while (MORE(t)) {
552 	    if (*t == '\n') {
553 		if (!ATLEAST(t, 1)
554 		    || t[1] != '#')
555 		    break;
556 	    }
557 	    t++;
558 	}
559 	s = t;
560     }
561     return (int) (s - base);
562 }
563 
564 /******************************************************************************
565  ******************************************************************************/
566 
567 /*
568  * Match a given 'marker' keyword, optionally checking if it is alone on the
569  * line.  Documentation implies otherwise, but parser does check for a blank
570  * after the keyword.
571  */
572 static int
end_marker(char * s,const char * marker,int only)573 end_marker(char *s, const char *marker, int only)
574 {
575     int len = (int) strlen(marker);
576 
577     return (ATLEAST(s, len)
578 	    && !strncmp(s, marker, (size_t) len)
579 	    && isspace(CharOf(s[len]))
580 	    && (!only || isreturn(s[len])));
581 }
582 
583 /*
584  * pattern: "^=begin.*"
585  */
586 static int
begin_POD(char * s)587 begin_POD(char *s)
588 {
589     return end_marker(s, "=begin", 0);
590 }
591 
592 /*
593  * pattern: "^=end.*"
594  */
595 static int
end_POD(char * s)596 end_POD(char *s)
597 {
598     return end_marker(s, "=end", 0);
599 }
600 
601 /*
602  * Ruby allows more than one here-document to begin on a line.  They "stack",
603  * and are processed in succession.
604  */
605 static void
make_here_tag(char * value,int quote,int strip)606 make_here_tag(char *value, int quote, int strip)
607 {
608     size_t size = 0;
609     HERE_TAGS *data = type_alloc(HERE_TAGS, (char *) 0, (size_t) 1, &size);
610 
611     if (data != 0) {
612 	HERE_TAGS *p = here_tags;
613 	HERE_TAGS *q = 0;
614 
615 	while (p != 0) {
616 	    q = p;
617 	    p = p->next;
618 	}
619 	if (q != 0)
620 	    q->next = data;
621 	else
622 	    here_tags = data;
623 	data->next = p;
624 	data->value = value;
625 	data->quote = quote;
626 	data->strip = strip;
627 	DPRINTF(("make_here_tag(%s) %squoted %sstripped\n", value,
628 		 quote ? "" : "un",
629 		 strip ? "" : "un"));
630     }
631 }
632 
633 static char *
free_here_tag(void)634 free_here_tag(void)
635 {
636     HERE_TAGS *next = here_tags->next;
637     char *result = next ? next->value : 0;
638 
639     free(here_tags->value);
640     free(here_tags);
641     here_tags = next;
642 
643     return result;
644 }
645 
646 /*
647  * Workaround for ruby bug: ignoring their documentation, which notes that you
648  * must separate a string from the "<<" operator to distinguish it from a here
649  * document tag, some of the source code happens to use " " or "\n" without an
650  * intervening space.
651  */
652 static int
valid_HERE(char * s)653 valid_HERE(char *s)
654 {
655     int ok = is_QIDENT(s);
656     int delim;
657     if (ok) {
658 	switch (*s) {
659 	default:
660 	    delim = 0;
661 	    break;
662 	case DQUOTE:
663 	    /* FALLTHRU */
664 	case SQUOTE:
665 	    delim = *s;
666 	    break;
667 	}
668 	if (delim) {
669 	    switch (ok) {
670 	    case 3:
671 		if (s[0] == s[2] && s[1] == ' ')
672 		    ok = 0;
673 		break;
674 	    case 4:
675 		if (s[0] == s[3] && s[1] == '\\' && s[2] == 'n')
676 		    ok = 0;
677 		break;
678 	    }
679 	}
680     }
681     return ok;
682 }
683 
684 /*
685  * Mark the beginning of a here-document.
686  */
687 static int
begin_HERE(char * s)688 begin_HERE(char *s)
689 {
690     char *base = s;
691     char *first;
692     char *marker = 0;
693     int ok;
694     int strip = 0;
695 
696     if (ATLEAST(s, 3)
697 	&& s[0] == '<'
698 	&& s[1] == '<'
699 	&& !isBlank(s[2])) {
700 	s += 2;
701 	if (*s == '-') {
702 	    strip = 1;
703 	    ++s;
704 	}
705 	first = s;
706 	if ((ok = valid_HERE(s)) != 0) {
707 	    size_t temp = 0;
708 	    int delim = (*s == SQUOTE || *s == DQUOTE) ? *s : 0;
709 	    int quote = (*s == SQUOTE);
710 
711 	    s += ok;
712 
713 	    if ((marker = do_alloc((char *) 0, (size_t) (ok + 1), &temp)) != 0) {
714 		char *d = marker;
715 		if (delim) {
716 		    ++first;
717 		}
718 		while (first != s) {
719 		    if (delim) {
720 			if (*first == delim)
721 			    break;
722 			*d++ = *first;
723 		    } else {
724 			if (isIdent(*first))
725 			    *d++ = *first;
726 		    }
727 		    first++;
728 		}
729 		*d = 0;
730 		make_here_tag(marker, quote, strip);
731 	    }
732 	}
733     }
734     return (marker ? (int) (s - base) : 0);
735 }
736 
737 static char *
skip_BLANKS(char * s)738 skip_BLANKS(char *s)
739 {
740     char *base = s;
741 
742     while (MORE(s)) {
743 	if (!isspace(CharOf(*s))) {
744 	    break;
745 	}
746 	++s;
747     }
748     if (s != base) {
749 	flt_puts(base, (int) (s - base), "");
750     }
751     return s;
752 }
753 
754 /*
755  * Check for ERB (embedded ruby).  Ruby on Rails uses templates which fit into
756  * the special case of ruby embedded in ruby.
757  */
758 static int
is_ERB(char * s)759 is_ERB(char *s)
760 {
761     int found = 0;
762 #define OPS(s) { s, sizeof(s) - 1 }
763     static const struct {
764 	const char *ops;
765 	int len;
766     } table[] = {
767 	/* 3 */
768 	OPS("<%="),
769 	    OPS("<%#"),
770 	    OPS("<%-"),
771 	    OPS("-%>"),
772 	/* 2 */
773 	    OPS("<%"),
774 	    OPS("%>"),
775     };
776 
777     if (FltOptions('e') && ispunct(CharOf(*s))) {
778 	unsigned n;
779 	for (n = 0; n < TABLESIZE(table); ++n) {
780 	    if (ATLEAST(s, table[n].len)
781 		&& table[n].ops[0] == *s
782 		&& !memcmp(s, table[n].ops, (size_t) table[n].len)) {
783 		found = table[n].len;
784 		break;
785 	    }
786 	}
787 	/* special-case comments */
788 	if (found == 3 && s[2] == '#') {
789 	    s += found;
790 	    while (ATLEAST(s, 2)) {
791 		if (!memcmp(s, "%>", 2)) {
792 		    found += 2;
793 		    break;
794 		}
795 		s++;
796 		found++;
797 	    }
798 	}
799     }
800     return found;
801 }
802 
803 /*
804  * Return the number of characters if an operator is found, otherwise zero.
805  */
806 static int
is_OPERATOR(char * s)807 is_OPERATOR(char *s)
808 {
809     int found = 0;
810 #define OPS(s) { s, sizeof(s) - 1 }
811     static const struct {
812 	const char *ops;
813 	int len;
814     } table[] = {
815 	/* 3 */
816 	OPS("&&="),
817 	    OPS("**="),
818 	    OPS("..."),
819 	    OPS("<<="),
820 	    OPS("<=>"),
821 	    OPS("==="),
822 	    OPS(">>="),
823 	    OPS("[]="),
824 	    OPS("||="),
825 	/* 2 */
826 	    OPS("!="),
827 	    OPS("!~"),
828 	    OPS("%="),
829 	    OPS("&&"),
830 	    OPS("&="),
831 	    OPS("**"),
832 	    OPS("*="),
833 	    OPS("+="),
834 	    OPS("-="),
835 	    OPS(".."),
836 	    OPS("/="),
837 	    OPS("::"),
838 	    OPS("<<"),
839 	    OPS("<="),
840 	    OPS("=="),
841 	    OPS("=>"),
842 	    OPS("=~"),
843 	    OPS(">="),
844 	    OPS(">>"),
845 	    OPS("[]"),
846 	    OPS("^="),
847 	    OPS("|="),
848 	    OPS("||"),
849 	/* 1 */
850 	    OPS("!"),
851 	    OPS("&"),
852 	    OPS("("),
853 	    OPS(")"),
854 	    OPS("*"),
855 	    OPS("+"),
856 	    OPS(","),
857 	    OPS("-"),
858 	    OPS(";"),
859 	    OPS("="),
860 	    OPS("["),
861 	    OPS("]"),
862 	    OPS("^"),
863 	    OPS("{"),
864 	    OPS("|"),
865 	    OPS("}"),
866 	    OPS("~"),
867     };
868 
869     if (ispunct(CharOf(*s))) {
870 	unsigned n;
871 	for (n = 0; n < TABLESIZE(table); ++n) {
872 	    if (ATLEAST(s, table[n].len)
873 		&& table[n].ops[0] == *s
874 		&& !memcmp(s, table[n].ops, (size_t) table[n].len)) {
875 		found = table[n].len;
876 		break;
877 	    }
878 	}
879     }
880     return found;
881 }
882 
883 /*
884  * Do the real work for is_Regexp().  Documentation is vague, but the parser
885  * appears to use the same quoting type for each nesting level.
886  */
887 static int
is_REGEXP(char * s,int left_delim,int right_delim)888 is_REGEXP(char *s, int left_delim, int right_delim)
889 {
890     char *base = s;
891     int found = 0;
892     int len;
893     int level = 0;
894     int range = 0;
895     int block = (left_delim != L_BLOCK);
896 
897     while (MORE(s)) {
898 	if (left_delim != right_delim) {
899 	    if (*s == left_delim) {
900 		++level;
901 	    } else if (*s == right_delim) {
902 		if (--level > 0) {
903 		    ++s;
904 		    continue;
905 		}
906 		/* otherwise, fallthru to the other right_delim check */
907 	    }
908 	}
909 	if ((len = is_ESCAPED(s)) != 0) {
910 	    s += len;
911 	} else if (block && (*s == R_BLOCK) && range) {
912 	    range = 0;
913 	    ++s;
914 	} else if (block && (*s == L_BLOCK) && !range) {
915 	    range = 1;
916 	    ++s;
917 	} else if ((len = var_embedded(s)) != 0) {
918 	    s += len;
919 	} else if (range) {
920 	    ++s;
921 	} else if (s != base && *s == right_delim) {
922 	    ++s;
923 	    while (MORE(s) && isalpha(CharOf(*s))) {
924 		++s;
925 	    }
926 	    found = (int) (s - base);
927 	    break;
928 	} else {
929 	    ++s;
930 	}
931     }
932     return found;
933 }
934 
935 #define valid_delimiter(c) (isgraph(CharOf(c)) && !isalnum(CharOf(c)))
936 
937 static int
balanced_delimiter(char * s)938 balanced_delimiter(char *s)
939 {
940     int result;
941 
942     switch (*s) {
943     case L_CURLY:
944 	result = R_CURLY;
945 	break;
946     case L_PAREN:
947 	result = R_PAREN;
948 	break;
949     case L_BLOCK:
950 	result = R_BLOCK;
951 	break;
952     case L_ANGLE:
953 	result = R_ANGLE;
954 	break;
955     default:
956 	if (valid_delimiter(*s))
957 	    result = *s;
958 	else
959 	    result = 0;
960 	break;
961     }
962     return result;
963 }
964 
965 static int
is_Balanced(char * s,int length,int * left,int * right)966 is_Balanced(char *s, int length, int *left, int *right)
967 {
968     int delim;
969     int n;
970 
971     *left = 0;
972     *right = 0;
973     if (*s == '%') {
974 	for (n = 1; n < length; ++n) {
975 	    if (isalnum(CharOf(s[n])))
976 		continue;
977 	    delim = balanced_delimiter(s + n);
978 	    if (delim == 0) {
979 		*left = n;
980 		break;
981 	    } else if (delim == s[n]) {
982 		*left = n + 1;
983 		*right = 1;
984 		break;
985 	    } else {
986 		*left = n + 1;
987 		*right = 1;
988 		break;
989 	    }
990 	}
991     }
992     return *left;
993 }
994 
995 /*
996  * Check for a regular expression, returning its length
997  */
998 static int
is_Regexp(char * s,int * delim)999 is_Regexp(char *s, int *delim)
1000 {
1001     int found = 0;
1002 
1003     if (*s == '/') {
1004 	*delim = balanced_delimiter(s);
1005 	found = is_REGEXP(s, *s, *delim);
1006     } else if (ATLEAST(s, 4)
1007 	       && s[0] == '%'
1008 	       && s[1] == 'r'
1009 	       && valid_delimiter(s[2])) {
1010 
1011 	*delim = balanced_delimiter(s + 2);
1012 	found = 2 + is_REGEXP(s + 2, s[2], *delim);
1013     }
1014     return found;
1015 }
1016 
1017 /*
1018  * Parse both the unquoted and quoted symbol types.
1019  * This gets treated as a string literal anyway, so
1020  * we only call it from is_String.
1021  */
1022 static int
is_Symbol(char * s,int * delim,int * err)1023 is_Symbol(char *s, int *delim, int *err)
1024 {
1025     int found = 0;
1026 
1027     if (*s++ == ':') {
1028 	switch (*s) {
1029 	case SQUOTE:
1030 	    if ((found = is_STRINGS(s, err, *s, *s, 1)) != 0)
1031 		*delim = SQUOTE;
1032 	    break;
1033 	case DQUOTE:
1034 	    if ((found = is_STRINGS(s, err, *s, *s, 0)) != 0)
1035 		*delim = DQUOTE;
1036 	    break;
1037 	case BQUOTE:
1038 	    found = 1;
1039 	    break;
1040 	default:
1041 	    found = is_MKEYWORD(s, 0);
1042 	    break;
1043 	}
1044     }
1045 
1046     if (found != 0)
1047 	found += 1;
1048 
1049     return found;
1050 }
1051 
1052 /*
1053  * Parse the various types of quoted strings.
1054  */
1055 static int
is_String(char * s,int * delim,int * err)1056 is_String(char *s, int *delim, int *err)
1057 {
1058     int found = 0;
1059 
1060     *delim = 0;
1061     if (ATLEAST(s, 2) && (found = is_Symbol(s, delim, err)) == 0) {
1062 	switch (*s) {
1063 	case '%':
1064 	    if (ATLEAST(s, 4)) {
1065 		int modifier = 0;
1066 		int single = 0;
1067 		char *base = s;
1068 
1069 		++s;
1070 		if (isalpha(CharOf(*s))) {
1071 		    modifier = *s++;
1072 		    switch (modifier) {
1073 		    case 'q':
1074 		    case 'w':	/* FIXME - sword */
1075 			single = 1;
1076 			break;
1077 		    case 'x':
1078 		    case 'Q':
1079 		    case 'W':	/* FIXME - dword */
1080 			break;
1081 		    case 'I':	/* FIXME - dword */
1082 		    case 'i':	/* FIXME - sword */
1083 		    case 'r':	/* FIXME - regexp */
1084 		    case 's':	/* FIXME - symbol */
1085 			break;
1086 		    }
1087 		}
1088 		if (valid_delimiter(*s)) {
1089 		    found = is_STRINGS(s,
1090 				       err,
1091 				       *s,
1092 				       balanced_delimiter(s),
1093 				       1);
1094 		}
1095 		if (found != 0) {
1096 		    found += (int) (s - 1 - base);
1097 		    *delim = single ? SQUOTE : DQUOTE;
1098 		}
1099 	    }
1100 	    break;
1101 	case SQUOTE:
1102 	    if ((found = is_STRINGS(s, err, *s, *s, 1)) != 0)
1103 		*delim = SQUOTE;
1104 	    break;
1105 	case BQUOTE:
1106 	case DQUOTE:
1107 	    if ((found = is_STRINGS(s, err, *s, *s, 0)) != 0)
1108 		*delim = DQUOTE;
1109 	    break;
1110 	case BACKSLASH:
1111 	    found = is_ESCAPED(s);
1112 	    *delim = SQUOTE;
1113 	    break;
1114 	}
1115     }
1116     return found;
1117 }
1118 
1119 static int
line_size(char * s)1120 line_size(char *s)
1121 {
1122     char *base = s;
1123 
1124     while (MORE(s)) {
1125 	if (*s == '\n')
1126 	    break;
1127 	s++;
1128     }
1129     return (int) (s - base);
1130 }
1131 
1132 static char *
put_newline(char * s)1133 put_newline(char *s)
1134 {
1135     if (MORE(s))
1136 	flt_putc(*s++);
1137     return s;
1138 }
1139 
1140 /*
1141  * pattern: #\{{EXPR}*\}
1142  * returns: length of the token
1143  */
1144 static int
var_embedded(char * s)1145 var_embedded(char *s)
1146 {
1147     char *base = s;
1148     int delim;
1149     int level = 0;
1150     int had_op = 1;
1151     int ignore;
1152     int ok;
1153 
1154     if (*s == '#' && MORE(++s)) {
1155 	if (*s == L_CURLY) {
1156 	    ++level;
1157 	    ++s;
1158 	    while (MORE(s)) {
1159 		if ((*s == '%' || had_op)
1160 		    && (ok = is_Regexp(s, &delim)) != 0) {
1161 		    had_op = 0;
1162 		    s += ok;
1163 		} else if ((ok = is_String(s, &delim, &ignore)) != 0) {
1164 		    had_op = 0;
1165 		    s += ok;
1166 		} else if ((ok = is_CHAR(s, &ignore)) != 0
1167 			   && (ok != 2 || (s[1] != L_CURLY && s[1] != R_CURLY))) {
1168 		    had_op = 0;
1169 		    s += ok;
1170 		} else if ((ok = is_NUMBER(s, &ignore)) != 0) {
1171 		    had_op = 0;
1172 		    s += ok;
1173 		} else if ((ok = is_KEYWORD(s)) != 0) {
1174 		    had_op = 0;
1175 		    s += ok;
1176 		} else if ((ok = is_VARIABLE(s)) != 0) {
1177 		    had_op = 0;
1178 		    s += ok;
1179 		} else if ((ok = is_OPERATOR(s)) != 0) {
1180 		    had_op = 1;
1181 		    if (*s == L_CURLY) {
1182 			++level;
1183 		    } else if (*s == R_CURLY) {
1184 			if (--level <= 0) {
1185 			    ++s;
1186 			    break;
1187 			}
1188 		    }
1189 		    s += ok;
1190 		} else {
1191 		    ++s;
1192 		}
1193 	    }
1194 	} else {
1195 	    if ((ok = is_VARIABLE(s)) != 0) {
1196 		++ok;
1197 	    } else {
1198 		s = base;
1199 	    }
1200 	}
1201     }
1202     return (int) (s - base);
1203 }
1204 
1205 /*
1206  * FIXME: var_embedded() can recognize a multi-line embedded variable, but
1207  * this function is called for each line.  Probably should redo the output
1208  * so this is called at the end of the here-document, etc.
1209  */
1210 static char *
put_embedded(char * s,int len,char * attr)1211 put_embedded(char *s, int len, char *attr)
1212 {
1213     int id;
1214     int j, k;
1215 
1216     for (j = k = 0; j < len; j++) {
1217 	if ((j == 0 || (s[j - 1] != BACKSLASH))
1218 	    && (id = var_embedded(s + j)) != 0
1219 	    && (id + j) < len) {
1220 	    if (j != k)
1221 		flt_puts(s + k, j - k, attr);
1222 	    flt_puts(s + j, id, Ident2_attr);
1223 	    k = j + id;
1224 	    j = k - 1;
1225 	}
1226     }
1227     if (k < len)
1228 	flt_puts(s + k, len - k, attr);
1229     return s + len;
1230 }
1231 
1232 /*
1233  * Write the remainder of the line with the given attribute.  If not quoted,
1234  * highlight identifiers which are embedded in the line.
1235  */
1236 static char *
put_remainder(char * s,char * attr,int quoted)1237 put_remainder(char *s, char *attr, int quoted)
1238 {
1239     int ok = line_size(s);
1240 
1241     if (quoted) {
1242 	flt_puts(s, ok, attr);
1243 	s += ok;
1244     } else {
1245 	s = put_embedded(s, ok, attr);
1246     }
1247     return put_newline(s);
1248 }
1249 
1250 static char *
put_COMMENT(char * s,int ok)1251 put_COMMENT(char *s, int ok)
1252 {
1253     int skip = (int) (skip_BLANKS(s) - s);
1254     ok -= skip;
1255     s += skip;
1256     flt_puts(s, ok, Comment_attr);
1257     return s + ok;
1258 }
1259 
1260 static char *
put_ERB(char * s,int ok,int * had_op)1261 put_ERB(char *s, int ok, int *had_op)
1262 {
1263     if (ok > 3) {
1264 	flt_puts(s, ok, Comment_attr);
1265     } else {
1266 	flt_puts(s, ok, Action_attr);
1267     }
1268     *had_op = 1;
1269     return s + ok;
1270 }
1271 
1272 static char *
put_KEYWORD(char * s,int ok,int * had_op)1273 put_KEYWORD(char *s, int ok, int *had_op)
1274 {
1275     const char *attr = 0;
1276     char save = s[ok];
1277 
1278     s[ok] = '\0';
1279     attr = get_keyword_attr(s);
1280     s[ok] = save;
1281     if (isEmpty(attr) && isupper(CharOf(s[0])))
1282 	attr = Type_attr;
1283     flt_puts(s, ok, attr);
1284     *had_op = (attr == Keyword_attr);
1285     return s + ok;
1286 }
1287 
1288 static char *
put_OPERATOR(char * s,int ok,int * had_op)1289 put_OPERATOR(char *s, int ok, int *had_op)
1290 {
1291     flt_puts(s, ok, "");
1292     if (strchr("[(|&=~!,;", *s) != 0)
1293 	*had_op = 1;
1294     return s + ok;
1295 }
1296 
1297 static char *
put_VARIABLE(char * s,int ok)1298 put_VARIABLE(char *s, int ok)
1299 {
1300     const char *attr = 0;
1301     char save = s[ok];
1302 
1303     s[ok] = '\0';
1304     attr = get_keyword_attr(s);
1305     s[ok] = save;
1306     flt_puts(s, ok, (attr != 0 && *attr != '\0') ? attr : Ident2_attr);
1307     return s + ok;
1308 }
1309 
1310 static char *
put_REGEXP(char * s,int length,int delim)1311 put_REGEXP(char *s, int length, int delim)
1312 {
1313     char *base = s;
1314     char *first = s;
1315     char *last;
1316     int len;
1317     int range = 0;
1318     int on_end = 0;
1319     int block = (delim != R_BLOCK);
1320     int level = 0;
1321     int extended = 0;
1322 
1323     for (last = s + length - 1; last != s && isalpha(CharOf(*last)); --last) {
1324 	if (*last == 'x') {
1325 	    extended = 1;
1326 	    break;
1327 	}
1328     }
1329     if (*s == '%') {
1330 	flt_puts(s, 3, Keyword_attr);
1331 	s += 3;
1332 	first = s;
1333 	on_end = 1;
1334     }
1335     flt_bfr_begin(String_attr);
1336     while (s < base + length) {
1337 	if ((len = is_ESCAPED(s)) != 0) {
1338 	    flt_bfr_append(s, len);
1339 	    s += len;
1340 	} else if (block && (*s == R_BLOCK) && range) {
1341 	    range = 0;
1342 	    flt_bfr_append(s++, 1);
1343 	} else if (block && (*s == L_BLOCK) && !range) {
1344 	    range = 1;
1345 	    flt_bfr_append(s++, 1);
1346 	} else if ((len = var_embedded(s)) != 0) {
1347 	    flt_bfr_embed(s, len, Ident2_attr);
1348 	    s += len;
1349 	} else if (range) {
1350 	    flt_bfr_append(s++, 1);
1351 	} else if (*s == L_PAREN && delim != R_PAREN) {
1352 	    ++level;
1353 	    flt_bfr_append(s++, 1);
1354 	} else if (*s == R_PAREN && delim != R_PAREN) {
1355 	    --level;
1356 	    flt_bfr_append(s++, 1);
1357 	} else if (extended && (len = is_BLANK(s)) != 0) {
1358 	    last = s;
1359 	    flt_bfr_embed(last, len, "");
1360 	    s += len;
1361 	} else if (extended && (*s == '#')) {
1362 	    last = s;
1363 	    while (MORE(s) && !isreturn(CharOf(*s))) {
1364 		++s;
1365 	    }
1366 	    flt_bfr_embed(last, (int) (s - last), Comment_attr);
1367 	} else if (s != first && level == 0 && *s == delim) {
1368 	    if (!on_end)
1369 		flt_bfr_append(s++, 1);
1370 	    last = s;
1371 	    if (on_end)
1372 		++s;
1373 	    while (MORE(s) && isalpha(CharOf(*s))) {
1374 		++s;
1375 	    }
1376 	    flt_bfr_embed(last, (int) (s - last), Keyword_attr);
1377 	    break;
1378 	} else {
1379 	    flt_bfr_append(s, 1);
1380 	    ++s;
1381 	}
1382     }
1383     flt_bfr_finish();
1384     return s;
1385 }
1386 
1387 static char *
put_String(char * s,int ok,int delim,int err,int * had_op)1388 put_String(char *s, int ok, int delim, int err, int *had_op)
1389 {
1390     int on_left = 0;
1391     int on_end = 0;
1392     int embed = (delim == DQUOTE);
1393 
1394     *had_op = 0;
1395     if (is_Balanced(s, ok, &on_left, &on_end)) {
1396 	flt_puts(s, on_left, Keyword_attr);
1397 	s += on_left;
1398 	ok -= on_left;
1399     }
1400     if (embed) {
1401 	if (err) {
1402 	    flt_error("unexpected quote");
1403 	    s = put_embedded(s, ok, Error_attr);
1404 	} else {
1405 	    s = put_embedded(s, ok, String_attr);
1406 	}
1407     } else {
1408 	if (err) {
1409 	    flt_error("unterminated string");
1410 	    flt_puts(s, ok, Error_attr);
1411 	} else {
1412 	    flt_puts(s, ok, String_attr);
1413 	}
1414 	s += ok;
1415     }
1416     if (on_end) {
1417 	flt_puts(s, 1, Keyword_attr);
1418 	++s;
1419     }
1420     return s;
1421 }
1422 
1423 /******************************************************************************
1424  ******************************************************************************/
1425 
1426 static void
init_filter(int before GCC_UNUSED)1427 init_filter(int before GCC_UNUSED)
1428 {
1429     (void) before;
1430 }
1431 
1432 static void
do_filter(FILE * input GCC_UNUSED)1433 do_filter(FILE *input GCC_UNUSED)
1434 {
1435     static size_t used;
1436     static char *line;
1437 
1438     size_t actual = 0;
1439     size_t request = 0;
1440     States state = eCODE;
1441     TType this_tok = tNULL;
1442     TType last_tok = tNULL;
1443     char *s;
1444     char *marker = 0;
1445     int in_line = -1;
1446     int ok;
1447     int err;
1448     int delim;
1449     int had_op = 1;		/* true to allow regex */
1450 
1451     (void) input;
1452 
1453     Action_attr = class_attr(NAME_ACTION);
1454     Comment_attr = class_attr(NAME_COMMENT);
1455     Error_attr = class_attr(NAME_ERROR);
1456     Ident_attr = class_attr(NAME_IDENT);
1457     Ident2_attr = class_attr(NAME_IDENT2);
1458     Keyword_attr = class_attr(NAME_KEYWORD);
1459     Number_attr = class_attr(NAME_NUMBER);
1460     String_attr = class_attr(NAME_LITERAL);
1461     Type_attr = class_attr(NAME_TYPES);
1462 
1463     /*
1464      * Read the whole file into a single string, in-memory.  Rather than
1465      * spend time working around the various continuation-line types _and_
1466      * the regular expression "syntax", let's just concentrate on the latter.
1467      */
1468     the_size = 0;
1469     the_file = 0;
1470     while (flt_gets(&line, &used) != NULL) {
1471 	size_t len = strlen(line);	/* FIXME: nulls? */
1472 	if (len != 0 && line[len - 1] == '\r')	/* FIXME: move this to readline */
1473 	    line[--len] = '\0';
1474 	if ((request = the_size + len + 1) > actual)
1475 	    request = 1024 + (request * 2);
1476 	the_file = do_alloc(the_file, request, &actual);
1477 	if (the_file == 0)
1478 	    break;
1479 	memcpy(the_file + the_size, line, len + 1);
1480 	the_size += len;
1481     }
1482 
1483     if (the_file != 0) {
1484 	the_last = the_file + the_size;
1485 
1486 	s = the_file;
1487 	while (MORE(s)) {
1488 	    if (*s == '\n') {
1489 		if (marker != 0)
1490 		    state = eHERE;
1491 		in_line = -1;
1492 		if (state == eCODE)
1493 		    had_op = 1;
1494 	    } else {
1495 		in_line++;
1496 	    }
1497 	    DPRINTF(("(%s(%.*s) line:%d op:%d)\n",
1498 		     stateName(state), is_KEYWORD(s) + 1, s, in_line, had_op));
1499 	    switch (state) {
1500 		/*
1501 		 * alias method-name method-name
1502 		 * alias global-variable-name global-variable-name
1503 		 */
1504 	    case eALIAS:
1505 	    case eDEF:
1506 		if ((ok = is_COMMENT(s)) != 0) {
1507 		    Parsed(this_tok, tCOMMENT);
1508 		    s = put_COMMENT(s, ok);
1509 		} else if ((ok = is_BLANK(s)) != 0) {
1510 		    Parsed(this_tok, tBLANK);
1511 		    flt_puts(s, ok, "");
1512 		    s += ok;
1513 		} else if ((ok = is_MKEYWORD(s, 0)) != 0) {
1514 		    Parsed(this_tok, tKEYWORD);
1515 		    s = put_KEYWORD(s, ok, &had_op);
1516 		    state = eCODE;
1517 		} else if ((ok = is_ERB(s)) != 0) {
1518 		    Parsed(this_tok, tERB);
1519 		    s = put_ERB(s, ok, &had_op);
1520 		    state = eCODE;
1521 		} else if ((ok = is_OPERATOR(s)) != 0) {
1522 		    Parsed(this_tok, tOPERATOR);
1523 		    s = put_OPERATOR(s, ok, &had_op);
1524 		    state = eCODE;
1525 		} else {
1526 		    flt_putc(*s++);
1527 		    state = eCODE;
1528 		}
1529 		had_op = 1;	/* kludge - in case a "def" precedes regex */
1530 		break;
1531 		/*
1532 		 * Class definitions use '<' specially, like a reverse arrow:
1533 		 *      class ClassName < SuperClass
1534 		 *      class << Object
1535 		 *
1536 		 * The first case is not a syntax problem, but the
1537 		 * singleton-class definition conflicts with here-documents.
1538 		 */
1539 	    case eCLASS:
1540 		if ((ok = is_COMMENT(s)) != 0) {
1541 		    Parsed(this_tok, tCOMMENT);
1542 		    s = put_COMMENT(s, ok);
1543 		} else if ((ok = is_BLANK(s)) != 0) {
1544 		    Parsed(this_tok, tBLANK);
1545 		    flt_puts(s, ok, "");
1546 		    s += ok;
1547 		} else if ((ok = is_KEYWORD(s)) != 0) {
1548 		    Parsed(this_tok, tKEYWORD);
1549 		    s = put_KEYWORD(s, ok, &had_op);
1550 		    state = eCODE;
1551 		} else if ((ok = is_ERB(s)) != 0) {
1552 		    Parsed(this_tok, tERB);
1553 		    s = put_ERB(s, ok, &had_op);
1554 		    state = eCODE;
1555 		} else if ((ok = is_OPERATOR(s)) != 0) {
1556 		    Parsed(this_tok, tOPERATOR);
1557 		    s = put_OPERATOR(s, ok, &had_op);
1558 		    state = eCODE;
1559 		} else {
1560 		    flt_putc(*s++);
1561 		    state = eCODE;
1562 		}
1563 		had_op = 1;	/* kludge - in case a "class" precedes regex */
1564 		break;
1565 	    case eCODE:
1566 		if ((last_tok == tKEYWORD || last_tok == tOPERATOR)
1567 		    && (ok = begin_HERE(s)) != 0) {
1568 		    Parsed(this_tok, tHERE);
1569 		    flt_puts(s, ok, String_attr);
1570 		    s += ok;
1571 		    marker = here_tags->value;
1572 		} else if ((in_line < 0 && begin_POD(s + 1))
1573 			   || ((s == the_file && begin_POD(s)))) {
1574 		    DPRINTF(("...POD\n"));
1575 		    state = ePOD;
1576 		    flt_putc(*s++);	/* write the newline */
1577 		    s = put_remainder(s, Comment_attr, 1);
1578 		} else if ((ok = is_COMMENT(s)) != 0) {
1579 		    Parsed(this_tok, tCOMMENT);
1580 		    s = put_COMMENT(s, ok);
1581 		} else if ((ok = is_BLANK(s)) != 0) {
1582 		    Parsed(this_tok, tBLANK);
1583 		    flt_puts(s, ok, "");
1584 		    s += ok;
1585 		} else if ((*s == '%' || had_op)
1586 			   && (ok = is_Regexp(s, &delim)) != 0) {
1587 		    Parsed(this_tok, tREGEXP);
1588 		    s = put_REGEXP(s, ok, delim);
1589 		} else if ((ok = is_CHAR(s, &err)) != 0) {
1590 		    Parsed(this_tok, tCHAR);
1591 		    had_op = 0;
1592 		    if (err) {
1593 			flt_error("not a number: %.*s", ok, s);
1594 			flt_puts(s, ok, Error_attr);
1595 		    } else {
1596 			flt_puts(s, ok, Number_attr);
1597 		    }
1598 		    s += ok;
1599 		} else if ((ok = is_NUMBER(s, &err)) != 0) {
1600 		    Parsed(this_tok, tNUMBER);
1601 		    had_op = 0;
1602 		    if (err) {
1603 			flt_error("not a number: %.*s", ok, s);
1604 			flt_puts(s, ok, Error_attr);
1605 		    } else {
1606 			flt_puts(s, ok, Number_attr);
1607 		    }
1608 		    s += ok;
1609 		} else if ((ok = is_MKEYWORD(s, 1)) != 0) {
1610 		    Parsed(this_tok, tKEYWORD);
1611 		    if (ok == 5 && !strncmp(s, "alias", (size_t) ok))
1612 			state = eALIAS;
1613 		    else if (ok == 5 && !strncmp(s, "class", (size_t) ok))
1614 			state = eCLASS;
1615 		    else if (ok == 3 && !strncmp(s, "def", (size_t) ok))
1616 			state = eDEF;
1617 		    else if (ok == 7 && !strncmp(s, "__END__", (size_t) ok))
1618 			state = eEND;
1619 		    s = put_KEYWORD(s, ok, &had_op);
1620 		} else if ((ok = is_VARIABLE(s)) != 0) {
1621 		    Parsed(this_tok, tVARIABLE);
1622 		    s = put_VARIABLE(s, ok);
1623 		    had_op = 0;
1624 		} else if (ATLEAST(s, (ok = 2))
1625 			   && !strncmp(s, "?\"", (size_t) ok)) {
1626 		    Parsed(this_tok, tVARIABLE);
1627 		    s = put_VARIABLE(s, ok);	/* csv.rb uses it, undocumented */
1628 		    had_op = 0;
1629 		} else if ((ok = is_ERB(s)) != 0) {
1630 		    Parsed(this_tok, tERB);
1631 		    s = put_ERB(s, ok, &had_op);
1632 		    state = eCODE;
1633 		} else if ((ok = is_String(s, &delim, &err)) != 0) {
1634 		    Parsed(this_tok, tSTRING);
1635 		    s = put_String(s, ok, delim, err, &had_op);
1636 		} else if ((ok = is_OPERATOR(s)) != 0) {
1637 		    Parsed(this_tok, tOPERATOR);
1638 		    s = put_OPERATOR(s, ok, &had_op);
1639 		} else {
1640 		    if (!isspace(CharOf(*s)))
1641 			had_op = 0;
1642 		    flt_putc(*s++);
1643 		}
1644 		break;
1645 	    case eHERE:
1646 		if (here_tags == 0) {
1647 		    state = eCODE;
1648 		} else if (end_marker(s + (here_tags->strip
1649 					   ? is_BLANK(s)
1650 					   : 0),
1651 				      marker, 1)) {
1652 		    if ((marker = free_here_tag()) == 0)
1653 			state = eCODE;
1654 		}
1655 		s = put_remainder(s, String_attr,
1656 				  (here_tags
1657 				   ? here_tags->quote
1658 				   : 0));
1659 		break;
1660 	    case ePOD:
1661 		if (end_POD(s))
1662 		    state = eCODE;
1663 		s = put_remainder(s, Comment_attr, 1);
1664 		break;
1665 	    case eEND:
1666 		s = put_remainder(s, Comment_attr, 1);
1667 		break;
1668 	    }
1669 
1670 	    switch (this_tok) {
1671 	    case tNULL:
1672 	    case tBLANK:
1673 	    case tCOMMENT:
1674 		continue;
1675 	    case tCHAR:
1676 	    case tERB:
1677 	    case tHERE:
1678 	    case tNUMBER:
1679 	    case tREGEXP:
1680 	    case tSTRING:
1681 		last_tok = this_tok;
1682 		break;
1683 	    case tKEYWORD:
1684 		last_tok = this_tok;
1685 		break;
1686 	    case tOPERATOR:
1687 		last_tok = this_tok;
1688 		break;
1689 	    case tVARIABLE:
1690 		last_tok = this_tok;
1691 		break;
1692 	    }
1693 	}
1694 	free(the_file);
1695     }
1696     while (here_tags != 0) {
1697 	flt_error("expected tag:%s", here_tags->value);
1698 	(void) free_here_tag();
1699     }
1700 }
1701 
1702 #if NO_LEAKS
1703 static void
free_filter(void)1704 free_filter(void)
1705 {
1706 }
1707 #endif
1708