1#!/usr/bin/awk -f
2
3###############################################################################
4#
5#  FILE:	y2l
6#  DESCRIPTION: Yacc-to-LaTeX grammar processor
7#
8#  Copyright (c) 1994-2000 by Kris Van Hees, Belgium.  All rights reserved.
9#  See the "Artistic License" included in this package (file: "Artistic") for
10#  terms and conditions.
11#
12#  $Id: y2l,v 1.1.1.1 2000/03/07 16:40:33 aedil Exp $
13#
14###############################################################################
15
16#
17# Reserved variables:
18#	_debug		Debug level
19#	_exit		Exiting due to an error
20#	_line		Current line being processed
21#	_spclen		Length of string holding padding spaces
22#	_spcstr		String to hold padding spaces
23#
24
25###############################################################################
26# Support functions
27###############################################################################
28
29#
30#  NAME:	SPACES()
31#  DESCRIPTION: return a given number of spaces
32#
33function SPACES(snum) {
34    if (!_spclen) {				# uninitialized (first use)
35	_spcstr = " ";
36	_spclen = 1;
37    }
38
39    while (_spclen < snum) {			# extend the string of spaces
40	_spcstr = _spcstr _spcstr;		# double the spaces
41	_spclen *= 2;				# update the string length
42    }
43
44     return substr(_spcstr, 1, snum);		# requested number of spaces
45}
46
47#
48#  NAME:	DBG()
49#  DESCRIPTION: write debug output to standard error (if debugging is enabled)
50#
51function DBG(dlvl, dmsg) {
52    if (!_debug)				# debugging is disabled
53	return;
54
55    print "DBG:" SPACES(dlvl * 2 + 1) dmsg			>"/dev/stderr";
56}
57
58#
59#  NAME:	error()
60#  DESCRIPTION: write an error message to standard error
61#
62function error(emsg) {
63    print "ERROR: " emsg					>"/dev/stderr";
64
65    if (_line)
66	print "       Line is (on or about) " _line		>"/dev/stderr";
67
68    _exit = 1;					# mark program as exiting
69    exit;
70}
71
72#
73#  NAME:	exiting()
74#  DESCRIPTION: return whether the program is exiting
75#
76function exiting() {
77    return _exit;				# exit status
78}
79
80#
81#  NAME:	exists()
82#  DESCRIPTION: return whether a file exists
83#
84function exists(fname) {
85    return !system("ls " fname " >/dev/null 2>&1");
86}
87
88###############################################################################
89# Yacc-to-LaTeX initialization
90###############################################################################
91
92#
93#  NAME:	init()
94#  DESCRIPTION: initialization
95#
96function init() {
97    PERC_PERC			= 257;
98    PERC_LACC			= 258;
99    PERC_RACC			= 259;
100    PERC_LEFT			= 260;
101    PERC_ASSC			= 261;
102    PERC_RGHT			= 262;
103    PERC_STRT			= 263;
104    PERC_TOKN			= 264;
105    PERC_TYPE			= 265;
106    PERC_UNIO			= 266;
107    PERC_PREC			= 267;
108
109    CHAR			= 268;
110    IDENT			= 269;
111    STRING			= 270;
112    TYPE			= 271;
113
114    EOF				= -1;
115    BAD				= -2;
116
117    reserved["%%"]		= PERC_PERC;
118    reserved["%{"]		= PERC_LACC;
119    reserved["%}"]		= PERC_RACC;
120    reserved["%left"]		= PERC_LEFT;
121    reserved["%nonassoc"]	= PERC_ASSC;
122    reserved["%right"]		= PERC_RGHT;
123    reserved["%start"]		= PERC_STRT;
124    reserved["%token"]		= PERC_TOKN;
125    reserved["%type"]		= PERC_TYPE;
126    reserved["%union"]		= PERC_UNIO;
127    reserved["%prec"]		= PERC_PREC;
128}
129
130###############################################################################
131# Yacc-to-LaTeX processor functions
132###############################################################################
133
134#
135#  NAME:	token()
136#  DESCRIPTION: verify whether something is a valid token
137#
138function token(tokname) {
139    return tokname ~ /^[_\.A-Za-z][_\.A-Za-z0-9]*$/;
140}
141
142#
143#  NAME:	read()
144#  DESCRIPTION: read another (non-blank) line from the input file
145#
146function read() {
147    while (getline < grammar_file == 1) {
148	_line++;
149
150	if ($1 != "")
151	    return;
152    }
153
154    return grammar_eof = 1;
155}
156
157#
158#  NAME:	skip_ws()
159#  DESCRIPTION: skip a continuous block of whitespace
160#
161function skip_ws() {
162    in_comment = 0;
163
164    while (!grammar_eof) {
165	sub(/^[ \t]+/, "");			# remove leading whitespace
166
167	if ($1 != "") {
168	    if (/^\/\//) {			# C++ comment
169		$0 = "";
170	    } else if (in_comment) {		# in a comment
171		if (match($0, /\*\//)) {	# end of a comment
172		    $0 = substr($0, RSTART + RLENGTH);
173		    in_comment = 0;
174		} else
175		    $0 = "";
176	    } else if (/^\/\*"[^"]+"\*\//) {	# special marker
177		sub(/\/\*"/, "\"");
178		sub(/"\*\//, "\"");
179
180		return;
181	    } else if (/^\/\*/) {		# regular comment
182		$0 = substr($0, 3);
183		in_comment = 1;
184	    } else
185		return;
186
187	    sub(/[ \t]+$/, "");			# remove trailing whitespace
188	}
189
190	if ($1 == "")
191	    read();
192    }
193}
194
195#
196#  NAME:	lex()
197#  DESCRIPTION: read the next token from the input
198#
199function lex() {
200    if (tok_prev) {
201	tok = tok_prev;
202	tok_str = tok_pstr;
203	tok_prev = 0;
204
205	return tok;
206    }
207
208    skip_ws();
209
210    if (grammar_eof)
211	return EOF;
212
213    if (/^%/)
214	if (match($0, /^%(%|{|}|left|nonassoc|right|start|token|type|union)/)) {
215	    tok_str = substr($0, 1, RLENGTH);
216	    $0 = substr($0, RLENGTH + 1);
217
218	    return reserved[tok_str];
219	} else if (match($0, /^%[_A-Za-z][_A-Za-z0-9]*/)) {
220	    tok_str = substr($0, 1, RLENGTH);
221	    $0 = substr($0, RLENGTH + 1);
222
223	    return BAD;
224	}
225
226    if (match($0, /^[_\.A-Za-z][_\.A-zA-z0-9]*/)) {
227	tok_str = substr($0, 1, RLENGTH);
228	$0 = substr($0, RLENGTH + 1);
229
230	return IDENT;
231    }
232
233    if (match($0, /^<[_\.A-Za-z][_\.A-zA-z0-9]*>/)) {
234	tok_str = substr($0, 1, RLENGTH);
235	$0 = substr($0, RLENGTH + 1);
236
237	return TYPE;
238    }
239
240    if (/^'/)
241	if (match($0, /^'(.|\\([tnarfbv\\'"]|[0-7][0-7]*))'/)) {
242	    tok_str = "@C" (consts++);
243	    transtab[tok_str] = "\"" substr($0, 2, RLENGTH - 2) "\"";
244	    $0 = substr($0, RLENGTH + 1);
245
246	    return CHAR;
247	} else
248	    error("Excuse me, but this character constant is a bit weird.");
249
250    if (/^"/)
251	if (match($0, /([^\\]|[ \t]+)(\\\\)*"/)) {
252	    tok_str = "@S" (strings++);
253	    transtab[tok_str] = substr($0, 1, RSTART + RLENGTH - 1);
254	    $0 = substr($0, RSTART + RLENGTH);
255
256	    return STRING;
257	} else
258	    error("Newlines in strings are interesting, but not allowed.");
259
260    tok_str = substr($0, 1, 1);
261    $0 = substr($0, 2);
262
263    return tok_str;
264}
265
266#
267#  NAME:	unlex()
268#  DESCRIPTION: return a token to the input stream
269#
270function unlex(tok) {
271    tok_prev = tok;
272    tok_pstr = tok_str;
273}
274
275#
276#  NAME:	skip_definition()
277#  DESCRIPTION: skip the contents of a %{ ... %} block
278#
279function skip_definition() {
280    do {
281	skip = lex();
282    } while (skip != PERC_RACC && skip != EOF);
283}
284
285#
286#  NAME:	decl_token()
287#  DESCRIPTION: read a token declaration
288#
289function decl_token() {
290    first = 1;
291
292    do {
293	tok = lex();
294
295	if (tok == ",") {
296	    symbol = 0;
297	} else if (tok == CHAR) {
298	    DBG(1, transtab[tok_str] ": No need to remember this token.");
299	} else if (tok == IDENT) {
300	    if (_tkpat && tok_str !~ _tkpat) {
301		if (transtab[tok_str])
302		    DBG(2, "WARNING: Redefining '" tok_str "'.");
303
304		transtab[tok_str] = "\"" tolower(tok_str) "\"";
305		DBG(1, tok_str ": Defined as " transtab[tok_str] ".");
306	    }
307
308	    symbol = tok_str;
309	} else if (tok == NUMBER) {
310	    if (!symbol)
311		error("How about giving a token first?");
312
313	    symbol = 0;
314	} else if (tok == STRING) {
315	    if (!symbol)
316		error("How about giving a token first?");
317
318	    str = transtab[tok_str];
319	    transtab[symbol] = "\"" substr(str, 2, length(str) - 2) "\"";
320	    DBG(1, SPACES(length(symbol) + 2)				      \
321		   "Defined as " transtab[symbol] ".");
322
323	    symbol = 0;
324	} else if (tok == TYPE) {
325	    if (!first)
326		error("This is no place for a type name.");
327	} else {
328	    unlex(tok);
329	    return;
330	}
331
332	first = 0;
333    } while (tok != EOF);
334}
335
336#
337#  NAME:	decl_start()
338#  DESCRIPTION: read a start rule declaration
339#
340function decl_start() {
341    if (grammar_start)
342	error("Hm, you want the grammar to start with two rules?");
343    else if (lex() == IDENT) {
344	grammar_start = tok_str;
345	DBG(2, "The grammar start rule is '" grammar_start "'.");
346    } else
347	error("How about a nice juicy identifier?");
348}
349
350#
351#  NAME:	decl_type()
352#  DESCRIPTION: read a type declaration
353#
354function decl_type() {
355    if (lex() != TYPE)
356	error("So where is the typename?");
357
358    do {
359	tok = lex();
360
361	if (tok == ",") {
362	    symbol = 0;
363	} else if (tok == CHAR) {
364	    error("Bison etc may accept literals in a %type declaration, "    \
365		  "but the Unix 7th\n       Ed manual clearly indicates "     \
366		  "that it is NOT legal.  And I think that the\n       Bell " \
367		  "Labs guys know what they are talking about; but anyway, "  \
368		  "do you\n       really spend the time reading this long "   \
369		  "error message?");
370	} else if (tok == IDENT) {
371	    if (_tkpat && tok_str !~ _tkpat) {
372		if (transtab[tok_str])
373		    DBG(2, "WARNING: Redefining '" tok_str "'.");
374
375		transtab[tok_str] = "\"" tolower(tok_str) "\"";
376		DBG(1, tok_str ": Defined as " transtab[tok_str] ".");
377	    }
378
379	    symbol = tok_str;
380	} else if (tok == NUMBER) {
381	    if (!symbol)
382		error("How about giving a token first?");
383
384	    symbol = 0;
385	} else if (tok == STRING) {
386	    if (!symbol)
387		error("How about giving a token first?");
388
389	    str = transtab[tok_str];
390	    transtab[symbol] = "\"" substr(str, 2, length(str) - 2) "\"";
391	    DBG(1, SPACES(length(symbol) + 2)				      \
392		   "Defined as " transtab[symbol] ".");
393
394	    symbol = 0;
395	} else {
396	    unlex(tok);
397	    return;
398	}
399    } while (tok != EOF);
400}
401
402#
403#  NAME:	decl_union()
404#  DESCRIPTION: read a union declaration
405#
406function decl_union() {
407    if (grammar_union)
408	error("How about sticking to one single union declaration?");
409
410    grammar_union = 1;
411    DBG(2, "Extended types have been registered with a union declaration.");
412
413    do {
414	tok = lex();
415
416	if (tok == "{")
417	    block++;
418	else if (tok == "}") {
419	    if (!block)
420		error("Why close an unopened block?");
421	    if (--block <= 0)
422		return;
423	} else if (tok == EOF)
424	    error("The file ends before the union is finished.  How rude!");
425    } while (1);
426}
427
428#
429#  NAME:	read_declarations()
430#  DESCRIPTION: read the yacc declarations
431#
432function read_declarations() {
433    do {
434	tok = lex();				# next token
435
436	if (tok == PERC_PERC || tok == EOF)	# end of the declarations
437	    return;
438	if (tok == PERC_LACC) {			# definition block
439	    skip_definition();
440	} else if (tok == PERC_LEFT) {		# left associative declaration
441	    decl_token();
442	} else if (tok == PERC_ASSC) {		# non-associative declaration
443	    decl_token();
444	} else if (tok == PERC_RGHT) {		# right associative declaration
445	    decl_token();
446	} else if (tok == PERC_STRT) {		# start rule declaration
447	    decl_start();
448	} else if (tok == PERC_TOKN) {		# token declaration(s)
449	    decl_token();
450	} else if (tok == PERC_TYPE) {		# type declaration
451	    decl_type();
452	} else if (tok == PERC_UNIO) {		# union declaration
453	    decl_union();
454	} else
455	    DBG(2, "WARNING: Ignoring the unknown token '" tok_str "'.");
456    } while (1);
457}
458
459#
460#  NAME:	skip_action()
461#  DESCRIPTION: skip the contents of an action block
462#
463function skip_action() {
464    block = 1;
465
466    do {
467	tok = lex();
468
469	if (tok == "{")
470	    block++;
471	else if (tok == "}") {
472	    if (!block)
473		error("Why close an unopened block?");
474	    if (--block <= 0)
475		return;
476	} else if (tok == EOF)
477	    error("The file ends before the action is finished.  How rude!");
478    } while (tok != EOF);
479}
480
481#
482#  NAME:	read_grammar()
483#  DESCRIPTION: read the yacc grammar
484#
485function read_grammar() {
486    tok = lex();
487
488    do {
489	if (tok == PERC_PERC || tok == EOF)	# end of the grammar
490	    return;
491
492	if (tok == IDENT) {			# rule begins here
493	    if (!(rule_idx = rule_ref[tok_str])) {
494		rule_idx = ++rule_cnt;		# new rule
495		rule_lhs[rule_idx] = tok_str;
496		rule_ref[tok_str] = rule_idx;
497		rule_sub = ++rule_len[rule_idx];
498
499		if (!grammar_start)
500		    grammar_start = tok_str;
501	    }
502
503	    if (lex() != ":")
504		error("The LHS and RHS would like to be separated by a colon.");
505	} else if (tok == "|") {		# alternative RHS for a rule
506	    if (!rule_cnt)
507		error("The grammar shouldn't start with a |.");
508
509	    rule_sub = ++rule_len[rule_idx];
510	} else
511	    error("You could at least give a valid token.");
512
513	rule_rhs[rule_cnt] = "";		# empty RHS
514
515	do {					# read the RHS
516	    tok = lex();
517
518	    if (tok == PERC_PREC) {		# %prec
519		tok = lex();
520
521		if (tok != IDENT && tok != CHAR)
522		    error("What precedence are you talking about?");
523
524		tok = lex();			# continue with the rest
525	    }
526
527	    if (tok == IDENT) {			# might be a new rule
528		old_str = tok_str;
529		nxt = lex();			# look ahead
530		unlex(nxt);
531		tok_str = old_str;
532
533		if (nxt == ":")			# yup, a new rule started
534		    break;
535
536		rule_rhs[rule_idx, rule_sub] = rule_rhs[rule_idx, rule_sub]   \
537					       " " tok_str;
538	    } else if (tok == CHAR) {
539		if (tok_str ~ /^@/)
540		    tok_str = transtab[tok_str];
541
542		rule_rhs[rule_idx, rule_sub] = rule_rhs[rule_idx, rule_sub]   \
543					       " " tok_str;
544	    } else if (tok == "{") {		# an action block
545		skip_action();
546	    } else				# can't be part of a rule
547		break;
548	} while (1);
549
550	sub(/^ /, "", rule_rhs[rule_idx, rule_sub]);
551
552	if (rule_rhs[rule_idx, rule_sub] ~				      \
553	    /(^|[^_\.A-Za-z0-9])error($|[^_\.A-Za-z0-9])/) {
554	    DBG(1, rule_lhs[rule_idx] ": " rule_rhs[rule_idx, rule_sub]	      \
555		   " [IGNORED]");
556
557	    rule_rhs[rule_idx, rule_sub] = "";
558	    rule_len[rule_idx]--;
559	} else
560	    DBG(1, rule_lhs[rule_idx] ": " rule_rhs[rule_idx, rule_sub]);
561
562	if (tok == ";")
563	    tok = lex();
564    } while (1);
565}
566
567#
568#  NAME:	is_optional()
569#  DESCRIPTION: check whether the given non-terminal is optional
570#
571function is_optional(idx) {
572    if ((len = rule_len[idx]) <= 1)
573	return 0;
574
575    #
576    # One or more empty rules for a non-terminal indicate that the other  rules
577    # are in fact optional.  There shouldn't be multiple empty rules, but it is
578    # is technically possible.
579    #
580    for (rule_sub = 1; rule_sub <= len; rule_sub++)
581	if (rule_rhs[idx, rule_sub] == "") {
582	    while (++rule_sub <= len)
583		rule_rhs[idx, rule_sub - 1] = rule_rhs[idx, rule_sub];
584
585	    rule_len[idx]--;
586	}
587
588    return rule_len[idx] < len;
589}
590
591#
592#  NAME:	get_prefix()
593#  DESCRIPTION: check whether the given non-terminal has a common prefix
594#
595function get_prefix(idx) {
596    if ((len = rule_len[idx]) <= 1)
597	return 0;
598
599    #
600    # Split up the first rule into tokens.  These will be compared with all the
601    # other rules for this non-terminal.
602    #
603    gp_last = split(rule_rhs[idx, 1], arr);
604    gp_pref = gp_last;
605
606    #
607    # Look for the longest common prefix.
608    #
609    for (rule_sub = 2; rule_sub <= len; rule_sub++) {
610	$0 = rule_rhs[idx, rule_sub];
611	gp_tokc = NF;
612
613	if (gp_tokc < gp_pref)
614	    gp_pref = gp_tokc;
615
616	for (j = 1; j <= gp_pref; j++)
617	    if (arr[j] != $j) {
618		if (!(gp_pref = j - 1))
619		    return 0;
620
621		break;
622	    }
623    }
624
625    #
626    # Construct the prefix string.
627    #
628    gp_pstr = arr[1];
629    for (j = 2; j <= gp_pref; j++)
630	gp_pstr = gp_pstr " " arr[j];
631
632    #
633    # Remove the common prefix from all rules for this non-terminal.
634    #
635    for (rule_sub = 1; rule_sub <= len; rule_sub++) {
636	$0 = rule_rhs[idx, rule_sub];
637
638	for (j = 1; j <= gp_pref; j++)
639	    $j = "";
640
641	sub(/^ +/, "");
642	rule_rhs[idx, rule_sub] = $0;
643    }
644
645    return gp_pstr;
646}
647
648#
649#  NAME:	get_suffix()
650#  DESCRIPTION: check whether the given non-terminal has a common suffix
651#
652function get_suffix(idx) {
653    if ((len = rule_len[idx]) <= 1)
654	return 0;
655
656    #
657    # Split up the first rule into tokens.  These will be compared with all the
658    # other rules for this non-terminal.
659    #
660    gs_last = split(rule_rhs[idx, 1], arr);
661    gs_suff = gs_last;
662
663    #
664    # Look for the longest common suffix.
665    #
666    for (rule_sub = 2; rule_sub <= len; rule_sub++) {
667	$0 = rule_rhs[idx, rule_sub];
668	gs_tokc = NF;
669
670	if (gs_tokc < gs_suff)
671	    gs_suff = gs_tokc;
672
673	for (j = 0; j < gs_suff; j++)
674	    if (arr[gs_last - j] != $(gs_tokc - j)) {
675		if (!(gs_suff = j))
676		    return 0;
677
678		break;
679	    }
680    }
681
682    #
683    # Construct the suffix string.
684    #
685    gs_sstr = arr[gs_last];
686    for (j = 1; j < gs_suff; j++)
687	gs_sstr = arr[gs_last - j] " " gs_sstr;
688
689    #
690    # Remove the common suffix from all rules for this non-terminal.
691    #
692    for (rule_sub = 1; rule_sub <= len; rule_sub++) {
693	$0 = rule_rhs[idx, rule_sub];
694
695	for (j = 0; j < gs_suff; j++)
696	    $(NF - j) = "";
697
698	sub(/ +$/, "");
699	rule_rhs[idx, rule_sub] = $0;
700    }
701
702    return gs_sstr;
703}
704
705#
706#  NAME:	optimize1()
707#  DESCRIPTION: first pass of the optimizer
708#
709function optimize1() {
710    DBG(0, "Optimization pass 1...");
711
712    for (rule_idx = 1; rule_idx <= rule_cnt; rule_idx++) {
713	#
714	# Non-terminals with only a single rule can't be optimized here.
715	#
716	if ((len = rule_len[rule_idx]) <= 1)
717	    continue;
718
719	#
720	# First record whether the entire non-terminal might be optional.
721	#
722	rule_opt[rule_idx] = is_optional(rule_idx);
723
724	#
725	# The actual optimization takes place in this endless loop.  It will in
726	# fact end when an iteration does not yield an optional ruleset.   This
727	# is a proper and correct stop criteria, since a  non-optional  ruleset
728	# cannot have any common prefix or suffix.  If it did, those would have
729	# been added to the already present prefix or suffix.
730	#
731	pref = "";
732	suff = "";
733	do {
734	    if (pstr = get_prefix(rule_idx))
735		pref = pref " " pstr;
736	    else if (sstr = get_suffix(rule_idx))
737		suff = sstr " " suff;
738
739	    if (is_optional(rule_idx)) {
740		pref = pref " [";
741		suff = "] " suff;
742	    } else {
743		if (pstr || sstr) {
744		    pref = pref " (";
745		    suff = ") " suff;
746		}
747
748		break;
749	    }
750	} while (1);
751
752	#
753	# Compose the single composite rule for this non-terminal, if a  common
754	# prefix or suffix was found.
755	#
756	if (pref != "" || suff != "") {
757	    sub(/^ /, "", pref);
758	    sub(/ $/, "", suff);
759
760	    DBG(2, "Rules for '" rule_lhs[rule_idx] "' have:");
761
762	    if (pref != "")
763		DBG(3, "Prefix '" pref "'");
764	    if (suff != "")
765		DBG(3, "Suffix '" suff "'");
766
767	    len = rule_len[rule_idx];
768	    pref = pref " " rule_rhs[rule_idx, 1];
769
770	    for (rule_sub = 2; rule_sub <= len; rule_sub++)
771		pref = pref " | " rule_rhs[rule_idx, rule_sub];
772
773	    rule_rhs[rule_idx, 1] = pref " " suff;
774	    rule_len[rule_idx] = 1;
775
776	    DBG(3, "Combined rule '" rule_rhs[rule_idx, 1] "'");
777
778	    if (rule_opt[rule_idx])
779		DBG(3, "(The non-terminal is optional.)");
780	}
781    }
782}
783
784#
785#  NAME:	optimize2()
786#  DESCRIPTION: second pass of the optimizer
787#
788function optimize2() {
789    DBG(0, "Optimization pass 2...");
790
791    for (rule_idx = 1; rule_idx <= rule_cnt; rule_idx++) {
792	$0 = rule_rhs[rule_idx, 1];
793
794	if ((len = rule_len[rule_idx]) > 1)
795	    for (rule_sub = 2; rule_sub <= len; rule_sub++)
796		$0 = $0 " | " rule_rhs[rule_idx, rule_sub];
797
798	if ($1 != "[" && rule_opt[rule_idx]) {
799	    $0 = "[ " $0 " ]";
800	    rule_opt[rule_idx] = 0;
801	}
802
803	if ($1 == "[")
804	    if ($2 == rule_lhs[rule_idx]) {
805		for (i = NF; i > 0; i--)
806		    if ($i == "]")
807			break;
808
809		if ((NF - i) < 3) {
810		    $1 = "";
811		    subst = "{";
812		    $i = "}";
813
814		    while (++i <= NF)
815			subst = subst " " $i;
816
817		    $2 = subst;
818
819		    sub(/^ +/, "");
820		}
821	    }
822
823	if ($NF == "]")
824	    if ($(NF - 1) == rule_lhs[rule_idx]) {
825		for (i = 1; i <= NF; i++)
826		    if ($i == "[")
827			break;
828
829		if (i < 3) {
830		    $i = "{";
831		    subst = "}";
832		    $NF = "";
833
834		    while (--i > 0)
835			subst = $i " " subst;
836
837		    $(NF - 1) = subst;
838
839		    sub(/ +$/, "");
840		}
841	    }
842
843	if (rule_opt[rule_idx]) {
844	    $0 = "[ " $0 " ]";
845	    rule_opt[rule_idx] = 0;
846	}
847
848	rule_rhs[rule_idx, 1] = $0;
849	rule_len[rule_idx] = 1;
850    }
851}
852
853#
854#  NAME:	latexify()
855#  DESCRIPTION: make substitutions to ensure that the string is LaTeX ok
856#
857function latexify(txt) {
858    gsub(/[{&%}]/, "$\\\\&$", txt);
859    gsub(/[!|[\]=+\-*\/<>]/, "$&$", txt);
860    gsub(/\$\$/, "\\!", txt);
861    gsub(/"[^"]+"/, "``{\\bf &}''", txt);
862    gsub(/_/, "\\_", txt);
863    gsub(/\^/, "\\verb+^+", txt);
864    gsub(/"/, "", txt);
865
866    return txt;
867}
868
869#
870#  NAME:	break_string()
871#  DESCRIPTION: possibly break up a string in multiple lines
872#
873function break_string(str) {
874    match(str, /^([_A-Za-z][_A-Za-z0-9]* +| +)(::=|\|)/);
875    bs_len = RLENGTH;
876    bs_str = substr(str, 1, RLENGTH);
877    bs_pln = RLENGTH;
878    bs_pre = SPACES(RLENGTH);
879
880    $0 = substr(str, RLENGTH + 2);
881    for (i = 1; i <= NF; i++) {
882	if (bs_len + 1 + length($i) > 79) {
883	    bs_str = bs_str "\n" bs_pre;
884	    bs_len = bs_pln;
885	}
886
887	bs_str = bs_str " " $i;
888	bs_len += 1 + length($i);
889   }
890
891    return bs_str;
892}
893
894#
895#  NAME:	output()
896#  DESCRIPTION: write out the rewritten rules
897#
898function output(LaTeX) {
899    rule_use[grammar_start] = 1;
900
901    for (rule_idx = 1; rule_idx <= rule_cnt; rule_idx++) {
902	len = rule_len[rule_idx];
903
904	if (rule_opt[rule_idx]) {
905	    rule_rhs[rule_idx, 1] = "[ "  rule_rhs[rule_idx, 1];
906	    rule_rhs[rule_idx, len] = rule_rhs[rule_idx, len] " ]";
907	}
908
909	str = LaTeX ? latexify(rule_lhs[rule_idx])			      \
910		    : rule_lhs[rule_idx];
911
912	if (length(str) > rule_max) {
913	    rule_max = length(str);
914	    rule_mls = str;
915	}
916
917	for (rule_sub = 1; rule_sub <= len; rule_sub++) {
918	    $0 = rule_rhs[rule_idx, rule_sub];
919
920	    for (j = 1; j <= NF; j++) {
921		#
922		# A couple of notes here...  Non-terminals that merely  have  a
923		# single terminal as definition are substituted  anywhere  they
924		# are used.  And some special casing is needed  for  situations
925		# where someone actually defines the non-terminals  as  tokens.
926		# Those should definitely not be quotated.
927		#
928		if ((idx = rule_ref[$j]) &&
929		    rule_len[$j] == 1 && rule_rhs[idx, 1] ~ /^[^ ]+$/)
930		    $j = rule_rhs[idx, 1];
931		else if ((str = transtab[$j]) && !rule_ref[$j])
932		    $j = str;
933
934		rule_use[$j] = 1;
935	    }
936
937	    rule_rhs[rule_idx, rule_sub] = $0;
938	}
939    }
940
941    if (LaTeX)
942	#
943	# Some LaTeX magic...  We calculate the available size for the  RHS  of
944	# the rules.  This will be provided as argument to the  minipages  used
945	# for each independent rule.
946	#
947	# The formula is fairly easy:
948	#	textwidth - width(LHS) - width("::=") - 6 * tabcolsep
949	#
950	print "\\newlength\\rulelhs\n"					      \
951	      "\\newlength\\rulemid\n"					      \
952	      "\\newlength\\rulerhs\n"					      \
953	      "\\settowidth\\rulelhs{" rule_mls "}\n"			      \
954	      "\\settowidth\\rulemid{::=}\n"				      \
955	      "\\setlength\\rulerhs{\\textwidth}\n"			      \
956	      "\\addtolength\\rulerhs{-\\rulelhs}\n"			      \
957	      "\\addtolength\\rulerhs{-\\rulemid}\n"			      \
958	      "\\addtolength\\rulerhs{-6\\tabcolsep}\n\n"		      \
959	      "\\begin{longtable}{lrl}";
960
961    for (rule_idx = 1; rule_idx <= rule_cnt; rule_idx++) {
962	if (!rule_use[rule_lhs[rule_idx]])
963	    continue;
964
965	len = rule_len[rule_idx];
966
967	for (rule_sub = 1; rule_sub <= len; rule_sub++) {
968	    if (LaTeX) {
969		str = latexify(rule_lhs[rule_idx]);
970		out = str SPACES(rule_max - length(str)) " & ::= &\n"	      \
971		      "  \\begin{minipage}[t]{\\rulerhs}\n"		      \
972		      "    \\raggedright\n    ";
973	    } else {
974		str = rule_lhs[rule_idx];
975		out = str SPACES(rule_max - length(str)) " ::= ";
976	    }
977
978	    rhs = rule_rhs[rule_idx, rule_sub];
979	    lvl = 0;
980
981	    while (match(rhs, /(^[\(\{\[] | [\(\{\[\|\]\}\)] | [\]\}\)]$)/)) {
982		o_beg = RSTART;
983		o_len = RLENGTH;
984
985		if (substr(rhs, o_beg) ~ /(^[\(\{\[] | [\(\{\[] )/) {
986		    lvl++;
987
988		    str = LaTeX ? latexify(substr(rhs, 1, o_beg + o_len - 1)) \
989				: substr(rhs, 1, o_beg + o_len - 1);
990
991		    out = out str;
992		    rhs = substr(rhs, o_beg + o_len);
993		} else if (substr(rhs, o_beg) ~ /( [\]\}\)] | [\]\}\)]$)/) {
994		    lvl--;
995
996		    str = LaTeX ? latexify(substr(rhs, 1, o_beg + o_len - 1)) \
997				: substr(rhs, 1, o_beg + o_len - 1);
998
999		    out = out str;
1000		    rhs = substr(rhs, o_beg + o_len);
1001		} else if (substr(rhs, o_beg + 1, 1) == "|") {
1002		    if (lvl) {
1003			out = out substr(rhs, 1, o_beg + o_len - 1);
1004			rhs = substr(rhs, o_beg + o_len);
1005		    } else {
1006			if (LaTeX) {
1007			    str = latexify(substr(rhs, 1, o_beg - 1));
1008
1009			    print out str				      \
1010				  SPACES(64 - rule_max - length(str)) "\n"    \
1011				  "  \\end{minipage}" SPACES(60) " \\\\";
1012
1013			    out = SPACES(rule_max) " & $|$ &\n"		      \
1014				  "  \\begin{minipage}[t]{\\rulerhs}\n"	      \
1015				  "    \\raggedright\n    ";
1016			} else {
1017			    print break_string(out substr(rhs, 1, o_beg - 1));
1018
1019			    out = SPACES(rule_max + 1) "  | ";
1020			}
1021
1022			rhs = substr(rhs, o_beg + o_len);
1023		    }
1024		}
1025	    }
1026
1027	    if (LaTeX) {
1028		str = latexify(rhs);
1029
1030		print out str SPACES(76 - length(out) - length(str)) "\n"     \
1031		      "  \\end{minipage}" SPACES(60) " \\\\";
1032	    } else
1033		print break_string(out rhs);
1034	}
1035    }
1036
1037    if (LaTeX)
1038	print "\\end{longtable}";
1039}
1040
1041#
1042#  NAME:	process()
1043#  DESCRIPTION: process a given yacc grammar definition file
1044#
1045function process(grammar_file) {
1046    if (!exists(grammar_file))
1047	error("So where exactly is this file?");
1048
1049    DBG(0, "Reading grammar from '" grammar_file "'.");
1050
1051    read_declarations();
1052    read_grammar();
1053
1054    if (_optim >= 1)
1055	optimize1();
1056    if (_optim >= 2)
1057	optimize2();
1058
1059    output(!_plain);
1060}
1061
1062#
1063#  NAME:	load_tokens()
1064#  DESCRIPTION: load a list of literal tokens from a file
1065#
1066function load_tokens(file) {
1067    while (getline < file == 1)
1068	transtab[$1] = "\"" $2 "\"";
1069
1070    close(file);
1071}
1072
1073#
1074#  NAME:	give_help()
1075#  DISCRIPTION: offer some help to the poor user
1076#
1077function give_help() {
1078    print "Usage: y2l -- [options] yacc-file\n"				      \
1079	  "Options:\n"							      \
1080	  "    -d\n"							      \
1081	  "\tWrite debugging output to the standard error stream.  One  can\n"\
1082	  "\tsupply a numeric argument to this option to set an indentation\n"\
1083	  "\tlevel for the messages.\n"					      \
1084	  "    -h\n"							      \
1085	  "\tDisplay this help information.\n"				      \
1086	  "    -O, -O[012]\n"u						      \
1087	  "\tOptimize the grammar to get more typical EBNF.  If no argument\n"\
1088	  "\tis provided, the highest optimization level is selected.\n"      \
1089	  "    -p\n"							      \
1090	  "\tGive basic ASCII output rather than LaTeX output.\n"	      \
1091	  "    -t/regexp/, -tfile\n"					      \
1092	  "\tIn the first form, the regular expression is used  to  decide\n" \
1093	  "\twhether a terminal in the grammar is a real token or rather a\n" \
1094	  "\tliteral.  The second variant specifies a file which  contains\n" \
1095	  "\tlines listing a token and the literal it represents.\n"	      \
1096	  "    -v\n"							      \
1097	  "\tPrint out version information.";
1098    exit;
1099}
1100
1101#
1102#  All processing is done from the BEGIN block.  The  one  and  only  mandatory
1103#  argument to the program is passed on to the grammar processor.
1104#
1105BEGIN {
1106    _debug = 0;					# no debugging
1107    _optim = 0;					# no optimization
1108    _plain = 0;					# no plain output
1109    _tkpat = 0;					# no token pattern
1110
1111    _bannr = "Yacc-to-LaTeX grammar processor v1.0";
1112
1113    for (i = 1; i < ARGC; i++)			# loop over all arguments
1114	if (ARGV[i] ~ /^-d$/)			# debugging
1115	    _debug = 1;
1116	else if (ARGV[i] ~ /^-h$/)		# help the user
1117	    give_help();
1118	else if (ARGV[i] ~ /^-O([012]|$)/)	# optimization level
1119	    if (length(ARGV[i]) > 2)
1120		_optim = substr(ARGV[i], 3, 1);
1121	    else
1122		_optim = 1;			# default optimization level
1123	else if (ARGV[i] ~ /^-p$/)		# non-LaTeX output
1124	    _plain = 1;
1125	else if (ARGV[i] ~ /^-t/)		# regexp or file for tokens
1126	    if (ARGV[i] ~ /^-t(\/\^|\/)[_\.A-Za-z][_\.A-zA-z0-9]*(\/|\$\/)$/) {
1127		l = length(ARGV[i]);
1128		_tkpat = substr(ARGV[i], 4, l - 4);
1129	    } else if (exists(file = substr(ARGV[i], 3)))
1130		load_tokens(file);
1131	    else
1132		error("So where is the token regexp pattern or file?");
1133	else if (ARGV[i] ~ /^-v$/)		# version
1134	    print _bannr					>"/dev/stderr";
1135	else if (ARGV[i] ~ /^-/)		# unknown option
1136	    error("Am I supposed to do something with '" ARGV[i] "'?");
1137	else if (grammar_file)			# more than one grammar
1138	    error("How about just processing one grammar at a time?");
1139	else
1140	    grammar_file = ARGV[i];		# name of the grammar file
1141
1142    if (!grammar_file)				# no grammar file provided
1143	error("Any particular grammar you have in mind?");
1144
1145    init();					# initialization
1146
1147    process(grammar_file)			# process the given grammar
1148
1149    exit;
1150}
1151