1#!/usr/bin/awk -f 2 3############################################################################### 4# 5# FILE: y2l 6# DESCRIPTION: Yacc-to-LaTeX grammar processor 7# 8# Copyright (c) 1994-2000 by Kris Van Hees, Belgium. All rights reserved. 9# See the "Artistic License" included in this package (file: "Artistic") for 10# terms and conditions. 11# 12# $Id: y2l,v 1.1.1.1 2000/03/07 16:40:33 aedil Exp $ 13# 14############################################################################### 15 16# 17# Reserved variables: 18# _debug Debug level 19# _exit Exiting due to an error 20# _line Current line being processed 21# _spclen Length of string holding padding spaces 22# _spcstr String to hold padding spaces 23# 24 25############################################################################### 26# Support functions 27############################################################################### 28 29# 30# NAME: SPACES() 31# DESCRIPTION: return a given number of spaces 32# 33function SPACES(snum) { 34 if (!_spclen) { # uninitialized (first use) 35 _spcstr = " "; 36 _spclen = 1; 37 } 38 39 while (_spclen < snum) { # extend the string of spaces 40 _spcstr = _spcstr _spcstr; # double the spaces 41 _spclen *= 2; # update the string length 42 } 43 44 return substr(_spcstr, 1, snum); # requested number of spaces 45} 46 47# 48# NAME: DBG() 49# DESCRIPTION: write debug output to standard error (if debugging is enabled) 50# 51function DBG(dlvl, dmsg) { 52 if (!_debug) # debugging is disabled 53 return; 54 55 print "DBG:" SPACES(dlvl * 2 + 1) dmsg >"/dev/stderr"; 56} 57 58# 59# NAME: error() 60# DESCRIPTION: write an error message to standard error 61# 62function error(emsg) { 63 print "ERROR: " emsg >"/dev/stderr"; 64 65 if (_line) 66 print " Line is (on or about) " _line >"/dev/stderr"; 67 68 _exit = 1; # mark program as exiting 69 exit; 70} 71 72# 73# NAME: exiting() 74# DESCRIPTION: return whether the program is exiting 75# 76function exiting() { 77 return _exit; # exit status 78} 79 80# 81# NAME: exists() 82# DESCRIPTION: return whether a file exists 83# 84function exists(fname) { 85 return !system("ls " fname " >/dev/null 2>&1"); 86} 87 88############################################################################### 89# Yacc-to-LaTeX initialization 90############################################################################### 91 92# 93# NAME: init() 94# DESCRIPTION: initialization 95# 96function init() { 97 PERC_PERC = 257; 98 PERC_LACC = 258; 99 PERC_RACC = 259; 100 PERC_LEFT = 260; 101 PERC_ASSC = 261; 102 PERC_RGHT = 262; 103 PERC_STRT = 263; 104 PERC_TOKN = 264; 105 PERC_TYPE = 265; 106 PERC_UNIO = 266; 107 PERC_PREC = 267; 108 109 CHAR = 268; 110 IDENT = 269; 111 STRING = 270; 112 TYPE = 271; 113 114 EOF = -1; 115 BAD = -2; 116 117 reserved["%%"] = PERC_PERC; 118 reserved["%{"] = PERC_LACC; 119 reserved["%}"] = PERC_RACC; 120 reserved["%left"] = PERC_LEFT; 121 reserved["%nonassoc"] = PERC_ASSC; 122 reserved["%right"] = PERC_RGHT; 123 reserved["%start"] = PERC_STRT; 124 reserved["%token"] = PERC_TOKN; 125 reserved["%type"] = PERC_TYPE; 126 reserved["%union"] = PERC_UNIO; 127 reserved["%prec"] = PERC_PREC; 128} 129 130############################################################################### 131# Yacc-to-LaTeX processor functions 132############################################################################### 133 134# 135# NAME: token() 136# DESCRIPTION: verify whether something is a valid token 137# 138function token(tokname) { 139 return tokname ~ /^[_\.A-Za-z][_\.A-Za-z0-9]*$/; 140} 141 142# 143# NAME: read() 144# DESCRIPTION: read another (non-blank) line from the input file 145# 146function read() { 147 while (getline < grammar_file == 1) { 148 _line++; 149 150 if ($1 != "") 151 return; 152 } 153 154 return grammar_eof = 1; 155} 156 157# 158# NAME: skip_ws() 159# DESCRIPTION: skip a continuous block of whitespace 160# 161function skip_ws() { 162 in_comment = 0; 163 164 while (!grammar_eof) { 165 sub(/^[ \t]+/, ""); # remove leading whitespace 166 167 if ($1 != "") { 168 if (/^\/\//) { # C++ comment 169 $0 = ""; 170 } else if (in_comment) { # in a comment 171 if (match($0, /\*\//)) { # end of a comment 172 $0 = substr($0, RSTART + RLENGTH); 173 in_comment = 0; 174 } else 175 $0 = ""; 176 } else if (/^\/\*"[^"]+"\*\//) { # special marker 177 sub(/\/\*"/, "\""); 178 sub(/"\*\//, "\""); 179 180 return; 181 } else if (/^\/\*/) { # regular comment 182 $0 = substr($0, 3); 183 in_comment = 1; 184 } else 185 return; 186 187 sub(/[ \t]+$/, ""); # remove trailing whitespace 188 } 189 190 if ($1 == "") 191 read(); 192 } 193} 194 195# 196# NAME: lex() 197# DESCRIPTION: read the next token from the input 198# 199function lex() { 200 if (tok_prev) { 201 tok = tok_prev; 202 tok_str = tok_pstr; 203 tok_prev = 0; 204 205 return tok; 206 } 207 208 skip_ws(); 209 210 if (grammar_eof) 211 return EOF; 212 213 if (/^%/) 214 if (match($0, /^%(%|{|}|left|nonassoc|right|start|token|type|union)/)) { 215 tok_str = substr($0, 1, RLENGTH); 216 $0 = substr($0, RLENGTH + 1); 217 218 return reserved[tok_str]; 219 } else if (match($0, /^%[_A-Za-z][_A-Za-z0-9]*/)) { 220 tok_str = substr($0, 1, RLENGTH); 221 $0 = substr($0, RLENGTH + 1); 222 223 return BAD; 224 } 225 226 if (match($0, /^[_\.A-Za-z][_\.A-zA-z0-9]*/)) { 227 tok_str = substr($0, 1, RLENGTH); 228 $0 = substr($0, RLENGTH + 1); 229 230 return IDENT; 231 } 232 233 if (match($0, /^<[_\.A-Za-z][_\.A-zA-z0-9]*>/)) { 234 tok_str = substr($0, 1, RLENGTH); 235 $0 = substr($0, RLENGTH + 1); 236 237 return TYPE; 238 } 239 240 if (/^'/) 241 if (match($0, /^'(.|\\([tnarfbv\\'"]|[0-7][0-7]*))'/)) { 242 tok_str = "@C" (consts++); 243 transtab[tok_str] = "\"" substr($0, 2, RLENGTH - 2) "\""; 244 $0 = substr($0, RLENGTH + 1); 245 246 return CHAR; 247 } else 248 error("Excuse me, but this character constant is a bit weird."); 249 250 if (/^"/) 251 if (match($0, /([^\\]|[ \t]+)(\\\\)*"/)) { 252 tok_str = "@S" (strings++); 253 transtab[tok_str] = substr($0, 1, RSTART + RLENGTH - 1); 254 $0 = substr($0, RSTART + RLENGTH); 255 256 return STRING; 257 } else 258 error("Newlines in strings are interesting, but not allowed."); 259 260 tok_str = substr($0, 1, 1); 261 $0 = substr($0, 2); 262 263 return tok_str; 264} 265 266# 267# NAME: unlex() 268# DESCRIPTION: return a token to the input stream 269# 270function unlex(tok) { 271 tok_prev = tok; 272 tok_pstr = tok_str; 273} 274 275# 276# NAME: skip_definition() 277# DESCRIPTION: skip the contents of a %{ ... %} block 278# 279function skip_definition() { 280 do { 281 skip = lex(); 282 } while (skip != PERC_RACC && skip != EOF); 283} 284 285# 286# NAME: decl_token() 287# DESCRIPTION: read a token declaration 288# 289function decl_token() { 290 first = 1; 291 292 do { 293 tok = lex(); 294 295 if (tok == ",") { 296 symbol = 0; 297 } else if (tok == CHAR) { 298 DBG(1, transtab[tok_str] ": No need to remember this token."); 299 } else if (tok == IDENT) { 300 if (_tkpat && tok_str !~ _tkpat) { 301 if (transtab[tok_str]) 302 DBG(2, "WARNING: Redefining '" tok_str "'."); 303 304 transtab[tok_str] = "\"" tolower(tok_str) "\""; 305 DBG(1, tok_str ": Defined as " transtab[tok_str] "."); 306 } 307 308 symbol = tok_str; 309 } else if (tok == NUMBER) { 310 if (!symbol) 311 error("How about giving a token first?"); 312 313 symbol = 0; 314 } else if (tok == STRING) { 315 if (!symbol) 316 error("How about giving a token first?"); 317 318 str = transtab[tok_str]; 319 transtab[symbol] = "\"" substr(str, 2, length(str) - 2) "\""; 320 DBG(1, SPACES(length(symbol) + 2) \ 321 "Defined as " transtab[symbol] "."); 322 323 symbol = 0; 324 } else if (tok == TYPE) { 325 if (!first) 326 error("This is no place for a type name."); 327 } else { 328 unlex(tok); 329 return; 330 } 331 332 first = 0; 333 } while (tok != EOF); 334} 335 336# 337# NAME: decl_start() 338# DESCRIPTION: read a start rule declaration 339# 340function decl_start() { 341 if (grammar_start) 342 error("Hm, you want the grammar to start with two rules?"); 343 else if (lex() == IDENT) { 344 grammar_start = tok_str; 345 DBG(2, "The grammar start rule is '" grammar_start "'."); 346 } else 347 error("How about a nice juicy identifier?"); 348} 349 350# 351# NAME: decl_type() 352# DESCRIPTION: read a type declaration 353# 354function decl_type() { 355 if (lex() != TYPE) 356 error("So where is the typename?"); 357 358 do { 359 tok = lex(); 360 361 if (tok == ",") { 362 symbol = 0; 363 } else if (tok == CHAR) { 364 error("Bison etc may accept literals in a %type declaration, " \ 365 "but the Unix 7th\n Ed manual clearly indicates " \ 366 "that it is NOT legal. And I think that the\n Bell " \ 367 "Labs guys know what they are talking about; but anyway, " \ 368 "do you\n really spend the time reading this long " \ 369 "error message?"); 370 } else if (tok == IDENT) { 371 if (_tkpat && tok_str !~ _tkpat) { 372 if (transtab[tok_str]) 373 DBG(2, "WARNING: Redefining '" tok_str "'."); 374 375 transtab[tok_str] = "\"" tolower(tok_str) "\""; 376 DBG(1, tok_str ": Defined as " transtab[tok_str] "."); 377 } 378 379 symbol = tok_str; 380 } else if (tok == NUMBER) { 381 if (!symbol) 382 error("How about giving a token first?"); 383 384 symbol = 0; 385 } else if (tok == STRING) { 386 if (!symbol) 387 error("How about giving a token first?"); 388 389 str = transtab[tok_str]; 390 transtab[symbol] = "\"" substr(str, 2, length(str) - 2) "\""; 391 DBG(1, SPACES(length(symbol) + 2) \ 392 "Defined as " transtab[symbol] "."); 393 394 symbol = 0; 395 } else { 396 unlex(tok); 397 return; 398 } 399 } while (tok != EOF); 400} 401 402# 403# NAME: decl_union() 404# DESCRIPTION: read a union declaration 405# 406function decl_union() { 407 if (grammar_union) 408 error("How about sticking to one single union declaration?"); 409 410 grammar_union = 1; 411 DBG(2, "Extended types have been registered with a union declaration."); 412 413 do { 414 tok = lex(); 415 416 if (tok == "{") 417 block++; 418 else if (tok == "}") { 419 if (!block) 420 error("Why close an unopened block?"); 421 if (--block <= 0) 422 return; 423 } else if (tok == EOF) 424 error("The file ends before the union is finished. How rude!"); 425 } while (1); 426} 427 428# 429# NAME: read_declarations() 430# DESCRIPTION: read the yacc declarations 431# 432function read_declarations() { 433 do { 434 tok = lex(); # next token 435 436 if (tok == PERC_PERC || tok == EOF) # end of the declarations 437 return; 438 if (tok == PERC_LACC) { # definition block 439 skip_definition(); 440 } else if (tok == PERC_LEFT) { # left associative declaration 441 decl_token(); 442 } else if (tok == PERC_ASSC) { # non-associative declaration 443 decl_token(); 444 } else if (tok == PERC_RGHT) { # right associative declaration 445 decl_token(); 446 } else if (tok == PERC_STRT) { # start rule declaration 447 decl_start(); 448 } else if (tok == PERC_TOKN) { # token declaration(s) 449 decl_token(); 450 } else if (tok == PERC_TYPE) { # type declaration 451 decl_type(); 452 } else if (tok == PERC_UNIO) { # union declaration 453 decl_union(); 454 } else 455 DBG(2, "WARNING: Ignoring the unknown token '" tok_str "'."); 456 } while (1); 457} 458 459# 460# NAME: skip_action() 461# DESCRIPTION: skip the contents of an action block 462# 463function skip_action() { 464 block = 1; 465 466 do { 467 tok = lex(); 468 469 if (tok == "{") 470 block++; 471 else if (tok == "}") { 472 if (!block) 473 error("Why close an unopened block?"); 474 if (--block <= 0) 475 return; 476 } else if (tok == EOF) 477 error("The file ends before the action is finished. How rude!"); 478 } while (tok != EOF); 479} 480 481# 482# NAME: read_grammar() 483# DESCRIPTION: read the yacc grammar 484# 485function read_grammar() { 486 tok = lex(); 487 488 do { 489 if (tok == PERC_PERC || tok == EOF) # end of the grammar 490 return; 491 492 if (tok == IDENT) { # rule begins here 493 if (!(rule_idx = rule_ref[tok_str])) { 494 rule_idx = ++rule_cnt; # new rule 495 rule_lhs[rule_idx] = tok_str; 496 rule_ref[tok_str] = rule_idx; 497 rule_sub = ++rule_len[rule_idx]; 498 499 if (!grammar_start) 500 grammar_start = tok_str; 501 } 502 503 if (lex() != ":") 504 error("The LHS and RHS would like to be separated by a colon."); 505 } else if (tok == "|") { # alternative RHS for a rule 506 if (!rule_cnt) 507 error("The grammar shouldn't start with a |."); 508 509 rule_sub = ++rule_len[rule_idx]; 510 } else 511 error("You could at least give a valid token."); 512 513 rule_rhs[rule_cnt] = ""; # empty RHS 514 515 do { # read the RHS 516 tok = lex(); 517 518 if (tok == PERC_PREC) { # %prec 519 tok = lex(); 520 521 if (tok != IDENT && tok != CHAR) 522 error("What precedence are you talking about?"); 523 524 tok = lex(); # continue with the rest 525 } 526 527 if (tok == IDENT) { # might be a new rule 528 old_str = tok_str; 529 nxt = lex(); # look ahead 530 unlex(nxt); 531 tok_str = old_str; 532 533 if (nxt == ":") # yup, a new rule started 534 break; 535 536 rule_rhs[rule_idx, rule_sub] = rule_rhs[rule_idx, rule_sub] \ 537 " " tok_str; 538 } else if (tok == CHAR) { 539 if (tok_str ~ /^@/) 540 tok_str = transtab[tok_str]; 541 542 rule_rhs[rule_idx, rule_sub] = rule_rhs[rule_idx, rule_sub] \ 543 " " tok_str; 544 } else if (tok == "{") { # an action block 545 skip_action(); 546 } else # can't be part of a rule 547 break; 548 } while (1); 549 550 sub(/^ /, "", rule_rhs[rule_idx, rule_sub]); 551 552 if (rule_rhs[rule_idx, rule_sub] ~ \ 553 /(^|[^_\.A-Za-z0-9])error($|[^_\.A-Za-z0-9])/) { 554 DBG(1, rule_lhs[rule_idx] ": " rule_rhs[rule_idx, rule_sub] \ 555 " [IGNORED]"); 556 557 rule_rhs[rule_idx, rule_sub] = ""; 558 rule_len[rule_idx]--; 559 } else 560 DBG(1, rule_lhs[rule_idx] ": " rule_rhs[rule_idx, rule_sub]); 561 562 if (tok == ";") 563 tok = lex(); 564 } while (1); 565} 566 567# 568# NAME: is_optional() 569# DESCRIPTION: check whether the given non-terminal is optional 570# 571function is_optional(idx) { 572 if ((len = rule_len[idx]) <= 1) 573 return 0; 574 575 # 576 # One or more empty rules for a non-terminal indicate that the other rules 577 # are in fact optional. There shouldn't be multiple empty rules, but it is 578 # is technically possible. 579 # 580 for (rule_sub = 1; rule_sub <= len; rule_sub++) 581 if (rule_rhs[idx, rule_sub] == "") { 582 while (++rule_sub <= len) 583 rule_rhs[idx, rule_sub - 1] = rule_rhs[idx, rule_sub]; 584 585 rule_len[idx]--; 586 } 587 588 return rule_len[idx] < len; 589} 590 591# 592# NAME: get_prefix() 593# DESCRIPTION: check whether the given non-terminal has a common prefix 594# 595function get_prefix(idx) { 596 if ((len = rule_len[idx]) <= 1) 597 return 0; 598 599 # 600 # Split up the first rule into tokens. These will be compared with all the 601 # other rules for this non-terminal. 602 # 603 gp_last = split(rule_rhs[idx, 1], arr); 604 gp_pref = gp_last; 605 606 # 607 # Look for the longest common prefix. 608 # 609 for (rule_sub = 2; rule_sub <= len; rule_sub++) { 610 $0 = rule_rhs[idx, rule_sub]; 611 gp_tokc = NF; 612 613 if (gp_tokc < gp_pref) 614 gp_pref = gp_tokc; 615 616 for (j = 1; j <= gp_pref; j++) 617 if (arr[j] != $j) { 618 if (!(gp_pref = j - 1)) 619 return 0; 620 621 break; 622 } 623 } 624 625 # 626 # Construct the prefix string. 627 # 628 gp_pstr = arr[1]; 629 for (j = 2; j <= gp_pref; j++) 630 gp_pstr = gp_pstr " " arr[j]; 631 632 # 633 # Remove the common prefix from all rules for this non-terminal. 634 # 635 for (rule_sub = 1; rule_sub <= len; rule_sub++) { 636 $0 = rule_rhs[idx, rule_sub]; 637 638 for (j = 1; j <= gp_pref; j++) 639 $j = ""; 640 641 sub(/^ +/, ""); 642 rule_rhs[idx, rule_sub] = $0; 643 } 644 645 return gp_pstr; 646} 647 648# 649# NAME: get_suffix() 650# DESCRIPTION: check whether the given non-terminal has a common suffix 651# 652function get_suffix(idx) { 653 if ((len = rule_len[idx]) <= 1) 654 return 0; 655 656 # 657 # Split up the first rule into tokens. These will be compared with all the 658 # other rules for this non-terminal. 659 # 660 gs_last = split(rule_rhs[idx, 1], arr); 661 gs_suff = gs_last; 662 663 # 664 # Look for the longest common suffix. 665 # 666 for (rule_sub = 2; rule_sub <= len; rule_sub++) { 667 $0 = rule_rhs[idx, rule_sub]; 668 gs_tokc = NF; 669 670 if (gs_tokc < gs_suff) 671 gs_suff = gs_tokc; 672 673 for (j = 0; j < gs_suff; j++) 674 if (arr[gs_last - j] != $(gs_tokc - j)) { 675 if (!(gs_suff = j)) 676 return 0; 677 678 break; 679 } 680 } 681 682 # 683 # Construct the suffix string. 684 # 685 gs_sstr = arr[gs_last]; 686 for (j = 1; j < gs_suff; j++) 687 gs_sstr = arr[gs_last - j] " " gs_sstr; 688 689 # 690 # Remove the common suffix from all rules for this non-terminal. 691 # 692 for (rule_sub = 1; rule_sub <= len; rule_sub++) { 693 $0 = rule_rhs[idx, rule_sub]; 694 695 for (j = 0; j < gs_suff; j++) 696 $(NF - j) = ""; 697 698 sub(/ +$/, ""); 699 rule_rhs[idx, rule_sub] = $0; 700 } 701 702 return gs_sstr; 703} 704 705# 706# NAME: optimize1() 707# DESCRIPTION: first pass of the optimizer 708# 709function optimize1() { 710 DBG(0, "Optimization pass 1..."); 711 712 for (rule_idx = 1; rule_idx <= rule_cnt; rule_idx++) { 713 # 714 # Non-terminals with only a single rule can't be optimized here. 715 # 716 if ((len = rule_len[rule_idx]) <= 1) 717 continue; 718 719 # 720 # First record whether the entire non-terminal might be optional. 721 # 722 rule_opt[rule_idx] = is_optional(rule_idx); 723 724 # 725 # The actual optimization takes place in this endless loop. It will in 726 # fact end when an iteration does not yield an optional ruleset. This 727 # is a proper and correct stop criteria, since a non-optional ruleset 728 # cannot have any common prefix or suffix. If it did, those would have 729 # been added to the already present prefix or suffix. 730 # 731 pref = ""; 732 suff = ""; 733 do { 734 if (pstr = get_prefix(rule_idx)) 735 pref = pref " " pstr; 736 else if (sstr = get_suffix(rule_idx)) 737 suff = sstr " " suff; 738 739 if (is_optional(rule_idx)) { 740 pref = pref " ["; 741 suff = "] " suff; 742 } else { 743 if (pstr || sstr) { 744 pref = pref " ("; 745 suff = ") " suff; 746 } 747 748 break; 749 } 750 } while (1); 751 752 # 753 # Compose the single composite rule for this non-terminal, if a common 754 # prefix or suffix was found. 755 # 756 if (pref != "" || suff != "") { 757 sub(/^ /, "", pref); 758 sub(/ $/, "", suff); 759 760 DBG(2, "Rules for '" rule_lhs[rule_idx] "' have:"); 761 762 if (pref != "") 763 DBG(3, "Prefix '" pref "'"); 764 if (suff != "") 765 DBG(3, "Suffix '" suff "'"); 766 767 len = rule_len[rule_idx]; 768 pref = pref " " rule_rhs[rule_idx, 1]; 769 770 for (rule_sub = 2; rule_sub <= len; rule_sub++) 771 pref = pref " | " rule_rhs[rule_idx, rule_sub]; 772 773 rule_rhs[rule_idx, 1] = pref " " suff; 774 rule_len[rule_idx] = 1; 775 776 DBG(3, "Combined rule '" rule_rhs[rule_idx, 1] "'"); 777 778 if (rule_opt[rule_idx]) 779 DBG(3, "(The non-terminal is optional.)"); 780 } 781 } 782} 783 784# 785# NAME: optimize2() 786# DESCRIPTION: second pass of the optimizer 787# 788function optimize2() { 789 DBG(0, "Optimization pass 2..."); 790 791 for (rule_idx = 1; rule_idx <= rule_cnt; rule_idx++) { 792 $0 = rule_rhs[rule_idx, 1]; 793 794 if ((len = rule_len[rule_idx]) > 1) 795 for (rule_sub = 2; rule_sub <= len; rule_sub++) 796 $0 = $0 " | " rule_rhs[rule_idx, rule_sub]; 797 798 if ($1 != "[" && rule_opt[rule_idx]) { 799 $0 = "[ " $0 " ]"; 800 rule_opt[rule_idx] = 0; 801 } 802 803 if ($1 == "[") 804 if ($2 == rule_lhs[rule_idx]) { 805 for (i = NF; i > 0; i--) 806 if ($i == "]") 807 break; 808 809 if ((NF - i) < 3) { 810 $1 = ""; 811 subst = "{"; 812 $i = "}"; 813 814 while (++i <= NF) 815 subst = subst " " $i; 816 817 $2 = subst; 818 819 sub(/^ +/, ""); 820 } 821 } 822 823 if ($NF == "]") 824 if ($(NF - 1) == rule_lhs[rule_idx]) { 825 for (i = 1; i <= NF; i++) 826 if ($i == "[") 827 break; 828 829 if (i < 3) { 830 $i = "{"; 831 subst = "}"; 832 $NF = ""; 833 834 while (--i > 0) 835 subst = $i " " subst; 836 837 $(NF - 1) = subst; 838 839 sub(/ +$/, ""); 840 } 841 } 842 843 if (rule_opt[rule_idx]) { 844 $0 = "[ " $0 " ]"; 845 rule_opt[rule_idx] = 0; 846 } 847 848 rule_rhs[rule_idx, 1] = $0; 849 rule_len[rule_idx] = 1; 850 } 851} 852 853# 854# NAME: latexify() 855# DESCRIPTION: make substitutions to ensure that the string is LaTeX ok 856# 857function latexify(txt) { 858 gsub(/[{&%}]/, "$\\\\&$", txt); 859 gsub(/[!|[\]=+\-*\/<>]/, "$&$", txt); 860 gsub(/\$\$/, "\\!", txt); 861 gsub(/"[^"]+"/, "``{\\bf &}''", txt); 862 gsub(/_/, "\\_", txt); 863 gsub(/\^/, "\\verb+^+", txt); 864 gsub(/"/, "", txt); 865 866 return txt; 867} 868 869# 870# NAME: break_string() 871# DESCRIPTION: possibly break up a string in multiple lines 872# 873function break_string(str) { 874 match(str, /^([_A-Za-z][_A-Za-z0-9]* +| +)(::=|\|)/); 875 bs_len = RLENGTH; 876 bs_str = substr(str, 1, RLENGTH); 877 bs_pln = RLENGTH; 878 bs_pre = SPACES(RLENGTH); 879 880 $0 = substr(str, RLENGTH + 2); 881 for (i = 1; i <= NF; i++) { 882 if (bs_len + 1 + length($i) > 79) { 883 bs_str = bs_str "\n" bs_pre; 884 bs_len = bs_pln; 885 } 886 887 bs_str = bs_str " " $i; 888 bs_len += 1 + length($i); 889 } 890 891 return bs_str; 892} 893 894# 895# NAME: output() 896# DESCRIPTION: write out the rewritten rules 897# 898function output(LaTeX) { 899 rule_use[grammar_start] = 1; 900 901 for (rule_idx = 1; rule_idx <= rule_cnt; rule_idx++) { 902 len = rule_len[rule_idx]; 903 904 if (rule_opt[rule_idx]) { 905 rule_rhs[rule_idx, 1] = "[ " rule_rhs[rule_idx, 1]; 906 rule_rhs[rule_idx, len] = rule_rhs[rule_idx, len] " ]"; 907 } 908 909 str = LaTeX ? latexify(rule_lhs[rule_idx]) \ 910 : rule_lhs[rule_idx]; 911 912 if (length(str) > rule_max) { 913 rule_max = length(str); 914 rule_mls = str; 915 } 916 917 for (rule_sub = 1; rule_sub <= len; rule_sub++) { 918 $0 = rule_rhs[rule_idx, rule_sub]; 919 920 for (j = 1; j <= NF; j++) { 921 # 922 # A couple of notes here... Non-terminals that merely have a 923 # single terminal as definition are substituted anywhere they 924 # are used. And some special casing is needed for situations 925 # where someone actually defines the non-terminals as tokens. 926 # Those should definitely not be quotated. 927 # 928 if ((idx = rule_ref[$j]) && 929 rule_len[$j] == 1 && rule_rhs[idx, 1] ~ /^[^ ]+$/) 930 $j = rule_rhs[idx, 1]; 931 else if ((str = transtab[$j]) && !rule_ref[$j]) 932 $j = str; 933 934 rule_use[$j] = 1; 935 } 936 937 rule_rhs[rule_idx, rule_sub] = $0; 938 } 939 } 940 941 if (LaTeX) 942 # 943 # Some LaTeX magic... We calculate the available size for the RHS of 944 # the rules. This will be provided as argument to the minipages used 945 # for each independent rule. 946 # 947 # The formula is fairly easy: 948 # textwidth - width(LHS) - width("::=") - 6 * tabcolsep 949 # 950 print "\\newlength\\rulelhs\n" \ 951 "\\newlength\\rulemid\n" \ 952 "\\newlength\\rulerhs\n" \ 953 "\\settowidth\\rulelhs{" rule_mls "}\n" \ 954 "\\settowidth\\rulemid{::=}\n" \ 955 "\\setlength\\rulerhs{\\textwidth}\n" \ 956 "\\addtolength\\rulerhs{-\\rulelhs}\n" \ 957 "\\addtolength\\rulerhs{-\\rulemid}\n" \ 958 "\\addtolength\\rulerhs{-6\\tabcolsep}\n\n" \ 959 "\\begin{longtable}{lrl}"; 960 961 for (rule_idx = 1; rule_idx <= rule_cnt; rule_idx++) { 962 if (!rule_use[rule_lhs[rule_idx]]) 963 continue; 964 965 len = rule_len[rule_idx]; 966 967 for (rule_sub = 1; rule_sub <= len; rule_sub++) { 968 if (LaTeX) { 969 str = latexify(rule_lhs[rule_idx]); 970 out = str SPACES(rule_max - length(str)) " & ::= &\n" \ 971 " \\begin{minipage}[t]{\\rulerhs}\n" \ 972 " \\raggedright\n "; 973 } else { 974 str = rule_lhs[rule_idx]; 975 out = str SPACES(rule_max - length(str)) " ::= "; 976 } 977 978 rhs = rule_rhs[rule_idx, rule_sub]; 979 lvl = 0; 980 981 while (match(rhs, /(^[\(\{\[] | [\(\{\[\|\]\}\)] | [\]\}\)]$)/)) { 982 o_beg = RSTART; 983 o_len = RLENGTH; 984 985 if (substr(rhs, o_beg) ~ /(^[\(\{\[] | [\(\{\[] )/) { 986 lvl++; 987 988 str = LaTeX ? latexify(substr(rhs, 1, o_beg + o_len - 1)) \ 989 : substr(rhs, 1, o_beg + o_len - 1); 990 991 out = out str; 992 rhs = substr(rhs, o_beg + o_len); 993 } else if (substr(rhs, o_beg) ~ /( [\]\}\)] | [\]\}\)]$)/) { 994 lvl--; 995 996 str = LaTeX ? latexify(substr(rhs, 1, o_beg + o_len - 1)) \ 997 : substr(rhs, 1, o_beg + o_len - 1); 998 999 out = out str; 1000 rhs = substr(rhs, o_beg + o_len); 1001 } else if (substr(rhs, o_beg + 1, 1) == "|") { 1002 if (lvl) { 1003 out = out substr(rhs, 1, o_beg + o_len - 1); 1004 rhs = substr(rhs, o_beg + o_len); 1005 } else { 1006 if (LaTeX) { 1007 str = latexify(substr(rhs, 1, o_beg - 1)); 1008 1009 print out str \ 1010 SPACES(64 - rule_max - length(str)) "\n" \ 1011 " \\end{minipage}" SPACES(60) " \\\\"; 1012 1013 out = SPACES(rule_max) " & $|$ &\n" \ 1014 " \\begin{minipage}[t]{\\rulerhs}\n" \ 1015 " \\raggedright\n "; 1016 } else { 1017 print break_string(out substr(rhs, 1, o_beg - 1)); 1018 1019 out = SPACES(rule_max + 1) " | "; 1020 } 1021 1022 rhs = substr(rhs, o_beg + o_len); 1023 } 1024 } 1025 } 1026 1027 if (LaTeX) { 1028 str = latexify(rhs); 1029 1030 print out str SPACES(76 - length(out) - length(str)) "\n" \ 1031 " \\end{minipage}" SPACES(60) " \\\\"; 1032 } else 1033 print break_string(out rhs); 1034 } 1035 } 1036 1037 if (LaTeX) 1038 print "\\end{longtable}"; 1039} 1040 1041# 1042# NAME: process() 1043# DESCRIPTION: process a given yacc grammar definition file 1044# 1045function process(grammar_file) { 1046 if (!exists(grammar_file)) 1047 error("So where exactly is this file?"); 1048 1049 DBG(0, "Reading grammar from '" grammar_file "'."); 1050 1051 read_declarations(); 1052 read_grammar(); 1053 1054 if (_optim >= 1) 1055 optimize1(); 1056 if (_optim >= 2) 1057 optimize2(); 1058 1059 output(!_plain); 1060} 1061 1062# 1063# NAME: load_tokens() 1064# DESCRIPTION: load a list of literal tokens from a file 1065# 1066function load_tokens(file) { 1067 while (getline < file == 1) 1068 transtab[$1] = "\"" $2 "\""; 1069 1070 close(file); 1071} 1072 1073# 1074# NAME: give_help() 1075# DISCRIPTION: offer some help to the poor user 1076# 1077function give_help() { 1078 print "Usage: y2l -- [options] yacc-file\n" \ 1079 "Options:\n" \ 1080 " -d\n" \ 1081 "\tWrite debugging output to the standard error stream. One can\n"\ 1082 "\tsupply a numeric argument to this option to set an indentation\n"\ 1083 "\tlevel for the messages.\n" \ 1084 " -h\n" \ 1085 "\tDisplay this help information.\n" \ 1086 " -O, -O[012]\n"u \ 1087 "\tOptimize the grammar to get more typical EBNF. If no argument\n"\ 1088 "\tis provided, the highest optimization level is selected.\n" \ 1089 " -p\n" \ 1090 "\tGive basic ASCII output rather than LaTeX output.\n" \ 1091 " -t/regexp/, -tfile\n" \ 1092 "\tIn the first form, the regular expression is used to decide\n" \ 1093 "\twhether a terminal in the grammar is a real token or rather a\n" \ 1094 "\tliteral. The second variant specifies a file which contains\n" \ 1095 "\tlines listing a token and the literal it represents.\n" \ 1096 " -v\n" \ 1097 "\tPrint out version information."; 1098 exit; 1099} 1100 1101# 1102# All processing is done from the BEGIN block. The one and only mandatory 1103# argument to the program is passed on to the grammar processor. 1104# 1105BEGIN { 1106 _debug = 0; # no debugging 1107 _optim = 0; # no optimization 1108 _plain = 0; # no plain output 1109 _tkpat = 0; # no token pattern 1110 1111 _bannr = "Yacc-to-LaTeX grammar processor v1.0"; 1112 1113 for (i = 1; i < ARGC; i++) # loop over all arguments 1114 if (ARGV[i] ~ /^-d$/) # debugging 1115 _debug = 1; 1116 else if (ARGV[i] ~ /^-h$/) # help the user 1117 give_help(); 1118 else if (ARGV[i] ~ /^-O([012]|$)/) # optimization level 1119 if (length(ARGV[i]) > 2) 1120 _optim = substr(ARGV[i], 3, 1); 1121 else 1122 _optim = 1; # default optimization level 1123 else if (ARGV[i] ~ /^-p$/) # non-LaTeX output 1124 _plain = 1; 1125 else if (ARGV[i] ~ /^-t/) # regexp or file for tokens 1126 if (ARGV[i] ~ /^-t(\/\^|\/)[_\.A-Za-z][_\.A-zA-z0-9]*(\/|\$\/)$/) { 1127 l = length(ARGV[i]); 1128 _tkpat = substr(ARGV[i], 4, l - 4); 1129 } else if (exists(file = substr(ARGV[i], 3))) 1130 load_tokens(file); 1131 else 1132 error("So where is the token regexp pattern or file?"); 1133 else if (ARGV[i] ~ /^-v$/) # version 1134 print _bannr >"/dev/stderr"; 1135 else if (ARGV[i] ~ /^-/) # unknown option 1136 error("Am I supposed to do something with '" ARGV[i] "'?"); 1137 else if (grammar_file) # more than one grammar 1138 error("How about just processing one grammar at a time?"); 1139 else 1140 grammar_file = ARGV[i]; # name of the grammar file 1141 1142 if (!grammar_file) # no grammar file provided 1143 error("Any particular grammar you have in mind?"); 1144 1145 init(); # initialization 1146 1147 process(grammar_file) # process the given grammar 1148 1149 exit; 1150} 1151