1/* 2 Based on the HTML 3.2 spec. by the W3 (http://www.w3.org) 3 Alexander Hinds & Terence Parr 4 Magelang Institute, Ltd. 5 Send comments to: parrt@jguru.com 6 7 7/4/00 Fixed bug in COMMENT_DATA that wouldn't let - appear in comment 8 Made COMMENT scarf WS after comment 9 10 v1.2 Fixed a bug APARAM->APARM in APPLET tag. 11 12 v1.1 Terence Parr (updated to 2.6.0) 13 14 Fixed CCYTE->CCITE 15 Fixed def of COMMENT_DATA so it scarfs stuff correctly. 16 Also, fixed refs to (PCDATA)? -> (PCDATA)* because a comment 17 between PCDATA returns 2 PCDATA--ya need the loop not optional. 18 19 v1.0 Terence John Parr (version 2.5.0 of ANTLR required) 20 21 Fixed how whitespace as handled, removing some ambiguities; some 22 because of ANTLR lexical filtering in 2.5.0. 23 24 Changed (PCDATA)* loops to (PCDATA)? general since PCDATA matches 25 everything between valid tags (how could there be more than one 26 between tags?) 27 28 Made the DOCTYPE optional. 29 30 Reduced lookahead from k=5 to k=1 on the parser and number 31 of parser ambiguities to 2. Reduced lexer lookahead from 6 32 to 4; had to left factor a bunch of stuff. 33 34 List items couldn't contain nested lists...fixed it. 35 36 Fixed def of WORD so it can't be an INT. Removed '-' from WORD. 37 38 Fixed HEXNUM so it will allow letters A..F. 39 40 KNOWN ISSUES: 41 42 1. Does not handle "staggered" tags, eg: <p> <i> <p> <i> 43 44 2. Adhere's somewhat strictly to the html spec, so many pages 45 won't parse without errors. 46 47 3. Doesn't convert &(a signifier) to it's proper single char 48 representation 49 50 4. Checks only the syntax of element attributes, not the semantics, 51 e.g. won't very that a base element's attribute is actually 52 called "href" 53 54 5. Tags split across lines, for example, <A (NEWLINE) some text > 55 won't be properly recognized. TJP: I think I fixed this. 56 57 7. Lines not counted properly due to the def'n of PCDATA - see the 58 alternate def'n for a possible fix. TJP: I think I fixed this. 59 60*/ 61 62class HTMLParser extends Parser; 63options { 64 exportVocab=HTML; 65 k = 1; 66} 67 68 69document 70 : (PCDATA)? (DOCTYPE (PCDATA)?)? 71 (OHTML (PCDATA)?)? 72 (head)? 73 (body)? 74 (CHTML (PCDATA)?)? 75 ; 76 77head: (OHEAD (PCDATA)?)? 78 head_element 79 (PCDATA | head_element)* 80 (CHEAD (PCDATA)?)? 81 ; 82 83head_element 84 : title //bug need at least a title, rest optional 85 | script 86 | style 87 | ISINDEX 88 | BASE 89 | META 90 | LINK 91 ; 92 93title 94 : OTITLE (PCDATA)? CTITLE 95 ; 96 97script 98 : OSCRIPT (~CSCRIPT)+ CSCRIPT 99 ; 100 101style 102 : OSTYLE (~CSTYLE)+ CSTYLE 103 ; 104 105body: ( OBODY (PCDATA)* )? 106 body_content_no_PCDATA 107 ( body_content )+ 108 ( CBODY (PCDATA)* )? 109 ; 110 111body_content_no_PCDATA 112 : body_tag | text_tag 113 ; 114 115body_tag 116 : heading | block | address 117 ; 118 119body_content 120 : body_tag | text 121 ; 122 123 124/*revised*/ 125heading 126 : h1 | h2 | h3 | h4 | h5 | h6 127 ; 128 129block 130 : paragraph | list | preformatted | div | 131 center | blockquote | HR | table 132 ; //bug - ?FORM v %form, ISINDEX here too? 133 134font: teletype | italic | bold | underline | strike | 135 big | small | subscript | superscript 136 ; 137 138phrase 139 : emphasize | strong | definition | code | sample_output| 140 keyboard_text | variable | citation 141 ; 142 143special 144 : anchor | IMG | applet | font_dfn | BFONT | 145 map | BR 146 ; 147 148text_tag 149 : font | phrase | special | form 150 ; 151 152text: PCDATA | text_tag 153 ; 154 155/*end*/ 156 157 158/*BLOCK ELEMENTS*/ 159 160h1 : OH1 (block | text)* CH1 161 ; 162h2 : OH2 (block | text)* CH2 163 ; 164h3 : OH3 (block | text)* CH3 165 ; 166h4 : OH4 (block | text)* CH4 167 ; 168h5 : OH5 (block | text)* CH5 169 ; 170h6 : OH6 (block | text)* CH6 171 ; 172 173address_content 174 : text 175 | paragraph 176 ; 177 178address 179 : OADDRESS address_content CADDRESS 180 ; 181 182//NOTE: according to the standard, paragraphs can't contain block elements 183//like HR. Netscape may insert these elements into paragraphs. 184//We adhere strictly here. 185 186paragraph 187 : OPARA 188 ( 189 /* Rule body_content may also be just plain text because HTML is 190 so loose. When body puts body_content in a loop, ANTLR 191 doesn't know whether you want it to match all the text as part 192 of this paragraph (in the case where the </p> is missing) or 193 if the body rule should scarf it. This is analogous to the 194 dangling-else clause. I shut off the warning. 195 */ 196 options { 197 generateAmbigWarnings=false; 198 } 199 : text 200 )* 201 (CPARA)? 202 ; 203 204list: unordered_list 205 | ordered_list 206 | def_list 207 ; 208 209unordered_list 210 : OULIST (PCDATA)* (list_item)+ CULIST 211 ; 212 213ordered_list 214 : OOLIST (PCDATA)* (list_item)+ COLIST 215 ; 216 217def_list 218 : ODLIST (PCDATA)* (def_list_item)+ CDLIST 219 ; 220 221list_item 222 : OLITEM ( text | list )+ (CLITEM (PCDATA)*)? 223 ; 224 225def_list_item 226 : dt | dd 227 ; 228 229dt : ODTERM (text)+ CDTERM (PCDATA)* 230 ; 231 232dd : ODDEF (text | block)+ CDTERM (PCDATA)* 233 ; 234 235dir : ODIR (list_item)+ CDIR 236 ; 237 238menu: OMENU (list_item)+ CMENU 239 ; 240 241preformatted 242 : OPRE (text)+ CPRE 243 ; 244 245div : ODIV (body_content)* CDIV //semi-revised 246 ; 247 248center 249 : OCENTER (body_content)* CCENTER //semi-revised 250 ; 251 252blockquote 253 : OBQUOTE (body_content)* CBQUOTE 254 ; 255 256form: OFORM (form_field | body_content)* CFORM 257 ; 258 259table 260 : OTABLE (caption)? (PCDATA)* (tr)+ CTABLE 261 ; 262 263caption 264 : OCAP (text)* CCAP 265 ; 266 267tr : O_TR (PCDATA)* (th_or_td)* (C_TR (PCDATA)*)? 268 ; 269 270th_or_td 271 : O_TH_OR_TD (body_content)* (C_TH_OR_TD (PCDATA)*)? 272 ; 273 274/*TEXT ELEMENTS*/ 275 276/*font style*/ 277 278teletype 279 : OTTYPE ( text )+ CTTYPE 280 ; 281 282italic 283 : OITALIC ( text )+ CITALIC 284 ; 285 286bold: OBOLD ( text )+ CBOLD 287 ; 288 289underline 290 : OUNDER ( text )+ CUNDER 291 ; 292 293strike 294 : OSTRIKE ( text )+ CSTRIKE 295 ; 296 297big : OBIG ( text )+ CBIG 298 ; 299 300small 301 : OSMALL ( text )+ CSMALL 302 ; 303 304subscript 305 : OSUB ( text )+ CSUB 306 ; 307 308superscript 309 : OSUP ( text )+ CSUP 310 ; 311 312 /*phrase elements*/ 313 314emphasize 315 : OEM ( text )+ CEM 316 ; 317 318strong 319 : OSTRONG ( text )+ CSTRONG 320 ; 321 322definition 323 : ODFN ( text )+ CDFN 324 ; 325 326code 327 : OCODE ( text )+ CCODE 328 ; 329 330sample_output 331 : OSAMP ( text )+ CSAMP 332 ; 333 334keyboard_text 335 : OKBD ( text )+ CKBD 336 ; 337 338variable 339 : OVAR ( text )+ CVAR 340 ; 341 342citation 343 : OCITE ( text )+ CCITE 344 ; 345 346/* form fields (combined with body_content elsewhere so no PCDATA on end) */ 347form_field 348 : INPUT | select | textarea 349 ; 350 351select 352 : OSELECT (PCDATA)* (select_option)+ CSELECT 353 ; 354 355select_option 356 : SELOPT (PCDATA)* 357 ; 358 359textarea 360 : OTAREA (PCDATA)* CTAREA 361 ; 362 363/* special text level elements*/ 364anchor 365 : OANCHOR (text)* CANCHOR 366 ; 367 368applet 369 : OAPPLET (APARM)? (PCDATA)* CAPPLET 370 ; 371 372//not w3-no blocks allowed; www.microsoft.com uses 373font_dfn 374 : OFONT (text)* CFONT 375 ; 376 377map : OMAP (AREA)+ CMAP 378 ; 379 380class HTMLLexer extends Lexer; 381options { 382 k = 4; 383 exportVocab=HTML; 384 charVocabulary = '\3'..'\377'; 385 caseSensitive=false; 386 filter=UNDEFINED_TOKEN; 387} 388 389 390/* STRUCTURAL tags 391*/ 392 393DOCTYPE 394 : "<!doctype" WS "html" WS "public" (WS)? STRING (WS)? (STRING (WS)?)? '>' 395 ; 396 397OHTML 398 : "<html>" 399 ; 400 401CHTML 402 : "</html>" 403 ; 404 405OHEAD 406 : "<head>" 407 ; 408 409CHEAD 410 : "</head>" 411 ; 412 413OBODY 414 : "<body" (WS (ATTR )*)? '>' 415 ; 416 417CBODY 418 : "</body>" 419 ; 420 421 422/* HEAD ELEMENTS 423*/ 424 425OTITLE 426 : "<title>" 427 ; 428 429CTITLE 430 : "</title>" 431 ; 432 433 434OSCRIPT 435 : "<script>" 436 ; 437 438CSCRIPT 439 : "</script>" 440 ; 441 442ISINDEX 443 : "<isindex" WS ATTR '>' 444 ; 445 446META 447 : "<meta" WS (ATTR)+ '>' 448 ; 449 450LINK 451 : "<link" WS (ATTR)+ '>' 452 ; 453 454 455/* headings */ 456 457OH1 : "<h1" (WS ATTR)? '>' 458 ; 459 460CH1 : "</h1>" 461 ; 462 463OH2 : "<h2" (WS ATTR)?'>' 464 ; 465 466CH2 : "</h2>" 467 ; 468 469OH3 : "<h3" (WS ATTR)? '>' 470 ; 471 472CH3 : "</h3>" 473 ; 474 475OH4 : "<h4" (WS ATTR)? '>' 476 ; 477 478CH4 : "</h4>" 479 ; 480 481OH5 : "<h5" (WS ATTR)? '>' 482 ; 483 484CH5 : "</h5>" 485 ; 486 487OH6 : "<h6" (WS ATTR)? '>' 488 ; 489 490CH6 : "</h6>" 491 ; 492 493OADDRESS 494 : "<address>" 495 ; 496 497CADDRESS 498 : "</address>" 499 ; 500 501OPARA 502 : "<p" (WS ATTR)? '>' 503 ; 504 505CPARA 506 : "</p>" //it's optional 507 ; 508 509 /*UNORDERED LIST*/ 510OULIST 511 : "<ul" (WS ATTR)? '>' 512 ; 513 514CULIST 515 : "</ul>" 516 ; 517 518 /*ORDERED LIST*/ 519OOLIST 520 : "<ol" (WS ATTR)? '>' 521 ; 522 523COLIST 524 : "</ol>" 525 ; 526 527 /*LIST ITEM*/ 528 529OLITEM 530 : "<li" (WS ATTR)? '>' 531 ; 532 533CLITEM 534 : "</li>" 535 ; 536 537 /*DEFINITION LIST*/ 538 539ODLIST 540 : "<dl" (WS ATTR)? '>' 541 ; 542 543CDLIST 544 : "</dl>" 545 ; 546 547ODTERM 548 : "<dt>" 549 ; 550 551CDTERM 552 : "</dt>" 553 ; 554 555ODDEF 556 : "<dd>" 557 ; 558 559CDDEF 560 : "</dd>" 561 ; 562 563ODIR: "<dir>" 564 ; 565 566CDIR_OR_CDIV 567 : "</di" 568 ( 'r' {$setType(CDIR);} 569 | 'v' {$setType(CDIV);} 570 ) 571 '>' 572 ; 573 574ODIV: "<div" (WS ATTR)? '>' 575 ; 576 577OMENU 578 : "<menu>" 579 ; 580 581CMENU 582 : "</menu>" 583 ; 584 585OPRE: ("<pre>" | "<xmp>") ('\n')? 586 ; 587 588CPRE: "</pre>" | "</xmp>" 589 ; 590 591OCENTER 592 : "<center>" 593 ; 594 595CCENTER 596 : "</center>" 597 ; 598 599OBQUOTE 600 : "<blockquote>" 601 ; 602 603CBQUOTE 604 : "</blockquote>" 605 ; 606 607//this is block element and thus can't be nested inside of 608//other block elements, ex: paragraphs. 609//Netscape appears to generate bad HTML vis-a-vis the standard. 610 611HR : "<hr" (WS (ATTR)*)? '>' 612 ; 613 614 615OTABLE 616 : "<table" (WS (ATTR)*)? '>' 617 ; 618 619CTABLE 620 : "</table>" 621 ; 622 623OCAP: "<caption" (WS (ATTR)*)? '>' 624 ; 625 626CCAP: "</caption>" 627 ; 628 629O_TR 630 : "<tr" (WS (ATTR)*)? '>' 631 ; 632 633C_TR: "</tr>" 634 ; 635 636O_TH_OR_TD 637 : ("<th" | "<td") (WS (ATTR)*)? '>' 638 ; 639 640C_TH_OR_TD 641 : "</th>" | "</td>" 642 ; 643 644/* PCDATA-LEVEL ELEMENTS 645*/ 646 647/* font style elemens*/ 648 649OTTYPE 650 : "<tt>" 651 ; 652 653CTTYPE 654 : "</tt>" 655 ; 656 657OITALIC 658 : "<i>" 659 ; 660 661CITALIC 662 : "</i>" 663 ; 664 665OBOLD 666 : "<b>" 667 ; 668 669CBOLD 670 : "</b>" 671 ; 672 673OUNDER 674 : "<u>" 675 ; 676 677CUNDER 678 : "</u>" 679 ; 680 681/** Left-factor <strike> and <strong> to reduce lookahead */ 682OSTRIKE_OR_OSTRONG 683 : "<str" 684 ( "ike" {$setType(OSTRIKE);} 685 | "ong" {$setType(OSTRONG);} 686 ) 687 '>' 688 ; 689 690CST_LEFT_FACTORED 691 : "</st" 692 ( "rike" {$setType(CSTRIKE);} 693 | "rong" {$setType(CSTRONG);} 694 | "yle" {$setType(CSTYLE);} 695 ) 696 '>' 697 ; 698 699OSTYLE 700 : "<style>" 701 ; 702 703OBIG: "<big>" 704 ; 705 706CBIG: "</big>" 707 ; 708 709OSMALL 710 : "<small>" 711 ; 712 713CSMALL 714 : "</small>" 715 ; 716 717OSUB: "<sub>" 718 ; 719 720OSUP: "<sup>" 721 ; 722 723CSUB_OR_CSUP 724 : "</su" 725 ( 'b' {$setType(CSUB);} 726 | 'p' {$setType(CSUP);} 727 ) 728 '>' 729 ; 730 731/* phrase elements*/ 732OEM : "<em>" 733 ; 734 735CEM : "</em>" 736 ; 737 738ODFN: "<dfn>" 739 ; 740 741CDFN: "</dfn>" 742 ; 743 744OCODE 745 : "<code>" 746 ; 747 748CCODE 749 : "</code>" 750 ; 751 752OSAMP 753 : "<samp>" 754 ; 755 756CSAMP 757 : "</samp>" 758 ; 759 760OKBD: "<kbd>" 761 ; 762 763CKBD: "</kbd>" 764 ; 765 766OVAR: "<var>" 767 ; 768 769CVAR: "</var>" 770 ; 771 772OCITE 773 : "<cite>" 774 ; 775 776CCITE 777 : "</cite>" 778 ; 779 780/* form fields*/ 781INPUT 782 : "<input" (WS (ATTR)*)? '>' 783 ; 784 785OSELECT 786 : "<select" (WS (ATTR)*)? '>' 787 ; 788 789CSELECT 790 : "</select>" 791 ; 792 793OTAREA 794 : "<textarea" (WS (ATTR)*)? '>' 795 ; 796 797CTAREA 798 : "</textarea>" 799 ; 800 801SELOPT 802 : "<option" (WS (ATTR)*)? '>' 803 ; 804 805/* special text level elements*/ 806 807OANCHOR 808 : "<a" WS (ATTR)+ '>' 809 ; 810 811CANCHOR 812 : "</a>" 813 ; 814 815IMG : "<img" WS (ATTR)+ '>' 816 ; 817 818 819OAPPLET 820 : "<applet" WS (ATTR)+ '>' 821 ; 822 823CAPPLET 824 : "</applet>" 825 ; 826 827APARM 828 : "<param" WS (ATTR)+ '>' 829 ; 830 831OFORM 832 : "<form" WS (ATTR)+ '>' 833 ; 834 835OFONT 836 : "<font" WS (ATTR)+ '>' 837 ; 838 839CFORM_OR_CFONT 840 : "</fo" 841 ( "rm" {$setType(CFORM);} 842 | "nt" {$setType(CFONT);} 843 ) 844 '>' 845 ; 846 847/* 848CFORM 849 : "</form>" 850 ; 851 852CFONT 853 : "</font>" 854 ; 855*/ 856 857BFONT_OR_BASE 858 : "<base" 859 ( "font" WS ATTR {$setType(BFONT);} 860 | WS ATTR {$setType(BASE);} 861 ) 862 '>' 863 ; 864 865/* 866BFONT 867 : "<basefont" WS ATTR '>' 868 ; 869 870BASE: "<base" WS ATTR '>' 871 ; 872*/ 873 874BR 875 : "<br" (WS ATTR)? '>' 876 ; 877 878OMAP 879 : "<map" WS ATTR '>' 880 ; 881 882CMAP: "</map>" 883 ; 884 885AREA: "<area" WS (ATTR)+ '>' 886 ; 887 888/*MISC STUFF*/ 889 890PCDATA 891 : ( 892 /* See comment in WS. Language for combining any flavor 893 * newline is ambiguous. Shutting off the warning. 894 */ 895 options { 896 generateAmbigWarnings=false; 897 } 898 : '\r' '\n' {newline();} 899 | '\r' {newline();} 900 | '\n' {newline();} 901 | ~('<'|'\n'|'\r'|'"'|'>') 902 )+ 903 ; 904 905// multiple-line comments 906protected 907COMMENT_DATA 908 : ( /* '\r' '\n' can be matched in one alternative or by matching 909 '\r' in one iteration and '\n' in another. I am trying to 910 handle any flavor of newline that comes in, but the language 911 that allows both "\r\n" and "\r" and "\n" to all be valid 912 newline is ambiguous. Consequently, the resulting grammar 913 must be ambiguous. I'm shutting this warning off. 914 */ 915 options { 916 generateAmbigWarnings=false; 917 } 918 : 919 {!(LA(2)=='-' && LA(3)=='>')}? '-' // allow '-' if not "-->" 920 | '\r' '\n' {newline();} 921 | '\r' {newline();} 922 | '\n' {newline();} 923 | ~('-'|'\n'|'\r') 924 )* 925 ; 926 927 928COMMENT 929 : "<!--" c:COMMENT_DATA "-->" (WS)? 930 { $setType(Token.SKIP); } 931 ; 932 933/* 934 PROTECTED LEXER RULES 935*/ 936 937protected 938WS : ( 939 /* '\r' '\n' can be matched in one alternative or by matching 940 '\r' in one iteration and '\n' in another. I am trying to 941 handle any flavor of newline that comes in, but the language 942 that allows both "\r\n" and "\r" and "\n" to all be valid 943 newline is ambiguous. Consequently, the resulting grammar 944 must be ambiguous. I'm shutting this warning off. 945 */ 946 options { 947 generateAmbigWarnings=false; 948 } 949 : ' ' 950 | '\t' 951 | '\n' { newline(); } 952 | "\r\n" { newline(); } 953 | '\r' { newline(); } 954 )+ 955 ; 956 957protected 958ATTR 959 : WORD (WS)? ('=' (WS)? (WORD ('%')? | ('-')? INT | STRING | HEXNUM) (WS)?)? 960 ; 961 962//don't need uppercase for case-insen. 963//the '.' is for words like "image.gif" 964protected 965WORD: ( LCLETTER 966 | '.' 967 ) 968 969 ( 970 /* In reality, a WORD must be followed by whitespace, '=', or 971 what can follow an ATTR such as '>'. In writing this grammar, 972 however, we just list all the possibilities as optional 973 elements. This is loose, allowing the case where nothing is 974 matched after a WORD and then the (ATTR)* loop means the 975 grammar would allow "widthheight" as WORD WORD or WORD, hence, 976 an ambiguity. Naturally, ANTLR will consume the input as soon 977 as possible, combing "widthheight" into one WORD. 978 979 I am shutting off the ambiguity here because ANTLR does the 980 right thing. The exit path is ambiguous with ever 981 alternative. The only solution would be to write an unnatural 982 grammar (lots of extra productions) that laid out the 983 possibilities explicitly, preventing the bogus WORD followed 984 immediately by WORD without whitespace etc... 985 */ 986 options { 987 generateAmbigWarnings=false; 988 } 989 : LCLETTER 990 | DIGIT 991 | '.' 992 )+ 993 ; 994 995protected 996STRING 997 : '"' (~'"')* '"' 998 | '\'' (~'\'')* '\'' 999 ; 1000 1001protected 1002WSCHARS 1003 : ' ' | '\t' | '\n' | '\r' 1004 ; 1005 1006protected 1007SPECIAL 1008 : '<' | '~' 1009 ; 1010 1011protected 1012HEXNUM 1013 : '#' HEXINT 1014 ; 1015 1016protected 1017INT : (DIGIT)+ 1018 ; 1019 1020protected 1021HEXINT 1022 : ( 1023 /* Technically, HEXINT cannot be followed by a..f, but due to our 1024 loose grammar, the whitespace that normally would follow this 1025 rule is optional. ANTLR reports that #4FACE could parse as 1026 HEXINT "#4" followed by WORD "FACE", which is clearly bogus. 1027 ANTLR does the right thing by consuming a much input as 1028 possible here. I shut the warning off. 1029 */ 1030 options { 1031 generateAmbigWarnings=false; 1032 } 1033 : HEXDIGIT 1034 )+ 1035 ; 1036 1037protected 1038DIGIT 1039 : '0'..'9' 1040 ; 1041 1042protected 1043HEXDIGIT 1044 : '0'..'9' 1045 | 'a'..'f' 1046 ; 1047 1048protected 1049LCLETTER 1050 : 'a'..'z' 1051 ; 1052 1053protected 1054UNDEFINED_TOKEN 1055 : '<' (~'>')* '>' 1056 ( 1057 ( /* the usual newline hassle: \r\n can be matched in alt 1 1058 * or by matching alt 2 followed by alt 3 in another iteration. 1059 */ 1060 options { 1061 generateAmbigWarnings=false; 1062 } 1063 : "\r\n" | '\r' | '\n' 1064 ) 1065 { newline();} 1066 )* 1067 {System.err.println("invalid tag: "+$getText);} 1068 | ( "\r\n" | '\r' | '\n' ) {newline();} 1069 | . 1070 ; 1071 1072/* 1073 : ('<' { System.err.print("Warning: non-standard tag <" + LA(1)); } ) 1074 (~'>' { System.err.print(LA(1)); } )* 1075 ('>' { System.err.println(" skipped."); } ) 1076 { _ttype = Token.SKIP; } 1077 ; 1078*/ 1079