1/*
2	Based on the HTML 3.2 spec. by the W3 (http://www.w3.org)
3	Alexander Hinds & Terence Parr
4	Magelang Institute, Ltd.
5	Send comments to:  parrt@jguru.com
6
7	7/4/00	Fixed bug in COMMENT_DATA that wouldn't let - appear in comment
8		Made COMMENT scarf WS after comment
9
10	v1.2	Fixed a bug APARAM->APARM in APPLET tag.
11
12	v1.1	Terence Parr (updated to 2.6.0)
13
14	Fixed CCYTE->CCITE
15	Fixed def of COMMENT_DATA so it scarfs stuff correctly.
16	Also, fixed refs to (PCDATA)? -> (PCDATA)* because a comment
17		between PCDATA returns 2 PCDATA--ya need the loop not optional.
18
19	v1.0	Terence John Parr (version 2.5.0 of ANTLR required)
20
21	Fixed how whitespace as handled, removing some ambiguities; some
22	because of ANTLR lexical filtering in 2.5.0.
23
24	Changed (PCDATA)* loops to (PCDATA)? general since PCDATA matches
25	everything between valid tags (how could there be more than one
26	between tags?)
27
28	Made the DOCTYPE optional.
29
30	Reduced lookahead from k=5 to k=1 on the parser and number
31	of parser ambiguities to 2.  Reduced lexer lookahead from 6
32	to 4; had to left factor a bunch of stuff.
33
34	List items couldn't contain nested lists...fixed it.
35
36	Fixed def of WORD so it can't be an INT.  Removed '-' from WORD.
37
38	Fixed HEXNUM so it will allow letters A..F.
39
40	KNOWN ISSUES:
41
42	1.  Does not handle "staggered" tags, eg: <p> <i> <p> <i>
43
44	2.  Adhere's somewhat strictly to the html spec, so many pages
45	won't parse without errors.
46
47	3.  Doesn't convert &(a signifier) to it's proper single char
48	representation
49
50	4.  Checks only the syntax of element attributes, not the semantics,
51	e.g. won't very that a base element's attribute is actually
52	called "href"
53
54	5.  Tags split across lines, for example, <A (NEWLINE) some text >
55	won't be properly recognized.  TJP: I think I fixed this.
56
57	7.  Lines not counted properly due to the def'n of PCDATA - see the
58	alternate def'n for a possible fix.  TJP: I think I fixed this.
59
60*/
61
62class HTMLParser extends Parser;
63options {
64	exportVocab=HTML;
65	k = 1;
66}
67
68
69document
70	: 	(PCDATA)? (DOCTYPE (PCDATA)?)?
71		(OHTML (PCDATA)?)?
72		(head)?
73		(body)?
74		(CHTML (PCDATA)?)?
75	;
76
77head: 	(OHEAD (PCDATA)?)?
78		head_element
79		(PCDATA | head_element)*
80		(CHEAD (PCDATA)?)?
81	;
82
83head_element
84	:	title	//bug need at least a title, rest optional
85	|	script
86	|	style
87	|	ISINDEX
88	|	BASE
89	|	META
90	|	LINK
91	;
92
93title
94	:	OTITLE (PCDATA)? CTITLE
95	;
96
97script
98	:	OSCRIPT (~CSCRIPT)+ CSCRIPT
99	;
100
101style
102	:	OSTYLE (~CSTYLE)+ CSTYLE
103	;
104
105body: 	( OBODY (PCDATA)* )?
106		body_content_no_PCDATA
107		( body_content )+
108		( CBODY (PCDATA)* )?
109	;
110
111body_content_no_PCDATA
112	:	body_tag | text_tag
113	;
114
115body_tag
116	: 	heading | block | address
117	;
118
119body_content
120	: 	body_tag | text
121	;
122
123
124/*revised*/
125heading
126	:	h1 | h2 | h3 | h4 | h5 | h6
127	;
128
129block
130	:	paragraph | list | preformatted | div |
131		center | blockquote | HR | table
132	;	//bug - ?FORM v %form, ISINDEX here too?
133
134font:	teletype | italic | bold | underline | strike |
135		big | small | subscript | superscript
136	;
137
138phrase
139	:	emphasize | strong | definition | code | sample_output|
140		keyboard_text | variable | citation
141	;
142
143special
144	:	anchor | IMG | applet | font_dfn | BFONT |
145		map | BR
146	;
147
148text_tag
149	:	font | phrase | special | form
150	;
151
152text:	PCDATA | text_tag
153	;
154
155/*end*/
156
157
158/*BLOCK ELEMENTS*/
159
160h1	:	OH1 (block | text)* CH1
161	;
162h2	:	OH2 (block | text)* CH2
163	;
164h3	:	OH3 (block | text)* CH3
165	;
166h4	:	OH4 (block | text)* CH4
167	;
168h5	:	OH5 (block | text)* CH5
169	;
170h6	:	OH6 (block | text)* CH6
171	;
172
173address_content
174	:	text
175	|	paragraph
176	;
177
178address
179	:	OADDRESS address_content CADDRESS
180	;
181
182//NOTE:  according to the standard, paragraphs can't contain block elements
183//like HR.  Netscape may insert these elements into paragraphs.
184//We adhere strictly here.
185
186paragraph
187	:	OPARA
188		(
189			/*	Rule body_content may also be just plain text because HTML is
190				so loose.  When body puts body_content in a loop, ANTLR
191				doesn't know whether you want it to match all the text as part
192				of this paragraph (in the case where the </p> is missing) or
193				if the body rule should scarf it.  This is analogous to the
194				dangling-else clause.  I shut off the warning.
195			*/
196			options {
197				generateAmbigWarnings=false;
198			}
199		:	text
200		)*
201		(CPARA)?
202	;
203
204list:	unordered_list
205	|	ordered_list
206	|	def_list
207	;
208
209unordered_list
210	:	OULIST (PCDATA)* (list_item)+ CULIST
211	;
212
213ordered_list
214	:	OOLIST (PCDATA)* (list_item)+ COLIST
215	;
216
217def_list
218	:	ODLIST (PCDATA)* (def_list_item)+ CDLIST
219	;
220
221list_item
222	:	OLITEM ( text | list )+ (CLITEM (PCDATA)*)?
223	;
224
225def_list_item
226	:	dt | dd
227	;
228
229dt	:	ODTERM (text)+ CDTERM (PCDATA)*
230	;
231
232dd	:	ODDEF (text | block)+ CDTERM (PCDATA)*
233	;
234
235dir	:	ODIR (list_item)+ CDIR
236	;
237
238menu:	OMENU (list_item)+ CMENU
239	;
240
241preformatted
242	:	OPRE (text)+ CPRE
243	;
244
245div	:	ODIV (body_content)* CDIV		//semi-revised
246	;
247
248center
249	:	OCENTER (body_content)* CCENTER //semi-revised
250	;
251
252blockquote
253	:	OBQUOTE (body_content)* CBQUOTE
254	;
255
256form:	OFORM (form_field | body_content)* CFORM
257	;
258
259table
260	:	OTABLE (caption)? (PCDATA)* (tr)+ CTABLE
261	;
262
263caption
264	:	OCAP (text)* CCAP
265	;
266
267tr	:	O_TR (PCDATA)* (th_or_td)* (C_TR (PCDATA)*)?
268	;
269
270th_or_td
271	:	O_TH_OR_TD (body_content)* (C_TH_OR_TD (PCDATA)*)?
272	;
273
274/*TEXT ELEMENTS*/
275
276/*font style*/
277
278teletype
279	:	OTTYPE ( text )+ CTTYPE
280	;
281
282italic
283	:	OITALIC ( text )+ CITALIC
284	;
285
286bold:	OBOLD ( text )+ CBOLD
287	;
288
289underline
290	:	OUNDER ( text )+ CUNDER
291	;
292
293strike
294	:	OSTRIKE ( text )+ CSTRIKE
295	;
296
297big	:	OBIG ( text )+ CBIG
298	;
299
300small
301	:	OSMALL ( text )+ CSMALL
302	;
303
304subscript
305	:	OSUB ( text )+ CSUB
306	;
307
308superscript
309	:	OSUP ( text )+ CSUP
310	;
311
312	/*phrase elements*/
313
314emphasize
315	:	OEM ( text )+ CEM
316	;
317
318strong
319	:	OSTRONG ( text )+ CSTRONG
320	;
321
322definition
323	:	ODFN ( text )+ CDFN
324	;
325
326code
327	:	OCODE ( text )+ CCODE
328	;
329
330sample_output
331	:	OSAMP ( text )+ CSAMP
332	;
333
334keyboard_text
335	:	OKBD ( text )+ CKBD
336	;
337
338variable
339	:	OVAR ( text )+ CVAR
340	;
341
342citation
343	:	OCITE ( text )+ CCITE
344	;
345
346/*	form fields (combined with body_content elsewhere so no PCDATA on end) */
347form_field
348	:	INPUT | select | textarea
349	;
350
351select
352	:	OSELECT (PCDATA)* (select_option)+ CSELECT
353	;
354
355select_option
356	:	SELOPT (PCDATA)*
357	;
358
359textarea
360	:	OTAREA (PCDATA)* CTAREA
361	;
362
363/*	special text level elements*/
364anchor
365	:	OANCHOR (text)* CANCHOR
366	;
367
368applet
369	:	OAPPLET (APARM)? (PCDATA)* CAPPLET
370	;
371
372//not w3-no blocks allowed; www.microsoft.com uses
373font_dfn
374	:	OFONT (text)* CFONT
375	;
376
377map	:	OMAP (AREA)+ CMAP
378	;
379
380class HTMLLexer extends Lexer;
381options {
382	k = 4;
383	exportVocab=HTML;
384	charVocabulary = '\3'..'\377';
385	caseSensitive=false;
386	filter=UNDEFINED_TOKEN;
387}
388
389
390/*	STRUCTURAL tags
391*/
392
393DOCTYPE
394	: "<!doctype" WS "html" WS "public" (WS)? STRING (WS)? (STRING (WS)?)? '>'
395	;
396
397OHTML
398 	: 	"<html>"
399	;
400
401CHTML
402	: 	"</html>"
403	;
404
405OHEAD
406	: 	"<head>"
407	;
408
409CHEAD
410	: 	"</head>"
411	;
412
413OBODY
414	:	"<body" (WS (ATTR )*)? '>'
415	;
416
417CBODY
418	:	"</body>"
419	;
420
421
422/*	HEAD ELEMENTS
423*/
424
425OTITLE
426	: "<title>"
427	;
428
429CTITLE
430	: "</title>"
431	;
432
433
434OSCRIPT
435	: 	"<script>"
436	;
437
438CSCRIPT
439	:	"</script>"
440	;
441
442ISINDEX
443 	: 	"<isindex" WS ATTR '>'
444	;
445
446META
447	: 	"<meta" WS (ATTR)+ '>'
448	;
449
450LINK
451	:	"<link" WS (ATTR)+ '>'
452	;
453
454
455/* headings */
456
457OH1	:	"<h1" (WS ATTR)? '>'
458	;
459
460CH1	:	"</h1>"
461	;
462
463OH2	:	"<h2" (WS ATTR)?'>'
464	;
465
466CH2	:	"</h2>"
467	;
468
469OH3	:	"<h3" (WS ATTR)? '>'
470	;
471
472CH3	:	"</h3>"
473	;
474
475OH4	:	"<h4" (WS ATTR)? '>'
476	;
477
478CH4	:	"</h4>"
479	;
480
481OH5	:	"<h5" (WS ATTR)? '>'
482	;
483
484CH5	:	"</h5>"
485	;
486
487OH6	:	"<h6" (WS ATTR)? '>'
488	;
489
490CH6	:	"</h6>"
491	;
492
493OADDRESS
494	:	"<address>"
495	;
496
497CADDRESS
498	:	"</address>"
499	;
500
501OPARA
502	:	"<p" (WS ATTR)? '>'
503	;
504
505CPARA
506	: 	"</p>"		//it's optional
507	;
508
509		/*UNORDERED LIST*/
510OULIST
511	:	"<ul" (WS ATTR)? '>'
512	;
513
514CULIST
515	:	"</ul>"
516	;
517
518		/*ORDERED LIST*/
519OOLIST
520	:	"<ol" (WS ATTR)? '>'
521	;
522
523COLIST
524	:	"</ol>"
525	;
526
527		/*LIST ITEM*/
528
529OLITEM
530	:	"<li" (WS ATTR)? '>'
531	;
532
533CLITEM
534	:	"</li>"
535	;
536
537		/*DEFINITION LIST*/
538
539ODLIST
540	:	"<dl" (WS ATTR)? '>'
541	;
542
543CDLIST
544	:	"</dl>"
545	;
546
547ODTERM
548	: 	"<dt>"
549	;
550
551CDTERM
552	: 	"</dt>"
553	;
554
555ODDEF
556	: 	"<dd>"
557	;
558
559CDDEF
560	: 	"</dd>"
561	;
562
563ODIR:	"<dir>"
564	;
565
566CDIR_OR_CDIV
567	:	"</di"
568		(	'r' {$setType(CDIR);}
569		|	'v' {$setType(CDIV);}
570		)
571		'>'
572	;
573
574ODIV:	"<div" (WS ATTR)? '>'
575	;
576
577OMENU
578	:	"<menu>"
579	;
580
581CMENU
582	:	"</menu>"
583	;
584
585OPRE:	("<pre>" | "<xmp>") ('\n')?
586	;
587
588CPRE:	 "</pre>" | "</xmp>"
589	;
590
591OCENTER
592	:	"<center>"
593	;
594
595CCENTER
596	:	"</center>"
597	;
598
599OBQUOTE
600	:	"<blockquote>"
601	;
602
603CBQUOTE
604	:	"</blockquote>"
605	;
606
607//this is block element and thus can't be nested inside of
608//other block elements, ex: paragraphs.
609//Netscape appears to generate bad HTML vis-a-vis the standard.
610
611HR	:	"<hr" (WS (ATTR)*)? '>'
612	;
613
614
615OTABLE
616	:	"<table" (WS (ATTR)*)? '>'
617	;
618
619CTABLE
620	: 	"</table>"
621	;
622
623OCAP:	"<caption" (WS (ATTR)*)? '>'
624	;
625
626CCAP:	"</caption>"
627	;
628
629O_TR
630	:	"<tr" (WS (ATTR)*)? '>'
631	;
632
633C_TR:	"</tr>"
634	;
635
636O_TH_OR_TD
637	:	("<th" | "<td") (WS (ATTR)*)? '>'
638	;
639
640C_TH_OR_TD
641	:	"</th>" | "</td>"
642	;
643
644/*	PCDATA-LEVEL ELEMENTS
645*/
646
647/*		font style elemens*/
648
649OTTYPE
650	:	"<tt>"
651	;
652
653CTTYPE
654	:	"</tt>"
655	;
656
657OITALIC
658	:	"<i>"
659	;
660
661CITALIC
662	:	"</i>"
663	;
664
665OBOLD
666 	:	"<b>"
667	;
668
669CBOLD
670	:	"</b>"
671	;
672
673OUNDER
674	:	"<u>"
675	;
676
677CUNDER
678	:	"</u>"
679	;
680
681/** Left-factor <strike> and <strong> to reduce lookahead */
682OSTRIKE_OR_OSTRONG
683	:	"<str"
684		(	"ike" {$setType(OSTRIKE);}
685		|	"ong" {$setType(OSTRONG);}
686		)
687		'>'
688	;
689
690CST_LEFT_FACTORED
691	:	"</st"
692		(	"rike" {$setType(CSTRIKE);}
693		|	"rong" {$setType(CSTRONG);}
694		|	"yle"  {$setType(CSTYLE);}
695		)
696		'>'
697	;
698
699OSTYLE
700 	: 	"<style>"
701	;
702
703OBIG:	"<big>"
704	;
705
706CBIG:	"</big>"
707	;
708
709OSMALL
710	:	"<small>"
711	;
712
713CSMALL
714	:	"</small>"
715	;
716
717OSUB:	"<sub>"
718	;
719
720OSUP:	"<sup>"
721	;
722
723CSUB_OR_CSUP
724	:	"</su"
725		(	'b' {$setType(CSUB);}
726		|	'p' {$setType(CSUP);}
727		)
728		'>'
729	;
730
731/*		phrase elements*/
732OEM	:	"<em>"
733	;
734
735CEM	:	"</em>"
736	;
737
738ODFN:	"<dfn>"
739	;
740
741CDFN:	"</dfn>"
742	;
743
744OCODE
745 	:	"<code>"
746	;
747
748CCODE
749	:	"</code>"
750	;
751
752OSAMP
753	:	"<samp>"
754	;
755
756CSAMP
757	:	"</samp>"
758	;
759
760OKBD:	"<kbd>"
761	;
762
763CKBD:	"</kbd>"
764	;
765
766OVAR:	"<var>"
767	;
768
769CVAR:	"</var>"
770	;
771
772OCITE
773	:	"<cite>"
774	;
775
776CCITE
777	:	"</cite>"
778	;
779
780/* form fields*/
781INPUT
782	:	"<input" (WS (ATTR)*)? '>'
783	;
784
785OSELECT
786	:	"<select" (WS (ATTR)*)? '>'
787	;
788
789CSELECT
790	:	"</select>"
791	;
792
793OTAREA
794	:	"<textarea" (WS (ATTR)*)? '>'
795	;
796
797CTAREA
798	:	"</textarea>"
799	;
800
801SELOPT
802	:	"<option" (WS (ATTR)*)? '>'
803	;
804
805/* special text level elements*/
806
807OANCHOR
808	:	"<a" WS (ATTR)+ '>'
809	;
810
811CANCHOR
812	:	"</a>"
813	;
814
815IMG	:	"<img" WS (ATTR)+ '>'
816	;
817
818
819OAPPLET
820	:	"<applet" WS (ATTR)+ '>'
821	;
822
823CAPPLET
824	:	"</applet>"
825	;
826
827APARM
828	:	"<param" WS (ATTR)+ '>'
829	;
830
831OFORM
832	:	"<form" WS (ATTR)+ '>'
833	;
834
835OFONT
836	:	"<font" WS (ATTR)+ '>'
837	;
838
839CFORM_OR_CFONT
840	:	"</fo"
841		(	"rm" {$setType(CFORM);}
842		|	"nt" {$setType(CFONT);}
843		)
844		'>'
845	;
846
847/*
848CFORM
849	:	"</form>"
850	;
851
852CFONT
853	:	"</font>"
854	;
855*/
856
857BFONT_OR_BASE
858	:	"<base"
859		(	"font" WS ATTR {$setType(BFONT);}
860		|	WS ATTR        {$setType(BASE);}
861		)
862		'>'
863	;
864
865/*
866BFONT
867	:	"<basefont" WS ATTR '>'
868	;
869
870BASE: 	"<base" WS ATTR '>'
871	;
872*/
873
874BR
875	:	"<br" (WS ATTR)? '>'
876	;
877
878OMAP
879	:	"<map" WS ATTR '>'
880	;
881
882CMAP:	"</map>"
883	;
884
885AREA:	"<area" WS (ATTR)+ '>'
886	;
887
888/*MISC STUFF*/
889
890PCDATA
891	:	(
892			/* See comment in WS.  Language for combining any flavor
893			 * newline is ambiguous.  Shutting off the warning.
894			 */
895			options {
896				generateAmbigWarnings=false;
897			}
898		:	'\r' '\n'		{newline();}
899		|	'\r'			{newline();}
900		|	'\n'			{newline();}
901		|	~('<'|'\n'|'\r'|'"'|'>')
902		)+
903	;
904
905// multiple-line comments
906protected
907COMMENT_DATA
908	:	(	/*	'\r' '\n' can be matched in one alternative or by matching
909				'\r' in one iteration and '\n' in another.  I am trying to
910				handle any flavor of newline that comes in, but the language
911				that allows both "\r\n" and "\r" and "\n" to all be valid
912				newline is ambiguous.  Consequently, the resulting grammar
913				must be ambiguous.  I'm shutting this warning off.
914			 */
915			options {
916				generateAmbigWarnings=false;
917			}
918		:
919			{!(LA(2)=='-' && LA(3)=='>')}? '-' // allow '-' if not "-->"
920		|	'\r' '\n'		{newline();}
921		|	'\r'			{newline();}
922		|	'\n'			{newline();}
923		|	~('-'|'\n'|'\r')
924		)*
925	;
926
927
928COMMENT
929	:	"<!--" c:COMMENT_DATA "-->" (WS)?
930		{ $setType(Token.SKIP); }
931	;
932
933/*
934	PROTECTED LEXER RULES
935*/
936
937protected
938WS	:	(
939			/*	'\r' '\n' can be matched in one alternative or by matching
940				'\r' in one iteration and '\n' in another.  I am trying to
941				handle any flavor of newline that comes in, but the language
942				that allows both "\r\n" and "\r" and "\n" to all be valid
943				newline is ambiguous.  Consequently, the resulting grammar
944				must be ambiguous.  I'm shutting this warning off.
945			 */
946			options {
947				generateAmbigWarnings=false;
948			}
949		:	' '
950		|	'\t'
951		|	'\n'	{ newline(); }
952		|	"\r\n"	{ newline(); }
953		|	'\r'	{ newline(); }
954		)+
955	;
956
957protected
958ATTR
959	:       WORD (WS)? ('=' (WS)? (WORD ('%')? | ('-')? INT | STRING | HEXNUM) (WS)?)?
960	;
961
962//don't need uppercase for case-insen.
963//the '.' is for words like "image.gif"
964protected
965WORD:	(	LCLETTER
966		|	'.'
967		)
968
969		(
970			/*	In reality, a WORD must be followed by whitespace, '=', or
971				what can follow an ATTR such as '>'.  In writing this grammar,
972				however, we just list all the possibilities as optional
973				elements.  This is loose, allowing the case where nothing is
974				matched after a WORD and then the (ATTR)* loop means the
975				grammar would allow "widthheight" as WORD WORD or WORD, hence,
976				an ambiguity.  Naturally, ANTLR will consume the input as soon
977				as possible, combing "widthheight" into one WORD.
978
979				I am shutting off the ambiguity here because ANTLR does the
980				right thing.  The exit path is ambiguous with ever
981				alternative.  The only solution would be to write an unnatural
982				grammar (lots of extra productions) that laid out the
983				possibilities explicitly, preventing the bogus WORD followed
984				immediately by WORD without whitespace etc...
985			 */
986			options {
987				generateAmbigWarnings=false;
988			}
989		:	LCLETTER
990		|	DIGIT
991		|	'.'
992		)+
993	;
994
995protected
996STRING
997	:	'"' (~'"')* '"'
998	|	'\'' (~'\'')* '\''
999	;
1000
1001protected
1002WSCHARS
1003	:	' ' | '\t' | '\n' | '\r'
1004	;
1005
1006protected
1007SPECIAL
1008	:	'<' | '~'
1009	;
1010
1011protected
1012HEXNUM
1013	:	'#' HEXINT
1014	;
1015
1016protected
1017INT	:	(DIGIT)+
1018	;
1019
1020protected
1021HEXINT
1022	:	(
1023			/*	Technically, HEXINT cannot be followed by a..f, but due to our
1024				loose grammar, the whitespace that normally would follow this
1025				rule is optional.  ANTLR reports that #4FACE could parse as
1026				HEXINT "#4" followed by WORD "FACE", which is clearly bogus.
1027				ANTLR does the right thing by consuming a much input as
1028				possible here.  I shut the warning off.
1029			 */
1030			 options {
1031				generateAmbigWarnings=false;
1032			}
1033		:	HEXDIGIT
1034		)+
1035	;
1036
1037protected
1038DIGIT
1039	:	'0'..'9'
1040	;
1041
1042protected
1043HEXDIGIT
1044	:	'0'..'9'
1045	|	'a'..'f'
1046	;
1047
1048protected
1049LCLETTER
1050	:	'a'..'z'
1051	;
1052
1053protected
1054UNDEFINED_TOKEN
1055	:	'<' (~'>')* '>'
1056		(
1057			(	/* the usual newline hassle: \r\n can be matched in alt 1
1058				 * or by matching alt 2 followed by alt 3 in another iteration.
1059				 */
1060				 options {
1061					generateAmbigWarnings=false;
1062				}
1063			:	"\r\n" | '\r' | '\n'
1064			)
1065			{ newline();}
1066		)*
1067		{System.err.println("invalid tag: "+$getText);}
1068	|	( "\r\n" | '\r' | '\n' ) {newline();}
1069	|	.
1070	;
1071
1072/*
1073	:	('<'  { System.err.print("Warning: non-standard tag <" + LA(1)); } )
1074		(~'>' { System.err.print(LA(1)); } )*
1075		('>'  { System.err.println(" skipped."); } )
1076		{ _ttype = Token.SKIP; }
1077	;
1078*/
1079