1 /* Parser.java -- HTML parser.
2    Copyright (C) 2005 Free Software Foundation, Inc.
3 
4 This file is part of GNU Classpath.
5 
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10 
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING.  If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
20 
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library.  Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
25 
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module.  An independent module is a module which is not derived from
33 or based on this library.  If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so.  If you do not wish to do so, delete this
36 exception statement from your version. */
37 
38 
39 package gnu.javax.swing.text.html.parser.support;
40 
41 import gnu.java.lang.CPStringBuilder;
42 
43 import gnu.javax.swing.text.html.parser.htmlAttributeSet;
44 import gnu.javax.swing.text.html.parser.htmlValidator;
45 import gnu.javax.swing.text.html.parser.support.low.Constants;
46 import gnu.javax.swing.text.html.parser.support.low.ParseException;
47 import gnu.javax.swing.text.html.parser.support.low.ReaderTokenizer;
48 import gnu.javax.swing.text.html.parser.support.low.Token;
49 import gnu.javax.swing.text.html.parser.support.low.node;
50 import gnu.javax.swing.text.html.parser.support.low.pattern;
51 
52 import java.io.IOException;
53 import java.io.Reader;
54 
55 import java.util.Comparator;
56 import java.util.Set;
57 import java.util.TreeSet;
58 import java.util.Vector;
59 
60 import javax.swing.text.ChangedCharSetException;
61 import javax.swing.text.SimpleAttributeSet;
62 import javax.swing.text.html.HTML;
63 import javax.swing.text.html.parser.AttributeList;
64 import javax.swing.text.html.parser.DTD;
65 import javax.swing.text.html.parser.DTDConstants;
66 import javax.swing.text.html.parser.Element;
67 import javax.swing.text.html.parser.Entity;
68 import javax.swing.text.html.parser.TagElement;
69 
70 /**
71  * <p>A simple error-tolerant HTML parser that uses a DTD document
72  * to access data on the possible tokens, arguments and syntax.</p>
73  * <p> The parser reads an HTML content from a Reader and calls various
74  * notifying methods (which should be overridden in a subclass)
75  * when tags or data are encountered.</p>
76  * <p>Some HTML elements need no opening or closing tags. The
77  * task of this parser is to invoke the tag handling methods also when
78  * the tags are not explicitly specified and must be supposed using
79  * information, stored in the DTD.
80  * For  example, parsing the document
81  * <p>&lt;table&gt;&lt;tr&gt;&lt;td&gt;a&lt;td&gt;b&lt;td&gt;c&lt;/tr&gt; <br>
82  * will invoke exactly the handling methods exactly in the same order
83  * (and with the same parameters) as if parsing the document: <br>
84  * <em>&lt;html&gt;&lt;head&gt;&lt;/head&gt;&lt;body&gt;&lt;table&gt;&lt;
85  * tbody&gt;</em>&lt;tr&gt;&lt;td&gt;a<em>&lt;/td&gt;</em>&lt;td&gt;b<em>
86  * &lt;/td&gt;</em>&lt;td&gt;c<em>&lt;/td&gt;&lt;/tr&gt;</em>&lt;
87  * <em>/tbody&gt;&lt;/table&gt;&lt;/body&gt;&lt;/html&gt;</em></p>
88  * (supposed tags are given in italics). The parser also supports
89  * obsolete elements of HTML syntax.<p>
90  * </p>
91  * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
92  */
93 public class Parser
94   extends ReaderTokenizer
95   implements DTDConstants
96 {
97   /**
98    * The current html tag.
99    */
100   public Token hTag = new Token();
101 
102   /**
103    * The document template description that will be used to parse the documents.
104    */
105   protected DTD dtd;
106 
107   /**
108    * The value of this field determines whether or not the Parser will be
109    * strict in enforcing SGML compatibility. The default value is false,
110    * stating that the parser should do everything to parse and get at least
111    * some information even from the incorrectly written HTML input.
112    */
113   protected boolean strict;
114 
115   /**
116    * This fields has positive values in preformatted tags.
117    */
118   protected int preformatted = 0;
119 
120   /**
121    * The set of the document tags. This field is used for supporting
122    * markFirstTime().
123    */
124   private Set documentTags =
125     new TreeSet(new Comparator()
126       {
127         public int compare(Object a, Object b)
128         {
129           return ((String) a).compareToIgnoreCase((String) b);
130         }
131       }
132                );
133 
134   /**
135   * The buffer to collect the incremental output like text or coment.
136   */
137   private final StringBuffer buffer = new StringBuffer();
138 
139   /**
140    * The buffer to store the document title.
141    */
142   private final StringBuffer title = new StringBuffer();
143 
144   /**
145    * The current token.
146    */
147   private Token t;
148 
149   /**
150    * True means that the 'title' tag of this document has
151    * already been handled.
152    */
153   private boolean titleHandled;
154 
155   /**
156    * True means that the 'title' tag is currently open and all
157    * text is also added to the title buffer.
158    */
159   private boolean titleOpen;
160 
161   /**
162    * The attributes of the current HTML element.
163    * Package-private to avoid an accessor method.
164    */
165   htmlAttributeSet attributes =
166     htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
167 
168   /**
169    * The validator, controlling the forcible closing of the tags that
170    * (in accordance to dtd) are not allowed in the current context.
171    */
172   private htmlValidator validator;
173 
174   /**
175    * Provides the default values for parameters in the case when these
176    * values are defined in the DTD.
177    */
178   private parameterDefaulter defaulter;
179 
180   /**
181    * The text pre-processor for handling line ends and tabs.
182    */
183   private textPreProcessor textProcessor = new textPreProcessor();
184 
185   /**
186    * Creates a new Parser that uses the given
187    * {@link javax.swing.text.html.parser.DTD }. The only standard way
188    * to get an instance of DTD is to construct it manually, filling in
189    * all required fields.
190    * @param a_dtd The DTD to use. The parser behaviour after passing null
191    * as an argument is not documented and may vary between implementations.
192    */
Parser(DTD a_dtd)193   public Parser(DTD a_dtd)
194   {
195     if (a_dtd == null)
196       dtd = gnu.javax.swing.text.html.parser.HTML_401F.getInstance();
197     else
198       dtd = a_dtd;
199 
200     defaulter = new parameterDefaulter(dtd);
201 
202     validator =
203       new htmlValidator(dtd)
204         {
205           /**
206            * Handles the error message. This method must be overridden to pass
207            * the message where required.
208            * @param msg The message text.
209            */
210           protected void s_error(String msg)
211           {
212             error(msg);
213           }
214 
215           /**
216            * The method is called when the tag validator decides to close the
217            * tag on its own initiative. After reaching the end of stream,
218            * The tag validator closes all unclosed elements that are required
219            * to have the end (closing) tag.
220            *
221            * @param tElement The tag being fictionally (forcibly) closed.
222            */
223           protected void handleSupposedEndTag(Element tElement)
224           {
225             // The tag is cloned as the original tElement is the
226             // element from the starting tag - may be accidently used
227             // somewhere else.
228             TagElement tag = makeTag(tElement, true);
229             _handleEndTag_remaining(tag);
230           }
231 
232           /**
233            * The method is called when the the tag validator decides to open
234            * the new tag on its own initiative. The tags, opened in this
235            * way, are HTML, HEAD and BODY. The attribute set is temporary
236            * assigned to the empty one, the previous value is
237            * restored before return.
238            *
239            * @param tElement The tag being fictionally (forcibly) closed.
240            */
241           protected void handleSupposedStartTag(Element tElement)
242           {
243             TagElement tag = makeTag(tElement, true);
244             htmlAttributeSet were = attributes;
245             attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
246             _handleStartTag(tag);
247             attributes = were;
248           }
249         };
250   }
251 
252   /**
253    * Get the attributes of the current tag.
254    * @return The attribute set, representing the attributes of the current tag.
255    */
getAttributes()256   public SimpleAttributeSet getAttributes()
257   {
258     return new SimpleAttributeSet(attributes);
259   }
260 
261   /**
262    * Invokes the error handler. The default method in this implementation
263    * delegates the call to handleError, also providing the current line.
264    */
error(String msg)265   public void error(String msg)
266   {
267     error(msg, getTokenAhead());
268   }
269 
error(String msg, Token atToken)270   public void error(String msg, Token atToken)
271   {
272     if (atToken != null)
273       handleError(atToken.where.beginLine,
274                   msg + ": line " + atToken.where.beginLine +
275                   ", absolute pos " + atToken.where.startPosition
276                  );
277     else
278       handleError(0, msg);
279   }
280 
281   /**
282    * Invokes the error handler. The default method in this implementation
283    * delegates the call to error (parm1+": '"+parm2+"'").
284    */
error(String msg, String invalid)285   public void error(String msg, String invalid)
286   {
287     error(msg + ": '" + invalid + "'");
288   }
289 
290   /**
291    * Invokes the error handler. The default method in this implementation
292    * delegates the call to error (parm1+" "+ parm2+" "+ parm3).
293    */
error(String parm1, String parm2, String parm3)294   public void error(String parm1, String parm2, String parm3)
295   {
296     error(parm1 + " " + parm2 + " " + parm3);
297   }
298 
299   /**
300    * Invokes the error handler. The default method in this implementation
301    * delegates the call to error (parm1+" "+ parm2+" "+ parm3+" "+ parm4).
302    */
error(String parm1, String parm2, String parm3, String parm4)303   public void error(String parm1, String parm2, String parm3, String parm4)
304   {
305     error(parm1 + " " + parm2 + " " + parm3 + " " + parm4);
306   }
307 
flushAttributes()308   public void flushAttributes()
309   {
310   }
311 
312   /**
313    * Parse the HTML text, calling various methods in response to the
314    * occurence of the corresponding HTML constructions.
315    * @param reader The reader to read the source HTML from.
316    * @throws IOException If the reader throws one.
317    */
parse(Reader reader)318   public synchronized void parse(Reader reader)
319                           throws IOException
320   {
321     reset(reader);
322     restart();
323     try
324       {
325         parseDocument();
326         validator.closeAll();
327       }
328     catch (ParseException ex)
329       {
330         if (ex != null)
331           {
332             error("Unable to continue parsing the document", ex.getMessage());
333 
334             Throwable cause = ex.getCause();
335             if (cause instanceof IOException)
336               throw (IOException) cause;
337           }
338       }
339   }
340 
341   /**
342    * Parses DTD markup declaration. Currently returns null without action.
343    * @return null.
344    * @throws IOException
345    */
parseDTDMarkup()346   public String parseDTDMarkup()
347                         throws IOException
348   {
349     return null;
350   }
351 
352   /**
353    * Parse SGML insertion ( &lt;! ... &gt; ). When the
354    * the SGML insertion is found, this method is called, passing
355    * SGML in the string buffer as a parameter. The default method
356    * returns false without action and can be overridden to
357    * implement user - defined SGML support.
358    * <p>
359    * If you need more information about SGML insertions in HTML documents,
360    * the author suggests to read SGML tutorial on
361    * {@link http://www.w3.org/TR/WD-html40-970708/intro/sgmltut.html}.
362    * We also recommend Goldfarb C.F (1991) <i>The SGML Handbook</i>,
363    * Oxford University Press, 688 p, ISBN: 0198537379.
364    * </p>
365    * @param strBuff
366    * @return true if this is a valid DTD markup declaration.
367    * @throws IOException
368    */
parseMarkupDeclarations(StringBuffer strBuff)369   public boolean parseMarkupDeclarations(StringBuffer strBuff)
370                                   throws IOException
371   {
372     return false;
373   }
374 
375   /**
376    * Get the first line of the last parsed token.
377    */
getCurrentLine()378   protected int getCurrentLine()
379   {
380     return hTag.where.beginLine;
381   }
382 
383   /**
384    * Read parseable character data, add to buffer.
385    * @param clearBuffer If true, buffer if filled by CDATA section,
386    * otherwise the section is appended to the existing content of the
387    * buffer.
388    *
389    * @throws ParseException
390    */
CDATA(boolean clearBuffer)391   protected void CDATA(boolean clearBuffer)
392                 throws ParseException
393   {
394     Token start = hTag = getTokenAhead();
395 
396     if (clearBuffer)
397       buffer.setLength(0);
398 
399     // Handle expected EOF.
400     if (start.kind == EOF)
401       return;
402 
403     read:
404     while (true)
405       {
406         t = getTokenAhead();
407         if (t.kind == EOF)
408           {
409             error("unexpected eof", t);
410             break read;
411           }
412         else if (t.kind == BEGIN)
413           break read;
414         else if (t.kind == Constants.ENTITY)
415           {
416             resolveAndAppendEntity(t);
417             getNextToken();
418           }
419         else
420           {
421             append(t);
422             getNextToken();
423           }
424       }
425     hTag = new Token(start, getTokenAhead(0));
426     if (buffer.length() != 0)
427       _handleText();
428   }
429 
430   /**
431   * Process Comment. This method skips till --> without
432   * taking SGML constructs into consideration.  The supported SGML
433   * constructs are handled separately.
434   */
Comment()435   protected void Comment()
436                   throws ParseException
437   {
438     buffer.setLength(0);
439 
440     Token start = hTag = mustBe(BEGIN);
441     optional(WS);
442     mustBe(EXCLAMATION);
443     optional(WS);
444     mustBe(DOUBLE_DASH);
445 
446     Token t;
447     Token last;
448 
449     comment:
450     while (true)
451       {
452         t = getTokenAhead();
453         if (t.kind == EOF)
454           {
455             handleEOFInComment();
456             last = t;
457             break comment;
458           }
459         else if (COMMENT_END.matches(this))
460           {
461             mustBe(DOUBLE_DASH);
462             optional(WS);
463             last = mustBe(END);
464             break comment;
465           }
466         else if (COMMENT_TRIPLEDASH_END.matches(this))
467           {
468             mustBe(DOUBLE_DASH);
469             t = mustBe(NUMTOKEN);
470             if (t.getImage().equals("-"))
471               {
472                 append(t);
473                 last = mustBe(END);
474                 break comment;
475               }
476             else
477               {
478                 buffer.append("--");
479                 append(t);
480                 t = getTokenAhead();
481               }
482           }
483         else
484         /* The lllll-- can match as NUMTOKEN */
485         if ((t.getImage().endsWith("--")) &&
486             (
487               getTokenAhead(1).kind == END ||
488               (getTokenAhead(1).kind == WS && getTokenAhead(2).kind == END)
489             )
490            )
491           {
492             buffer.append(t.getImage().substring(0, t.getImage().length() - 2));
493 
494             /* Skip the closing > that we have already checked. */
495             last = mustBe(t.kind);
496             break comment;
497           }
498         else
499           append(t);
500         mustBe(t.kind);
501       }
502     hTag = new Token(start, last);
503 
504     // Consume any whitespace immediately following a comment.
505     optional(WS);
506     handleComment();
507   }
508 
509   /**
510   * Read a script. The text, returned without any changes,
511   * is terminated only by the closing tag SCRIPT.
512   */
Script()513   protected void Script()
514                  throws ParseException
515   {
516     Token name;
517 
518     Token start = hTag = mustBe(BEGIN);
519     optional(WS);
520 
521     name = mustBe(SCRIPT);
522 
523     optional(WS);
524 
525     restOfTag(false, name, start);
526 
527     buffer.setLength(0);
528 
529     while (!SCRIPT_CLOSE.matches(this))
530       {
531         append(getNextToken());
532       }
533 
534     consume(SCRIPT_CLOSE);
535 
536     _handleText();
537 
538     endTag(false);
539     _handleEndTag(makeTagElement(name.getImage(), false));
540   }
541 
542   /**
543   * Process SGML insertion that is not a comment.
544   */
Sgml()545   protected void Sgml()
546                throws ParseException
547   {
548     if (COMMENT_OPEN.matches(this))
549       Comment();
550     else // skip till ">"
551       {
552         Token start = hTag = mustBe(BEGIN);
553         optional(WS);
554         mustBe(EXCLAMATION);
555 
556         buffer.setLength(0);
557         read:
558         while (true)
559           {
560             t = getNextToken();
561             if (t.kind == Constants.ENTITY)
562               {
563                 resolveAndAppendEntity(t);
564               }
565             else if (t.kind == EOF)
566               {
567                 error("unexpected eof", t);
568                 break read;
569               }
570             else if (t.kind == END)
571               break read;
572             else
573               append(t);
574           }
575 
576         try
577           {
578             parseMarkupDeclarations(buffer);
579           }
580         catch (IOException ex)
581           {
582             error("Unable to parse SGML insertion: '" + buffer + "'",
583                   new Token(start, t)
584                  );
585           }
586       }
587     // Consume any whitespace that follows the Sgml insertion.
588     optional(WS);
589   }
590 
591   /**
592   * Read a style definition. The text, returned without any changes,
593   * is terminated only by the closing tag STYLE.
594   */
Style()595   protected void Style()
596                 throws ParseException
597   {
598     Token name;
599 
600     Token start = hTag = mustBe(BEGIN);
601     optional(WS);
602 
603     name = mustBe(STYLE);
604 
605     optional(WS);
606 
607     restOfTag(false, name, start);
608 
609     buffer.setLength(0);
610 
611     while (!STYLE_CLOSE.matches(this))
612       {
613         append(getNextToken());
614       }
615 
616     consume(STYLE_CLOSE);
617 
618     _handleText();
619 
620     endTag(false);
621     _handleEndTag(makeTagElement(name.getImage(), false));
622   }
623 
624   /**
625    * Read a html tag.
626    */
Tag()627   protected void Tag()
628               throws ParseException
629   {
630     mark(true);
631 
632     boolean closing = false;
633     Token name;
634     Token start = hTag = mustBe(BEGIN);
635 
636     optional(WS);
637     name = getNextToken();
638     optional(WS);
639 
640     if (name.kind == SLASH)
641       {
642         closing = true;
643         name = getNextToken();
644       }
645 
646     restOfTag(closing, name, start);
647   }
648 
649   /**
650    * A hook, for operations, preceeding call to handleText.
651    * Handle text in a string buffer.
652    * In non - preformatted mode, all line breaks immediately following the
653    * start tag and immediately before an end tag is discarded,
654    * \r, \n and \t are replaced by spaces, multiple space are replaced
655    * by the single one and the result is  moved into array,
656    * passing it  to handleText().
657    */
_handleText()658   protected void _handleText()
659   {
660     char[] text;
661 
662     if (preformatted > 0)
663       text = textProcessor.preprocessPreformatted(buffer);
664     else
665       text = textProcessor.preprocess(buffer);
666 
667     if (text != null && text.length > 0
668         // According to the specs we need to discard whitespace immediately
669         // before a closing tag.
670         && (text.length > 1 || text[0] != ' ' || ! TAG_CLOSE.matches(this)))
671       {
672         TagElement pcdata = new TagElement(dtd.getElement("#pcdata"));
673         attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
674         _handleEmptyTag(pcdata);
675 
676         handleText(text);
677         if (titleOpen)
678           title.append(text);
679       }
680   }
681 
682   /**
683    * Add the image of this token to the buffer.
684    * @param t A token to append.
685    */
append(Token t)686   protected final void append(Token t)
687   {
688     if (t.kind != EOF)
689       t.appendTo(buffer);
690   }
691 
692   /**
693    * Consume pattern that must match.
694    * @param p A pattern to consume.
695    */
consume(pattern p)696   protected final void consume(pattern p)
697   {
698     node n;
699     for (int i = 0; i < p.nodes.length; i++)
700       {
701         n = p.nodes [ i ];
702         if (n.optional)
703           optional(n.kind);
704         else
705           mustBe(n.kind);
706       }
707   }
708 
709   /**
710    * The method is called when the HTML end (closing) tag is found or if
711    * the parser concludes that the one should be present in the
712    * current position. The method is called immediatly
713    * before calling the handleEndTag().
714    * @param omitted True if the tag is no actually present in the document,
715    * but is supposed by the parser (like &lt;/html&gt; at the end of the
716    * document).
717    */
endTag(boolean omitted)718   protected void endTag(boolean omitted)
719   {
720   }
721 
722   /**
723    * Handle HTML comment. The default method returns without action.
724    * @param comment
725    */
handleComment(char[] comment)726   protected void handleComment(char[] comment)
727   {
728   }
729 
730   /**
731    * This is additionally called in when the HTML content terminates
732    * without closing the HTML comment. This can only happen if the
733    * HTML document contains errors (for example, the closing --;gt is
734    * missing.
735    */
handleEOFInComment()736   protected void handleEOFInComment()
737   {
738     error("Unclosed comment");
739   }
740 
741   /**
742    * Handle the tag with no content, like &lt;br&gt;. The method is
743    * called for the elements that, in accordance with the current DTD,
744    * has an empty content.
745    * @param tag The tag being handled.
746    * @throws javax.swing.text.ChangedCharSetException
747    */
handleEmptyTag(TagElement tag)748   protected void handleEmptyTag(TagElement tag)
749                          throws javax.swing.text.ChangedCharSetException
750   {
751   }
752 
753   /**
754    * The method is called when the HTML closing tag ((like &lt;/table&gt;)
755    * is found or if the parser concludes that the one should be present
756    * in the current position.
757    * @param tag The tag
758    */
handleEndTag(TagElement tag)759   protected void handleEndTag(TagElement tag)
760   {
761   }
762 
763   /* Handle error that has occured in the given line. */
handleError(int line, String message)764   protected void handleError(int line, String message)
765   {
766   }
767 
768   /**
769    * The method is called when the HTML opening tag ((like &lt;table&gt;)
770    * is found or if the parser concludes that the one should be present
771    * in the current position.
772    * @param tag The tag
773    */
handleStartTag(TagElement tag)774   protected void handleStartTag(TagElement tag)
775   {
776   }
777 
778   /**
779    * Handle the text section.
780    * <p> For non-preformatted section, the parser replaces
781    * \t, \r and \n by spaces and then multiple spaces
782    * by a single space. Additionaly, all whitespace around
783    * tags is discarded.
784    * </p>
785    * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves
786    * all tabs and spaces, but removes <b>one</b>  bounding \r, \n or \r\n,
787    * if it is present. Additionally, it replaces each occurence of \r or \r\n
788    * by a single \n.</p>
789    *
790    * @param text A section text.
791    */
handleText(char[] text)792   protected void handleText(char[] text)
793   {
794   }
795 
796   /**
797    * Handle HTML &lt;title&gt; tag. This method is invoked when
798    * both title starting and closing tags are already behind.
799    * The passed argument contains the concatenation of all
800    * title text sections.
801    * @param title The title text.
802    */
handleTitle(char[] title)803   protected void handleTitle(char[] title)
804   {
805   }
806 
807   /**
808    * Constructs the tag from the given element. In this implementation,
809    * this is defined, but never called.
810    * @return the tag
811    */
makeTag(Element element)812   protected TagElement makeTag(Element element)
813   {
814     return makeTag(element, false);
815   }
816 
817   /**
818    * Constructs the tag from the given element.
819    * @param the tag base {@link javax.swing.text.html.parser.Element}
820    * @param isSupposed true if the tag is not actually present in the
821    * html input, but the parser supposes that it should to occur in
822    * the current location.
823    * @return the tag
824    */
makeTag(Element element, boolean isSupposed)825   protected TagElement makeTag(Element element, boolean isSupposed)
826   {
827     return new TagElement(element, isSupposed);
828   }
829 
830   /**
831    * This is called when the tag, representing the given element,
832    * occurs first time in the document.
833    * @param element
834    */
markFirstTime(Element element)835   protected void markFirstTime(Element element)
836   {
837   }
838 
839   /**
840    * Consume the token that was checked before and hence MUST be present.
841    * @param kind The kind of token to consume.
842    */
mustBe(int kind)843   protected Token mustBe(int kind)
844   {
845     if (getTokenAhead().kind == kind)
846       return getNextToken();
847     else
848       {
849         String ei = "";
850         if (kind < 1000)
851           ei = " ('" + (char) kind + "') ";
852         throw new AssertionError("The token of kind " + kind + ei +
853                                  " MUST be here,"
854                                 );
855       }
856   }
857 
858   /**
859    * Handle attribute without value. The default method uses
860    * the only allowed attribute value from DTD.
861    * If the attribute is unknown or allows several values,
862    * the HTML.NULL_ATTRIBUTE_VALUE is used. The attribute with
863    * this value is added to the attribute set.
864    * @param element The name of element.
865    * @param attribute The name of attribute without value.
866    */
noValueAttribute(String element, String attribute)867   protected void noValueAttribute(String element, String attribute)
868   {
869     Object value = HTML.NULL_ATTRIBUTE_VALUE;
870 
871     Element e = dtd.elementHash.get(element.toLowerCase());
872     if (e != null)
873       {
874         AttributeList attr = e.getAttribute(attribute);
875         if (attr != null)
876           {
877             Vector values = attr.values;
878             if (values != null && values.size() == 1)
879               value = values.get(0);
880           }
881       }
882     attributes.addAttribute(attribute, value);
883   }
884 
885   /**
886    * Consume the optional token, if present.
887    * @param kind The kind of token to consume.
888    */
optional(int kind)889   protected Token optional(int kind)
890   {
891     if (getTokenAhead().kind == kind)
892       return getNextToken();
893     else
894       return null;
895   }
896 
897   /** Parse the html document. */
parseDocument()898   protected void parseDocument()
899                         throws ParseException
900   {
901     // Read up any initial whitespace.
902     optional(WS);
903     while (getTokenAhead().kind != EOF)
904       {
905         advanced = false;
906         if (TAG.matches(this))
907           Tag();
908         else if (COMMENT_OPEN.matches(this))
909           Comment();
910         else if (STYLE_OPEN.matches(this))
911           Style();
912         else if (SCRIPT_OPEN.matches(this))
913           Script();
914         else if (SGML.matches(this))
915           Sgml();
916         else
917           CDATA(true);
918 
919         // Surely HTML error, treat as a text.
920         if (!advanced)
921           {
922             Token wrong = getNextToken();
923             error("unexpected '" + wrong.getImage() + "'", wrong);
924             buffer.setLength(0);
925             buffer.append(wrong.getImage());
926             _handleText();
927           }
928       }
929   }
930 
931   /**
932    * Read the element attributes, adding them into attribute set.
933    * @param element The element name (needed to access attribute
934    * information in dtd).
935    */
readAttributes(String element)936   protected void readAttributes(String element)
937   {
938     Token name;
939     Token value;
940     Token next;
941     String attrValue;
942 
943     attributes = new htmlAttributeSet();
944 
945     optional(WS);
946 
947     attributeReading:
948       while (getTokenAhead().kind == NUMTOKEN)
949       {
950         name = getNextToken();
951         optional(WS);
952 
953         next = getTokenAhead();
954         if (next.kind == EQ)
955           {
956             mustBe(EQ);
957             optional(WS);
958 
959             next = getNextToken();
960 
961             switch (next.kind)
962               {
963               case QUOT:
964 
965                 // read "quoted" attribute.
966                 buffer.setLength(0);
967                 readTillTokenE(QUOT);
968                 attrValue = buffer.toString();
969                 break;
970 
971               case AP:
972 
973                 // read 'quoted' attribute.
974                 buffer.setLength(0);
975                 readTillTokenE(AP);
976                 attrValue = buffer.toString();
977                 break;
978 
979               // read unquoted attribute.
980               case NUMTOKEN:
981                 value = next;
982                 optional(WS);
983 
984                 // Check maybe the opening quote is missing.
985                 next = getTokenAhead();
986                 if (bQUOTING.get(next.kind))
987                   {
988                     hTag = next;
989                     error("The value without opening quote is closed with '"
990                           + next.getImage() + "'");
991                     attrValue = value.getImage();
992                   }
993                 else if (next.kind == SLASH || next.kind == OTHER)
994                 // The slash and other characters (like %) in this context is
995                 // treated as the ordinary
996                 // character, not as a token. The character may be part of
997                 // the unquoted URL.
998                   {
999                     CPStringBuilder image = new CPStringBuilder(value.getImage());
1000                     while (next.kind == NUMTOKEN || next.kind == SLASH
1001                            || next.kind == OTHER)
1002                       {
1003                         image.append(getNextToken().getImage());
1004                         next = getTokenAhead();
1005                       }
1006                     attrValue = image.toString();
1007                   }
1008                 else
1009                   attrValue = value.getImage();
1010                 break;
1011 
1012               case SLASH:
1013                 value = next;
1014                 optional(WS);
1015 
1016                 // Check maybe the opening quote is missing.
1017                 next = getTokenAhead();
1018                 if (bQUOTING.get(next.kind))
1019                   {
1020                     hTag = next;
1021                     error("The value without opening quote is closed with '"
1022                           + next.getImage() + "'");
1023                     attrValue = value.getImage();
1024                   }
1025                 else if (next.kind == NUMTOKEN || next.kind == SLASH)
1026                 // The slash in this context is treated as the ordinary
1027                 // character, not as a token. The slash may be part of
1028                 // the unquoted URL.
1029                   {
1030                     CPStringBuilder image = new CPStringBuilder(value.getImage());
1031                     while (next.kind == NUMTOKEN || next.kind == SLASH)
1032                       {
1033                         image.append(getNextToken().getImage());
1034                         next = getTokenAhead();
1035                       }
1036                     attrValue = image.toString();
1037                   }
1038                 else
1039                   attrValue = value.getImage();
1040                 break;
1041               default:
1042                 break attributeReading;
1043               }
1044             attributes.addAttribute(name.getImage(), attrValue);
1045             optional(WS);
1046           }
1047         else
1048           // The '=' is missing: attribute without value.
1049           {
1050             noValueAttribute(element, name.getImage());
1051           }
1052       }
1053   }
1054 
1055   /**
1056    * Return string, corresponding the given named entity. The name is passed
1057    * with the preceeding &, but without the ending semicolon.
1058    */
resolveNamedEntity(final String a_tag)1059   protected String resolveNamedEntity(final String a_tag)
1060   {
1061     // Discard &
1062     if (!a_tag.startsWith("&"))
1063       throw new AssertionError("Named entity " + a_tag +
1064                                " must start witn '&'."
1065                               );
1066 
1067     String tag = a_tag.substring(1);
1068 
1069     try
1070       {
1071         Entity entity = dtd.getEntity(tag);
1072         if (entity != null)
1073           return entity.getString();
1074 
1075         entity = dtd.getEntity(tag.toLowerCase());
1076 
1077         if (entity != null)
1078           {
1079             error("The name of this entity should be in lowercase", a_tag);
1080             return entity.getString();
1081           }
1082       }
1083     catch (IndexOutOfBoundsException ibx)
1084       {
1085         /* The error will be reported. */
1086       }
1087 
1088     error("Unknown named entity", a_tag);
1089     return a_tag;
1090   }
1091 
1092   /**
1093    * Return char, corresponding the given numeric entity.
1094    * The name is passed with the preceeding &#, but without
1095    * the ending semicolon.
1096    */
resolveNumericEntity(final String a_tag)1097   protected char resolveNumericEntity(final String a_tag)
1098   {
1099     // Discard &#
1100     if (!a_tag.startsWith("&#"))
1101       throw new AssertionError("Numeric entity " + a_tag +
1102                                " must start witn '&#'."
1103                               );
1104 
1105     String tag = a_tag.substring(2);
1106 
1107     try
1108       {
1109         // Determine the encoding type:
1110         char cx = tag.charAt(0);
1111         if (cx == 'x' || cx == 'X') // Hexadecimal &#Xnnn;
1112 
1113           return (char) Integer.parseInt(tag.substring(1), 16);
1114 
1115         return (char) Integer.parseInt(tag);
1116       }
1117 
1118     /* The error will be reported. */
1119     catch (NumberFormatException nex)
1120       {
1121       }
1122     catch (IndexOutOfBoundsException ix)
1123       {
1124       }
1125 
1126     error("Invalid numeric entity", a_tag);
1127     return '?';
1128   }
1129 
1130   /**
1131    * Reset all fields into the intial default state, preparing the
1132    * parset for parsing the next document.
1133    */
restart()1134   protected void restart()
1135   {
1136     documentTags.clear();
1137     titleHandled = false;
1138     titleOpen = false;
1139     buffer.setLength(0);
1140     title.setLength(0);
1141     validator.restart();
1142   }
1143 
1144   /**
1145    * The method is called when the HTML opening tag ((like &lt;table&gt;)
1146    * is found or if the parser concludes that the one should be present
1147    * in the current position. The method is called immediately before
1148    * calling the handleStartTag.
1149    * @param tag The tag
1150    */
startTag(TagElement tag)1151   protected void startTag(TagElement tag)
1152                    throws ChangedCharSetException
1153   {
1154   }
1155 
1156   /**
1157    * Handle a complete element, when the tag content is already present in the
1158    * buffer and both starting and heading tags behind. This is called
1159    * in the case when the tag text must not be parsed for the nested
1160    * elements (elements STYLE and SCRIPT).
1161    */
_handleCompleteElement(TagElement tag)1162   private void _handleCompleteElement(TagElement tag)
1163   {
1164     _handleStartTag(tag);
1165 
1166     // Suppress inclusion of the SCRIPT ans STYLE texts into the title.
1167     HTML.Tag h = tag.getHTMLTag();
1168     if (h == HTML.Tag.SCRIPT || h == HTML.Tag.STYLE)
1169       {
1170         boolean tmp = titleOpen;
1171         titleOpen = false;
1172         _handleText();
1173         titleOpen = tmp;
1174       }
1175     else
1176       _handleText();
1177 
1178     _handleEndTag(tag);
1179   }
1180 
1181   /**
1182    * A hooks for operations, preceeding call to handleEmptyTag().
1183    * Handle the tag with no content, like &lt;br&gt;. As no any
1184    * nested tags are expected, the tag validator is not involved.
1185    * @param tag The tag being handled.
1186    */
_handleEmptyTag(TagElement tag)1187   private void _handleEmptyTag(TagElement tag)
1188   {
1189     try
1190       {
1191         validator.validateTag(tag, attributes);
1192         handleEmptyTag(tag);
1193         HTML.Tag h = tag.getHTMLTag();
1194         // When a block tag is closed, consume whitespace that follows after
1195         // it.
1196         // For some unknown reason a FRAME tag is not treated as block element.
1197         // However in this case it should be treated as such.
1198         if (isBlock(h))
1199           optional(WS);
1200       }
1201     catch (ChangedCharSetException ex)
1202       {
1203         error("Changed charset exception:", ex.getMessage());
1204       }
1205   }
1206 
1207   /**
1208    * A hooks for operations, preceeding call to handleEndTag().
1209    * The method is called when the HTML closing tag
1210    * is found. Calls handleTitle after closing the 'title' tag.
1211    * @param tag The tag
1212    */
_handleEndTag(TagElement tag)1213   private void _handleEndTag(TagElement tag)
1214   {
1215     if (validator.closeTag(tag))
1216        _handleEndTag_remaining(tag);
1217   }
1218 
1219   /**
1220    * Actions that are also required if the closing action was
1221    * initiated by the tag validator.
1222    * Package-private to avoid an accessor method.
1223    */
_handleEndTag_remaining(TagElement tag)1224   void _handleEndTag_remaining(TagElement tag)
1225   {
1226     HTML.Tag h = tag.getHTMLTag();
1227 
1228     handleEndTag(tag);
1229     endTag(tag.fictional());
1230 
1231     if (h.isPreformatted())
1232       preformatted--;
1233     if (preformatted < 0)
1234       preformatted = 0;
1235 
1236     // When a block tag is closed, consume whitespace that follows after
1237     // it.
1238     if (isBlock(h))
1239       optional(WS);
1240 
1241     if (h == HTML.Tag.TITLE)
1242       {
1243         titleOpen = false;
1244         titleHandled = true;
1245 
1246         char[] a = new char[ title.length() ];
1247         title.getChars(0, a.length, a, 0);
1248         handleTitle(a);
1249       }
1250   }
1251 
1252   /**
1253    * A hooks for operations, preceeding call to handleStartTag().
1254    * The method is called when the HTML opening tag ((like &lt;table&gt;)
1255    * is found.
1256    * Package-private to avoid an accessor method.
1257    * @param tag The tag
1258    */
_handleStartTag(TagElement tag)1259   void _handleStartTag(TagElement tag)
1260   {
1261     validator.openTag(tag, attributes);
1262     startingTag(tag);
1263     handleStartTag(tag);
1264 
1265     HTML.Tag h = tag.getHTMLTag();
1266 
1267     if (isBlock(h))
1268       optional(WS);
1269 
1270     if (h.isPreformatted())
1271       preformatted++;
1272 
1273     if (h == HTML.Tag.TITLE)
1274       {
1275         if (titleHandled)
1276           error("Repetetive <TITLE> tag");
1277         titleOpen = true;
1278         titleHandled = false;
1279       }
1280   }
1281 
1282   /**
1283    * Resume parsing after heavy errors in HTML tag structure.
1284    * @throws ParseException
1285    */
forciblyCloseTheTag()1286   private void forciblyCloseTheTag()
1287                             throws ParseException
1288   {
1289     int closeAt = 0;
1290     buffer.setLength(0);
1291 
1292     ahead:
1293     for (int i = 1; i < 100; i++)
1294       {
1295         t = getTokenAhead(i - 1);
1296         if (t.kind == EOF || t.kind == BEGIN)
1297           break ahead;
1298         if (t.kind == END)
1299           {
1300             /* Closing '>' found. */
1301             closeAt = i;
1302             break ahead;
1303           }
1304       }
1305     if (closeAt > 0)
1306       {
1307         buffer.append("Ignoring '");
1308         for (int i = 1; i <= closeAt; i++)
1309           {
1310             t = getNextToken();
1311             append(t);
1312           }
1313         buffer.append('\'');
1314         error(buffer.toString());
1315       }
1316   }
1317 
1318   /**
1319    * Handle comment in string buffer. You can avoid allocating a char
1320    * array each time by processing your comment directly here.
1321    */
handleComment()1322   private void handleComment()
1323   {
1324     char[] a = new char[ buffer.length() ];
1325     buffer.getChars(0, a.length, a, 0);
1326     handleComment(a);
1327   }
1328 
makeTagElement(String name, boolean isSupposed)1329   private TagElement makeTagElement(String name, boolean isSupposed)
1330   {
1331     Element e = dtd.elementHash.get(name.toLowerCase());
1332     if (e == null)
1333       {
1334         error("Unknown tag <" + name + ">");
1335         e = dtd.getElement(name);
1336         e.name = name.toUpperCase();
1337         e.index = -1;
1338       }
1339 
1340     if (!documentTags.contains(e.name))
1341       {
1342         markFirstTime(e);
1343         documentTags.add(e.name);
1344       }
1345 
1346     return makeTag(e, isSupposed);
1347   }
1348 
1349   /**
1350    * Read till the given token, resolving entities. Consume the given
1351    * token without adding it to buffer.
1352    * @param till The token to read till
1353    * @throws ParseException
1354    */
readTillTokenE(int till)1355   private void readTillTokenE(int till)
1356                        throws ParseException
1357   {
1358     buffer.setLength(0);
1359     read:
1360     while (true)
1361       {
1362         t = getNextToken();
1363         if (t.kind == Constants.ENTITY)
1364           {
1365             resolveAndAppendEntity(t);
1366           }
1367         else if (t.kind == EOF)
1368           {
1369             error("unexpected eof", t);
1370             break read;
1371           }
1372         else if (t.kind == till)
1373           break read;
1374         else if (t.kind == WS)
1375           {
1376             // Processing whitespace in accordance with CDATA rules:
1377             String s = t.getImage();
1378             char c;
1379             for (int i = 0; i < s.length(); i++)
1380               {
1381                 c = s.charAt(i);
1382                 if (c == '\r')
1383                   buffer.append(' '); // CR replaced by space
1384                 else if (c == '\n')
1385                   { /* LF ignored */ }
1386                 else if (c == '\t')
1387                   buffer.append(' '); // Tab replaced by space
1388                 else
1389                   buffer.append(c);
1390               }
1391           }
1392         else
1393           append(t);
1394       }
1395   }
1396 
1397   /**
1398    * Resolve the entity and append it to the end of buffer.
1399    * @param entity
1400    */
resolveAndAppendEntity(Token entity)1401   private void resolveAndAppendEntity(Token entity)
1402   {
1403     switch (entity.category)
1404       {
1405         case ENTITY_NAMED :
1406           buffer.append(resolveNamedEntity(entity.getImage()));
1407           break;
1408 
1409         case ENTITY_NUMERIC :
1410           buffer.append(resolveNumericEntity(entity.getImage()));
1411           break;
1412 
1413         default :
1414           throw new AssertionError("Invalid entity category " +
1415                                    entity.category
1416                                   );
1417       }
1418   }
1419 
1420   /**
1421    * Handle the remaining of HTML tags. This is a common end for
1422    * TAG, SCRIPT and STYLE.
1423    * @param closing True for closing tags ( &lt;/TAG&gt; ).
1424    * @param name Name of element
1425    * @param start Token where element has started
1426    * @throws ParseException
1427    */
restOfTag(boolean closing, Token name, Token start)1428   private void restOfTag(boolean closing, Token name, Token start)
1429                   throws ParseException
1430   {
1431     boolean end = false;
1432     Token next;
1433 
1434     optional(WS);
1435 
1436     readAttributes(name.getImage());
1437 
1438     optional(WS);
1439 
1440     next = getTokenAhead();
1441     if (next.kind == END)
1442       {
1443         mustBe(END);
1444         end = true;
1445       }
1446 
1447     hTag = new Token(start, next);
1448 
1449     if (!end)
1450       {
1451         // The tag body contains errors. If additionally the tag
1452         // name is not valid, this construction is treated as text.
1453         if (dtd.elementHash.get(name.getImage().toLowerCase()) == null &&
1454             backupMode
1455            )
1456           {
1457             error("Errors in tag body and unknown tag name. " +
1458                   "Treating the tag as a text."
1459                  );
1460             reset();
1461 
1462             hTag = mustBe(BEGIN);
1463             buffer.setLength(0);
1464             buffer.append(hTag.getImage());
1465             CDATA(false);
1466             return;
1467           }
1468         else
1469           {
1470             error("Forcibly closing invalid parameter list");
1471             forciblyCloseTheTag();
1472           }
1473       }
1474 
1475     if (closing)
1476       {
1477         endTag(false);
1478         _handleEndTag(makeTagElement(name.getImage(), false));
1479       }
1480     else
1481       {
1482         TagElement te = makeTagElement(name.getImage(), false);
1483         if (te.getElement().type == DTDConstants.EMPTY)
1484           _handleEmptyTag(te);
1485         else
1486           {
1487             // According to the specs we need to consume whitespace following
1488             // immediately after a opening tag.
1489             optional(WS);
1490             _handleStartTag(te);
1491           }
1492       }
1493   }
1494 
1495   /**
1496    * This should fire additional actions in response to the
1497    * ChangedCharSetException.  The current implementation
1498    * does nothing.
1499    * @param tag
1500    */
startingTag(TagElement tag)1501   private void startingTag(TagElement tag)
1502   {
1503     try
1504       {
1505         startTag(tag);
1506       }
1507     catch (ChangedCharSetException cax)
1508       {
1509         error("Invalid change of charset");
1510       }
1511   }
1512 
ws_error()1513   private void ws_error()
1514   {
1515     error("Whitespace here is not permitted");
1516   }
1517 
1518   /**
1519    * Returns true when the specified tag should be considered a block tag
1520    * wrt whitespace handling. We need this special handling, since there
1521    * are a couple of tags that we must treat as block tags but which aren't
1522    * officially block tags.
1523    *
1524    * @param tag the tag to check
1525    * @return true when the specified tag should be considered a block tag
1526    *         wrt whitespace handling
1527    */
isBlock(HTML.Tag tag)1528   private boolean isBlock(HTML.Tag tag)
1529   {
1530     return tag.isBlock() || tag == HTML.Tag.STYLE || tag == HTML.Tag.FRAME;
1531   }
1532 }
1533