1 /* Parser.java -- HTML parser
2    Copyright (C) 2005 Free Software Foundation, Inc.
3 
4 This file is part of GNU Classpath.
5 
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10 
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING.  If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
20 
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library.  Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
25 
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module.  An independent module is a module which is not derived from
33 or based on this library.  If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so.  If you do not wish to do so, delete this
36 exception statement from your version. */
37 
38 
39 package javax.swing.text.html.parser;
40 
41 import java.io.IOException;
42 import java.io.Reader;
43 
44 import javax.swing.text.ChangedCharSetException;
45 import javax.swing.text.SimpleAttributeSet;
46 
47 /*
48  * FOR DEVELOPERS: To avoid regression, please run the package test
49  * textsuite/javax.swing.text.html.parser/AllParserTests after your
50  * modifications.
51  */
52 
53 /**
54  * <p>A simple error-tolerant HTML parser that uses a DTD document
55  * to access data on the possible tokens, arguments and syntax.</p>
56  * <p> The parser reads an HTML content from a Reader and calls various
57  * notifying methods (which should be overridden in a subclass)
58  * when tags or data are encountered.</p>
59  * <p>Some HTML elements need no opening or closing tags. The
60  * task of this parser is to invoke the tag handling methods also when
61  * the tags are not explicitly specified and must be supposed using
62  * information, stored in the DTD.
63  * For  example, parsing the document
64  * <p>&lt;table&gt;&lt;tr&gt;&lt;td&gt;a&lt;td&gt;b&lt;td&gt;c&lt;/tr&gt; <br>
65  * will invoke exactly the handling methods exactly in the same order
66  * (and with the same parameters) as if parsing the document: <br>
67  * <em>&lt;html&gt;&lt;head&gt;&lt;/head&gt;&lt;body&gt;&lt;table&gt;&lt;
68  * tbody&gt;</em>&lt;tr&gt;&lt;td&gt;a<em>&lt;/td&gt;</em>&lt;td&gt;b<em>
69  * &lt;/td&gt;</em>&lt;td&gt;c<em>&lt;/td&gt;&lt;/tr&gt;</em>&lt;
70  * <em>/tbody&gt;&lt;/table&gt;&lt;/body&gt;&lt;/html&gt;</em></p>
71  * (supposed tags are given in italics). The parser also supports
72  * obsolete elements of HTML syntax.<p>
73  * </p>
74  * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
75  */
76 public class Parser
77   implements DTDConstants
78 {
79   /**
80    * The document template description that will be used to parse the documents.
81    */
82   protected DTD dtd;
83 
84   /**
85    * The value of this field determines whether or not the Parser will be
86    * strict in enforcing SGML compatibility. The default value is false,
87    * stating that the parser should do everything to parse and get at least
88    * some information even from the incorrectly written HTML input.
89    */
90   protected boolean strict;
91 
92   /**
93    * The package level reference to the working HTML parser in this
94    * implementation.
95    */
96   final gnu.javax.swing.text.html.parser.support.Parser gnu;
97 
98   /**
99    * Creates a new parser that uses the given DTD to access data on the
100    * possible tokens, arguments and syntax. There is no single - step way
101    * to get a default DTD; you must either refer to the implementation -
102    * specific packages, write your own DTD or obtain the working instance
103    * of parser in other way, for example, by calling
104    * {@link javax.swing.text.html.HTMLEditorKit#getParser() }.
105    * @param a_dtd A DTD to use.
106    */
Parser(DTD a_dtd)107   public Parser(DTD a_dtd)
108   {
109     dtd = a_dtd;
110 
111     final Parser j = this;
112 
113     gnu =
114       new gnu.javax.swing.text.html.parser.support.Parser(dtd)
115         {
116           protected final void handleComment(char[] comment)
117           {
118             j.handleComment(comment);
119           }
120 
121           protected final void handleEOFInComment()
122           {
123             j.handleEOFInComment();
124           }
125 
126           protected final void handleEmptyTag(TagElement tag)
127             throws javax.swing.text.ChangedCharSetException
128           {
129             j.handleEmptyTag(tag);
130           }
131 
132           protected final void handleStartTag(TagElement tag)
133           {
134             j.handleStartTag(tag);
135           }
136 
137           protected final void handleEndTag(TagElement tag)
138           {
139             j.handleEndTag(tag);
140           }
141 
142           protected final void handleError(int line, String message)
143           {
144             j.handleError(line, message);
145           }
146 
147           protected final void handleText(char[] text)
148           {
149             j.handleText(text);
150           }
151 
152           protected final void handleTitle(char[] title)
153           {
154             j.handleTitle(title);
155           }
156 
157           protected final void markFirstTime(Element element)
158           {
159             j.markFirstTime(element);
160           }
161 
162           protected final void startTag(TagElement tag)
163             throws ChangedCharSetException
164           {
165             j.startTag(tag);
166           }
167 
168           protected final void endTag(boolean omitted)
169           {
170             j.endTag(omitted);
171           }
172 
173           protected TagElement makeTag(Element element)
174           {
175             return j.makeTag(element);
176           }
177 
178           protected TagElement makeTag(Element element, boolean isSupposed)
179           {
180             return j.makeTag(element, isSupposed);
181           }
182         };
183   }
184 
185   /**
186    * Parse the HTML text, calling various methods in response to the
187    * occurence of the corresponding HTML constructions.
188    * @param reader The reader to read the source HTML from.
189    * @throws IOException If the reader throws one.
190    */
parse(Reader reader)191   public synchronized void parse(Reader reader)
192     throws IOException
193   {
194     gnu.parse(reader);
195   }
196 
197   /**
198    * Parses DTD markup declaration. Currently returns without action.
199    * @return null.
200    * @throws java.io.IOException
201    */
parseDTDMarkup()202   public String parseDTDMarkup()
203     throws IOException
204   {
205     return gnu.parseDTDMarkup();
206   }
207 
208   /**
209    * Parse DTD document declarations. Currently only parses the document
210    * type declaration markup.
211    * @param strBuff
212    * @return true if this is a valid DTD markup declaration.
213    * @throws IOException
214    */
parseMarkupDeclarations(StringBuffer strBuff)215   protected boolean parseMarkupDeclarations(StringBuffer strBuff)
216     throws IOException
217   {
218     return gnu.parseMarkupDeclarations(strBuff);
219   }
220 
221   /**
222    * Get the attributes of the current tag.
223    * @return The attribute set, representing the attributes of the current tag.
224    */
getAttributes()225   protected SimpleAttributeSet getAttributes()
226   {
227     return gnu.getAttributes();
228   }
229 
230   /**
231    * Get the number of the document line being parsed.
232    * @return The current line.
233    */
getCurrentLine()234   protected int getCurrentLine()
235   {
236     return gnu.hTag.where.beginLine;
237   }
238 
239   /**
240    * Get the current position in the document being parsed.
241    * @return The current position.
242    */
getCurrentPos()243   protected int getCurrentPos()
244   {
245     return gnu.hTag.where.startPosition;
246   }
247 
248   /**
249    * The method is called when the HTML end (closing) tag is found or if
250    * the parser concludes that the one should be present in the
251    * current position. The method is called immediatly
252    * before calling the handleEndTag().
253    * @param omitted True if the tag is no actually present in the document,
254    * but is supposed by the parser (like &lt;/html&gt; at the end of the
255    * document).
256    */
endTag(boolean omitted)257   protected void endTag(boolean omitted)
258   {
259     // This default implementation does nothing.
260   }
261 
262   /**
263    * Invokes the error handler. The default method in this implementation
264    * finally delegates the call to handleError, also providing the number of the
265    * current line.
266    */
error(String msg)267   protected void error(String msg)
268   {
269     gnu.error(msg);
270   }
271 
272   /**
273    * Invokes the error handler. The default method in this implementation
274    * finally delegates the call to error (msg+": '"+invalid+"'").
275    */
error(String msg, String invalid)276   protected void error(String msg, String invalid)
277   {
278     gnu.error(msg, invalid);
279   }
280 
281   /**
282    * Invokes the error handler. The default method in this implementation
283    * finally delegates the call to error (parm1+" "+ parm2+" "+ parm3).
284    */
error(String parm1, String parm2, String parm3)285   protected void error(String parm1, String parm2, String parm3)
286   {
287     gnu.error(parm1, parm2, parm3);
288   }
289 
290   /**
291    * Invokes the error handler. The default method in this implementation
292    * finally delegates the call to error
293    * (parm1+" "+ parm2+" "+ parm3+" "+ parm4).
294    */
error(String parm1, String parm2, String parm3, String parm4)295   protected void error(String parm1, String parm2, String parm3, String parm4)
296   {
297     gnu.error(parm1, parm2, parm3, parm4);
298   }
299 
300   /**
301    * In this implementation, this is never called and returns without action.
302    */
flushAttributes()303   protected void flushAttributes()
304   {
305     gnu.flushAttributes();
306   }
307 
308   /**
309    * Handle HTML comment. The default method returns without action.
310    * @param comment The comment being handled
311    */
handleComment(char[] comment)312   protected void handleComment(char[] comment)
313   {
314     // This default implementation does nothing.
315   }
316 
317   /**
318    * This is additionally called in when the HTML content terminates
319    * without closing the HTML comment. This can only happen if the
320    * HTML document contains errors (for example, the closing --;gt is
321    * missing. The default method calls the error handler.
322    */
handleEOFInComment()323   protected void handleEOFInComment()
324   {
325     gnu.error("Unclosed comment");
326   }
327 
328   /**
329    * Handle the tag with no content, like &lt;br&gt;. The method is
330    * called for the elements that, in accordance with the current DTD,
331    * has an empty content.
332    * @param tag The tag being handled.
333    * @throws javax.swing.text.ChangedCharSetException
334    */
handleEmptyTag(TagElement tag)335   protected void handleEmptyTag(TagElement tag)
336     throws ChangedCharSetException
337   {
338     // This default implementation does nothing.
339   }
340 
341   /**
342    * The method is called when the HTML closing tag ((like &lt;/table&gt;)
343    * is found or if the parser concludes that the one should be present
344    * in the current position.
345    * @param tag The tag being handled
346    */
handleEndTag(TagElement tag)347   protected void handleEndTag(TagElement tag)
348   {
349     // This default implementation does nothing.
350   }
351 
352   /* Handle error that has occured in the given line. */
handleError(int line, String message)353   protected void handleError(int line, String message)
354   {
355     // This default implementation does nothing.
356   }
357 
358   /**
359    * The method is called when the HTML opening tag ((like &lt;table&gt;)
360    * is found or if the parser concludes that the one should be present
361    * in the current position.
362    * @param tag The tag being handled
363    */
handleStartTag(TagElement tag)364   protected void handleStartTag(TagElement tag)
365   {
366     // This default implementation does nothing.
367   }
368 
369   /**
370    * Handle the text section.
371    * <p> For non-preformatted section, the parser replaces
372    * \t, \r and \n by spaces and then multiple spaces
373    * by a single space. Additionaly, all whitespace around
374    * tags is discarded.
375    * </p>
376    * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves
377    * all tabs and spaces, but removes <b>one</b>  bounding \r, \n or \r\n,
378    * if it is present. Additionally, it replaces each occurence of \r or \r\n
379    * by a single \n.</p>
380    *
381    * @param text A section text.
382    */
handleText(char[] text)383   protected void handleText(char[] text)
384   {
385     // This default implementation does nothing.
386   }
387 
388   /**
389    * Handle HTML &lt;title&gt; tag. This method is invoked when
390    * both title starting and closing tags are already behind.
391    * The passed argument contains the concatenation of all
392    * title text sections.
393    * @param title The title text.
394    */
handleTitle(char[] title)395   protected void handleTitle(char[] title)
396   {
397     // This default implementation does nothing.
398   }
399 
400   /**
401    * Constructs the tag from the given element. In this implementation,
402    * this is defined, but never called.
403    * @param element the base element of the tag.
404    * @return the tag
405    */
makeTag(Element element)406   protected TagElement makeTag(Element element)
407   {
408     return makeTag(element, false);
409   }
410 
411   /**
412    * Constructs the tag from the given element.
413    * @param element the tag base {@link javax.swing.text.html.parser.Element}
414    * @param isSupposed true if the tag is not actually present in the
415    * html input, but the parser supposes that it should to occur in
416    * the current location.
417    * @return the tag
418    */
makeTag(Element element, boolean isSupposed)419   protected TagElement makeTag(Element element, boolean isSupposed)
420   {
421     return new TagElement(element, isSupposed);
422   }
423 
424   /**
425    * This is called when the tag, representing the given element,
426    * occurs first time in the document.
427    * @param element
428    */
markFirstTime(Element element)429   protected void markFirstTime(Element element)
430   {
431     // This default implementation does nothing.
432   }
433 
434   /**
435    * The method is called when the HTML opening tag ((like &lt;table&gt;)
436    * is found or if the parser concludes that the one should be present
437    * in the current position. The method is called immediately before
438    * calling the handleStartTag.
439    * @param tag The tag
440    */
startTag(TagElement tag)441   protected void startTag(TagElement tag)
442     throws ChangedCharSetException
443   {
444     // This default implementation does nothing.
445   }
446 }
447