1 /* Parser.java -- HTML parser 2 Copyright (C) 2005 Free Software Foundation, Inc. 3 4 This file is part of GNU Classpath. 5 6 GNU Classpath is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 GNU Classpath is distributed in the hope that it will be useful, but 12 WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GNU Classpath; see the file COPYING. If not, write to the 18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19 02110-1301 USA. 20 21 Linking this library statically or dynamically with other modules is 22 making a combined work based on this library. Thus, the terms and 23 conditions of the GNU General Public License cover the whole 24 combination. 25 26 As a special exception, the copyright holders of this library give you 27 permission to link this library with independent modules to produce an 28 executable, regardless of the license terms of these independent 29 modules, and to copy and distribute the resulting executable under 30 terms of your choice, provided that you also meet, for each linked 31 independent module, the terms and conditions of the license of that 32 module. An independent module is a module which is not derived from 33 or based on this library. If you modify this library, you may extend 34 this exception to your version of the library, but you are not 35 obligated to do so. If you do not wish to do so, delete this 36 exception statement from your version. */ 37 38 39 package javax.swing.text.html.parser; 40 41 import java.io.IOException; 42 import java.io.Reader; 43 44 import javax.swing.text.ChangedCharSetException; 45 import javax.swing.text.SimpleAttributeSet; 46 47 /* 48 * FOR DEVELOPERS: To avoid regression, please run the package test 49 * textsuite/javax.swing.text.html.parser/AllParserTests after your 50 * modifications. 51 */ 52 53 /** 54 * <p>A simple error-tolerant HTML parser that uses a DTD document 55 * to access data on the possible tokens, arguments and syntax.</p> 56 * <p> The parser reads an HTML content from a Reader and calls various 57 * notifying methods (which should be overridden in a subclass) 58 * when tags or data are encountered.</p> 59 * <p>Some HTML elements need no opening or closing tags. The 60 * task of this parser is to invoke the tag handling methods also when 61 * the tags are not explicitly specified and must be supposed using 62 * information, stored in the DTD. 63 * For example, parsing the document 64 * <p><table><tr><td>a<td>b<td>c</tr> <br> 65 * will invoke exactly the handling methods exactly in the same order 66 * (and with the same parameters) as if parsing the document: <br> 67 * <em><html><head></head><body><table>< 68 * tbody></em><tr><td>a<em></td></em><td>b<em> 69 * </td></em><td>c<em></td></tr></em>< 70 * <em>/tbody></table></body></html></em></p> 71 * (supposed tags are given in italics). The parser also supports 72 * obsolete elements of HTML syntax.<p> 73 * </p> 74 * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) 75 */ 76 public class Parser 77 implements DTDConstants 78 { 79 /** 80 * The document template description that will be used to parse the documents. 81 */ 82 protected DTD dtd; 83 84 /** 85 * The value of this field determines whether or not the Parser will be 86 * strict in enforcing SGML compatibility. The default value is false, 87 * stating that the parser should do everything to parse and get at least 88 * some information even from the incorrectly written HTML input. 89 */ 90 protected boolean strict; 91 92 /** 93 * The package level reference to the working HTML parser in this 94 * implementation. 95 */ 96 final gnu.javax.swing.text.html.parser.support.Parser gnu; 97 98 /** 99 * Creates a new parser that uses the given DTD to access data on the 100 * possible tokens, arguments and syntax. There is no single - step way 101 * to get a default DTD; you must either refer to the implementation - 102 * specific packages, write your own DTD or obtain the working instance 103 * of parser in other way, for example, by calling 104 * {@link javax.swing.text.html.HTMLEditorKit#getParser() }. 105 * @param a_dtd A DTD to use. 106 */ Parser(DTD a_dtd)107 public Parser(DTD a_dtd) 108 { 109 dtd = a_dtd; 110 111 final Parser j = this; 112 113 gnu = 114 new gnu.javax.swing.text.html.parser.support.Parser(dtd) 115 { 116 protected final void handleComment(char[] comment) 117 { 118 j.handleComment(comment); 119 } 120 121 protected final void handleEOFInComment() 122 { 123 j.handleEOFInComment(); 124 } 125 126 protected final void handleEmptyTag(TagElement tag) 127 throws javax.swing.text.ChangedCharSetException 128 { 129 j.handleEmptyTag(tag); 130 } 131 132 protected final void handleStartTag(TagElement tag) 133 { 134 j.handleStartTag(tag); 135 } 136 137 protected final void handleEndTag(TagElement tag) 138 { 139 j.handleEndTag(tag); 140 } 141 142 protected final void handleError(int line, String message) 143 { 144 j.handleError(line, message); 145 } 146 147 protected final void handleText(char[] text) 148 { 149 j.handleText(text); 150 } 151 152 protected final void handleTitle(char[] title) 153 { 154 j.handleTitle(title); 155 } 156 157 protected final void markFirstTime(Element element) 158 { 159 j.markFirstTime(element); 160 } 161 162 protected final void startTag(TagElement tag) 163 throws ChangedCharSetException 164 { 165 j.startTag(tag); 166 } 167 168 protected final void endTag(boolean omitted) 169 { 170 j.endTag(omitted); 171 } 172 173 protected TagElement makeTag(Element element) 174 { 175 return j.makeTag(element); 176 } 177 178 protected TagElement makeTag(Element element, boolean isSupposed) 179 { 180 return j.makeTag(element, isSupposed); 181 } 182 }; 183 } 184 185 /** 186 * Parse the HTML text, calling various methods in response to the 187 * occurence of the corresponding HTML constructions. 188 * @param reader The reader to read the source HTML from. 189 * @throws IOException If the reader throws one. 190 */ parse(Reader reader)191 public synchronized void parse(Reader reader) 192 throws IOException 193 { 194 gnu.parse(reader); 195 } 196 197 /** 198 * Parses DTD markup declaration. Currently returns without action. 199 * @return null. 200 * @throws java.io.IOException 201 */ parseDTDMarkup()202 public String parseDTDMarkup() 203 throws IOException 204 { 205 return gnu.parseDTDMarkup(); 206 } 207 208 /** 209 * Parse DTD document declarations. Currently only parses the document 210 * type declaration markup. 211 * @param strBuff 212 * @return true if this is a valid DTD markup declaration. 213 * @throws IOException 214 */ parseMarkupDeclarations(StringBuffer strBuff)215 protected boolean parseMarkupDeclarations(StringBuffer strBuff) 216 throws IOException 217 { 218 return gnu.parseMarkupDeclarations(strBuff); 219 } 220 221 /** 222 * Get the attributes of the current tag. 223 * @return The attribute set, representing the attributes of the current tag. 224 */ getAttributes()225 protected SimpleAttributeSet getAttributes() 226 { 227 return gnu.getAttributes(); 228 } 229 230 /** 231 * Get the number of the document line being parsed. 232 * @return The current line. 233 */ getCurrentLine()234 protected int getCurrentLine() 235 { 236 return gnu.hTag.where.beginLine; 237 } 238 239 /** 240 * Get the current position in the document being parsed. 241 * @return The current position. 242 */ getCurrentPos()243 protected int getCurrentPos() 244 { 245 return gnu.hTag.where.startPosition; 246 } 247 248 /** 249 * The method is called when the HTML end (closing) tag is found or if 250 * the parser concludes that the one should be present in the 251 * current position. The method is called immediatly 252 * before calling the handleEndTag(). 253 * @param omitted True if the tag is no actually present in the document, 254 * but is supposed by the parser (like </html> at the end of the 255 * document). 256 */ endTag(boolean omitted)257 protected void endTag(boolean omitted) 258 { 259 // This default implementation does nothing. 260 } 261 262 /** 263 * Invokes the error handler. The default method in this implementation 264 * finally delegates the call to handleError, also providing the number of the 265 * current line. 266 */ error(String msg)267 protected void error(String msg) 268 { 269 gnu.error(msg); 270 } 271 272 /** 273 * Invokes the error handler. The default method in this implementation 274 * finally delegates the call to error (msg+": '"+invalid+"'"). 275 */ error(String msg, String invalid)276 protected void error(String msg, String invalid) 277 { 278 gnu.error(msg, invalid); 279 } 280 281 /** 282 * Invokes the error handler. The default method in this implementation 283 * finally delegates the call to error (parm1+" "+ parm2+" "+ parm3). 284 */ error(String parm1, String parm2, String parm3)285 protected void error(String parm1, String parm2, String parm3) 286 { 287 gnu.error(parm1, parm2, parm3); 288 } 289 290 /** 291 * Invokes the error handler. The default method in this implementation 292 * finally delegates the call to error 293 * (parm1+" "+ parm2+" "+ parm3+" "+ parm4). 294 */ error(String parm1, String parm2, String parm3, String parm4)295 protected void error(String parm1, String parm2, String parm3, String parm4) 296 { 297 gnu.error(parm1, parm2, parm3, parm4); 298 } 299 300 /** 301 * In this implementation, this is never called and returns without action. 302 */ flushAttributes()303 protected void flushAttributes() 304 { 305 gnu.flushAttributes(); 306 } 307 308 /** 309 * Handle HTML comment. The default method returns without action. 310 * @param comment The comment being handled 311 */ handleComment(char[] comment)312 protected void handleComment(char[] comment) 313 { 314 // This default implementation does nothing. 315 } 316 317 /** 318 * This is additionally called in when the HTML content terminates 319 * without closing the HTML comment. This can only happen if the 320 * HTML document contains errors (for example, the closing --;gt is 321 * missing. The default method calls the error handler. 322 */ handleEOFInComment()323 protected void handleEOFInComment() 324 { 325 gnu.error("Unclosed comment"); 326 } 327 328 /** 329 * Handle the tag with no content, like <br>. The method is 330 * called for the elements that, in accordance with the current DTD, 331 * has an empty content. 332 * @param tag The tag being handled. 333 * @throws javax.swing.text.ChangedCharSetException 334 */ handleEmptyTag(TagElement tag)335 protected void handleEmptyTag(TagElement tag) 336 throws ChangedCharSetException 337 { 338 // This default implementation does nothing. 339 } 340 341 /** 342 * The method is called when the HTML closing tag ((like </table>) 343 * is found or if the parser concludes that the one should be present 344 * in the current position. 345 * @param tag The tag being handled 346 */ handleEndTag(TagElement tag)347 protected void handleEndTag(TagElement tag) 348 { 349 // This default implementation does nothing. 350 } 351 352 /* Handle error that has occured in the given line. */ handleError(int line, String message)353 protected void handleError(int line, String message) 354 { 355 // This default implementation does nothing. 356 } 357 358 /** 359 * The method is called when the HTML opening tag ((like <table>) 360 * is found or if the parser concludes that the one should be present 361 * in the current position. 362 * @param tag The tag being handled 363 */ handleStartTag(TagElement tag)364 protected void handleStartTag(TagElement tag) 365 { 366 // This default implementation does nothing. 367 } 368 369 /** 370 * Handle the text section. 371 * <p> For non-preformatted section, the parser replaces 372 * \t, \r and \n by spaces and then multiple spaces 373 * by a single space. Additionaly, all whitespace around 374 * tags is discarded. 375 * </p> 376 * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves 377 * all tabs and spaces, but removes <b>one</b> bounding \r, \n or \r\n, 378 * if it is present. Additionally, it replaces each occurence of \r or \r\n 379 * by a single \n.</p> 380 * 381 * @param text A section text. 382 */ handleText(char[] text)383 protected void handleText(char[] text) 384 { 385 // This default implementation does nothing. 386 } 387 388 /** 389 * Handle HTML <title> tag. This method is invoked when 390 * both title starting and closing tags are already behind. 391 * The passed argument contains the concatenation of all 392 * title text sections. 393 * @param title The title text. 394 */ handleTitle(char[] title)395 protected void handleTitle(char[] title) 396 { 397 // This default implementation does nothing. 398 } 399 400 /** 401 * Constructs the tag from the given element. In this implementation, 402 * this is defined, but never called. 403 * @param element the base element of the tag. 404 * @return the tag 405 */ makeTag(Element element)406 protected TagElement makeTag(Element element) 407 { 408 return makeTag(element, false); 409 } 410 411 /** 412 * Constructs the tag from the given element. 413 * @param element the tag base {@link javax.swing.text.html.parser.Element} 414 * @param isSupposed true if the tag is not actually present in the 415 * html input, but the parser supposes that it should to occur in 416 * the current location. 417 * @return the tag 418 */ makeTag(Element element, boolean isSupposed)419 protected TagElement makeTag(Element element, boolean isSupposed) 420 { 421 return new TagElement(element, isSupposed); 422 } 423 424 /** 425 * This is called when the tag, representing the given element, 426 * occurs first time in the document. 427 * @param element 428 */ markFirstTime(Element element)429 protected void markFirstTime(Element element) 430 { 431 // This default implementation does nothing. 432 } 433 434 /** 435 * The method is called when the HTML opening tag ((like <table>) 436 * is found or if the parser concludes that the one should be present 437 * in the current position. The method is called immediately before 438 * calling the handleStartTag. 439 * @param tag The tag 440 */ startTag(TagElement tag)441 protected void startTag(TagElement tag) 442 throws ChangedCharSetException 443 { 444 // This default implementation does nothing. 445 } 446 } 447