1 /* Parser.java -- HTML parser. 2 Copyright (C) 2005 Free Software Foundation, Inc. 3 4 This file is part of GNU Classpath. 5 6 GNU Classpath is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 GNU Classpath is distributed in the hope that it will be useful, but 12 WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GNU Classpath; see the file COPYING. If not, write to the 18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19 02110-1301 USA. 20 21 Linking this library statically or dynamically with other modules is 22 making a combined work based on this library. Thus, the terms and 23 conditions of the GNU General Public License cover the whole 24 combination. 25 26 As a special exception, the copyright holders of this library give you 27 permission to link this library with independent modules to produce an 28 executable, regardless of the license terms of these independent 29 modules, and to copy and distribute the resulting executable under 30 terms of your choice, provided that you also meet, for each linked 31 independent module, the terms and conditions of the license of that 32 module. An independent module is a module which is not derived from 33 or based on this library. If you modify this library, you may extend 34 this exception to your version of the library, but you are not 35 obligated to do so. If you do not wish to do so, delete this 36 exception statement from your version. */ 37 38 39 package gnu.javax.swing.text.html.parser.support; 40 41 import gnu.java.lang.CPStringBuilder; 42 43 import gnu.javax.swing.text.html.parser.htmlAttributeSet; 44 import gnu.javax.swing.text.html.parser.htmlValidator; 45 import gnu.javax.swing.text.html.parser.support.low.Constants; 46 import gnu.javax.swing.text.html.parser.support.low.ParseException; 47 import gnu.javax.swing.text.html.parser.support.low.ReaderTokenizer; 48 import gnu.javax.swing.text.html.parser.support.low.Token; 49 import gnu.javax.swing.text.html.parser.support.low.node; 50 import gnu.javax.swing.text.html.parser.support.low.pattern; 51 52 import java.io.IOException; 53 import java.io.Reader; 54 55 import java.util.Comparator; 56 import java.util.Set; 57 import java.util.TreeSet; 58 import java.util.Vector; 59 60 import javax.swing.text.ChangedCharSetException; 61 import javax.swing.text.SimpleAttributeSet; 62 import javax.swing.text.html.HTML; 63 import javax.swing.text.html.parser.AttributeList; 64 import javax.swing.text.html.parser.DTD; 65 import javax.swing.text.html.parser.DTDConstants; 66 import javax.swing.text.html.parser.Element; 67 import javax.swing.text.html.parser.Entity; 68 import javax.swing.text.html.parser.TagElement; 69 70 /** 71 * <p>A simple error-tolerant HTML parser that uses a DTD document 72 * to access data on the possible tokens, arguments and syntax.</p> 73 * <p> The parser reads an HTML content from a Reader and calls various 74 * notifying methods (which should be overridden in a subclass) 75 * when tags or data are encountered.</p> 76 * <p>Some HTML elements need no opening or closing tags. The 77 * task of this parser is to invoke the tag handling methods also when 78 * the tags are not explicitly specified and must be supposed using 79 * information, stored in the DTD. 80 * For example, parsing the document 81 * <p><table><tr><td>a<td>b<td>c</tr> <br> 82 * will invoke exactly the handling methods exactly in the same order 83 * (and with the same parameters) as if parsing the document: <br> 84 * <em><html><head></head><body><table>< 85 * tbody></em><tr><td>a<em></td></em><td>b<em> 86 * </td></em><td>c<em></td></tr></em>< 87 * <em>/tbody></table></body></html></em></p> 88 * (supposed tags are given in italics). The parser also supports 89 * obsolete elements of HTML syntax.<p> 90 * </p> 91 * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) 92 */ 93 public class Parser 94 extends ReaderTokenizer 95 implements DTDConstants 96 { 97 /** 98 * The current html tag. 99 */ 100 public Token hTag = new Token(); 101 102 /** 103 * The document template description that will be used to parse the documents. 104 */ 105 protected DTD dtd; 106 107 /** 108 * The value of this field determines whether or not the Parser will be 109 * strict in enforcing SGML compatibility. The default value is false, 110 * stating that the parser should do everything to parse and get at least 111 * some information even from the incorrectly written HTML input. 112 */ 113 protected boolean strict; 114 115 /** 116 * This fields has positive values in preformatted tags. 117 */ 118 protected int preformatted = 0; 119 120 /** 121 * The set of the document tags. This field is used for supporting 122 * markFirstTime(). 123 */ 124 private Set documentTags = 125 new TreeSet(new Comparator() 126 { 127 public int compare(Object a, Object b) 128 { 129 return ((String) a).compareToIgnoreCase((String) b); 130 } 131 } 132 ); 133 134 /** 135 * The buffer to collect the incremental output like text or coment. 136 */ 137 private final StringBuffer buffer = new StringBuffer(); 138 139 /** 140 * The buffer to store the document title. 141 */ 142 private final StringBuffer title = new StringBuffer(); 143 144 /** 145 * The current token. 146 */ 147 private Token t; 148 149 /** 150 * True means that the 'title' tag of this document has 151 * already been handled. 152 */ 153 private boolean titleHandled; 154 155 /** 156 * True means that the 'title' tag is currently open and all 157 * text is also added to the title buffer. 158 */ 159 private boolean titleOpen; 160 161 /** 162 * The attributes of the current HTML element. 163 * Package-private to avoid an accessor method. 164 */ 165 htmlAttributeSet attributes = 166 htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET; 167 168 /** 169 * The validator, controlling the forcible closing of the tags that 170 * (in accordance to dtd) are not allowed in the current context. 171 */ 172 private htmlValidator validator; 173 174 /** 175 * Provides the default values for parameters in the case when these 176 * values are defined in the DTD. 177 */ 178 private parameterDefaulter defaulter; 179 180 /** 181 * The text pre-processor for handling line ends and tabs. 182 */ 183 private textPreProcessor textProcessor = new textPreProcessor(); 184 185 /** 186 * Creates a new Parser that uses the given 187 * {@link javax.swing.text.html.parser.DTD }. The only standard way 188 * to get an instance of DTD is to construct it manually, filling in 189 * all required fields. 190 * @param a_dtd The DTD to use. The parser behaviour after passing null 191 * as an argument is not documented and may vary between implementations. 192 */ Parser(DTD a_dtd)193 public Parser(DTD a_dtd) 194 { 195 if (a_dtd == null) 196 dtd = gnu.javax.swing.text.html.parser.HTML_401F.getInstance(); 197 else 198 dtd = a_dtd; 199 200 defaulter = new parameterDefaulter(dtd); 201 202 validator = 203 new htmlValidator(dtd) 204 { 205 /** 206 * Handles the error message. This method must be overridden to pass 207 * the message where required. 208 * @param msg The message text. 209 */ 210 protected void s_error(String msg) 211 { 212 error(msg); 213 } 214 215 /** 216 * The method is called when the tag validator decides to close the 217 * tag on its own initiative. After reaching the end of stream, 218 * The tag validator closes all unclosed elements that are required 219 * to have the end (closing) tag. 220 * 221 * @param tElement The tag being fictionally (forcibly) closed. 222 */ 223 protected void handleSupposedEndTag(Element tElement) 224 { 225 // The tag is cloned as the original tElement is the 226 // element from the starting tag - may be accidently used 227 // somewhere else. 228 TagElement tag = makeTag(tElement, true); 229 _handleEndTag_remaining(tag); 230 } 231 232 /** 233 * The method is called when the the tag validator decides to open 234 * the new tag on its own initiative. The tags, opened in this 235 * way, are HTML, HEAD and BODY. The attribute set is temporary 236 * assigned to the empty one, the previous value is 237 * restored before return. 238 * 239 * @param tElement The tag being fictionally (forcibly) closed. 240 */ 241 protected void handleSupposedStartTag(Element tElement) 242 { 243 TagElement tag = makeTag(tElement, true); 244 htmlAttributeSet were = attributes; 245 attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET; 246 _handleStartTag(tag); 247 attributes = were; 248 } 249 }; 250 } 251 252 /** 253 * Get the attributes of the current tag. 254 * @return The attribute set, representing the attributes of the current tag. 255 */ getAttributes()256 public SimpleAttributeSet getAttributes() 257 { 258 return new SimpleAttributeSet(attributes); 259 } 260 261 /** 262 * Invokes the error handler. The default method in this implementation 263 * delegates the call to handleError, also providing the current line. 264 */ error(String msg)265 public void error(String msg) 266 { 267 error(msg, getTokenAhead()); 268 } 269 error(String msg, Token atToken)270 public void error(String msg, Token atToken) 271 { 272 if (atToken != null) 273 handleError(atToken.where.beginLine, 274 msg + ": line " + atToken.where.beginLine + 275 ", absolute pos " + atToken.where.startPosition 276 ); 277 else 278 handleError(0, msg); 279 } 280 281 /** 282 * Invokes the error handler. The default method in this implementation 283 * delegates the call to error (parm1+": '"+parm2+"'"). 284 */ error(String msg, String invalid)285 public void error(String msg, String invalid) 286 { 287 error(msg + ": '" + invalid + "'"); 288 } 289 290 /** 291 * Invokes the error handler. The default method in this implementation 292 * delegates the call to error (parm1+" "+ parm2+" "+ parm3). 293 */ error(String parm1, String parm2, String parm3)294 public void error(String parm1, String parm2, String parm3) 295 { 296 error(parm1 + " " + parm2 + " " + parm3); 297 } 298 299 /** 300 * Invokes the error handler. The default method in this implementation 301 * delegates the call to error (parm1+" "+ parm2+" "+ parm3+" "+ parm4). 302 */ error(String parm1, String parm2, String parm3, String parm4)303 public void error(String parm1, String parm2, String parm3, String parm4) 304 { 305 error(parm1 + " " + parm2 + " " + parm3 + " " + parm4); 306 } 307 flushAttributes()308 public void flushAttributes() 309 { 310 } 311 312 /** 313 * Parse the HTML text, calling various methods in response to the 314 * occurence of the corresponding HTML constructions. 315 * @param reader The reader to read the source HTML from. 316 * @throws IOException If the reader throws one. 317 */ parse(Reader reader)318 public synchronized void parse(Reader reader) 319 throws IOException 320 { 321 reset(reader); 322 restart(); 323 try 324 { 325 parseDocument(); 326 validator.closeAll(); 327 } 328 catch (ParseException ex) 329 { 330 if (ex != null) 331 { 332 error("Unable to continue parsing the document", ex.getMessage()); 333 334 Throwable cause = ex.getCause(); 335 if (cause instanceof IOException) 336 throw (IOException) cause; 337 } 338 } 339 } 340 341 /** 342 * Parses DTD markup declaration. Currently returns null without action. 343 * @return null. 344 * @throws IOException 345 */ parseDTDMarkup()346 public String parseDTDMarkup() 347 throws IOException 348 { 349 return null; 350 } 351 352 /** 353 * Parse SGML insertion ( <! ... > ). When the 354 * the SGML insertion is found, this method is called, passing 355 * SGML in the string buffer as a parameter. The default method 356 * returns false without action and can be overridden to 357 * implement user - defined SGML support. 358 * <p> 359 * If you need more information about SGML insertions in HTML documents, 360 * the author suggests to read SGML tutorial on 361 * {@link http://www.w3.org/TR/WD-html40-970708/intro/sgmltut.html}. 362 * We also recommend Goldfarb C.F (1991) <i>The SGML Handbook</i>, 363 * Oxford University Press, 688 p, ISBN: 0198537379. 364 * </p> 365 * @param strBuff 366 * @return true if this is a valid DTD markup declaration. 367 * @throws IOException 368 */ parseMarkupDeclarations(StringBuffer strBuff)369 public boolean parseMarkupDeclarations(StringBuffer strBuff) 370 throws IOException 371 { 372 return false; 373 } 374 375 /** 376 * Get the first line of the last parsed token. 377 */ getCurrentLine()378 protected int getCurrentLine() 379 { 380 return hTag.where.beginLine; 381 } 382 383 /** 384 * Read parseable character data, add to buffer. 385 * @param clearBuffer If true, buffer if filled by CDATA section, 386 * otherwise the section is appended to the existing content of the 387 * buffer. 388 * 389 * @throws ParseException 390 */ CDATA(boolean clearBuffer)391 protected void CDATA(boolean clearBuffer) 392 throws ParseException 393 { 394 Token start = hTag = getTokenAhead(); 395 396 if (clearBuffer) 397 buffer.setLength(0); 398 399 // Handle expected EOF. 400 if (start.kind == EOF) 401 return; 402 403 read: 404 while (true) 405 { 406 t = getTokenAhead(); 407 if (t.kind == EOF) 408 { 409 error("unexpected eof", t); 410 break read; 411 } 412 else if (t.kind == BEGIN) 413 break read; 414 else if (t.kind == Constants.ENTITY) 415 { 416 resolveAndAppendEntity(t); 417 getNextToken(); 418 } 419 else 420 { 421 append(t); 422 getNextToken(); 423 } 424 } 425 hTag = new Token(start, getTokenAhead(0)); 426 if (buffer.length() != 0) 427 _handleText(); 428 } 429 430 /** 431 * Process Comment. This method skips till --> without 432 * taking SGML constructs into consideration. The supported SGML 433 * constructs are handled separately. 434 */ Comment()435 protected void Comment() 436 throws ParseException 437 { 438 buffer.setLength(0); 439 440 Token start = hTag = mustBe(BEGIN); 441 optional(WS); 442 mustBe(EXCLAMATION); 443 optional(WS); 444 mustBe(DOUBLE_DASH); 445 446 Token t; 447 Token last; 448 449 comment: 450 while (true) 451 { 452 t = getTokenAhead(); 453 if (t.kind == EOF) 454 { 455 handleEOFInComment(); 456 last = t; 457 break comment; 458 } 459 else if (COMMENT_END.matches(this)) 460 { 461 mustBe(DOUBLE_DASH); 462 optional(WS); 463 last = mustBe(END); 464 break comment; 465 } 466 else if (COMMENT_TRIPLEDASH_END.matches(this)) 467 { 468 mustBe(DOUBLE_DASH); 469 t = mustBe(NUMTOKEN); 470 if (t.getImage().equals("-")) 471 { 472 append(t); 473 last = mustBe(END); 474 break comment; 475 } 476 else 477 { 478 buffer.append("--"); 479 append(t); 480 t = getTokenAhead(); 481 } 482 } 483 else 484 /* The lllll-- can match as NUMTOKEN */ 485 if ((t.getImage().endsWith("--")) && 486 ( 487 getTokenAhead(1).kind == END || 488 (getTokenAhead(1).kind == WS && getTokenAhead(2).kind == END) 489 ) 490 ) 491 { 492 buffer.append(t.getImage().substring(0, t.getImage().length() - 2)); 493 494 /* Skip the closing > that we have already checked. */ 495 last = mustBe(t.kind); 496 break comment; 497 } 498 else 499 append(t); 500 mustBe(t.kind); 501 } 502 hTag = new Token(start, last); 503 504 // Consume any whitespace immediately following a comment. 505 optional(WS); 506 handleComment(); 507 } 508 509 /** 510 * Read a script. The text, returned without any changes, 511 * is terminated only by the closing tag SCRIPT. 512 */ Script()513 protected void Script() 514 throws ParseException 515 { 516 Token name; 517 518 Token start = hTag = mustBe(BEGIN); 519 optional(WS); 520 521 name = mustBe(SCRIPT); 522 523 optional(WS); 524 525 restOfTag(false, name, start); 526 527 buffer.setLength(0); 528 529 while (!SCRIPT_CLOSE.matches(this)) 530 { 531 append(getNextToken()); 532 } 533 534 consume(SCRIPT_CLOSE); 535 536 _handleText(); 537 538 endTag(false); 539 _handleEndTag(makeTagElement(name.getImage(), false)); 540 } 541 542 /** 543 * Process SGML insertion that is not a comment. 544 */ Sgml()545 protected void Sgml() 546 throws ParseException 547 { 548 if (COMMENT_OPEN.matches(this)) 549 Comment(); 550 else // skip till ">" 551 { 552 Token start = hTag = mustBe(BEGIN); 553 optional(WS); 554 mustBe(EXCLAMATION); 555 556 buffer.setLength(0); 557 read: 558 while (true) 559 { 560 t = getNextToken(); 561 if (t.kind == Constants.ENTITY) 562 { 563 resolveAndAppendEntity(t); 564 } 565 else if (t.kind == EOF) 566 { 567 error("unexpected eof", t); 568 break read; 569 } 570 else if (t.kind == END) 571 break read; 572 else 573 append(t); 574 } 575 576 try 577 { 578 parseMarkupDeclarations(buffer); 579 } 580 catch (IOException ex) 581 { 582 error("Unable to parse SGML insertion: '" + buffer + "'", 583 new Token(start, t) 584 ); 585 } 586 } 587 // Consume any whitespace that follows the Sgml insertion. 588 optional(WS); 589 } 590 591 /** 592 * Read a style definition. The text, returned without any changes, 593 * is terminated only by the closing tag STYLE. 594 */ Style()595 protected void Style() 596 throws ParseException 597 { 598 Token name; 599 600 Token start = hTag = mustBe(BEGIN); 601 optional(WS); 602 603 name = mustBe(STYLE); 604 605 optional(WS); 606 607 restOfTag(false, name, start); 608 609 buffer.setLength(0); 610 611 while (!STYLE_CLOSE.matches(this)) 612 { 613 append(getNextToken()); 614 } 615 616 consume(STYLE_CLOSE); 617 618 _handleText(); 619 620 endTag(false); 621 _handleEndTag(makeTagElement(name.getImage(), false)); 622 } 623 624 /** 625 * Read a html tag. 626 */ Tag()627 protected void Tag() 628 throws ParseException 629 { 630 mark(true); 631 632 boolean closing = false; 633 Token name; 634 Token start = hTag = mustBe(BEGIN); 635 636 optional(WS); 637 name = getNextToken(); 638 optional(WS); 639 640 if (name.kind == SLASH) 641 { 642 closing = true; 643 name = getNextToken(); 644 } 645 646 restOfTag(closing, name, start); 647 } 648 649 /** 650 * A hook, for operations, preceeding call to handleText. 651 * Handle text in a string buffer. 652 * In non - preformatted mode, all line breaks immediately following the 653 * start tag and immediately before an end tag is discarded, 654 * \r, \n and \t are replaced by spaces, multiple space are replaced 655 * by the single one and the result is moved into array, 656 * passing it to handleText(). 657 */ _handleText()658 protected void _handleText() 659 { 660 char[] text; 661 662 if (preformatted > 0) 663 text = textProcessor.preprocessPreformatted(buffer); 664 else 665 text = textProcessor.preprocess(buffer); 666 667 if (text != null && text.length > 0 668 // According to the specs we need to discard whitespace immediately 669 // before a closing tag. 670 && (text.length > 1 || text[0] != ' ' || ! TAG_CLOSE.matches(this))) 671 { 672 TagElement pcdata = new TagElement(dtd.getElement("#pcdata")); 673 attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET; 674 _handleEmptyTag(pcdata); 675 676 handleText(text); 677 if (titleOpen) 678 title.append(text); 679 } 680 } 681 682 /** 683 * Add the image of this token to the buffer. 684 * @param t A token to append. 685 */ append(Token t)686 protected final void append(Token t) 687 { 688 if (t.kind != EOF) 689 t.appendTo(buffer); 690 } 691 692 /** 693 * Consume pattern that must match. 694 * @param p A pattern to consume. 695 */ consume(pattern p)696 protected final void consume(pattern p) 697 { 698 node n; 699 for (int i = 0; i < p.nodes.length; i++) 700 { 701 n = p.nodes [ i ]; 702 if (n.optional) 703 optional(n.kind); 704 else 705 mustBe(n.kind); 706 } 707 } 708 709 /** 710 * The method is called when the HTML end (closing) tag is found or if 711 * the parser concludes that the one should be present in the 712 * current position. The method is called immediatly 713 * before calling the handleEndTag(). 714 * @param omitted True if the tag is no actually present in the document, 715 * but is supposed by the parser (like </html> at the end of the 716 * document). 717 */ endTag(boolean omitted)718 protected void endTag(boolean omitted) 719 { 720 } 721 722 /** 723 * Handle HTML comment. The default method returns without action. 724 * @param comment 725 */ handleComment(char[] comment)726 protected void handleComment(char[] comment) 727 { 728 } 729 730 /** 731 * This is additionally called in when the HTML content terminates 732 * without closing the HTML comment. This can only happen if the 733 * HTML document contains errors (for example, the closing --;gt is 734 * missing. 735 */ handleEOFInComment()736 protected void handleEOFInComment() 737 { 738 error("Unclosed comment"); 739 } 740 741 /** 742 * Handle the tag with no content, like <br>. The method is 743 * called for the elements that, in accordance with the current DTD, 744 * has an empty content. 745 * @param tag The tag being handled. 746 * @throws javax.swing.text.ChangedCharSetException 747 */ handleEmptyTag(TagElement tag)748 protected void handleEmptyTag(TagElement tag) 749 throws javax.swing.text.ChangedCharSetException 750 { 751 } 752 753 /** 754 * The method is called when the HTML closing tag ((like </table>) 755 * is found or if the parser concludes that the one should be present 756 * in the current position. 757 * @param tag The tag 758 */ handleEndTag(TagElement tag)759 protected void handleEndTag(TagElement tag) 760 { 761 } 762 763 /* Handle error that has occured in the given line. */ handleError(int line, String message)764 protected void handleError(int line, String message) 765 { 766 } 767 768 /** 769 * The method is called when the HTML opening tag ((like <table>) 770 * is found or if the parser concludes that the one should be present 771 * in the current position. 772 * @param tag The tag 773 */ handleStartTag(TagElement tag)774 protected void handleStartTag(TagElement tag) 775 { 776 } 777 778 /** 779 * Handle the text section. 780 * <p> For non-preformatted section, the parser replaces 781 * \t, \r and \n by spaces and then multiple spaces 782 * by a single space. Additionaly, all whitespace around 783 * tags is discarded. 784 * </p> 785 * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves 786 * all tabs and spaces, but removes <b>one</b> bounding \r, \n or \r\n, 787 * if it is present. Additionally, it replaces each occurence of \r or \r\n 788 * by a single \n.</p> 789 * 790 * @param text A section text. 791 */ handleText(char[] text)792 protected void handleText(char[] text) 793 { 794 } 795 796 /** 797 * Handle HTML <title> tag. This method is invoked when 798 * both title starting and closing tags are already behind. 799 * The passed argument contains the concatenation of all 800 * title text sections. 801 * @param title The title text. 802 */ handleTitle(char[] title)803 protected void handleTitle(char[] title) 804 { 805 } 806 807 /** 808 * Constructs the tag from the given element. In this implementation, 809 * this is defined, but never called. 810 * @return the tag 811 */ makeTag(Element element)812 protected TagElement makeTag(Element element) 813 { 814 return makeTag(element, false); 815 } 816 817 /** 818 * Constructs the tag from the given element. 819 * @param the tag base {@link javax.swing.text.html.parser.Element} 820 * @param isSupposed true if the tag is not actually present in the 821 * html input, but the parser supposes that it should to occur in 822 * the current location. 823 * @return the tag 824 */ makeTag(Element element, boolean isSupposed)825 protected TagElement makeTag(Element element, boolean isSupposed) 826 { 827 return new TagElement(element, isSupposed); 828 } 829 830 /** 831 * This is called when the tag, representing the given element, 832 * occurs first time in the document. 833 * @param element 834 */ markFirstTime(Element element)835 protected void markFirstTime(Element element) 836 { 837 } 838 839 /** 840 * Consume the token that was checked before and hence MUST be present. 841 * @param kind The kind of token to consume. 842 */ mustBe(int kind)843 protected Token mustBe(int kind) 844 { 845 if (getTokenAhead().kind == kind) 846 return getNextToken(); 847 else 848 { 849 String ei = ""; 850 if (kind < 1000) 851 ei = " ('" + (char) kind + "') "; 852 throw new AssertionError("The token of kind " + kind + ei + 853 " MUST be here," 854 ); 855 } 856 } 857 858 /** 859 * Handle attribute without value. The default method uses 860 * the only allowed attribute value from DTD. 861 * If the attribute is unknown or allows several values, 862 * the HTML.NULL_ATTRIBUTE_VALUE is used. The attribute with 863 * this value is added to the attribute set. 864 * @param element The name of element. 865 * @param attribute The name of attribute without value. 866 */ noValueAttribute(String element, String attribute)867 protected void noValueAttribute(String element, String attribute) 868 { 869 Object value = HTML.NULL_ATTRIBUTE_VALUE; 870 871 Element e = dtd.elementHash.get(element.toLowerCase()); 872 if (e != null) 873 { 874 AttributeList attr = e.getAttribute(attribute); 875 if (attr != null) 876 { 877 Vector values = attr.values; 878 if (values != null && values.size() == 1) 879 value = values.get(0); 880 } 881 } 882 attributes.addAttribute(attribute, value); 883 } 884 885 /** 886 * Consume the optional token, if present. 887 * @param kind The kind of token to consume. 888 */ optional(int kind)889 protected Token optional(int kind) 890 { 891 if (getTokenAhead().kind == kind) 892 return getNextToken(); 893 else 894 return null; 895 } 896 897 /** Parse the html document. */ parseDocument()898 protected void parseDocument() 899 throws ParseException 900 { 901 // Read up any initial whitespace. 902 optional(WS); 903 while (getTokenAhead().kind != EOF) 904 { 905 advanced = false; 906 if (TAG.matches(this)) 907 Tag(); 908 else if (COMMENT_OPEN.matches(this)) 909 Comment(); 910 else if (STYLE_OPEN.matches(this)) 911 Style(); 912 else if (SCRIPT_OPEN.matches(this)) 913 Script(); 914 else if (SGML.matches(this)) 915 Sgml(); 916 else 917 CDATA(true); 918 919 // Surely HTML error, treat as a text. 920 if (!advanced) 921 { 922 Token wrong = getNextToken(); 923 error("unexpected '" + wrong.getImage() + "'", wrong); 924 buffer.setLength(0); 925 buffer.append(wrong.getImage()); 926 _handleText(); 927 } 928 } 929 } 930 931 /** 932 * Read the element attributes, adding them into attribute set. 933 * @param element The element name (needed to access attribute 934 * information in dtd). 935 */ readAttributes(String element)936 protected void readAttributes(String element) 937 { 938 Token name; 939 Token value; 940 Token next; 941 String attrValue; 942 943 attributes = new htmlAttributeSet(); 944 945 optional(WS); 946 947 attributeReading: 948 while (getTokenAhead().kind == NUMTOKEN) 949 { 950 name = getNextToken(); 951 optional(WS); 952 953 next = getTokenAhead(); 954 if (next.kind == EQ) 955 { 956 mustBe(EQ); 957 optional(WS); 958 959 next = getNextToken(); 960 961 switch (next.kind) 962 { 963 case QUOT: 964 965 // read "quoted" attribute. 966 buffer.setLength(0); 967 readTillTokenE(QUOT); 968 attrValue = buffer.toString(); 969 break; 970 971 case AP: 972 973 // read 'quoted' attribute. 974 buffer.setLength(0); 975 readTillTokenE(AP); 976 attrValue = buffer.toString(); 977 break; 978 979 // read unquoted attribute. 980 case NUMTOKEN: 981 value = next; 982 optional(WS); 983 984 // Check maybe the opening quote is missing. 985 next = getTokenAhead(); 986 if (bQUOTING.get(next.kind)) 987 { 988 hTag = next; 989 error("The value without opening quote is closed with '" 990 + next.getImage() + "'"); 991 attrValue = value.getImage(); 992 } 993 else if (next.kind == SLASH || next.kind == OTHER) 994 // The slash and other characters (like %) in this context is 995 // treated as the ordinary 996 // character, not as a token. The character may be part of 997 // the unquoted URL. 998 { 999 CPStringBuilder image = new CPStringBuilder(value.getImage()); 1000 while (next.kind == NUMTOKEN || next.kind == SLASH 1001 || next.kind == OTHER) 1002 { 1003 image.append(getNextToken().getImage()); 1004 next = getTokenAhead(); 1005 } 1006 attrValue = image.toString(); 1007 } 1008 else 1009 attrValue = value.getImage(); 1010 break; 1011 1012 case SLASH: 1013 value = next; 1014 optional(WS); 1015 1016 // Check maybe the opening quote is missing. 1017 next = getTokenAhead(); 1018 if (bQUOTING.get(next.kind)) 1019 { 1020 hTag = next; 1021 error("The value without opening quote is closed with '" 1022 + next.getImage() + "'"); 1023 attrValue = value.getImage(); 1024 } 1025 else if (next.kind == NUMTOKEN || next.kind == SLASH) 1026 // The slash in this context is treated as the ordinary 1027 // character, not as a token. The slash may be part of 1028 // the unquoted URL. 1029 { 1030 CPStringBuilder image = new CPStringBuilder(value.getImage()); 1031 while (next.kind == NUMTOKEN || next.kind == SLASH) 1032 { 1033 image.append(getNextToken().getImage()); 1034 next = getTokenAhead(); 1035 } 1036 attrValue = image.toString(); 1037 } 1038 else 1039 attrValue = value.getImage(); 1040 break; 1041 default: 1042 break attributeReading; 1043 } 1044 attributes.addAttribute(name.getImage(), attrValue); 1045 optional(WS); 1046 } 1047 else 1048 // The '=' is missing: attribute without value. 1049 { 1050 noValueAttribute(element, name.getImage()); 1051 } 1052 } 1053 } 1054 1055 /** 1056 * Return string, corresponding the given named entity. The name is passed 1057 * with the preceeding &, but without the ending semicolon. 1058 */ resolveNamedEntity(final String a_tag)1059 protected String resolveNamedEntity(final String a_tag) 1060 { 1061 // Discard & 1062 if (!a_tag.startsWith("&")) 1063 throw new AssertionError("Named entity " + a_tag + 1064 " must start witn '&'." 1065 ); 1066 1067 String tag = a_tag.substring(1); 1068 1069 try 1070 { 1071 Entity entity = dtd.getEntity(tag); 1072 if (entity != null) 1073 return entity.getString(); 1074 1075 entity = dtd.getEntity(tag.toLowerCase()); 1076 1077 if (entity != null) 1078 { 1079 error("The name of this entity should be in lowercase", a_tag); 1080 return entity.getString(); 1081 } 1082 } 1083 catch (IndexOutOfBoundsException ibx) 1084 { 1085 /* The error will be reported. */ 1086 } 1087 1088 error("Unknown named entity", a_tag); 1089 return a_tag; 1090 } 1091 1092 /** 1093 * Return char, corresponding the given numeric entity. 1094 * The name is passed with the preceeding &#, but without 1095 * the ending semicolon. 1096 */ resolveNumericEntity(final String a_tag)1097 protected char resolveNumericEntity(final String a_tag) 1098 { 1099 // Discard &# 1100 if (!a_tag.startsWith("&#")) 1101 throw new AssertionError("Numeric entity " + a_tag + 1102 " must start witn '&#'." 1103 ); 1104 1105 String tag = a_tag.substring(2); 1106 1107 try 1108 { 1109 // Determine the encoding type: 1110 char cx = tag.charAt(0); 1111 if (cx == 'x' || cx == 'X') // Hexadecimal &#Xnnn; 1112 1113 return (char) Integer.parseInt(tag.substring(1), 16); 1114 1115 return (char) Integer.parseInt(tag); 1116 } 1117 1118 /* The error will be reported. */ 1119 catch (NumberFormatException nex) 1120 { 1121 } 1122 catch (IndexOutOfBoundsException ix) 1123 { 1124 } 1125 1126 error("Invalid numeric entity", a_tag); 1127 return '?'; 1128 } 1129 1130 /** 1131 * Reset all fields into the intial default state, preparing the 1132 * parset for parsing the next document. 1133 */ restart()1134 protected void restart() 1135 { 1136 documentTags.clear(); 1137 titleHandled = false; 1138 titleOpen = false; 1139 buffer.setLength(0); 1140 title.setLength(0); 1141 validator.restart(); 1142 } 1143 1144 /** 1145 * The method is called when the HTML opening tag ((like <table>) 1146 * is found or if the parser concludes that the one should be present 1147 * in the current position. The method is called immediately before 1148 * calling the handleStartTag. 1149 * @param tag The tag 1150 */ startTag(TagElement tag)1151 protected void startTag(TagElement tag) 1152 throws ChangedCharSetException 1153 { 1154 } 1155 1156 /** 1157 * Handle a complete element, when the tag content is already present in the 1158 * buffer and both starting and heading tags behind. This is called 1159 * in the case when the tag text must not be parsed for the nested 1160 * elements (elements STYLE and SCRIPT). 1161 */ _handleCompleteElement(TagElement tag)1162 private void _handleCompleteElement(TagElement tag) 1163 { 1164 _handleStartTag(tag); 1165 1166 // Suppress inclusion of the SCRIPT ans STYLE texts into the title. 1167 HTML.Tag h = tag.getHTMLTag(); 1168 if (h == HTML.Tag.SCRIPT || h == HTML.Tag.STYLE) 1169 { 1170 boolean tmp = titleOpen; 1171 titleOpen = false; 1172 _handleText(); 1173 titleOpen = tmp; 1174 } 1175 else 1176 _handleText(); 1177 1178 _handleEndTag(tag); 1179 } 1180 1181 /** 1182 * A hooks for operations, preceeding call to handleEmptyTag(). 1183 * Handle the tag with no content, like <br>. As no any 1184 * nested tags are expected, the tag validator is not involved. 1185 * @param tag The tag being handled. 1186 */ _handleEmptyTag(TagElement tag)1187 private void _handleEmptyTag(TagElement tag) 1188 { 1189 try 1190 { 1191 validator.validateTag(tag, attributes); 1192 handleEmptyTag(tag); 1193 HTML.Tag h = tag.getHTMLTag(); 1194 // When a block tag is closed, consume whitespace that follows after 1195 // it. 1196 // For some unknown reason a FRAME tag is not treated as block element. 1197 // However in this case it should be treated as such. 1198 if (isBlock(h)) 1199 optional(WS); 1200 } 1201 catch (ChangedCharSetException ex) 1202 { 1203 error("Changed charset exception:", ex.getMessage()); 1204 } 1205 } 1206 1207 /** 1208 * A hooks for operations, preceeding call to handleEndTag(). 1209 * The method is called when the HTML closing tag 1210 * is found. Calls handleTitle after closing the 'title' tag. 1211 * @param tag The tag 1212 */ _handleEndTag(TagElement tag)1213 private void _handleEndTag(TagElement tag) 1214 { 1215 if (validator.closeTag(tag)) 1216 _handleEndTag_remaining(tag); 1217 } 1218 1219 /** 1220 * Actions that are also required if the closing action was 1221 * initiated by the tag validator. 1222 * Package-private to avoid an accessor method. 1223 */ _handleEndTag_remaining(TagElement tag)1224 void _handleEndTag_remaining(TagElement tag) 1225 { 1226 HTML.Tag h = tag.getHTMLTag(); 1227 1228 handleEndTag(tag); 1229 endTag(tag.fictional()); 1230 1231 if (h.isPreformatted()) 1232 preformatted--; 1233 if (preformatted < 0) 1234 preformatted = 0; 1235 1236 // When a block tag is closed, consume whitespace that follows after 1237 // it. 1238 if (isBlock(h)) 1239 optional(WS); 1240 1241 if (h == HTML.Tag.TITLE) 1242 { 1243 titleOpen = false; 1244 titleHandled = true; 1245 1246 char[] a = new char[ title.length() ]; 1247 title.getChars(0, a.length, a, 0); 1248 handleTitle(a); 1249 } 1250 } 1251 1252 /** 1253 * A hooks for operations, preceeding call to handleStartTag(). 1254 * The method is called when the HTML opening tag ((like <table>) 1255 * is found. 1256 * Package-private to avoid an accessor method. 1257 * @param tag The tag 1258 */ _handleStartTag(TagElement tag)1259 void _handleStartTag(TagElement tag) 1260 { 1261 validator.openTag(tag, attributes); 1262 startingTag(tag); 1263 handleStartTag(tag); 1264 1265 HTML.Tag h = tag.getHTMLTag(); 1266 1267 if (isBlock(h)) 1268 optional(WS); 1269 1270 if (h.isPreformatted()) 1271 preformatted++; 1272 1273 if (h == HTML.Tag.TITLE) 1274 { 1275 if (titleHandled) 1276 error("Repetetive <TITLE> tag"); 1277 titleOpen = true; 1278 titleHandled = false; 1279 } 1280 } 1281 1282 /** 1283 * Resume parsing after heavy errors in HTML tag structure. 1284 * @throws ParseException 1285 */ forciblyCloseTheTag()1286 private void forciblyCloseTheTag() 1287 throws ParseException 1288 { 1289 int closeAt = 0; 1290 buffer.setLength(0); 1291 1292 ahead: 1293 for (int i = 1; i < 100; i++) 1294 { 1295 t = getTokenAhead(i - 1); 1296 if (t.kind == EOF || t.kind == BEGIN) 1297 break ahead; 1298 if (t.kind == END) 1299 { 1300 /* Closing '>' found. */ 1301 closeAt = i; 1302 break ahead; 1303 } 1304 } 1305 if (closeAt > 0) 1306 { 1307 buffer.append("Ignoring '"); 1308 for (int i = 1; i <= closeAt; i++) 1309 { 1310 t = getNextToken(); 1311 append(t); 1312 } 1313 buffer.append('\''); 1314 error(buffer.toString()); 1315 } 1316 } 1317 1318 /** 1319 * Handle comment in string buffer. You can avoid allocating a char 1320 * array each time by processing your comment directly here. 1321 */ handleComment()1322 private void handleComment() 1323 { 1324 char[] a = new char[ buffer.length() ]; 1325 buffer.getChars(0, a.length, a, 0); 1326 handleComment(a); 1327 } 1328 makeTagElement(String name, boolean isSupposed)1329 private TagElement makeTagElement(String name, boolean isSupposed) 1330 { 1331 Element e = dtd.elementHash.get(name.toLowerCase()); 1332 if (e == null) 1333 { 1334 error("Unknown tag <" + name + ">"); 1335 e = dtd.getElement(name); 1336 e.name = name.toUpperCase(); 1337 e.index = -1; 1338 } 1339 1340 if (!documentTags.contains(e.name)) 1341 { 1342 markFirstTime(e); 1343 documentTags.add(e.name); 1344 } 1345 1346 return makeTag(e, isSupposed); 1347 } 1348 1349 /** 1350 * Read till the given token, resolving entities. Consume the given 1351 * token without adding it to buffer. 1352 * @param till The token to read till 1353 * @throws ParseException 1354 */ readTillTokenE(int till)1355 private void readTillTokenE(int till) 1356 throws ParseException 1357 { 1358 buffer.setLength(0); 1359 read: 1360 while (true) 1361 { 1362 t = getNextToken(); 1363 if (t.kind == Constants.ENTITY) 1364 { 1365 resolveAndAppendEntity(t); 1366 } 1367 else if (t.kind == EOF) 1368 { 1369 error("unexpected eof", t); 1370 break read; 1371 } 1372 else if (t.kind == till) 1373 break read; 1374 else if (t.kind == WS) 1375 { 1376 // Processing whitespace in accordance with CDATA rules: 1377 String s = t.getImage(); 1378 char c; 1379 for (int i = 0; i < s.length(); i++) 1380 { 1381 c = s.charAt(i); 1382 if (c == '\r') 1383 buffer.append(' '); // CR replaced by space 1384 else if (c == '\n') 1385 { /* LF ignored */ } 1386 else if (c == '\t') 1387 buffer.append(' '); // Tab replaced by space 1388 else 1389 buffer.append(c); 1390 } 1391 } 1392 else 1393 append(t); 1394 } 1395 } 1396 1397 /** 1398 * Resolve the entity and append it to the end of buffer. 1399 * @param entity 1400 */ resolveAndAppendEntity(Token entity)1401 private void resolveAndAppendEntity(Token entity) 1402 { 1403 switch (entity.category) 1404 { 1405 case ENTITY_NAMED : 1406 buffer.append(resolveNamedEntity(entity.getImage())); 1407 break; 1408 1409 case ENTITY_NUMERIC : 1410 buffer.append(resolveNumericEntity(entity.getImage())); 1411 break; 1412 1413 default : 1414 throw new AssertionError("Invalid entity category " + 1415 entity.category 1416 ); 1417 } 1418 } 1419 1420 /** 1421 * Handle the remaining of HTML tags. This is a common end for 1422 * TAG, SCRIPT and STYLE. 1423 * @param closing True for closing tags ( </TAG> ). 1424 * @param name Name of element 1425 * @param start Token where element has started 1426 * @throws ParseException 1427 */ restOfTag(boolean closing, Token name, Token start)1428 private void restOfTag(boolean closing, Token name, Token start) 1429 throws ParseException 1430 { 1431 boolean end = false; 1432 Token next; 1433 1434 optional(WS); 1435 1436 readAttributes(name.getImage()); 1437 1438 optional(WS); 1439 1440 next = getTokenAhead(); 1441 if (next.kind == END) 1442 { 1443 mustBe(END); 1444 end = true; 1445 } 1446 1447 hTag = new Token(start, next); 1448 1449 if (!end) 1450 { 1451 // The tag body contains errors. If additionally the tag 1452 // name is not valid, this construction is treated as text. 1453 if (dtd.elementHash.get(name.getImage().toLowerCase()) == null && 1454 backupMode 1455 ) 1456 { 1457 error("Errors in tag body and unknown tag name. " + 1458 "Treating the tag as a text." 1459 ); 1460 reset(); 1461 1462 hTag = mustBe(BEGIN); 1463 buffer.setLength(0); 1464 buffer.append(hTag.getImage()); 1465 CDATA(false); 1466 return; 1467 } 1468 else 1469 { 1470 error("Forcibly closing invalid parameter list"); 1471 forciblyCloseTheTag(); 1472 } 1473 } 1474 1475 if (closing) 1476 { 1477 endTag(false); 1478 _handleEndTag(makeTagElement(name.getImage(), false)); 1479 } 1480 else 1481 { 1482 TagElement te = makeTagElement(name.getImage(), false); 1483 if (te.getElement().type == DTDConstants.EMPTY) 1484 _handleEmptyTag(te); 1485 else 1486 { 1487 // According to the specs we need to consume whitespace following 1488 // immediately after a opening tag. 1489 optional(WS); 1490 _handleStartTag(te); 1491 } 1492 } 1493 } 1494 1495 /** 1496 * This should fire additional actions in response to the 1497 * ChangedCharSetException. The current implementation 1498 * does nothing. 1499 * @param tag 1500 */ startingTag(TagElement tag)1501 private void startingTag(TagElement tag) 1502 { 1503 try 1504 { 1505 startTag(tag); 1506 } 1507 catch (ChangedCharSetException cax) 1508 { 1509 error("Invalid change of charset"); 1510 } 1511 } 1512 ws_error()1513 private void ws_error() 1514 { 1515 error("Whitespace here is not permitted"); 1516 } 1517 1518 /** 1519 * Returns true when the specified tag should be considered a block tag 1520 * wrt whitespace handling. We need this special handling, since there 1521 * are a couple of tags that we must treat as block tags but which aren't 1522 * officially block tags. 1523 * 1524 * @param tag the tag to check 1525 * @return true when the specified tag should be considered a block tag 1526 * wrt whitespace handling 1527 */ isBlock(HTML.Tag tag)1528 private boolean isBlock(HTML.Tag tag) 1529 { 1530 return tag.isBlock() || tag == HTML.Tag.STYLE || tag == HTML.Tag.FRAME; 1531 } 1532 } 1533