1 /*
2  * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 package javax.swing.text.html.parser;
27 
28 import javax.swing.text.SimpleAttributeSet;
29 import javax.swing.text.html.HTML;
30 import javax.swing.text.ChangedCharSetException;
31 import java.io.*;
32 import java.util.Hashtable;
33 import java.util.Properties;
34 import java.util.Vector;
35 import java.util.Enumeration;
36 import java.net.URL;
37 
38 import sun.misc.MessageUtils;
39 
40 /**
41  * A simple DTD-driven HTML parser. The parser reads an
42  * HTML file from an InputStream and calls various methods
43  * (which should be overridden in a subclass) when tags and
44  * data are encountered.
45  * <p>
46  * Unfortunately there are many badly implemented HTML parsers
47  * out there, and as a result there are many badly formatted
48  * HTML files. This parser attempts to parse most HTML files.
49  * This means that the implementation sometimes deviates from
50  * the SGML specification in favor of HTML.
51  * <p>
52  * The parser treats \r and \r\n as \n. Newlines after starttags
53  * and before end tags are ignored just as specified in the SGML/HTML
54  * specification.
55  * <p>
56  * The html spec does not specify how spaces are to be coalesced very well.
57  * Specifically, the following scenarios are not discussed (note that a
58  * space should be used here, but I am using &amp;nbsp to force the space to
59  * be displayed):
60  * <p>
61  * '&lt;b&gt;blah&nbsp;&lt;i&gt;&nbsp;&lt;strike&gt;&nbsp;foo' which can be treated as:
62  * '&lt;b&gt;blah&nbsp;&lt;i&gt;&lt;strike&gt;foo'
63  * <p>as well as:
64  * '&lt;p&gt;&lt;a href="xx"&gt;&nbsp;&lt;em&gt;Using&lt;/em&gt;&lt;/a&gt;&lt;/p&gt;'
65  * which appears to be treated as:
66  * '&lt;p&gt;&lt;a href="xx"&gt;&lt;em&gt;Using&lt;/em&gt;&lt;/a&gt;&lt;/p&gt;'
67  * <p>
68  * If <code>strict</code> is false, when a tag that breaks flow,
69  * (<code>TagElement.breaksFlows</code>) or trailing whitespace is
70  * encountered, all whitespace will be ignored until a non whitespace
71  * character is encountered. This appears to give behavior closer to
72  * the popular browsers.
73  *
74  * @see DTD
75  * @see TagElement
76  * @see SimpleAttributeSet
77  * @author Arthur van Hoff
78  * @author Sunita Mani
79  */
80 public
81 class Parser implements DTDConstants {
82 
83     private char text[] = new char[1024];
84     private int textpos = 0;
85     private TagElement last;
86     private boolean space;
87 
88     private char str[] = new char[128];
89     private int strpos = 0;
90 
91     protected DTD dtd = null;
92 
93     private int ch;
94     private int ln;
95     private Reader in;
96 
97     private Element recent;
98     private TagStack stack;
99     private boolean skipTag = false;
100     private TagElement lastFormSent = null;
101     private SimpleAttributeSet attributes = new SimpleAttributeSet();
102 
103     // State for <html>, <head> and <body>.  Since people like to slap
104     // together HTML documents without thinking, occasionally they
105     // have multiple instances of these tags.  These booleans track
106     // the first sightings of these tags so they can be safely ignored
107     // by the parser if repeated.
108     private boolean seenHtml = false;
109     private boolean seenHead = false;
110     private boolean seenBody = false;
111 
112     /**
113      * The html spec does not specify how spaces are coalesced very well.
114      * If strict == false, ignoreSpace is used to try and mimic the behavior
115      * of the popular browsers.
116      * <p>
117      * The problematic scenarios are:
118      * '&lt;b>blah &lt;i> &lt;strike> foo' which can be treated as:
119      * '&lt;b>blah &lt;i>&lt;strike>foo'
120      * as well as:
121      * '&lt;p>&lt;a href="xx"> &lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
122      * which appears to be treated as:
123      * '&lt;p>&lt;a href="xx">&lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
124      * <p>
125      * When a tag that breaks flow, or trailing whitespace is encountered
126      * ignoreSpace is set to true. From then on, all whitespace will be
127      * ignored.
128      * ignoreSpace will be set back to false the first time a
129      * non whitespace character is encountered. This appears to give
130      * behavior closer to the popular browsers.
131      */
132     private boolean ignoreSpace;
133 
134     /**
135      * This flag determines whether or not the Parser will be strict
136      * in enforcing SGML compatibility.  If false, it will be lenient
137      * with certain common classes of erroneous HTML constructs.
138      * Strict or not, in either case an error will be recorded.
139      *
140      */
141     protected boolean strict = false;
142 
143 
144     /** Number of \r\n's encountered. */
145     private int crlfCount;
146     /** Number of \r's encountered. A \r\n will not increment this. */
147     private int crCount;
148     /** Number of \n's encountered. A \r\n will not increment this. */
149     private int lfCount;
150 
151     //
152     // To correctly identify the start of a tag/comment/text we need two
153     // ivars. Two are needed as handleText isn't invoked until the tag
154     // after the text has been parsed, that is the parser parses the text,
155     // then a tag, then invokes handleText followed by handleStart.
156     //
157     /** The start position of the current block. Block is overloaded here,
158      * it really means the current start position for the current comment,
159      * tag, text. Use getBlockStartPosition to access this. */
160     private int currentBlockStartPos;
161     /** Start position of the last block. */
162     private int lastBlockStartPos;
163 
164     /**
165      * array for mapping numeric references in range
166      * 130-159 to displayable Unicode characters.
167      */
168     private static final char[] cp1252Map = {
169         8218,  // &#130;
170         402,   // &#131;
171         8222,  // &#132;
172         8230,  // &#133;
173         8224,  // &#134;
174         8225,  // &#135;
175         710,   // &#136;
176         8240,  // &#137;
177         352,   // &#138;
178         8249,  // &#139;
179         338,   // &#140;
180         141,   // &#141;
181         142,   // &#142;
182         143,   // &#143;
183         144,   // &#144;
184         8216,  // &#145;
185         8217,  // &#146;
186         8220,  // &#147;
187         8221,  // &#148;
188         8226,  // &#149;
189         8211,  // &#150;
190         8212,  // &#151;
191         732,   // &#152;
192         8482,  // &#153;
193         353,   // &#154;
194         8250,  // &#155;
195         339,   // &#156;
196         157,   // &#157;
197         158,   // &#158;
198         376    // &#159;
199     };
200 
Parser(DTD dtd)201     public Parser(DTD dtd) {
202         this.dtd = dtd;
203     }
204 
205 
206     /**
207      * @return the line number of the line currently being parsed
208      */
getCurrentLine()209     protected int getCurrentLine() {
210         return ln;
211     }
212 
213     /**
214      * Returns the start position of the current block. Block is
215      * overloaded here, it really means the current start position for
216      * the current comment tag, text, block.... This is provided for
217      * subclassers that wish to know the start of the current block when
218      * called with one of the handleXXX methods.
219      */
getBlockStartPosition()220     int getBlockStartPosition() {
221         return Math.max(0, lastBlockStartPos - 1);
222     }
223 
224     /**
225      * Makes a TagElement.
226      */
makeTag(Element elem, boolean fictional)227     protected TagElement makeTag(Element elem, boolean fictional) {
228         return new TagElement(elem, fictional);
229     }
230 
makeTag(Element elem)231     protected TagElement makeTag(Element elem) {
232         return makeTag(elem, false);
233     }
234 
getAttributes()235     protected SimpleAttributeSet getAttributes() {
236         return attributes;
237     }
238 
flushAttributes()239     protected void flushAttributes() {
240         attributes.removeAttributes(attributes);
241     }
242 
243     /**
244      * Called when PCDATA is encountered.
245      */
handleText(char text[])246     protected void handleText(char text[]) {
247     }
248 
249     /**
250      * Called when an HTML title tag is encountered.
251      */
handleTitle(char text[])252     protected void handleTitle(char text[]) {
253         // default behavior is to call handleText. Subclasses
254         // can override if necessary.
255         handleText(text);
256     }
257 
258     /**
259      * Called when an HTML comment is encountered.
260      */
handleComment(char text[])261     protected void handleComment(char text[]) {
262     }
263 
handleEOFInComment()264     protected void handleEOFInComment() {
265         // We've reached EOF.  Our recovery strategy is to
266         // see if we have more than one line in the comment;
267         // if so, we pretend that the comment was an unterminated
268         // single line comment, and reparse the lines after the
269         // first line as normal HTML content.
270 
271         int commentEndPos = strIndexOf('\n');
272         if (commentEndPos >= 0) {
273             handleComment(getChars(0, commentEndPos));
274             try {
275                 in.close();
276                 in = new CharArrayReader(getChars(commentEndPos + 1));
277                 ch = '>';
278             } catch (IOException e) {
279                 error("ioexception");
280             }
281 
282             resetStrBuffer();
283         } else {
284             // no newline, so signal an error
285             error("eof.comment");
286         }
287     }
288 
289     /**
290      * Called when an empty tag is encountered.
291      */
handleEmptyTag(TagElement tag)292     protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
293     }
294 
295     /**
296      * Called when a start tag is encountered.
297      */
handleStartTag(TagElement tag)298     protected void handleStartTag(TagElement tag) {
299     }
300 
301     /**
302      * Called when an end tag is encountered.
303      */
handleEndTag(TagElement tag)304     protected void handleEndTag(TagElement tag) {
305     }
306 
307     /**
308      * An error has occurred.
309      */
handleError(int ln, String msg)310     protected void handleError(int ln, String msg) {
311         /*
312         Thread.dumpStack();
313         System.out.println("**** " + stack);
314         System.out.println("line " + ln + ": error: " + msg);
315         System.out.println();
316         */
317     }
318 
319     /**
320      * Output text.
321      */
handleText(TagElement tag)322     void handleText(TagElement tag) {
323         if (tag.breaksFlow()) {
324             space = false;
325             if (!strict) {
326                 ignoreSpace = true;
327             }
328         }
329         if (textpos == 0) {
330             if ((!space) || (stack == null) || last.breaksFlow() ||
331                 !stack.advance(dtd.pcdata)) {
332                 last = tag;
333                 space = false;
334                 lastBlockStartPos = currentBlockStartPos;
335                 return;
336             }
337         }
338         if (space) {
339             if (!ignoreSpace) {
340                 // enlarge buffer if needed
341                 if (textpos + 1 > text.length) {
342                     char newtext[] = new char[text.length + 200];
343                     System.arraycopy(text, 0, newtext, 0, text.length);
344                     text = newtext;
345                 }
346 
347                 // output pending space
348                 text[textpos++] = ' ';
349                 if (!strict && !tag.getElement().isEmpty()) {
350                     ignoreSpace = true;
351                 }
352             }
353             space = false;
354         }
355         char newtext[] = new char[textpos];
356         System.arraycopy(text, 0, newtext, 0, textpos);
357         // Handles cases of bad html where the title tag
358         // was getting lost when we did error recovery.
359         if (tag.getElement().getName().equals("title")) {
360             handleTitle(newtext);
361         } else {
362             handleText(newtext);
363         }
364         lastBlockStartPos = currentBlockStartPos;
365         textpos = 0;
366         last = tag;
367         space = false;
368     }
369 
370     /**
371      * Invoke the error handler.
372      */
error(String err, String arg1, String arg2, String arg3)373     protected void error(String err, String arg1, String arg2,
374         String arg3) {
375         handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3);
376     }
377 
error(String err, String arg1, String arg2)378     protected void error(String err, String arg1, String arg2) {
379         error(err, arg1, arg2, "?");
380     }
error(String err, String arg1)381     protected void error(String err, String arg1) {
382         error(err, arg1, "?", "?");
383     }
error(String err)384     protected void error(String err) {
385         error(err, "?", "?", "?");
386     }
387 
388 
389     /**
390      * Handle a start tag. The new tag is pushed
391      * onto the tag stack. The attribute list is
392      * checked for required attributes.
393      */
startTag(TagElement tag)394     protected void startTag(TagElement tag) throws ChangedCharSetException {
395         Element elem = tag.getElement();
396 
397         // If the tag is an empty tag and texpos != 0
398         // this implies that there is text before the
399         // start tag that needs to be processed before
400         // handling the tag.
401         //
402         if (!elem.isEmpty() ||
403                     ((last != null) && !last.breaksFlow()) ||
404                     (textpos != 0)) {
405             handleText(tag);
406         } else {
407             // this variable gets updated in handleText().
408             // Since in this case we do not call handleText()
409             // we need to update it here.
410             //
411             last = tag;
412             // Note that we should really check last.breakFlows before
413             // assuming this should be false.
414             space = false;
415         }
416         lastBlockStartPos = currentBlockStartPos;
417 
418         // check required attributes
419         for (AttributeList a = elem.atts ; a != null ; a = a.next) {
420             if ((a.modifier == REQUIRED) &&
421                 ((attributes.isEmpty()) ||
422                  ((!attributes.isDefined(a.name)) &&
423                   (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) {
424                 error("req.att ", a.getName(), elem.getName());
425             }
426         }
427 
428         if (elem.isEmpty()) {
429             handleEmptyTag(tag);
430             /*
431         } else if (elem.getName().equals("form")) {
432             handleStartTag(tag);
433             */
434         } else {
435             recent = elem;
436             stack = new TagStack(tag, stack);
437             handleStartTag(tag);
438         }
439     }
440 
441     /**
442      * Handle an end tag. The end tag is popped
443      * from the tag stack.
444      */
endTag(boolean omitted)445     protected void endTag(boolean omitted) {
446         handleText(stack.tag);
447 
448         if (omitted && !stack.elem.omitEnd()) {
449             error("end.missing", stack.elem.getName());
450         } else if (!stack.terminate()) {
451             error("end.unexpected", stack.elem.getName());
452         }
453 
454         // handle the tag
455         handleEndTag(stack.tag);
456         stack = stack.next;
457         recent = (stack != null) ? stack.elem : null;
458     }
459 
460 
ignoreElement(Element elem)461     boolean ignoreElement(Element elem) {
462 
463         String stackElement = stack.elem.getName();
464         String elemName = elem.getName();
465         /* We ignore all elements that are not valid in the context of
466            a table except <td>, <th> (these we handle in
467            legalElementContext()) and #pcdata.  We also ignore the
468            <font> tag in the context of <ul> and <ol> We additonally
469            ignore the <meta> and the <style> tag if the body tag has
470            been seen. **/
471         if ((elemName.equals("html") && seenHtml) ||
472             (elemName.equals("head") && seenHead) ||
473             (elemName.equals("body") && seenBody)) {
474             return true;
475         }
476         if (elemName.equals("dt") || elemName.equals("dd")) {
477             TagStack s = stack;
478             while (s != null && !s.elem.getName().equals("dl")) {
479                 s = s.next;
480             }
481             if (s == null) {
482                 return true;
483             }
484         }
485 
486         if (((stackElement.equals("table")) &&
487              (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) ||
488             ((elemName.equals("font")) &&
489              (stackElement.equals("ul") || stackElement.equals("ol"))) ||
490             (elemName.equals("meta") && stack != null) ||
491             (elemName.equals("style") && seenBody) ||
492             (stackElement.equals("table") && elemName.equals("a"))) {
493             return true;
494         }
495         return false;
496     }
497 
498 
499     /**
500      * Marks the first time a tag has been seen in a document
501      */
502 
markFirstTime(Element elem)503     protected void markFirstTime(Element elem) {
504         String elemName = elem.getName();
505         if (elemName.equals("html")) {
506             seenHtml = true;
507         } else if (elemName.equals("head")) {
508             seenHead = true;
509         } else if (elemName.equals("body")) {
510             if (buf.length == 1) {
511                 // Refer to note in definition of buf for details on this.
512                 char[] newBuf = new char[256];
513 
514                 newBuf[0] = buf[0];
515                 buf = newBuf;
516             }
517             seenBody = true;
518         }
519     }
520 
521     /**
522      * Create a legal content for an element.
523      */
legalElementContext(Element elem)524     boolean legalElementContext(Element elem) throws ChangedCharSetException {
525 
526         // System.out.println("-- legalContext -- " + elem);
527 
528         // Deal with the empty stack
529         if (stack == null) {
530             // System.out.println("-- stack is empty");
531             if (elem != dtd.html) {
532                 // System.out.println("-- pushing html");
533                 startTag(makeTag(dtd.html, true));
534                 return legalElementContext(elem);
535             }
536             return true;
537         }
538 
539         // Is it allowed in the current context
540         if (stack.advance(elem)) {
541             // System.out.println("-- legal context");
542             markFirstTime(elem);
543             return true;
544         }
545         boolean insertTag = false;
546 
547         // The use of all error recovery strategies are contingent
548         // on the value of the strict property.
549         //
550         // These are commonly occurring errors.  if insertTag is true,
551         // then we want to adopt an error recovery strategy that
552         // involves attempting to insert an additional tag to
553         // legalize the context.  The two errors addressed here
554         // are:
555         // 1) when a <td> or <th> is seen soon after a <table> tag.
556         //    In this case we insert a <tr>.
557         // 2) when any other tag apart from a <tr> is seen
558         //    in the context of a <tr>.  In this case we would
559         //    like to add a <td>.  If a <tr> is seen within a
560         //    <tr> context, then we will close out the current
561         //    <tr>.
562         //
563         // This insertion strategy is handled later in the method.
564         // The reason for checking this now, is that in other cases
565         // we would like to apply other error recovery strategies for example
566         // ignoring tags.
567         //
568         // In certain cases it is better to ignore a tag than try to
569         // fix the situation.  So the first test is to see if this
570         // is what we need to do.
571         //
572         String stackElemName = stack.elem.getName();
573         String elemName = elem.getName();
574 
575 
576         if (!strict &&
577             ((stackElemName.equals("table") && elemName.equals("td")) ||
578              (stackElemName.equals("table") && elemName.equals("th")) ||
579              (stackElemName.equals("tr") && !elemName.equals("tr")))){
580              insertTag = true;
581         }
582 
583 
584         if (!strict && !insertTag && (stack.elem.getName() != elem.getName() ||
585                                       elem.getName().equals("body"))) {
586             if (skipTag = ignoreElement(elem)) {
587                 error("tag.ignore", elem.getName());
588                 return skipTag;
589             }
590         }
591 
592         // Check for anything after the start of the table besides tr, td, th
593         // or caption, and if those aren't there, insert the <tr> and call
594         // legalElementContext again.
595         if (!strict && stackElemName.equals("table") &&
596             !elemName.equals("tr") && !elemName.equals("td") &&
597             !elemName.equals("th") && !elemName.equals("caption")) {
598             Element e = dtd.getElement("tr");
599             TagElement t = makeTag(e, true);
600             legalTagContext(t);
601             startTag(t);
602             error("start.missing", elem.getName());
603             return legalElementContext(elem);
604         }
605 
606         // They try to find a legal context by checking if the current
607         // tag is valid in an enclosing context.  If so
608         // close out the tags by outputing end tags and then
609         // insert the current tag.  If the tags that are
610         // being closed out do not have an optional end tag
611         // specification in the DTD then an html error is
612         // reported.
613         //
614         if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) {
615             for (TagStack s = stack.next ; s != null ; s = s.next) {
616                 if (s.advance(elem)) {
617                     while (stack != s) {
618                         endTag(true);
619                     }
620                     return true;
621                 }
622                 if (!s.terminate() || (strict && !s.elem.omitEnd())) {
623                     break;
624                 }
625             }
626         }
627 
628         // Check if we know what tag is expected next.
629         // If so insert the tag.  Report an error if the
630         // tag does not have its start tag spec in the DTD as optional.
631         //
632         Element next = stack.first();
633         if (next != null && (!strict || next.omitStart()) &&
634            !(next==dtd.head && elem==dtd.pcdata) ) {
635             // System.out.println("-- omitting start tag: " + next);
636             TagElement t = makeTag(next, true);
637             legalTagContext(t);
638             startTag(t);
639             if (!next.omitStart()) {
640                 error("start.missing", elem.getName());
641             }
642             return legalElementContext(elem);
643         }
644 
645 
646         // Traverse the list of expected elements and determine if adding
647         // any of these elements would make for a legal context.
648         //
649 
650         if (!strict) {
651             ContentModel content = stack.contentModel();
652             Vector<Element> elemVec = new Vector<Element>();
653             if (content != null) {
654                 content.getElements(elemVec);
655                 for (Element e : elemVec) {
656                     // Ensure that this element has not been included as
657                     // part of the exclusions in the DTD.
658                     //
659                     if (stack.excluded(e.getIndex())) {
660                         continue;
661                     }
662 
663                     boolean reqAtts = false;
664 
665                     for (AttributeList a = e.getAttributes(); a != null ; a = a.next) {
666                         if (a.modifier == REQUIRED) {
667                             reqAtts = true;
668                             break;
669                         }
670                     }
671                     // Ensure that no tag that has required attributes
672                     // gets inserted.
673                     //
674                     if (reqAtts) {
675                         continue;
676                     }
677 
678                     ContentModel m = e.getContent();
679                     if (m != null && m.first(elem)) {
680                         // System.out.println("-- adding a legal tag: " + e);
681                         TagElement t = makeTag(e, true);
682                         legalTagContext(t);
683                         startTag(t);
684                         error("start.missing", e.getName());
685                         return legalElementContext(elem);
686                     }
687                 }
688             }
689         }
690 
691         // Check if the stack can be terminated.  If so add the appropriate
692         // end tag.  Report an error if the tag being ended does not have its
693         // end tag spec in the DTD as optional.
694         //
695         if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) {
696             // System.out.println("-- omitting end tag: " + stack.elem);
697             if (!stack.elem.omitEnd()) {
698                 error("end.missing", elem.getName());
699             }
700 
701             endTag(true);
702             return legalElementContext(elem);
703         }
704 
705         // At this point we know that something is screwed up.
706         return false;
707     }
708 
709     /**
710      * Create a legal context for a tag.
711      */
legalTagContext(TagElement tag)712     void legalTagContext(TagElement tag) throws ChangedCharSetException {
713         if (legalElementContext(tag.getElement())) {
714             markFirstTime(tag.getElement());
715             return;
716         }
717 
718         // Avoid putting a block tag in a flow tag.
719         if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) {
720             endTag(true);
721             legalTagContext(tag);
722             return;
723         }
724 
725         // Avoid putting something wierd in the head of the document.
726         for (TagStack s = stack ; s != null ; s = s.next) {
727             if (s.tag.getElement() == dtd.head) {
728                 while (stack != s) {
729                     endTag(true);
730                 }
731                 endTag(true);
732                 legalTagContext(tag);
733                 return;
734             }
735         }
736 
737         // Everything failed
738         error("tag.unexpected", tag.getElement().getName());
739     }
740 
741     /**
742      * Error context. Something went wrong, make sure we are in
743      * the document's body context
744      */
errorContext()745     void errorContext() throws ChangedCharSetException {
746         for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) {
747             handleEndTag(stack.tag);
748         }
749         if (stack == null) {
750             legalElementContext(dtd.body);
751             startTag(makeTag(dtd.body, true));
752         }
753     }
754 
755     /**
756      * Add a char to the string buffer.
757      */
addString(int c)758     void addString(int c) {
759         if (strpos  == str.length) {
760             char newstr[] = new char[str.length + 128];
761             System.arraycopy(str, 0, newstr, 0, str.length);
762             str = newstr;
763         }
764         str[strpos++] = (char)c;
765     }
766 
767     /**
768      * Get the string that's been accumulated.
769      */
getString(int pos)770     String getString(int pos) {
771         char newStr[] = new char[strpos - pos];
772         System.arraycopy(str, pos, newStr, 0, strpos - pos);
773         strpos = pos;
774         return new String(newStr);
775     }
776 
getChars(int pos)777     char[] getChars(int pos) {
778         char newStr[] = new char[strpos - pos];
779         System.arraycopy(str, pos, newStr, 0, strpos - pos);
780         strpos = pos;
781         return newStr;
782     }
783 
getChars(int pos, int endPos)784     char[] getChars(int pos, int endPos) {
785         char newStr[] = new char[endPos - pos];
786         System.arraycopy(str, pos, newStr, 0, endPos - pos);
787         // REMIND: it's not clear whether this version should set strpos or not
788         // strpos = pos;
789         return newStr;
790     }
791 
resetStrBuffer()792     void resetStrBuffer() {
793         strpos = 0;
794     }
795 
strIndexOf(char target)796     int strIndexOf(char target) {
797         for (int i = 0; i < strpos; i++) {
798             if (str[i] == target) {
799                 return i;
800             }
801         }
802 
803         return -1;
804     }
805 
806     /**
807      * Skip space.
808      * [5] 297:5
809      */
skipSpace()810     void skipSpace() throws IOException {
811         while (true) {
812             switch (ch) {
813               case '\n':
814                 ln++;
815                 ch = readCh();
816                 lfCount++;
817                 break;
818 
819               case '\r':
820                 ln++;
821                 if ((ch = readCh()) == '\n') {
822                     ch = readCh();
823                     crlfCount++;
824                 }
825                 else {
826                     crCount++;
827                 }
828                 break;
829               case ' ':
830               case '\t':
831                 ch = readCh();
832                 break;
833 
834               default:
835                 return;
836             }
837         }
838     }
839 
840     /**
841      * Parse identifier. Uppercase characters are folded
842      * to lowercase when lower is true. Returns falsed if
843      * no identifier is found. [55] 346:17
844      */
parseIdentifier(boolean lower)845     boolean parseIdentifier(boolean lower) throws IOException {
846         switch (ch) {
847           case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
848           case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
849           case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
850           case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
851           case 'Y': case 'Z':
852             if (lower) {
853                 ch = 'a' + (ch - 'A');
854             }
855 
856           case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
857           case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
858           case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
859           case 's': case 't': case 'u': case 'v': case 'w': case 'x':
860           case 'y': case 'z':
861             break;
862 
863           default:
864             return false;
865         }
866 
867         while (true) {
868             addString(ch);
869 
870             switch (ch = readCh()) {
871               case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
872               case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
873               case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
874               case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
875               case 'Y': case 'Z':
876                 if (lower) {
877                     ch = 'a' + (ch - 'A');
878                 }
879 
880               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
881               case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
882               case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
883               case 's': case 't': case 'u': case 'v': case 'w': case 'x':
884               case 'y': case 'z':
885 
886               case '0': case '1': case '2': case '3': case '4':
887               case '5': case '6': case '7': case '8': case '9':
888 
889               case '.': case '-':
890 
891               case '_': // not officially allowed
892                 break;
893 
894               default:
895                 return true;
896             }
897         }
898     }
899 
900     /**
901      * Parse an entity reference. [59] 350:17
902      */
parseEntityReference()903     private char[] parseEntityReference() throws IOException {
904         int pos = strpos;
905 
906         if ((ch = readCh()) == '#') {
907             int n = 0;
908             ch = readCh();
909             if ((ch >= '0') && (ch <= '9') ||
910                     ch == 'x' || ch == 'X') {
911 
912                 if ((ch >= '0') && (ch <= '9')) {
913                     // parse decimal reference
914                     while ((ch >= '0') && (ch <= '9')) {
915                         n = (n * 10) + ch - '0';
916                         ch = readCh();
917                     }
918                 } else {
919                     // parse hexadecimal reference
920                     ch = readCh();
921                     char lch = (char) Character.toLowerCase(ch);
922                     while ((lch >= '0') && (lch <= '9') ||
923                             (lch >= 'a') && (lch <= 'f')) {
924                         if (lch >= '0' && lch <= '9') {
925                             n = (n * 16) + lch - '0';
926                         } else {
927                             n = (n * 16) + lch - 'a' + 10;
928                         }
929                         ch = readCh();
930                         lch = (char) Character.toLowerCase(ch);
931                     }
932                 }
933                 switch (ch) {
934                     case '\n':
935                         ln++;
936                         ch = readCh();
937                         lfCount++;
938                         break;
939 
940                     case '\r':
941                         ln++;
942                         if ((ch = readCh()) == '\n') {
943                             ch = readCh();
944                             crlfCount++;
945                         }
946                         else {
947                             crCount++;
948                         }
949                         break;
950 
951                     case ';':
952                         ch = readCh();
953                         break;
954                 }
955                 char data[] = mapNumericReference(n);
956                 return data;
957             }
958             addString('#');
959             if (!parseIdentifier(false)) {
960                 error("ident.expected");
961                 strpos = pos;
962                 char data[] = {'&', '#'};
963                 return data;
964             }
965         } else if (!parseIdentifier(false)) {
966             char data[] = {'&'};
967             return data;
968         }
969 
970         boolean semicolon = false;
971 
972         switch (ch) {
973           case '\n':
974             ln++;
975             ch = readCh();
976             lfCount++;
977             break;
978 
979           case '\r':
980             ln++;
981             if ((ch = readCh()) == '\n') {
982                 ch = readCh();
983                 crlfCount++;
984             }
985             else {
986                 crCount++;
987             }
988             break;
989 
990           case ';':
991             semicolon = true;
992 
993             ch = readCh();
994             break;
995         }
996 
997         String nm = getString(pos);
998         Entity ent = dtd.getEntity(nm);
999 
1000         // entities are case sensitive - however if strict
1001         // is false then we will try to make a match by
1002         // converting the string to all lowercase.
1003         //
1004         if (!strict && (ent == null)) {
1005             ent = dtd.getEntity(nm.toLowerCase());
1006         }
1007         if ((ent == null) || !ent.isGeneral()) {
1008 
1009             if (nm.length() == 0) {
1010                 error("invalid.entref", nm);
1011                 return new char[0];
1012             }
1013             /* given that there is not a match restore the entity reference */
1014             String str = "&" + nm + (semicolon ? ";" : "");
1015 
1016             char b[] = new char[str.length()];
1017             str.getChars(0, b.length, b, 0);
1018             return b;
1019         }
1020         return ent.getData();
1021     }
1022 
1023     /**
1024      * Converts numeric character reference to char array.
1025      *
1026      * Normally the code in a reference should be always converted
1027      * to the Unicode character with the same code, but due to
1028      * wide usage of Cp1252 charset most browsers map numeric references
1029      * in the range 130-159 (which are control chars in Unicode set)
1030      * to displayable characters with other codes.
1031      *
1032      * @param c the code of numeric character reference.
1033      * @return a char array corresponding to the reference code.
1034      */
mapNumericReference(int c)1035     private char[] mapNumericReference(int c) {
1036         char[] data;
1037         if (c >= 0xffff) { // outside unicode BMP.
1038             try {
1039                 data = Character.toChars(c);
1040             } catch (IllegalArgumentException e) {
1041                 data = new char[0];
1042             }
1043         } else {
1044             data = new char[1];
1045             data[0] = (c < 130 || c > 159) ? (char) c : cp1252Map[c - 130];
1046         }
1047         return data;
1048     }
1049 
1050     /**
1051      * Parse a comment. [92] 391:7
1052      */
parseComment()1053     void parseComment() throws IOException {
1054 
1055         while (true) {
1056             int c = ch;
1057             switch (c) {
1058               case '-':
1059                   /** Presuming that the start string of a comment "<!--" has
1060                       already been parsed, the '-' character is valid only as
1061                       part of a comment termination and further more it must
1062                       be present in even numbers. Hence if strict is true, we
1063                       presume the comment has been terminated and return.
1064                       However if strict is false, then there is no even number
1065                       requirement and this character can appear anywhere in the
1066                       comment.  The parser reads on until it sees the following
1067                       pattern: "-->" or "--!>".
1068                    **/
1069                 if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) {
1070                     if ((ch = readCh()) == '>') {
1071                         return;
1072                     }
1073                     if (ch == '!') {
1074                         if ((ch = readCh()) == '>') {
1075                             return;
1076                         } else {
1077                             /* to account for extra read()'s that happened */
1078                             addString('-');
1079                             addString('!');
1080                             continue;
1081                         }
1082                     }
1083                     break;
1084                 }
1085 
1086                 if ((ch = readCh()) == '-') {
1087                     ch = readCh();
1088                     if (strict || ch == '>') {
1089                         return;
1090                     }
1091                     if (ch == '!') {
1092                         if ((ch = readCh()) == '>') {
1093                             return;
1094                         } else {
1095                             /* to account for extra read()'s that happened */
1096                             addString('-');
1097                             addString('!');
1098                             continue;
1099                         }
1100                     }
1101                     /* to account for the extra read() */
1102                     addString('-');
1103                 }
1104                 break;
1105 
1106               case -1:
1107                   handleEOFInComment();
1108                   return;
1109 
1110               case '\n':
1111                 ln++;
1112                 ch = readCh();
1113                 lfCount++;
1114                 break;
1115 
1116               case '>':
1117                 ch = readCh();
1118                 break;
1119 
1120               case '\r':
1121                 ln++;
1122                 if ((ch = readCh()) == '\n') {
1123                     ch = readCh();
1124                     crlfCount++;
1125                 }
1126                 else {
1127                     crCount++;
1128                 }
1129                 c = '\n';
1130                 break;
1131               default:
1132                 ch = readCh();
1133                 break;
1134             }
1135 
1136             addString(c);
1137         }
1138     }
1139 
1140     /**
1141      * Parse literal content. [46] 343:1 and [47] 344:1
1142      */
parseLiteral(boolean replace)1143     void parseLiteral(boolean replace) throws IOException {
1144         while (true) {
1145             int c = ch;
1146             switch (c) {
1147               case -1:
1148                 error("eof.literal", stack.elem.getName());
1149                 endTag(true);
1150                 return;
1151 
1152               case '>':
1153                 ch = readCh();
1154                 int i = textpos - (stack.elem.name.length() + 2), j = 0;
1155 
1156                 // match end tag
1157                 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) {
1158                     while ((++i < textpos) &&
1159                            (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++)));
1160                     if (i == textpos) {
1161                         textpos -= (stack.elem.name.length() + 2);
1162                         if ((textpos > 0) && (text[textpos-1] == '\n')) {
1163                             textpos--;
1164                         }
1165                         endTag(false);
1166                         return;
1167                     }
1168                 }
1169                 break;
1170 
1171               case '&':
1172                 char data[] = parseEntityReference();
1173                 if (textpos + data.length > text.length) {
1174                     char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
1175                     System.arraycopy(text, 0, newtext, 0, text.length);
1176                     text = newtext;
1177                 }
1178                 System.arraycopy(data, 0, text, textpos, data.length);
1179                 textpos += data.length;
1180                 continue;
1181 
1182               case '\n':
1183                 ln++;
1184                 ch = readCh();
1185                 lfCount++;
1186                 break;
1187 
1188               case '\r':
1189                 ln++;
1190                 if ((ch = readCh()) == '\n') {
1191                     ch = readCh();
1192                     crlfCount++;
1193                 }
1194                 else {
1195                     crCount++;
1196                 }
1197                 c = '\n';
1198                 break;
1199               default:
1200                 ch = readCh();
1201                 break;
1202             }
1203 
1204             // output character
1205             if (textpos == text.length) {
1206                 char newtext[] = new char[text.length + 128];
1207                 System.arraycopy(text, 0, newtext, 0, text.length);
1208                 text = newtext;
1209             }
1210             text[textpos++] = (char)c;
1211         }
1212     }
1213 
1214     /**
1215      * Parse attribute value. [33] 331:1
1216      */
parseAttributeValue(boolean lower)1217     String parseAttributeValue(boolean lower) throws IOException {
1218         int delim = -1;
1219 
1220         // Check for a delimiter
1221         switch(ch) {
1222           case '\'':
1223           case '"':
1224             delim = ch;
1225             ch = readCh();
1226             break;
1227         }
1228 
1229         // Parse the rest of the value
1230         while (true) {
1231             int c = ch;
1232 
1233             switch (c) {
1234               case '\n':
1235                 ln++;
1236                 ch = readCh();
1237                 lfCount++;
1238                 if (delim < 0) {
1239                     return getString(0);
1240                 }
1241                 break;
1242 
1243               case '\r':
1244                 ln++;
1245 
1246                 if ((ch = readCh()) == '\n') {
1247                     ch = readCh();
1248                     crlfCount++;
1249                 }
1250                 else {
1251                     crCount++;
1252                 }
1253                 if (delim < 0) {
1254                     return getString(0);
1255                 }
1256                 break;
1257 
1258               case '\t':
1259                   if (delim < 0)
1260                       c = ' ';
1261               case ' ':
1262                 ch = readCh();
1263                 if (delim < 0) {
1264                     return getString(0);
1265                 }
1266                 break;
1267 
1268               case '>':
1269               case '<':
1270                 if (delim < 0) {
1271                     return getString(0);
1272                 }
1273                 ch = readCh();
1274                 break;
1275 
1276               case '\'':
1277               case '"':
1278                 ch = readCh();
1279                 if (c == delim) {
1280                     return getString(0);
1281                 } else if (delim == -1) {
1282                     error("attvalerr");
1283                     if (strict || ch == ' ') {
1284                         return getString(0);
1285                     } else {
1286                         continue;
1287                     }
1288                 }
1289                 break;
1290 
1291             case '=':
1292                 if (delim < 0) {
1293                     /* In SGML a construct like <img src=/cgi-bin/foo?x=1>
1294                        is considered invalid since an = sign can only be contained
1295                        in an attributes value if the string is quoted.
1296                        */
1297                     error("attvalerr");
1298                     /* If strict is true then we return with the string we have thus far.
1299                        Otherwise we accept the = sign as part of the attribute's value and
1300                        process the rest of the img tag. */
1301                     if (strict) {
1302                         return getString(0);
1303                     }
1304                 }
1305                 ch = readCh();
1306                 break;
1307 
1308               case '&':
1309                 if (strict && delim < 0) {
1310                     ch = readCh();
1311                     break;
1312                 }
1313 
1314                 char data[] = parseEntityReference();
1315                 for (int i = 0 ; i < data.length ; i++) {
1316                     c = data[i];
1317                     addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c);
1318                 }
1319                 continue;
1320 
1321               case -1:
1322                 return getString(0);
1323 
1324               default:
1325                 if (lower && (c >= 'A') && (c <= 'Z')) {
1326                     c = 'a' + c - 'A';
1327                 }
1328                 ch = readCh();
1329                 break;
1330             }
1331             addString(c);
1332         }
1333     }
1334 
1335 
1336     /**
1337      * Parse attribute specification List. [31] 327:17
1338      */
parseAttributeSpecificationList(Element elem)1339     void parseAttributeSpecificationList(Element elem) throws IOException {
1340 
1341         while (true) {
1342             skipSpace();
1343 
1344             switch (ch) {
1345               case '/':
1346               case '>':
1347               case '<':
1348               case -1:
1349                 return;
1350 
1351               case '-':
1352                 if ((ch = readCh()) == '-') {
1353                     ch = readCh();
1354                     parseComment();
1355                     strpos = 0;
1356                 } else {
1357                     error("invalid.tagchar", "-", elem.getName());
1358                     ch = readCh();
1359                 }
1360                 continue;
1361             }
1362 
1363             AttributeList att;
1364             String attname;
1365             String attvalue;
1366 
1367             if (parseIdentifier(true)) {
1368                 attname = getString(0);
1369                 skipSpace();
1370                 if (ch == '=') {
1371                     ch = readCh();
1372                     skipSpace();
1373                     att = elem.getAttribute(attname);
1374 //  Bug ID 4102750
1375 //  Load the NAME of an Attribute Case Sensitive
1376 //  The case of the NAME  must be intact
1377 //  MG 021898
1378                     attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME));
1379 //                  attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION));
1380                 } else {
1381                     attvalue = attname;
1382                     att = elem.getAttributeByValue(attvalue);
1383                     if (att == null) {
1384                         att = elem.getAttribute(attname);
1385                         if (att != null) {
1386                             attvalue = att.getValue();
1387                         }
1388                         else {
1389                             // Make it null so that NULL_ATTRIBUTE_VALUE is
1390                             // used
1391                             attvalue = null;
1392                         }
1393                     }
1394                 }
1395             } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs
1396                 ch = readCh();
1397                 continue;
1398             } else if (!strict && ch == '"') { // allows for quoted attributes
1399                 ch = readCh();
1400                 skipSpace();
1401                 if (parseIdentifier(true)) {
1402                     attname = getString(0);
1403                     if (ch == '"') {
1404                         ch = readCh();
1405                     }
1406                     skipSpace();
1407                     if (ch == '=') {
1408                         ch = readCh();
1409                         skipSpace();
1410                         att = elem.getAttribute(attname);
1411                         attvalue = parseAttributeValue((att != null) &&
1412                                                 (att.type != CDATA) &&
1413                                                 (att.type != NOTATION));
1414                     } else {
1415                         attvalue = attname;
1416                         att = elem.getAttributeByValue(attvalue);
1417                         if (att == null) {
1418                             att = elem.getAttribute(attname);
1419                             if (att != null) {
1420                                 attvalue = att.getValue();
1421                             }
1422                         }
1423                     }
1424                 } else {
1425                     char str[] = {(char)ch};
1426                     error("invalid.tagchar", new String(str), elem.getName());
1427                     ch = readCh();
1428                     continue;
1429                 }
1430             } else if (!strict && (attributes.isEmpty()) && (ch == '=')) {
1431                 ch = readCh();
1432                 skipSpace();
1433                 attname = elem.getName();
1434                 att = elem.getAttribute(attname);
1435                 attvalue = parseAttributeValue((att != null) &&
1436                                                (att.type != CDATA) &&
1437                                                (att.type != NOTATION));
1438             } else if (!strict && (ch == '=')) {
1439                 ch = readCh();
1440                 skipSpace();
1441                 attvalue = parseAttributeValue(true);
1442                 error("attvalerr");
1443                 return;
1444             } else {
1445                 char str[] = {(char)ch};
1446                 error("invalid.tagchar", new String(str), elem.getName());
1447                 if (!strict) {
1448                     ch = readCh();
1449                     continue;
1450                 } else {
1451                     return;
1452                 }
1453             }
1454 
1455             if (att != null) {
1456                 attname = att.getName();
1457             } else {
1458                 error("invalid.tagatt", attname, elem.getName());
1459             }
1460 
1461             // Check out the value
1462             if (attributes.isDefined(attname)) {
1463                 error("multi.tagatt", attname, elem.getName());
1464             }
1465             if (attvalue == null) {
1466                 attvalue = ((att != null) && (att.value != null)) ? att.value :
1467                     HTML.NULL_ATTRIBUTE_VALUE;
1468             } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) {
1469                 error("invalid.tagattval", attname, elem.getName());
1470             }
1471             HTML.Attribute attkey = HTML.getAttributeKey(attname);
1472             if (attkey == null) {
1473                 attributes.addAttribute(attname, attvalue);
1474             } else {
1475                 attributes.addAttribute(attkey, attvalue);
1476             }
1477         }
1478     }
1479 
1480     /**
1481      * Parses th Document Declaration Type markup declaration.
1482      * Currently ignores it.
1483      */
parseDTDMarkup()1484     public String parseDTDMarkup() throws IOException {
1485 
1486         StringBuilder strBuff = new StringBuilder();
1487         ch = readCh();
1488         while(true) {
1489             switch (ch) {
1490             case '>':
1491                 ch = readCh();
1492                 return strBuff.toString();
1493             case -1:
1494                 error("invalid.markup");
1495                 return strBuff.toString();
1496             case '\n':
1497                 ln++;
1498                 ch = readCh();
1499                 lfCount++;
1500                 break;
1501             case '"':
1502                 ch = readCh();
1503                 break;
1504             case '\r':
1505                 ln++;
1506                 if ((ch = readCh()) == '\n') {
1507                     ch = readCh();
1508                     crlfCount++;
1509                 }
1510                 else {
1511                     crCount++;
1512                 }
1513                 break;
1514             default:
1515                 strBuff.append((char)(ch & 0xFF));
1516                 ch = readCh();
1517                 break;
1518             }
1519         }
1520     }
1521 
1522     /**
1523      * Parse markup declarations.
1524      * Currently only handles the Document Type Declaration markup.
1525      * Returns true if it is a markup declaration false otherwise.
1526      */
parseMarkupDeclarations(StringBuffer strBuff)1527     protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException {
1528 
1529         /* Currently handles only the DOCTYPE */
1530         if ((strBuff.length() == "DOCTYPE".length()) &&
1531             (strBuff.toString().toUpperCase().equals("DOCTYPE"))) {
1532             parseDTDMarkup();
1533             return true;
1534         }
1535         return false;
1536     }
1537 
1538     /**
1539      * Parse an invalid tag.
1540      */
parseInvalidTag()1541     void parseInvalidTag() throws IOException {
1542         // ignore all data upto the close bracket '>'
1543         while (true) {
1544             skipSpace();
1545             switch (ch) {
1546               case '>':
1547               case -1:
1548                   ch = readCh();
1549                 return;
1550               case '<':
1551                   return;
1552               default:
1553                   ch = readCh();
1554 
1555             }
1556         }
1557     }
1558 
1559     /**
1560      * Parse a start or end tag.
1561      */
parseTag()1562     void parseTag() throws IOException {
1563         Element elem;
1564         boolean net = false;
1565         boolean warned = false;
1566         boolean unknown = false;
1567 
1568         switch (ch = readCh()) {
1569           case '!':
1570             switch (ch = readCh()) {
1571               case '-':
1572                 // Parse comment. [92] 391:7
1573                 while (true) {
1574                     if (ch == '-') {
1575                         if (!strict || ((ch = readCh()) == '-')) {
1576                             ch = readCh();
1577                             if (!strict && ch == '-') {
1578                                 ch = readCh();
1579                             }
1580                             // send over any text you might see
1581                             // before parsing and sending the
1582                             // comment
1583                             if (textpos != 0) {
1584                                 char newtext[] = new char[textpos];
1585                                 System.arraycopy(text, 0, newtext, 0, textpos);
1586                                 handleText(newtext);
1587                                 lastBlockStartPos = currentBlockStartPos;
1588                                 textpos = 0;
1589                             }
1590                             parseComment();
1591                             last = makeTag(dtd.getElement("comment"), true);
1592                             handleComment(getChars(0));
1593                             continue;
1594                         } else if (!warned) {
1595                             warned = true;
1596                             error("invalid.commentchar", "-");
1597                         }
1598                     }
1599                     skipSpace();
1600                     switch (ch) {
1601                       case '-':
1602                         continue;
1603                       case '>':
1604                         ch = readCh();
1605                       case -1:
1606                         return;
1607                       default:
1608                         ch = readCh();
1609                         if (!warned) {
1610                             warned = true;
1611                             error("invalid.commentchar",
1612                                   String.valueOf((char)ch));
1613                         }
1614                         break;
1615                     }
1616                 }
1617 
1618               default:
1619                 // deal with marked sections
1620                 StringBuffer strBuff = new StringBuffer();
1621                 while (true) {
1622                     strBuff.append((char)ch);
1623                     if (parseMarkupDeclarations(strBuff)) {
1624                         return;
1625                     }
1626                     switch(ch) {
1627                       case '>':
1628                         ch = readCh();
1629                       case -1:
1630                         error("invalid.markup");
1631                         return;
1632                       case '\n':
1633                         ln++;
1634                         ch = readCh();
1635                         lfCount++;
1636                         break;
1637                       case '\r':
1638                         ln++;
1639                         if ((ch = readCh()) == '\n') {
1640                             ch = readCh();
1641                             crlfCount++;
1642                         }
1643                         else {
1644                             crCount++;
1645                         }
1646                         break;
1647 
1648                       default:
1649                         ch = readCh();
1650                         break;
1651                     }
1652                 }
1653             }
1654 
1655           case '/':
1656             // parse end tag [19] 317:4
1657             switch (ch = readCh()) {
1658               case '>':
1659                 ch = readCh();
1660               case '<':
1661                 // empty end tag. either </> or </<
1662                 if (recent == null) {
1663                     error("invalid.shortend");
1664                     return;
1665                 }
1666                 elem = recent;
1667                 break;
1668 
1669               default:
1670                 if (!parseIdentifier(true)) {
1671                     error("expected.endtagname");
1672                     return;
1673                 }
1674                 skipSpace();
1675                 switch (ch) {
1676                   case '>':
1677                     ch = readCh();
1678                   case '<':
1679                     break;
1680 
1681                   default:
1682                     error("expected", "'>'");
1683                     while ((ch != -1) && (ch != '\n') && (ch != '>')) {
1684                         ch = readCh();
1685                     }
1686                     if (ch == '>') {
1687                         ch = readCh();
1688                     }
1689                     break;
1690                 }
1691                 String elemStr = getString(0);
1692                 if (!dtd.elementExists(elemStr)) {
1693                     error("end.unrecognized", elemStr);
1694                     // Ignore RE before end tag
1695                     if ((textpos > 0) && (text[textpos-1] == '\n')) {
1696                         textpos--;
1697                     }
1698                     elem = dtd.getElement("unknown");
1699                     elem.name = elemStr;
1700                     unknown = true;
1701                 } else {
1702                     elem = dtd.getElement(elemStr);
1703                 }
1704                 break;
1705             }
1706 
1707 
1708             // If the stack is null, we're seeing end tags without any begin
1709             // tags.  Ignore them.
1710 
1711             if (stack == null) {
1712                 error("end.extra.tag", elem.getName());
1713                 return;
1714             }
1715 
1716             // Ignore RE before end tag
1717             if ((textpos > 0) && (text[textpos-1] == '\n')) {
1718                 // In a pre tag, if there are blank lines
1719                 // we do not want to remove the newline
1720                 // before the end tag.  Hence this code.
1721                 //
1722                 if (stack.pre) {
1723                     if ((textpos > 1) && (text[textpos-2] != '\n')) {
1724                         textpos--;
1725                     }
1726                 } else {
1727                     textpos--;
1728                 }
1729             }
1730 
1731             // If the end tag is a form, since we did not put it
1732             // on the tag stack, there is no corresponding start
1733             // start tag to find. Hence do not touch the tag stack.
1734             //
1735 
1736             /*
1737             if (!strict && elem.getName().equals("form")) {
1738                 if (lastFormSent != null) {
1739                     handleEndTag(lastFormSent);
1740                     return;
1741                 } else {
1742                     // do nothing.
1743                     return;
1744                 }
1745             }
1746             */
1747 
1748             if (unknown) {
1749                 // we will not see a corresponding start tag
1750                 // on the the stack.  If we are seeing an
1751                 // end tag, lets send this on as an empty
1752                 // tag with the end tag attribute set to
1753                 // true.
1754                 TagElement t = makeTag(elem);
1755                 handleText(t);
1756                 attributes.addAttribute(HTML.Attribute.ENDTAG, "true");
1757                 handleEmptyTag(makeTag(elem));
1758                 unknown = false;
1759                 return;
1760             }
1761 
1762             // find the corresponding start tag
1763 
1764             // A commonly occurring error appears to be the insertion
1765             // of extra end tags in a table.  The intent here is ignore
1766             // such extra end tags.
1767             //
1768             if (!strict) {
1769                 String stackElem = stack.elem.getName();
1770 
1771                 if (stackElem.equals("table")) {
1772                     // If it is not a valid end tag ignore it and return
1773                     //
1774                     if (!elem.getName().equals(stackElem)) {
1775                         error("tag.ignore", elem.getName());
1776                         return;
1777                     }
1778                 }
1779 
1780 
1781 
1782                 if (stackElem.equals("tr") ||
1783                     stackElem.equals("td")) {
1784                     if ((!elem.getName().equals("table")) &&
1785                         (!elem.getName().equals(stackElem))) {
1786                         error("tag.ignore", elem.getName());
1787                         return;
1788                     }
1789                 }
1790             }
1791             TagStack sp = stack;
1792 
1793             while ((sp != null) && (elem != sp.elem)) {
1794                 sp = sp.next;
1795             }
1796             if (sp == null) {
1797                 error("unmatched.endtag", elem.getName());
1798                 return;
1799             }
1800 
1801             // People put font ending tags in the darndest places.
1802             // Don't close other contexts based on them being between
1803             // a font tag and the corresponding end tag.  Instead,
1804             // ignore the end tag like it doesn't exist and allow the end
1805             // of the document to close us out.
1806             String elemName = elem.getName();
1807             if (stack != sp &&
1808                 (elemName.equals("font") ||
1809                  elemName.equals("center"))) {
1810 
1811                 // Since closing out a center tag can have real wierd
1812                 // effects on the formatting,  make sure that tags
1813                 // for which omitting an end tag is legimitate
1814                 // get closed out.
1815                 //
1816                 if (elemName.equals("center")) {
1817                     while(stack.elem.omitEnd() && stack != sp) {
1818                         endTag(true);
1819                     }
1820                     if (stack.elem == elem) {
1821                         endTag(false);
1822                     }
1823                 }
1824                 return;
1825             }
1826             // People do the same thing with center tags.  In this
1827             // case we would like to close off the center tag but
1828             // not necessarily all enclosing tags.
1829 
1830 
1831 
1832             // end tags
1833             while (stack != sp) {
1834                 endTag(true);
1835             }
1836 
1837             endTag(false);
1838             return;
1839 
1840           case -1:
1841             error("eof");
1842             return;
1843         }
1844 
1845         // start tag [14] 314:1
1846         if (!parseIdentifier(true)) {
1847             elem = recent;
1848             if ((ch != '>') || (elem == null)) {
1849                 error("expected.tagname");
1850                 return;
1851             }
1852         } else {
1853             String elemStr = getString(0);
1854 
1855             if (elemStr.equals("image")) {
1856                 elemStr = "img";
1857             }
1858 
1859             /* determine if this element is part of the dtd. */
1860 
1861             if (!dtd.elementExists(elemStr)) {
1862                 //              parseInvalidTag();
1863                 error("tag.unrecognized ", elemStr);
1864                 elem = dtd.getElement("unknown");
1865                 elem.name = elemStr;
1866                 unknown = true;
1867             } else {
1868                 elem = dtd.getElement(elemStr);
1869             }
1870         }
1871 
1872         // Parse attributes
1873         parseAttributeSpecificationList(elem);
1874 
1875         switch (ch) {
1876           case '/':
1877             net = true;
1878           case '>':
1879             ch = readCh();
1880             if (ch == '>' && net) {
1881                 ch = readCh();
1882             }
1883           case '<':
1884             break;
1885 
1886           default:
1887             error("expected", "'>'");
1888             break;
1889         }
1890 
1891         if (!strict) {
1892           if (elem.getName().equals("script")) {
1893             error("javascript.unsupported");
1894           }
1895         }
1896 
1897         // ignore RE after start tag
1898         //
1899         if (!elem.isEmpty())  {
1900             if (ch == '\n') {
1901                 ln++;
1902                 lfCount++;
1903                 ch = readCh();
1904             } else if (ch == '\r') {
1905                 ln++;
1906                 if ((ch = readCh()) == '\n') {
1907                     ch = readCh();
1908                     crlfCount++;
1909                 }
1910                 else {
1911                     crCount++;
1912                 }
1913             }
1914         }
1915 
1916         // ensure a legal context for the tag
1917         TagElement tag = makeTag(elem, false);
1918 
1919 
1920         /** In dealing with forms, we have decided to treat
1921             them as legal in any context.  Also, even though
1922             they do have a start and an end tag, we will
1923             not put this tag on the stack.  This is to deal
1924             several pages in the web oasis that choose to
1925             start and end forms in any possible location. **/
1926 
1927         /*
1928         if (!strict && elem.getName().equals("form")) {
1929             if (lastFormSent == null) {
1930                 lastFormSent = tag;
1931             } else {
1932                 handleEndTag(lastFormSent);
1933                 lastFormSent = tag;
1934             }
1935         } else {
1936         */
1937             // Smlly, if a tag is unknown, we will apply
1938             // no legalTagContext logic to it.
1939             //
1940             if (!unknown) {
1941                 legalTagContext(tag);
1942 
1943                 // If skip tag is true,  this implies that
1944                 // the tag was illegal and that the error
1945                 // recovery strategy adopted is to ignore
1946                 // the tag.
1947                 if (!strict && skipTag) {
1948                     skipTag = false;
1949                     return;
1950                 }
1951             }
1952             /*
1953         }
1954             */
1955 
1956         startTag(tag);
1957 
1958         if (!elem.isEmpty()) {
1959             switch (elem.getType()) {
1960               case CDATA:
1961                 parseLiteral(false);
1962                 break;
1963               case RCDATA:
1964                 parseLiteral(true);
1965                 break;
1966               default:
1967                 if (stack != null) {
1968                     stack.net = net;
1969                 }
1970                 break;
1971             }
1972         }
1973     }
1974 
1975     private static final String START_COMMENT = "<!--";
1976     private static final String END_COMMENT = "-->";
1977     private static final char[] SCRIPT_END_TAG = "</script>".toCharArray();
1978     private static final char[] SCRIPT_END_TAG_UPPER_CASE =
1979                                         "</SCRIPT>".toCharArray();
1980 
parseScript()1981     void parseScript() throws IOException {
1982         char[] charsToAdd = new char[SCRIPT_END_TAG.length];
1983         boolean insideComment = false;
1984 
1985         /* Here, ch should be the first character after <script> */
1986         while (true) {
1987             int i = 0;
1988             while (!insideComment && i < SCRIPT_END_TAG.length
1989                     && (SCRIPT_END_TAG[i] == ch
1990                     || SCRIPT_END_TAG_UPPER_CASE[i] == ch)) {
1991                 charsToAdd[i] = (char) ch;
1992                 ch = readCh();
1993                 i++;
1994             }
1995             if (i == SCRIPT_END_TAG.length) {
1996                 return;
1997             }
1998 
1999             if (!insideComment && i == 1 && charsToAdd[0] == START_COMMENT.charAt(0)) {
2000                 // it isn't end script tag, but may be it's start comment tag?
2001                 while (i < START_COMMENT.length()
2002                         && START_COMMENT.charAt(i) == ch) {
2003                     charsToAdd[i] = (char) ch;
2004                     ch = readCh();
2005                     i++;
2006                 }
2007                 if (i == START_COMMENT.length()) {
2008                     insideComment = true;
2009                 }
2010             }
2011             if (insideComment) {
2012                 while (i < END_COMMENT.length()
2013                         && END_COMMENT.charAt(i) == ch) {
2014                     charsToAdd[i] = (char) ch;
2015                     ch = readCh();
2016                     i++;
2017                 }
2018                 if (i == END_COMMENT.length()) {
2019                     insideComment = false;
2020                 }
2021             }
2022 
2023             /* To account for extra read()'s that happened */
2024             if (i > 0) {
2025                 for (int j = 0; j < i; j++) {
2026                     addString(charsToAdd[j]);
2027                 }
2028                 continue;
2029             }
2030             switch (ch) {
2031             case -1:
2032                 error("eof.script");
2033                 return;
2034             case '\n':
2035                 ln++;
2036                 ch = readCh();
2037                 lfCount++;
2038                 addString('\n');
2039                 break;
2040             case '\r':
2041                 ln++;
2042                 if ((ch = readCh()) == '\n') {
2043                     ch = readCh();
2044                     crlfCount++;
2045                 } else {
2046                     crCount++;
2047                 }
2048                 addString('\n');
2049                 break;
2050             default:
2051                 addString(ch);
2052                 ch = readCh();
2053                 break;
2054             } // switch
2055         } // while
2056     }
2057 
2058     /**
2059      * Parse Content. [24] 320:1
2060      */
parseContent()2061     void parseContent() throws IOException {
2062         Thread curThread = Thread.currentThread();
2063 
2064         for (;;) {
2065             if (curThread.isInterrupted()) {
2066                 curThread.interrupt(); // resignal the interrupt
2067                 break;
2068             }
2069 
2070             int c = ch;
2071             currentBlockStartPos = currentPosition;
2072 
2073             if (recent == dtd.script) { // means: if after starting <script> tag
2074 
2075                 /* Here, ch has to be the first character after <script> */
2076                 parseScript();
2077                 last = makeTag(dtd.getElement("comment"), true);
2078 
2079                 /* Remove leading and trailing HTML comment declarations */
2080                 String str = new String(getChars(0)).trim();
2081                 int minLength = START_COMMENT.length() + END_COMMENT.length();
2082                 if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT)
2083                        && str.length() >= (minLength)) {
2084                     str = str.substring(START_COMMENT.length(),
2085                                       str.length() - END_COMMENT.length());
2086                 }
2087 
2088                 /* Handle resulting chars as comment */
2089                 handleComment(str.toCharArray());
2090                 endTag(false);
2091                 lastBlockStartPos = currentPosition;
2092 
2093                 continue;
2094             } else {
2095                 switch (c) {
2096                   case '<':
2097                     parseTag();
2098                     lastBlockStartPos = currentPosition;
2099                     continue;
2100 
2101                   case '/':
2102                     ch = readCh();
2103                     if ((stack != null) && stack.net) {
2104                         // null end tag.
2105                         endTag(false);
2106                         continue;
2107                     } else if (textpos == 0) {
2108                         if (!legalElementContext(dtd.pcdata)) {
2109                             error("unexpected.pcdata");
2110                         }
2111                         if (last.breaksFlow()) {
2112                             space = false;
2113                         }
2114                     }
2115                     break;
2116 
2117                   case -1:
2118                     return;
2119 
2120                   case '&':
2121                     if (textpos == 0) {
2122                         if (!legalElementContext(dtd.pcdata)) {
2123                             error("unexpected.pcdata");
2124                         }
2125                         if (last.breaksFlow()) {
2126                             space = false;
2127                         }
2128                     }
2129                     char data[] = parseEntityReference();
2130                     if (textpos + data.length + 1 > text.length) {
2131                         char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
2132                         System.arraycopy(text, 0, newtext, 0, text.length);
2133                         text = newtext;
2134                     }
2135                     if (space) {
2136                         space = false;
2137                         text[textpos++] = ' ';
2138                     }
2139                     System.arraycopy(data, 0, text, textpos, data.length);
2140                     textpos += data.length;
2141                     ignoreSpace = false;
2142                     continue;
2143 
2144                   case '\n':
2145                     ln++;
2146                     lfCount++;
2147                     ch = readCh();
2148                     if ((stack != null) && stack.pre) {
2149                         break;
2150                     }
2151                     if (textpos == 0) {
2152                         lastBlockStartPos = currentPosition;
2153                     }
2154                     if (!ignoreSpace) {
2155                         space = true;
2156                     }
2157                     continue;
2158 
2159                   case '\r':
2160                     ln++;
2161                     c = '\n';
2162                     if ((ch = readCh()) == '\n') {
2163                         ch = readCh();
2164                         crlfCount++;
2165                     }
2166                     else {
2167                         crCount++;
2168                     }
2169                     if ((stack != null) && stack.pre) {
2170                         break;
2171                     }
2172                     if (textpos == 0) {
2173                         lastBlockStartPos = currentPosition;
2174                     }
2175                     if (!ignoreSpace) {
2176                         space = true;
2177                     }
2178                     continue;
2179 
2180 
2181                   case '\t':
2182                   case ' ':
2183                     ch = readCh();
2184                     if ((stack != null) && stack.pre) {
2185                         break;
2186                     }
2187                     if (textpos == 0) {
2188                         lastBlockStartPos = currentPosition;
2189                     }
2190                     if (!ignoreSpace) {
2191                         space = true;
2192                     }
2193                     continue;
2194 
2195                   default:
2196                     if (textpos == 0) {
2197                         if (!legalElementContext(dtd.pcdata)) {
2198                             error("unexpected.pcdata");
2199                         }
2200                         if (last.breaksFlow()) {
2201                             space = false;
2202                         }
2203                     }
2204                     ch = readCh();
2205                     break;
2206                 }
2207             }
2208 
2209             // enlarge buffer if needed
2210             if (textpos + 2 > text.length) {
2211                 char newtext[] = new char[text.length + 128];
2212                 System.arraycopy(text, 0, newtext, 0, text.length);
2213                 text = newtext;
2214             }
2215 
2216             // output pending space
2217             if (space) {
2218                 if (textpos == 0) {
2219                     lastBlockStartPos--;
2220                 }
2221                 text[textpos++] = ' ';
2222                 space = false;
2223             }
2224             text[textpos++] = (char)c;
2225             ignoreSpace = false;
2226         }
2227     }
2228 
2229     /**
2230      * Returns the end of line string. This will return the end of line
2231      * string that has been encountered the most, one of \r, \n or \r\n.
2232      */
getEndOfLineString()2233     String getEndOfLineString() {
2234         if (crlfCount >= crCount) {
2235             if (lfCount >= crlfCount) {
2236                 return "\n";
2237             }
2238             else {
2239                 return "\r\n";
2240             }
2241         }
2242         else {
2243             if (crCount > lfCount) {
2244                 return "\r";
2245             }
2246             else {
2247                 return "\n";
2248             }
2249         }
2250     }
2251 
2252     /**
2253      * Parse an HTML stream, given a DTD.
2254      */
parse(Reader in)2255     public synchronized void parse(Reader in) throws IOException {
2256         this.in = in;
2257 
2258         this.ln = 1;
2259 
2260         seenHtml = false;
2261         seenHead = false;
2262         seenBody = false;
2263 
2264         crCount = lfCount = crlfCount = 0;
2265 
2266         try {
2267             ch = readCh();
2268             text = new char[1024];
2269             str = new char[128];
2270 
2271             parseContent();
2272             // NOTE: interruption may have occurred.  Control flows out
2273             // of here normally.
2274             while (stack != null) {
2275                 endTag(true);
2276             }
2277             in.close();
2278         } catch (IOException e) {
2279             errorContext();
2280             error("ioexception");
2281             throw e;
2282         } catch (Exception e) {
2283             errorContext();
2284             error("exception", e.getClass().getName(), e.getMessage());
2285             e.printStackTrace();
2286         } catch (ThreadDeath e) {
2287             errorContext();
2288             error("terminated");
2289             e.printStackTrace();
2290             throw e;
2291         } finally {
2292             for (; stack != null ; stack = stack.next) {
2293                 handleEndTag(stack.tag);
2294             }
2295 
2296             text = null;
2297             str = null;
2298         }
2299 
2300     }
2301 
2302 
2303     /*
2304      * Input cache.  This is much faster than calling down to a synchronized
2305      * method of BufferedReader for each byte.  Measurements done 5/30/97
2306      * show that there's no point in having a bigger buffer:  Increasing
2307      * the buffer to 8192 had no measurable impact for a program discarding
2308      * one character at a time (reading from an http URL to a local machine).
2309      * NOTE: If the current encoding is bogus, and we read too much
2310      * (past the content-type) we may suffer a MalformedInputException. For
2311      * this reason the initial size is 1 and when the body is encountered the
2312      * size is adjusted to 256.
2313      */
2314     private char buf[] = new char[1];
2315     private int pos;
2316     private int len;
2317     /*
2318         tracks position relative to the beginning of the
2319         document.
2320     */
2321     private int currentPosition;
2322 
2323 
readCh()2324     private final int readCh() throws IOException {
2325 
2326         if (pos >= len) {
2327 
2328             // This loop allows us to ignore interrupts if the flag
2329             // says so
2330             for (;;) {
2331                 try {
2332                     len = in.read(buf);
2333                     break;
2334                 } catch (InterruptedIOException ex) {
2335                     throw ex;
2336                 }
2337             }
2338 
2339             if (len <= 0) {
2340                 return -1;      // eof
2341             }
2342             pos = 0;
2343         }
2344         ++currentPosition;
2345 
2346         return buf[pos++];
2347     }
2348 
2349 
getCurrentPos()2350     protected int getCurrentPos() {
2351         return currentPosition;
2352     }
2353 }
2354