1 /*
2     This file is part of the KDE libraries
3 
4     Copyright (C) 1997 Martin Jones (mjones@kde.org)
5               (C) 1997 Torben Weis (weis@kde.org)
6               (C) 1999,2001 Lars Knoll (knoll@kde.org)
7               (C) 2000,2001 Dirk Mueller (mueller@kde.org)
8               (C) 2003 Apple Computer, Inc.
9 
10     This library is free software; you can redistribute it and/or
11     modify it under the terms of the GNU Library General Public
12     License as published by the Free Software Foundation; either
13     version 2 of the License, or (at your option) any later version.
14 
15     This library is distributed in the hope that it will be useful,
16     but WITHOUT ANY WARRANTY; without even the implied warranty of
17     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18     Library General Public License for more details.
19 
20     You should have received a copy of the GNU Library General Public License
21     along with this library; see the file COPYING.LIB.  If not, write to
22     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23     Boston, MA 02110-1301, USA.
24 */
25 //----------------------------------------------------------------------------
26 //
27 // KDE HTML Widget -- HTML Parser
28 // #define PARSER_DEBUG
29 
30 #include "htmlparser.h"
31 
32 #include <dom/dom_exception.h>
33 
34 #include <html/html_baseimpl.h>
35 #include <html/html_blockimpl.h>
36 #include <html/html_canvasimpl.h>
37 #include <html/html_documentimpl.h>
38 #include <html/html_elementimpl.h>
39 #include <html/html_formimpl.h>
40 #include <html/html_headimpl.h>
41 #include <html/html_imageimpl.h>
42 #include <html/html_inlineimpl.h>
43 #include <html/html_listimpl.h>
44 #include <html/html_miscimpl.h>
45 #include <html/html_tableimpl.h>
46 #include <html/html_objectimpl.h>
47 #include <html/HTMLAudioElement.h>
48 #include <html/HTMLVideoElement.h>
49 #include <html/HTMLSourceElement.h>
50 #include <xml/dom_textimpl.h>
51 #include <xml/dom_nodeimpl.h>
52 #include <html/htmltokenizer.h>
53 #include <khtmlview.h>
54 #include <khtml_part.h>
55 #include <khtml_global.h>
56 #include <css/cssproperties.h>
57 #include <css/cssvalues.h>
58 
59 #include <rendering/render_object.h>
60 
61 #include "khtml_debug.h"
62 #include <klocalizedstring.h>
63 
64 // Turn off gnu90 inlining to avoid linker errors
65 #undef __GNUC_STDC_INLINE__
66 #undef __GNUC_GNU_INLINE__
67 #include <doctypes.h>
68 
69 #undef OPTIONAL  // for win32, MinGW
70 
71 using namespace DOM;
72 using namespace khtml;
73 
74 #ifdef PARSER_DEBUG
getParserPrintableName(int id)75 static QString getParserPrintableName(int id)
76 {
77     if (id >= ID_CLOSE_TAG) {
78         return "/" + getPrintableName(id - ID_CLOSE_TAG);
79     } else {
80         return getPrintableName(id);
81     }
82 }
83 #endif
84 
85 //----------------------------------------------------------------------------
86 
87 /**
88  * @internal
89  */
90 class HTMLStackElem
91 {
92 public:
HTMLStackElem(int _id,int _level,DOM::NodeImpl * _node,bool _inline_,HTMLStackElem * _next)93     HTMLStackElem(int _id,
94                   int _level,
95                   DOM::NodeImpl *_node,
96                   bool _inline_,
97                   HTMLStackElem *_next)
98         :
99         id(_id),
100         level(_level),
101         strayTableContent(false),
102         m_inline(_inline_),
103         node(_node),
104         next(_next)
105     {
106         node->ref();
107     }
108 
~HTMLStackElem()109     ~HTMLStackElem()
110     {
111         node->deref();
112     }
113 
setNode(NodeImpl * newNode)114     void setNode(NodeImpl *newNode)
115     {
116         newNode->ref();
117         node->deref();
118         node = newNode;
119     }
120 
121     int       id;
122     int       level;
123     bool      strayTableContent;
124     bool m_inline;
125     NodeImpl *node;
126     HTMLStackElem *next;
127 };
128 
129 /**
130  * @internal
131  *
132  * The parser parses tokenized input into the document, building up the
133  * document tree. If the document is wellformed, parsing it is
134  * straightforward.
135  * Unfortunately, people can't write wellformed HTML documents, so the parser
136  * has to be tolerant about errors.
137  *
138  * We have to take care of the following error conditions:
139  * 1. The element being added is explicitly forbidden inside some outer tag.
140  *    In this case we should close all tags up to the one, which forbids
141  *    the element, and add it afterwards.
142  * 2. We are not allowed to add the element directly. It could be, that
143  *    the person writing the document forgot some tag inbetween (or that the
144  *    tag inbetween is optional...) This could be the case with the following
145  *    tags: HTML HEAD BODY TBODY TR TD LI (did I forget any?)
146  * 3. We wan't to add a block element inside to an inline element. Close all
147  *    inline elements up to the next higher block element.
148  * 4. If this doesn't help close elements, until we are allowed to add the
149  *    element or ignore the tag.
150  *
151  */
152 
KHTMLParser(KHTMLView * _parent,DocumentImpl * doc)153 KHTMLParser::KHTMLParser(KHTMLView *_parent, DocumentImpl *doc)
154 {
155     //qCDebug(KHTML_LOG) << "parser constructor";
156 #if SPEED_DEBUG > 0
157     qt.start();
158 #endif
159 
160     HTMLWidget    = _parent;
161     document      = doc;
162 
163     blockStack = nullptr;
164     current = nullptr;
165 
166     // ID_CLOSE_TAG == Num of tags
167     forbiddenTag = new ushort[ID_CLOSE_TAG + 1];
168 
169     reset();
170 }
171 
KHTMLParser(DOM::DocumentFragmentImpl * i,DocumentImpl * doc)172 KHTMLParser::KHTMLParser(DOM::DocumentFragmentImpl *i, DocumentImpl *doc)
173 {
174     HTMLWidget = nullptr;
175     document = doc;
176 
177     forbiddenTag = new ushort[ID_CLOSE_TAG + 1];
178 
179     blockStack = nullptr;
180     current = nullptr;
181 
182     reset();
183 
184     setCurrent(i);
185 
186     inBody = true;
187 }
188 
~KHTMLParser()189 KHTMLParser::~KHTMLParser()
190 {
191 #if SPEED_DEBUG > 0
192     qCDebug(KHTML_LOG) << "TIME: parsing time was = " << qt.elapsed();
193 #endif
194 
195     freeBlock();
196 
197     if (current) {
198         current->deref();
199     }
200 
201     delete [] forbiddenTag;
202     delete isindex;
203 }
204 
reset()205 void KHTMLParser::reset()
206 {
207     setCurrent(document);
208 
209     freeBlock();
210 
211     // before parsing no tags are forbidden...
212     memset(forbiddenTag, 0, (ID_CLOSE_TAG + 1)*sizeof(ushort));
213 
214     inBody = false;
215     haveFrameSet = false;
216     haveContent = false;
217     haveBody = false;
218     haveTitle = false;
219     inSelect = false;
220     inStrayTableContent = 0;
221     m_inline = false;
222 
223     form = nullptr;
224     map = nullptr;
225     end = false;
226     isindex = nullptr;
227 
228     discard_until = 0;
229 }
230 
parseToken(Token * t)231 void KHTMLParser::parseToken(Token *t)
232 {
233     if (t->tid > 2 * ID_CLOSE_TAG) {
234         // qCDebug(KHTML_LOG) << "Unknown tag!! tagID = " << t->tid;
235         return;
236     }
237     if (discard_until) {
238         if (t->tid == discard_until) {
239             discard_until = 0;
240         }
241 
242         // do not skip </iframe>
243         if (discard_until || current->id() + ID_CLOSE_TAG != t->tid) {
244             return;
245         }
246     }
247 
248 #ifdef PARSER_DEBUG
249     qCDebug(KHTML_LOG) << "\n\n==> parser: processing token " << getParserPrintableName(t->tid) << "(" << t->tid << ")"
250              << " current = " << getParserPrintableName(current->id()) << "(" << current->id() << ")";
251     qCDebug(KHTML_LOG) << "inline=" << m_inline << " inBody=" << inBody << " haveFrameSet=" << haveFrameSet << " haveContent=" << haveContent;
252 #endif
253 
254     // holy shit. apparently some sites use </br> instead of <br>
255     // be compatible with IE and NS
256     if (t->tid == ID_BR + ID_CLOSE_TAG && document->inCompatMode()) {
257         t->tid -= ID_CLOSE_TAG;
258     }
259 
260     if (t->tid > ID_CLOSE_TAG) {
261         processCloseTag(t);
262         return;
263     }
264 
265     // ignore spaces, if we're not inside a paragraph or other inline code
266     if (t->tid == ID_TEXT && t->text) {
267         if (inBody && !skipMode() &&
268                 current->id() != ID_STYLE && current->id() != ID_TITLE &&
269                 current->id() != ID_SCRIPT &&
270                 !t->text->containsOnlyWhitespace()) {
271             haveContent = true;
272         }
273 #ifdef PARSER_DEBUG
274 
275         qCDebug(KHTML_LOG) << "length=" << t->text->l << " text='" << QString::fromRawData(t->text->s, t->text->l) << "'";
276 #endif
277     }
278 
279     NodeImpl *n = getElement(t);
280     // just to be sure, and to catch currently unimplemented stuff
281     if (!n) {
282         return;
283     }
284 
285     // set attributes
286     if (n->isElementNode() && t->tid != ID_ISINDEX) {
287         ElementImpl *e = static_cast<ElementImpl *>(n);
288         e->setAttributeMap(t->attrs);
289     }
290 
291     // if this tag is forbidden inside the current context, pop
292     // blocks until we are allowed to add it...
293     while (blockStack && forbiddenTag[t->tid]) {
294 #ifdef PARSER_DEBUG
295         qCDebug(KHTML_LOG) << "t->id: " << t->tid << " is forbidden :-( ";
296 #endif
297         popOneBlock();
298     }
299 
300     // sometimes flat doesn't make sense
301     switch (t->tid) {
302     case ID_SELECT:
303     case ID_OPTION:
304         t->flat = false;
305     }
306 
307     // the tokenizer needs the feedback for space discarding
308     if (tagPriority(t->tid) == 0) {
309         t->flat = true;
310     }
311 
312     if (!insertNode(n, t->flat)) {
313         // we couldn't insert the node...
314 #ifdef PARSER_DEBUG
315         qCDebug(KHTML_LOG) << "insertNode failed current=" << current->id() << ", new=" << n->id() << "!";
316 #endif
317         if (map == n) {
318 #ifdef PARSER_DEBUG
319             qCDebug(KHTML_LOG) << "  --> resetting map!";
320 #endif
321             map = nullptr;
322         }
323         if (form == n) {
324 #ifdef PARSER_DEBUG
325             qCDebug(KHTML_LOG) << "   --> resetting form!";
326 #endif
327             form = nullptr;
328         }
329         delete n;
330     }
331 }
332 
parseDoctypeToken(DoctypeToken * t)333 void KHTMLParser::parseDoctypeToken(DoctypeToken *t)
334 {
335     // Ignore any doctype after the first. TODO It should be also ignored when processing DocumentFragment
336     if (current != document || document->doctype()) {
337         return;
338     }
339 
340     DocumentTypeImpl *doctype = new DocumentTypeImpl(document->implementation(), document, t->name, t->publicID, t->systemID);
341     if (!t->internalSubset.isEmpty()) {
342         doctype->setInternalSubset(t->internalSubset);
343     }
344     document->addChild(doctype);
345 
346     // Determine parse mode here
347     // This code more or less mimics Mozilla's implementation.
348     //
349     // There are three possible parse modes:
350     // COMPAT - quirks mode emulates WinIE
351     // and NS4.  CSS parsing is also relaxed in this mode, e.g., unit types can
352     // be omitted from numbers.
353     // ALMOST STRICT - This mode is identical to strict mode
354     // except for its treatment of line-height in the inline box model.  For
355     // now (until the inline box model is re-written), this mode is identical
356     // to STANDARDS mode.
357     // STRICT - no quirks apply.  Web pages will obey the specifications to
358     // the letter.
359 
360     if (!document->isHTMLDocument()) { // FIXME Could document be non-HTML?
361         return;
362     }
363     DOM::HTMLDocumentImpl *htmldoc = static_cast<DOM::HTMLDocumentImpl *>(document);
364     if (t->name.toLower() == "html") {
365         if (!t->internalSubset.isEmpty() || t->publicID.isEmpty()) {
366             // Internal subsets always denote full standards, as does
367             // a doctype without a public ID.
368             htmldoc->changeModes(DOM::DocumentImpl::Strict, DOM::DocumentImpl::Html4);
369         } else {
370             // We have to check a list of public IDs to see what we
371             // should do.
372             QString lowerPubID = t->publicID.toLower();
373             QByteArray pubIDStr = lowerPubID.toLocal8Bit();
374 
375             // Look up the entry in our gperf-generated table.
376             const PubIDInfo *doctypeEntry = Perfect_Hash::findDoctypeEntry(pubIDStr.constData(), t->publicID.length());
377             if (!doctypeEntry) {
378                 // The DOCTYPE is not in the list.  Assume strict mode.
379                 // ### Doesn't make any sense, but it's what Mozilla does.
380                 htmldoc->changeModes(DOM::DocumentImpl::Strict, DOM::DocumentImpl::Html4);
381             } else {
382                 switch ((!t->systemID.isEmpty()) ?
383                         doctypeEntry->mode_if_sysid :
384                         doctypeEntry->mode_if_no_sysid) {
385                 case PubIDInfo::eQuirks3:
386                     htmldoc->changeModes(DOM::DocumentImpl::Compat, DOM::DocumentImpl::Html3);
387                     break;
388                 case PubIDInfo::eQuirks:
389                     htmldoc->changeModes(DOM::DocumentImpl::Compat, DOM::DocumentImpl::Html4);
390                     break;
391                 case PubIDInfo::eAlmostStandards:
392                     htmldoc->changeModes(DOM::DocumentImpl::Transitional, DOM::DocumentImpl::Html4);
393                     break;
394                 default:
395                     assert(!"Unknown parse mode");
396                 }
397             }
398         }
399     } else {
400         // Malformed doctype implies quirks mode.
401         htmldoc->changeModes(DOM::DocumentImpl::Compat, DOM::DocumentImpl::Html3);
402     }
403 }
404 
isTableRelatedTag(int id)405 static bool isTableRelatedTag(int id)
406 {
407     return (id == ID_TR || id == ID_TD || id == ID_TABLE || id == ID_TBODY || id == ID_TFOOT || id == ID_THEAD ||
408             id == ID_TH);
409 }
410 
insertNode(NodeImpl * n,bool flat)411 bool KHTMLParser::insertNode(NodeImpl *n, bool flat)
412 {
413     int id = n->id();
414 
415     // <table> is never allowed inside stray table content.  Always pop out of the stray table content
416     // and close up the first table, and then start the second table as a sibling.
417     if (inStrayTableContent && id == ID_TABLE) {
418         popBlock(ID_TABLE);
419     }
420 
421     // let's be stupid and just try to insert it.
422     // this should work if the document is wellformed
423 #ifdef PARSER_DEBUG
424     NodeImpl *tmp = current;
425 #endif
426     NodeImpl *newNode = current->addChild(n);
427     if (newNode) {
428 #ifdef PARSER_DEBUG
429         qCDebug(KHTML_LOG) << "added " << n->nodeName().string() << " to " << tmp->nodeName().string() << ", new current=" << newNode->nodeName().string();
430 #endif
431         // We allow TABLE > FORM in dtd.cpp, but do not allow the form have children in this case
432         if (current->id() == ID_TABLE && id == ID_FORM) {
433             flat = true;
434             static_cast<HTMLFormElementImpl *>(n)->setMalformed(true);
435         }
436 
437         // don't push elements without end tag on the stack
438         if (tagPriority(id) != 0 && !flat) {
439 #if SPEED_DEBUG < 2
440             if (!n->attached() && HTMLWidget) {
441                 n->attach();
442             }
443 #endif
444             if (n->isInline()) {
445                 m_inline = true;
446             }
447             pushBlock(id, tagPriority(id));
448             setCurrent(newNode);
449         } else {
450 #if SPEED_DEBUG < 2
451             if (!n->attached() && HTMLWidget) {
452                 n->attach();
453             }
454             if (n->maintainsState()) {
455                 document->registerMaintainsState(n);
456                 document->attemptRestoreState(n);
457             }
458             n->close();
459 #endif
460             if (n->isInline()) {
461                 m_inline = true;
462             }
463         }
464 
465 #if SPEED_DEBUG < 1
466         if (tagPriority(id) == 0 && n->renderer()) {
467             n->renderer()->calcMinMaxWidth();
468         }
469 #endif
470         return true;
471     } else {
472 #ifdef PARSER_DEBUG
473         qCDebug(KHTML_LOG) << "ADDING NODE FAILED!!!! current = " << current->nodeName().string() << ", new = " << n->nodeName().string();
474 #endif
475         // error handling...
476         HTMLElementImpl *e;
477         bool handled = false;
478 
479         // first switch on current element for elements with optional end-tag and inline-only content
480         switch (current->id()) {
481         case ID_P:
482         case ID_DT:
483             if (!n->isInline()) {
484                 popBlock(current->id());
485                 return insertNode(n);
486             }
487             break;
488         case ID_TITLE:
489             popBlock(current->id());
490             return insertNode(n);
491         default:
492             break;
493         }
494 
495         // switch according to the element to insert
496         switch (id) {
497         case ID_TR:
498         case ID_TH:
499         case ID_TD:
500             if (inStrayTableContent && !isTableRelatedTag(current->id())) {
501                 // pop out to the nearest enclosing table-related tag.
502                 while (blockStack && !isTableRelatedTag(current->id())) {
503                     popOneBlock();
504                 }
505                 return insertNode(n);
506             }
507             break;
508         case ID_HEAD:
509             // ### allow not having <HTML> in at all, as per HTML spec
510             if (!current->isDocumentNode() && current->id() != ID_HTML) {
511                 return false;
512             }
513             break;
514         case ID_COMMENT:
515             if (head) {
516                 break;
517             }
518         case ID_META:
519         case ID_LINK:
520         case ID_ISINDEX:
521         case ID_BASE:
522             if (!head) {
523                 createHead();
524             }
525             if (head) {
526                 if (head->addChild(n)) {
527 #if SPEED_DEBUG < 2
528                     if (!n->attached() && HTMLWidget) {
529                         n->attach();
530                     }
531 #endif
532                 }
533 
534                 return true;
535             }
536 
537             break;
538         case ID_HTML:
539             if (!current->isDocumentNode()) {
540                 if (doc()->documentElement()->id() == ID_HTML) {
541                     // we have another <HTML> element.... apply attributes to existing one
542                     // make sure we don't overwrite already existing attributes
543                     NamedAttrMapImpl *map = static_cast<ElementImpl *>(n)->attributes(true);
544                     NamedAttrMapImpl *bmap = static_cast<ElementImpl *>(doc()->documentElement())->attributes(false);
545                     bool changed = false;
546                     for (unsigned long l = 0; map && l < map->length(); ++l) {
547                         NodeImpl::Id attrId = map->idAt(l);
548                         DOMStringImpl *attrValue = map->valueAt(l);
549                         changed = !bmap->getValue(attrId);
550                         bmap->setValue(attrId, attrValue);
551                     }
552                     if (changed) {
553                         doc()->recalcStyle(NodeImpl::Inherit);
554                     }
555                 }
556                 return false;
557             }
558             break;
559         case ID_TITLE:
560         case ID_STYLE:
561             if (!head) {
562                 createHead();
563             }
564             if (head) {
565                 DOM::NodeImpl *newNode = head->addChild(n);
566                 if (newNode) {
567                     pushBlock(id, tagPriority(id));
568                     setCurrent(newNode);
569 #if SPEED_DEBUG < 2
570                     if (!n->attached() && HTMLWidget) {
571                         n->attach();
572                     }
573 #endif
574                 } else {
575 #ifdef PARSER_DEBUG
576                     qCDebug(KHTML_LOG) << "adding style before to body failed!!!!";
577 #endif
578                     discard_until = ID_STYLE + ID_CLOSE_TAG;
579                     return false;
580                 }
581                 return true;
582             } else if (inBody) {
583                 discard_until = id + ID_CLOSE_TAG;
584                 return false;
585             }
586             break;
587         case ID_SCRIPT:
588             // if we failed to insert it, go into skip mode
589             discard_until = id + ID_CLOSE_TAG;
590             break;
591         case ID_BODY:
592             if (inBody && doc()->body()) {
593                 // we have another <BODY> element.... apply attributes to existing one
594                 // make sure we don't overwrite already existing attributes
595                 // some sites use <body bgcolor=rightcolor>...<body bgcolor=wrongcolor>
596                 NamedAttrMapImpl *map = static_cast<ElementImpl *>(n)->attributes(true);
597                 NamedAttrMapImpl *bmap = doc()->body()->attributes(false);
598                 bool changed = false;
599                 for (unsigned long l = 0; map && l < map->length(); ++l) {
600                     NodeImpl::Id attrId = map->idAt(l);
601                     DOMStringImpl *attrValue = map->valueAt(l);
602                     if (!bmap->getValue(attrId)) {
603                         bmap->setValue(attrId, attrValue);
604                         changed = true;
605                     }
606                 }
607                 if (changed) {
608                     doc()->recalcStyle(NodeImpl::Inherit);
609                 }
610             } else if (current->isDocumentNode()) {
611                 break;
612             }
613             return false;
614             break;
615 
616         // the following is a hack to move non rendered elements
617         // outside of tables.
618         // needed for broken constructs like <table><form ...><tr>....
619         case ID_INPUT: {
620             ElementImpl *e = static_cast<ElementImpl *>(n);
621             DOMString type = e->getAttribute(ATTR_TYPE);
622 
623             if (strcasecmp(type, "hidden") != 0) {
624                 break;
625             }
626             // Fall through!
627         }
628         case ID_TEXT: {
629             // Don't try to fit random white-space anywhere
630             TextImpl *t = static_cast<TextImpl *>(n);
631             if (t->containsOnlyWhitespace()) {
632                 return false;
633             }
634             // ignore text inside the following elements.
635             switch (current->id()) {
636             case ID_SELECT:
637                 return false;
638             default:
639                 ;
640                 // fall through!!
641             };
642             break;
643         }
644         case ID_DL:
645             popBlock(ID_DT);
646             if (current->id() == ID_DL) {
647                 e = new HTMLGenericElementImpl(document, ID_DD);
648                 insertNode(e);
649                 handled = true;
650             }
651             break;
652         case ID_DT:
653             e = new HTMLDListElementImpl(document);
654             if (insertNode(e)) {
655                 insertNode(n);
656                 return true;
657             }
658             break;
659         case ID_AREA: {
660             if (map) {
661                 map->addChild(n);
662 #if SPEED_DEBUG < 2
663                 if (!n->attached() && HTMLWidget) {
664                     n->attach();
665                 }
666 #endif
667                 handled = true;
668                 return true;
669             } else {
670                 return false;
671             }
672         }
673 
674         case ID_THEAD:
675         case ID_TBODY:
676         case ID_TFOOT:
677         case ID_CAPTION:
678         case ID_COLGROUP: {
679             if (isTableRelatedTag(current->id())) {
680                 while (blockStack && current->id() != ID_TABLE && isTableRelatedTag(current->id())) {
681                     popOneBlock();
682                 }
683                 return insertNode(n);
684             }
685         }
686         default:
687             break;
688         }
689 
690         // switch on the currently active element
691         switch (current->id()) {
692         case ID_HTML:
693             switch (id) {
694             case ID_SCRIPT:
695             case ID_STYLE:
696             case ID_META:
697             case ID_LINK:
698             case ID_OBJECT:
699             case ID_EMBED:
700             case ID_TITLE:
701             case ID_ISINDEX:
702             case ID_BASE:
703                 if (!head) {
704                     head = new HTMLHeadElementImpl(document);
705                     insertNode(head.get());
706                     handled = true;
707                 }
708                 break;
709             case ID_TEXT: {
710                 TextImpl *t = static_cast<TextImpl *>(n);
711                 if (t->containsOnlyWhitespace()) {
712                     return false;
713                 }
714                 /* Fall through to default */
715             }
716             default:
717                 if (haveFrameSet) {
718                     break;
719                 }
720                 e = new HTMLBodyElementImpl(document);
721                 startBody();
722                 insertNode(e);
723                 handled = true;
724                 break;
725             }
726             break;
727         case ID_HEAD:
728             // we can get here only if the element is not allowed in head.
729             if (id == ID_HTML) {
730                 return false;
731             } else {
732                 // This means the body starts here...
733                 if (haveFrameSet) {
734                     break;
735                 }
736                 popBlock(ID_HEAD);
737                 e = new HTMLBodyElementImpl(document);
738                 startBody();
739                 insertNode(e);
740                 handled = true;
741             }
742             break;
743         case ID_BODY:
744             break;
745         case ID_CAPTION:
746             // Illegal content in a caption. Close the caption and try again.
747             popBlock(ID_CAPTION);
748             switch (id) {
749             case ID_THEAD:
750             case ID_TFOOT:
751             case ID_TBODY:
752             case ID_TR:
753             case ID_TD:
754             case ID_TH:
755                 return insertNode(n, flat);
756             }
757             break;
758         case ID_TABLE:
759         case ID_THEAD:
760         case ID_TFOOT:
761         case ID_TBODY:
762         case ID_TR:
763             switch (id) {
764             case ID_TABLE:
765                 popBlock(ID_TABLE); // end the table
766                 handled = checkChild(current->id(), id, doc()->inStrictMode());
767                 break;
768             default: {
769                 NodeImpl *node = current;
770                 NodeImpl *parent = node->parentNode();
771                 // A script may have removed the current node's parent from the DOM
772                 // http://bugzilla.opendarwin.org/show_bug.cgi?id=7137
773                 // FIXME: we should do real recovery here and re-parent with the correct node.
774                 if (!parent) {
775                     return false;
776                 }
777                 NodeImpl *parentparent = parent->parentNode();
778 
779                 if (n->isTextNode() ||
780                         (node->id() == ID_TR &&
781                          (parent->id() == ID_THEAD ||
782                           parent->id() == ID_TBODY ||
783                           parent->id() == ID_TFOOT) && parentparent->id() == ID_TABLE) ||
784                         (!checkChild(ID_TR, id) && (node->id() == ID_THEAD || node->id() == ID_TBODY || node->id() == ID_TFOOT) &&
785                          parent->id() == ID_TABLE)) {
786                     node = (node->id() == ID_TABLE) ? node :
787                            ((node->id() == ID_TR) ? parentparent : parent);
788                     NodeImpl *parent = node->parentNode();
789                     if (!parent) {
790                         return false;
791                     }
792                     int exceptioncode = 0;
793 #ifdef PARSER_DEBUG
794                     qCDebug(KHTML_LOG) << "calling insertBefore(" << n->nodeName().string() << "," << node->nodeName().string() << ")";
795 #endif
796                     parent->insertBefore(n, node, exceptioncode);
797                     if (exceptioncode) {
798 #ifndef PARSER_DEBUG
799                         if (!n->isTextNode())
800 #endif
801                             // qCDebug(KHTML_LOG) << "adding content before table failed..";
802                             break;
803                     }
804                     if (n->isElementNode() && tagPriority(id) != 0 &&
805                             !flat && endTagRequirement(id) != DOM::FORBIDDEN) {
806 
807                         pushBlock(id, tagPriority(id));
808                         setCurrent(n);
809                         inStrayTableContent++;
810                         blockStack->strayTableContent = true;
811                     }
812                     return true;
813                 }
814 
815                 if (current->id() == ID_TR) {
816                     e = new HTMLTableCellElementImpl(document, ID_TD);
817                 } else if (current->id() == ID_TABLE) {
818                     e = new HTMLTableSectionElementImpl(document, ID_TBODY, true /* implicit */);
819                 } else {
820                     e = new HTMLTableRowElementImpl(document);
821                 }
822 
823                 insertNode(e);
824                 handled = true;
825                 break;
826             } // end default
827             } // end switch
828             break;
829         case ID_OBJECT:
830             discard_until = id + ID_CLOSE_TAG;
831             return false;
832         case ID_UL:
833         case ID_OL:
834         case ID_DIR:
835         case ID_MENU:
836             e = new HTMLLIElementImpl(document);
837             e->addCSSProperty(CSS_PROP_LIST_STYLE_TYPE, CSS_VAL_NONE);
838             insertNode(e);
839             handled = true;
840             break;
841         case ID_FORM:
842             popBlock(ID_FORM);
843             handled = true;
844             break;
845         case ID_SELECT:
846             if (n->isInline()) {
847                 return false;
848             }
849             break;
850         case ID_P:
851         case ID_H1:
852         case ID_H2:
853         case ID_H3:
854         case ID_H4:
855         case ID_H5:
856         case ID_H6:
857             if (!n->isInline()) {
858                 popBlock(current->id());
859                 handled = true;
860             }
861             break;
862         case ID_OPTION:
863         case ID_OPTGROUP:
864             if (id == ID_OPTGROUP) {
865                 popBlock(current->id());
866                 handled = true;
867             } else if (id == ID_SELECT) {
868                 // IE treats a nested select as </select>. Let's do the same
869                 popBlock(ID_SELECT);
870                 break;
871             }
872             break;
873         // head elements in the body should be ignored.
874 
875         case ID_ADDRESS:
876         case ID_COLGROUP:
877         case ID_FONT:
878             popBlock(current->id());
879             handled = true;
880             break;
881         default:
882             if (current->isDocumentNode()) {
883                 DocumentImpl *doc = static_cast<DocumentImpl *>(current);
884                 if (!doc->documentElement()) {
885                     e = new HTMLHtmlElementImpl(document);
886                     insertNode(e);
887                     handled = true;
888                 }
889             } else if (current->isInline()) {
890                 popInlineBlocks();
891                 handled = true;
892             }
893         }
894 
895         // if we couldn't handle the error, just rethrow the exception...
896         if (!handled) {
897             //qCDebug(KHTML_LOG) << "Exception handler failed in HTMLPArser::insertNode()";
898             return false;
899         }
900 
901         return insertNode(n);
902     }
903 }
904 
getElement(Token * t)905 NodeImpl *KHTMLParser::getElement(Token *t)
906 {
907     NodeImpl *n = nullptr;
908 
909     switch (t->tid) {
910     case ID_HTML:
911         n = new HTMLHtmlElementImpl(document);
912         break;
913     case ID_HEAD:
914         if (!head && (current->id() == ID_HTML || current->isDocumentNode())) {
915             head = new HTMLHeadElementImpl(document);
916             n = head.get();
917         }
918         break;
919     case ID_BODY:
920         // body no longer allowed if we have a frameset
921         if (haveFrameSet) {
922             break;
923         }
924         popBlock(ID_HEAD);
925         n = new HTMLBodyElementImpl(document);
926         haveBody =  true;
927         startBody();
928         break;
929 
930 // head elements
931     case ID_BASE:
932         n = new HTMLBaseElementImpl(document);
933         break;
934     case ID_LINK:
935         n = new HTMLLinkElementImpl(document);
936         break;
937     case ID_META:
938         n = new HTMLMetaElementImpl(document);
939         break;
940     case ID_STYLE:
941         n = new HTMLStyleElementImpl(document);
942         break;
943     case ID_TITLE:
944         // only one non-empty <title> allowed
945         if (haveTitle) {
946             discard_until = ID_TITLE + ID_CLOSE_TAG;
947             break;
948         }
949         n = new HTMLTitleElementImpl(document);
950         // we'll set haveTitle when closing the tag
951         break;
952 
953 // frames
954     case ID_FRAME:
955         n = new HTMLFrameElementImpl(document);
956         break;
957     case ID_FRAMESET:
958         popBlock(ID_HEAD);
959         if (inBody && !haveFrameSet && !haveContent && !haveBody) {
960             popBlock(ID_BODY);
961             // ### actually for IE document.body returns the now hidden "body" element
962             // we can't implement that behavior now because it could cause too many
963             // regressions and the headaches are not worth the work as long as there is
964             // no site actually relying on that detail (Dirk)
965             if (static_cast<HTMLDocumentImpl *>(document)->body())
966                 static_cast<HTMLDocumentImpl *>(document)->body()
967                 ->addCSSProperty(CSS_PROP_DISPLAY, CSS_VAL_NONE);
968             inBody = false;
969         }
970         if ((haveBody || haveContent || haveFrameSet) && current->id() == ID_HTML) {
971             break;
972         }
973         n = new HTMLFrameSetElementImpl(document);
974         haveFrameSet = true;
975         startBody();
976         break;
977     // a bit a special case, since the frame is inlined...
978     case ID_IFRAME:
979         n = new HTMLIFrameElementImpl(document);
980         break;
981 
982 // form elements
983     case ID_FORM:
984         // thou shall not nest <form> - NS/IE quirk
985         if (form) {
986             break;
987         }
988         n = form = new HTMLFormElementImpl(document, false);
989         break;
990     case ID_BUTTON:
991         n = new HTMLButtonElementImpl(document, form);
992         break;
993     case ID_FIELDSET:
994         n = new HTMLFieldSetElementImpl(document, form);
995         break;
996     case ID_INPUT:
997         if (t->attrs &&
998                 KHTMLGlobal::defaultHTMLSettings()->isAdFilterEnabled() &&
999                 KHTMLGlobal::defaultHTMLSettings()->isHideAdsEnabled() &&
1000                 !strcasecmp(t->attrs->getValue(ATTR_TYPE), "image")) {
1001             const QString url = doc()->completeURL(DOMString(t->attrs->getValue(ATTR_SRC)).trimSpaces().string());
1002             if (KHTMLGlobal::defaultHTMLSettings()->isAdFiltered(url)) {
1003                 return nullptr;
1004             }
1005         }
1006         n = new HTMLInputElementImpl(document, form);
1007         break;
1008     case ID_ISINDEX:
1009         n = handleIsindex(t);
1010         if (!inBody) {
1011             isindex = n;
1012             n = nullptr;
1013         } else {
1014             t->flat = true;
1015         }
1016         break;
1017     case ID_KEYGEN:
1018         n = new HTMLKeygenElementImpl(document, form);
1019         break;
1020     case ID_LABEL:
1021         n = new HTMLLabelElementImpl(document);
1022         break;
1023     case ID_LEGEND:
1024         n = new HTMLLegendElementImpl(document, form);
1025         break;
1026     case ID_OPTGROUP:
1027         n = new HTMLOptGroupElementImpl(document, form);
1028         break;
1029     case ID_OPTION:
1030         popOptionalBlock(ID_OPTION);
1031         n = new HTMLOptionElementImpl(document, form);
1032         break;
1033     case ID_SELECT:
1034         inSelect = true;
1035         n = new HTMLSelectElementImpl(document, form);
1036         break;
1037     case ID_TEXTAREA:
1038         n = new HTMLTextAreaElementImpl(document, form);
1039         break;
1040 
1041 // lists
1042     case ID_DL:
1043         n = new HTMLDListElementImpl(document);
1044         break;
1045     case ID_DD:
1046         popOptionalBlock(ID_DT);
1047         popOptionalBlock(ID_DD);
1048         n = new HTMLGenericElementImpl(document, t->tid);
1049         break;
1050     case ID_DT:
1051         popOptionalBlock(ID_DD);
1052         popOptionalBlock(ID_DT);
1053         n = new HTMLGenericElementImpl(document, t->tid);
1054         break;
1055     case ID_UL: {
1056         n = new HTMLUListElementImpl(document);
1057         break;
1058     }
1059     case ID_OL: {
1060         n = new HTMLOListElementImpl(document);
1061         break;
1062     }
1063     case ID_DIR:
1064         n = new HTMLDirectoryElementImpl(document);
1065         break;
1066     case ID_MENU:
1067         n = new HTMLMenuElementImpl(document);
1068         break;
1069     case ID_LI:
1070         popOptionalBlock(ID_LI);
1071         n = new HTMLLIElementImpl(document);
1072         break;
1073 // formatting elements (block)
1074     case ID_BLOCKQUOTE:
1075         n = new HTMLGenericElementImpl(document, t->tid);
1076         break;
1077     case ID_LAYER:
1078     case ID_ILAYER:
1079         n = new HTMLLayerElementImpl(document, t->tid);
1080         break;
1081     case ID_P:
1082     case ID_DIV:
1083         n = new HTMLDivElementImpl(document, t->tid);
1084         break;
1085     case ID_H1:
1086     case ID_H2:
1087     case ID_H3:
1088     case ID_H4:
1089     case ID_H5:
1090     case ID_H6:
1091         n = new HTMLGenericElementImpl(document, t->tid);
1092         break;
1093     case ID_HR:
1094         n = new HTMLHRElementImpl(document);
1095         break;
1096     case ID_PRE:
1097     case ID_XMP:
1098     case ID_PLAINTEXT:
1099     case ID_LISTING:
1100         n = new HTMLPreElementImpl(document, t->tid);
1101         break;
1102 
1103 // font stuff
1104     case ID_BASEFONT:
1105         n = new HTMLBaseFontElementImpl(document);
1106         break;
1107     case ID_FONT:
1108         n = new HTMLFontElementImpl(document);
1109         break;
1110 
1111 // ins/del
1112     case ID_DEL:
1113     case ID_INS:
1114         n = new HTMLGenericElementImpl(document, t->tid);
1115         break;
1116 
1117 // anchor
1118     case ID_A:
1119         popBlock(ID_A);
1120 
1121         n = new HTMLAnchorElementImpl(document);
1122         break;
1123 
1124 // images
1125     case ID_IMAGE:
1126     case ID_IMG:
1127         if (t->attrs &&
1128                 KHTMLGlobal::defaultHTMLSettings()->isAdFilterEnabled() &&
1129                 KHTMLGlobal::defaultHTMLSettings()->isHideAdsEnabled()) {
1130             const QString url = doc()->completeURL(DOMString(t->attrs->getValue(ATTR_SRC)).trimSpaces().string());
1131             if (KHTMLGlobal::defaultHTMLSettings()->isAdFiltered(url)) {
1132                 return nullptr;
1133             }
1134         }
1135         n = new HTMLImageElementImpl(document, form);
1136         break;
1137 
1138     case ID_CANVAS:
1139         n = new HTMLCanvasElementImpl(document);
1140         break;
1141 
1142     case ID_MAP:
1143         map = new HTMLMapElementImpl(document);
1144         n = map;
1145         break;
1146     case ID_AREA:
1147         n = new HTMLAreaElementImpl(document);
1148         break;
1149 
1150 // objects, applets and scripts
1151     case ID_APPLET:
1152         n = new HTMLAppletElementImpl(document);
1153         break;
1154     case ID_EMBED:
1155         n = new HTMLEmbedElementImpl(document);
1156         break;
1157     case ID_OBJECT:
1158         n = new HTMLObjectElementImpl(document);
1159         break;
1160     case ID_PARAM:
1161         n = new HTMLParamElementImpl(document);
1162         break;
1163     case ID_SCRIPT: {
1164         HTMLScriptElementImpl *scriptElement = new HTMLScriptElementImpl(document);
1165         scriptElement->setCreatedByParser(true);
1166         n = scriptElement;
1167         break;
1168     }
1169 
1170 // media
1171     case ID_AUDIO:
1172         n = new HTMLAudioElement(document);
1173         break;
1174     case ID_VIDEO:
1175         n = new HTMLVideoElement(document);
1176         break;
1177     case ID_SOURCE:
1178         n = new HTMLSourceElement(document);
1179         break;
1180 
1181 // tables
1182     case ID_TABLE:
1183         n = new HTMLTableElementImpl(document);
1184         break;
1185     case ID_CAPTION:
1186         n = new HTMLTableCaptionElementImpl(document);
1187         break;
1188     case ID_COLGROUP:
1189     case ID_COL:
1190         n = new HTMLTableColElementImpl(document, t->tid);
1191         break;
1192     case ID_TR:
1193         popBlock(ID_TR);
1194         n = new HTMLTableRowElementImpl(document);
1195         break;
1196     case ID_TD:
1197     case ID_TH:
1198         popBlock(ID_TH);
1199         popBlock(ID_TD);
1200         n = new HTMLTableCellElementImpl(document, t->tid);
1201         break;
1202     case ID_TBODY:
1203     case ID_THEAD:
1204     case ID_TFOOT:
1205         popBlock(ID_THEAD);
1206         popBlock(ID_TBODY);
1207         popBlock(ID_TFOOT);
1208         n = new HTMLTableSectionElementImpl(document, t->tid, false);
1209         break;
1210 
1211 // inline elements
1212     case ID_BR:
1213         n = new HTMLBRElementImpl(document);
1214         break;
1215     case ID_Q:
1216         n = new HTMLGenericElementImpl(document, t->tid);
1217         break;
1218 
1219 // elements with no special representation in the DOM
1220 
1221 // block:
1222     case ID_ADDRESS:
1223     case ID_CENTER:
1224         n = new HTMLGenericElementImpl(document, t->tid);
1225         break;
1226 // inline
1227     // %fontstyle
1228     case ID_TT:
1229     case ID_U:
1230     case ID_B:
1231     case ID_I:
1232     case ID_S:
1233     case ID_STRIKE:
1234     case ID_BIG:
1235     case ID_SMALL:
1236 
1237     // %phrase
1238     case ID_EM:
1239     case ID_STRONG:
1240     case ID_DFN:
1241     case ID_CODE:
1242     case ID_SAMP:
1243     case ID_KBD:
1244     case ID_VAR:
1245     case ID_CITE:
1246     case ID_ABBR:
1247     case ID_ACRONYM:
1248 
1249     // %special
1250     case ID_SUB:
1251     case ID_SUP:
1252     case ID_SPAN:
1253     case ID_WBR:
1254     case ID_NOBR:
1255         if (t->tid == ID_NOBR || t->tid == ID_WBR) {
1256             popOptionalBlock(t->tid);
1257         }
1258     case ID_BDO:
1259         n = new HTMLGenericElementImpl(document, t->tid);
1260         break;
1261 
1262     // these are special, and normally not rendered
1263     case ID_NOEMBED:
1264         if (!t->flat) {
1265             n = new HTMLGenericElementImpl(document, t->tid);
1266             discard_until = ID_NOEMBED + ID_CLOSE_TAG;
1267         }
1268         return n;
1269     case ID_NOFRAMES:
1270         if (!t->flat) {
1271             n = new HTMLGenericElementImpl(document, t->tid);
1272             discard_until = ID_NOFRAMES + ID_CLOSE_TAG;
1273         }
1274         return n;
1275     case ID_NOSCRIPT:
1276         if (!t->flat) {
1277             n = new HTMLGenericElementImpl(document, t->tid);
1278             if (HTMLWidget && HTMLWidget->part()->jScriptEnabled()) {
1279                 discard_until = ID_NOSCRIPT + ID_CLOSE_TAG;
1280             }
1281         }
1282         return n;
1283     case ID_NOLAYER:
1284 //        discard_until = ID_NOLAYER + ID_CLOSE_TAG;
1285         return nullptr;
1286         break;
1287     case ID_MARQUEE:
1288         n = new HTMLMarqueeElementImpl(document);
1289         break;
1290 // text
1291     case ID_TEXT:
1292 //        qCDebug(KHTML_LOG) << "ID_TEXT: \"" << DOMString(t->text).string() << "\"";
1293         n = new TextImpl(document, t->text);
1294         break;
1295     case ID_COMMENT:
1296         n = new CommentImpl(document, t->text);
1297         break;
1298     default:
1299         n = new HTMLGenericElementImpl(document, t->tid);
1300         break;
1301 //         qCDebug(KHTML_LOG) << "Unknown tag " << t->tid << "!";
1302     }
1303     return n;
1304 }
1305 
processCloseTag(Token * t)1306 void KHTMLParser::processCloseTag(Token *t)
1307 {
1308     // FIXME: the below only behaves according to "in body" insertion mode (HTML5 8.2.5.10)
1309     //    - might need fixing when we have other insertion modes.
1310     switch (t->tid) {
1311     case ID_HTML+ID_CLOSE_TAG:
1312     case ID_BODY+ID_CLOSE_TAG:
1313         // we never trust those close tags, since stupid webpages close
1314         // them prematurely
1315         return;
1316     case ID_FORM+ID_CLOSE_TAG: // needs additional error checking. See spec.
1317         form = nullptr;
1318         if (!isElementInScope(ID_FORM)) {
1319             // Parse error. Ignore.
1320             return;
1321         }
1322         // this one is to get the right style on the body element
1323         break;
1324     case ID_MAP+ID_CLOSE_TAG:
1325         map = nullptr;
1326         break;
1327     case ID_SELECT+ID_CLOSE_TAG:
1328         inSelect = false;
1329         break;
1330     case ID_TITLE+ID_CLOSE_TAG:
1331         // Set haveTitle only if <title> isn't empty
1332         if (current->firstChild()) {
1333             haveTitle = true;
1334         }
1335         break;
1336     case ID_P+ID_CLOSE_TAG:
1337         if (!isElementInScope(ID_P)) {
1338             // Parse error. Handle as if <p> had been seen.
1339             t->tid = ID_P;
1340             parseToken(t);
1341             popBlock(ID_P);
1342             return;
1343         }
1344         break;
1345     case ID_ADDRESS+ID_CLOSE_TAG:
1346 //    case ID_ARTICLE+ID_CLOSE_TAG:
1347     case ID_BLOCKQUOTE+ID_CLOSE_TAG:
1348     case ID_CENTER+ID_CLOSE_TAG:
1349 //    case ID_DATAGRID+ID_CLOSE_TAG:
1350 //    case ID_DETAILS+ID_CLOSE_TAG:
1351 //    case ID_DIALOG+ID_CLOSE_TAG:
1352     case ID_DIR+ID_CLOSE_TAG:
1353     case ID_DIV+ID_CLOSE_TAG:
1354     case ID_DL+ID_CLOSE_TAG:
1355     case ID_FIELDSET+ID_CLOSE_TAG:
1356 //    case ID_FIGURE+ID_CLOSE_TAG:
1357 //    case ID_FOOTER+ID_CLOSE_TAG:
1358 //    case ID_HEADER+ID_CLOSE_TAG:
1359     case ID_LISTING+ID_CLOSE_TAG:
1360     case ID_MENU+ID_CLOSE_TAG:
1361 //    case ID_NAV+ID_CLOSE_TAG:
1362     case ID_OL+ID_CLOSE_TAG:
1363     case ID_PRE+ID_CLOSE_TAG:
1364 //    case ID_SECTION+ID_CLOSE_TAG:
1365     case ID_UL+ID_CLOSE_TAG:
1366 
1367     case ID_DD+ID_CLOSE_TAG:
1368     case ID_DT+ID_CLOSE_TAG:
1369     case ID_LI+ID_CLOSE_TAG:
1370 
1371     case ID_APPLET+ID_CLOSE_TAG: // those four should also "Clear the list of active formatting elements
1372     case ID_BUTTON+ID_CLOSE_TAG: // up to the last marker." whenever we implement adoption agency.
1373     case ID_MARQUEE+ID_CLOSE_TAG:
1374     case ID_OBJECT+ID_CLOSE_TAG:
1375 
1376     case ID_HEAD+ID_CLOSE_TAG: // ### according to HTML5, should be treated as 'Any other end tag'
1377         //     We'll do that when proper 'Any other end tag' handling is implemented.
1378         //     In the meantime, test scoping at least (#170694)
1379 
1380         if (!isElementInScope(t->tid - ID_CLOSE_TAG)) {
1381             // Parse error. Ignore token.
1382             return;
1383         }
1384         break;
1385     case ID_H1:
1386     case ID_H2:
1387     case ID_H3:
1388     case ID_H4:
1389     case ID_H5:
1390     case ID_H6:
1391         if (!isHeadingInScope()) {
1392             // Parse error. Ignore token.
1393             return;
1394         }
1395         break;
1396     case ID_A: // Formatting elements - will need special handling - cf. HTML5 "adoption agency algorithm"
1397     case ID_B: //                       meant to replace the "residual style" handling we have now.
1398     case ID_BIG:
1399     case ID_CODE:
1400     case ID_EM:
1401     case ID_FONT:
1402     case ID_I:
1403     case ID_NOBR:
1404     case ID_S:
1405     case ID_SMALL:
1406     case ID_STRIKE:
1407     case ID_STRONG:
1408     case ID_TT:
1409     case ID_U:
1410         break;
1411 
1412     default:
1413 //      otherTag = true; // FIXME: implement 'Any other end tag' handling
1414         break;
1415     }
1416 
1417 #ifdef PARSER_DEBUG
1418     qCDebug(KHTML_LOG) << "added the following children to " << current->nodeName().string();
1419     NodeImpl *child = current->firstChild();
1420     while (child != 0) {
1421         qCDebug(KHTML_LOG) << "    " << child->nodeName().string();
1422         child = child->nextSibling();
1423     }
1424 #endif
1425 
1426     generateImpliedEndTags(t->tid - ID_CLOSE_TAG);
1427     popBlock(t->tid - ID_CLOSE_TAG);
1428 
1429 #ifdef PARSER_DEBUG
1430     qCDebug(KHTML_LOG) << "closeTag --> current = " << current->nodeName().string();
1431 #endif
1432 }
1433 
isResidualStyleTag(int _id)1434 bool KHTMLParser::isResidualStyleTag(int _id)
1435 {
1436     switch (_id) {
1437     case ID_A:
1438     case ID_B:
1439     case ID_BIG:
1440     case ID_EM:
1441     case ID_FONT:
1442     case ID_I:
1443     case ID_NOBR:
1444     case ID_S:
1445     case ID_SMALL:
1446     case ID_STRIKE:
1447     case ID_STRONG:
1448     case ID_TT:
1449     case ID_U:
1450     case ID_DFN:
1451     case ID_CODE:
1452     case ID_SAMP:
1453     case ID_KBD:
1454     case ID_VAR:
1455     case ID_DEL:
1456     case ID_INS:
1457         return true;
1458     default:
1459         return false;
1460     }
1461 }
1462 
isAffectedByResidualStyle(int _id)1463 bool KHTMLParser::isAffectedByResidualStyle(int _id)
1464 {
1465     if (isResidualStyleTag(_id)) {
1466         return true;
1467     }
1468 
1469     switch (_id) {
1470     case ID_P:
1471     case ID_DIV:
1472     case ID_BLOCKQUOTE:
1473     case ID_ADDRESS:
1474     case ID_H1:
1475     case ID_H2:
1476     case ID_H3:
1477     case ID_H4:
1478     case ID_H5:
1479     case ID_H6:
1480     case ID_CENTER:
1481     case ID_UL:
1482     case ID_OL:
1483     case ID_LI:
1484     case ID_DL:
1485     case ID_DT:
1486     case ID_DD:
1487     case ID_PRE:
1488     case ID_LISTING:
1489         return true;
1490     default:
1491         return false;
1492     }
1493 }
1494 
handleResidualStyleCloseTagAcrossBlocks(HTMLStackElem * elem)1495 void KHTMLParser::handleResidualStyleCloseTagAcrossBlocks(HTMLStackElem *elem)
1496 {
1497     // Find the element that crosses over to a higher level.
1498     // ### For now, if there is more than one, we will only make sure we close the residual style.
1499     int exceptionCode = 0;
1500     HTMLStackElem *curr = blockStack;
1501     HTMLStackElem *maxElem = nullptr;
1502     HTMLStackElem *endElem = nullptr;
1503     HTMLStackElem *prev = nullptr;
1504     HTMLStackElem *prevMaxElem = nullptr;
1505     bool advancedResidual = false; // ### if set we only close the residual style
1506     while (curr && curr != elem) {
1507         if (curr->level > elem->level) {
1508             if (!isAffectedByResidualStyle(curr->id)) {
1509                 return;
1510             }
1511             if (maxElem) {
1512                 advancedResidual = true;
1513             } else {
1514                 endElem = curr;
1515             }
1516             maxElem = curr;
1517             prevMaxElem = prev;
1518         }
1519 
1520         prev = curr;
1521         curr = curr->next;
1522     }
1523 
1524     if (!curr || !maxElem) {
1525         return;
1526     }
1527 
1528     NodeImpl *residualElem = prev->node;
1529     NodeImpl *blockElem = prevMaxElem ? prevMaxElem->node : current;
1530     RefPtr<NodeImpl> parentElem = elem->node;
1531 
1532     // Check to see if the reparenting that is going to occur is allowed according to the DOM.
1533     // FIXME: We should either always allow it or perform an additional fixup instead of
1534     // just bailing here.
1535     // Example: <p><font><center>blah</font></center></p> isn't doing a fixup right now.
1536     if (!parentElem->childAllowed(blockElem)) {
1537         return;
1538     }
1539 
1540     if (maxElem->node->parentNode() != elem->node && !advancedResidual) {
1541         // Walk the stack and remove any elements that aren't residual style tags.  These
1542         // are basically just being closed up.  Example:
1543         // <font><span>Moo<p>Goo</font></p>.
1544         // In the above example, the <span> doesn't need to be reopened.  It can just close.
1545         HTMLStackElem *currElem = maxElem->next;
1546         HTMLStackElem *prevElem = maxElem;
1547         while (currElem != elem) {
1548             HTMLStackElem *nextElem = currElem->next;
1549             if (!isResidualStyleTag(currElem->id)) {
1550                 prevElem->next = nextElem;
1551                 prevElem->setNode(currElem->node);
1552                 delete currElem;
1553             } else {
1554                 prevElem = currElem;
1555             }
1556             currElem = nextElem;
1557         }
1558 
1559         // We have to reopen residual tags in between maxElem and elem.  An example of this case s:
1560         // <font><i>Moo<p>Foo</font>.
1561         // In this case, we need to transform the part before the <p> into:
1562         // <font><i>Moo</i></font><i>
1563         // so that the <i> will remain open.  This involves the modification of elements
1564         // in the block stack.
1565         // This will also affect how we ultimately reparent the block, since we want it to end up
1566         // under the reopened residual tags (e.g., the <i> in the above example.)
1567         RefPtr<NodeImpl> prevNode = nullptr;
1568         RefPtr<NodeImpl> currNode = nullptr;
1569         currElem = maxElem;
1570         while (currElem->node != residualElem) {
1571             if (isResidualStyleTag(currElem->node->id())) {
1572                 // Create a clone of this element.
1573                 currNode = currElem->node->cloneNode(false);
1574                 currElem->node->close();
1575                 removeForbidden(currElem->id, forbiddenTag);
1576 
1577                 // Change the stack element's node to point to the clone.
1578                 currElem->setNode(currNode.get());
1579 
1580                 // Attach the previous node as a child of this new node.
1581                 if (prevNode) {
1582                     currNode->appendChild(prevNode.get(), exceptionCode);
1583                 } else { // The new parent for the block element is going to be the innermost clone.
1584                     parentElem = currNode;
1585                 }
1586 
1587                 prevNode = currNode;
1588             }
1589 
1590             currElem = currElem->next;
1591         }
1592 
1593         // Now append the chain of new residual style elements if one exists.
1594         if (prevNode) {
1595             elem->node->appendChild(prevNode.get(), exceptionCode);
1596         }
1597     }
1598 
1599     // We need to make a clone of |residualElem| and place it just inside |blockElem|.
1600     // All content of |blockElem| is reparented to be under this clone.  We then
1601     // reparent |blockElem| using real DOM calls so that attachment/detachment will
1602     // be performed to fix up the rendering tree.
1603     // So for this example: <b>...<p>Foo</b>Goo</p>
1604     // The end result will be: <b>...</b><p><b>Foo</b>Goo</p>
1605     //
1606     // Step 1: Remove |blockElem| from its parent, doing a batch detach of all the kids.
1607     SharedPtr<NodeImpl> guard(blockElem);
1608     blockElem->parentNode()->removeChild(blockElem, exceptionCode);
1609 
1610     if (!advancedResidual) {
1611         // Step 2: Clone |residualElem|.
1612         RefPtr<NodeImpl> newNode = residualElem->cloneNode(false); // Shallow clone. We don't pick up the same kids.
1613 
1614         // Step 3: Place |blockElem|'s children under |newNode|.  Remove all of the children of |blockElem|
1615         // before we've put |newElem| into the document.  That way we'll only do one attachment of all
1616         // the new content (instead of a bunch of individual attachments).
1617         NodeImpl *currNode = blockElem->firstChild();
1618         while (currNode) {
1619             NodeImpl *nextNode = currNode->nextSibling();
1620             SharedPtr<NodeImpl> guard(currNode); //Protect from deletion while moving
1621             blockElem->removeChild(currNode, exceptionCode);
1622             newNode->appendChild(currNode, exceptionCode);
1623             currNode = nextNode;
1624 
1625 // TODO - To be replaced.
1626             // Re-register form elements with currently active form, step 1 will have removed them
1627             if (form && currNode && currNode->isGenericFormElement()) {
1628                 HTMLGenericFormElementImpl *e = static_cast<HTMLGenericFormElementImpl *>(currNode);
1629                 form->registerFormElement(e);
1630             }
1631         }
1632 
1633         // Step 4: Place |newNode| under |blockElem|.  |blockElem| is still out of the document, so no
1634         // attachment can occur yet.
1635         blockElem->appendChild(newNode.get(), exceptionCode);
1636     }
1637 
1638     // Step 5: Reparent |blockElem|.  Now the full attachment of the fixed up tree takes place.
1639     parentElem->appendChild(blockElem, exceptionCode);
1640 
1641     // Step 6: Elide |elem|, since it is effectively no longer open.  Also update
1642     // the node associated with the previous stack element so that when it gets popped,
1643     // it doesn't make the residual element the next current node.
1644     HTMLStackElem *currElem = maxElem;
1645     HTMLStackElem *prevElem = nullptr;
1646     while (currElem != elem) {
1647         prevElem = currElem;
1648         currElem = currElem->next;
1649     }
1650     prevElem->next = elem->next;
1651     prevElem->setNode(elem->node);
1652     delete elem;
1653 
1654     // Step 7: Reopen intermediate inlines, e.g., <b><p><i>Foo</b>Goo</p>.
1655     // In the above example, Goo should stay italic.
1656     curr = blockStack;
1657     HTMLStackElem *residualStyleStack = nullptr;
1658     while (curr && curr != endElem) {
1659         // We will actually schedule this tag for reopening
1660         // after we complete the close of this entire block.
1661         NodeImpl *currNode = current;
1662         if (isResidualStyleTag(curr->id)) {
1663             // We've overloaded the use of stack elements and are just reusing the
1664             // struct with a slightly different meaning to the variables.  Instead of chaining
1665             // from innermost to outermost, we build up a list of all the tags we need to reopen
1666             // from the outermost to the innermost, i.e., residualStyleStack will end up pointing
1667             // to the outermost tag we need to reopen.
1668             // We also set curr->node to be the actual element that corresponds to the ID stored in
1669             // curr->id rather than the node that you should pop to when the element gets pulled off
1670             // the stack.
1671             popOneBlock(false);
1672             curr->setNode(currNode);
1673             curr->next = residualStyleStack;
1674             residualStyleStack = curr;
1675         } else {
1676             popOneBlock();
1677         }
1678 
1679         curr = blockStack;
1680     }
1681 
1682     reopenResidualStyleTags(residualStyleStack, nullptr); // FIXME: Deal with stray table content some day
1683     // if it becomes necessary to do so.
1684 }
1685 
reopenResidualStyleTags(HTMLStackElem * elem,DOM::NodeImpl * malformedTableParent)1686 void KHTMLParser::reopenResidualStyleTags(HTMLStackElem *elem, DOM::NodeImpl *malformedTableParent)
1687 {
1688     // Loop for each tag that needs to be reopened.
1689     while (elem) {
1690         // Create a shallow clone of the DOM node for this element.
1691         RefPtr<NodeImpl> newNode = elem->node->cloneNode(false);
1692 
1693         // Append the new node. In the malformed table case, we need to insert before the table,
1694         // which will be the last child.
1695         int exceptionCode = 0;
1696         if (malformedTableParent) {
1697             malformedTableParent->insertBefore(newNode.get(), malformedTableParent->lastChild(), exceptionCode);
1698         } else {
1699             current->appendChild(newNode.get(), exceptionCode);
1700         }
1701         // FIXME: Is it really OK to ignore the exceptions here?
1702 
1703         // Now push a new stack element for this node we just created.
1704         pushBlock(elem->id, elem->level);
1705 
1706         // Set our strayTableContent boolean if needed, so that the reopened tag also knows
1707         // that it is inside a malformed table.
1708         blockStack->strayTableContent = malformedTableParent != nullptr;
1709         if (blockStack->strayTableContent) {
1710             inStrayTableContent++;
1711         }
1712 
1713         // Clear our malformed table parent variable.
1714         malformedTableParent = nullptr;
1715 
1716         // Update |current| manually to point to the new node.
1717         setCurrent(newNode.get());
1718 
1719         // Advance to the next tag that needs to be reopened.
1720         HTMLStackElem *next = elem->next;
1721         delete elem;
1722         elem = next;
1723     }
1724 }
1725 
pushBlock(int _id,int _level)1726 void KHTMLParser::pushBlock(int _id, int _level)
1727 {
1728     HTMLStackElem *Elem = new HTMLStackElem(_id, _level, current, m_inline, blockStack);
1729 
1730     blockStack = Elem;
1731     addForbidden(_id, forbiddenTag);
1732 }
1733 
generateImpliedEndTags(int _id)1734 void KHTMLParser::generateImpliedEndTags(int _id)
1735 {
1736     HTMLStackElem *Elem = blockStack;
1737 
1738     int level = tagPriority(_id);
1739     while (Elem && Elem->id != _id) {
1740         HTMLStackElem *NextElem = Elem->next;
1741         if (endTagRequirement(Elem->id) == DOM::OPTIONAL && Elem->level <= level) {
1742             popOneBlock();
1743         } else {
1744             break;
1745         }
1746         Elem = NextElem;
1747     }
1748 }
1749 
popOptionalBlock(int _id)1750 void KHTMLParser::popOptionalBlock(int _id)
1751 {
1752     bool found = false;
1753     HTMLStackElem *Elem = blockStack;
1754 
1755     int level = tagPriority(_id);
1756     while (Elem) {
1757         if (Elem->id == _id) {
1758             found = true;
1759             break;
1760         }
1761         if (Elem->level > level || (endTagRequirement(Elem->id) != DOM::OPTIONAL && !isResidualStyleTag(Elem->id))) {
1762             break;
1763         }
1764         Elem = Elem->next;
1765     }
1766 
1767     if (found) {
1768         generateImpliedEndTags(_id);
1769         popBlock(_id);
1770     }
1771 }
1772 
isElementInScope(int _id)1773 bool KHTMLParser::isElementInScope(int _id)
1774 {
1775     // HTML5 8.2.3.2
1776     HTMLStackElem *Elem = blockStack;
1777     while (Elem && Elem->id != _id) {
1778         if (DOM::checkIsScopeBoundary(Elem->id)) {
1779             return false;
1780         }
1781         Elem = Elem->next;
1782     }
1783     return Elem;
1784 }
1785 
isHeadingInScope()1786 bool KHTMLParser::isHeadingInScope()
1787 {
1788     HTMLStackElem *Elem = blockStack;
1789     while (Elem && (Elem->id < ID_H1 || Elem->id > ID_H6)) {
1790         if (DOM::checkIsScopeBoundary(Elem->id)) {
1791             return false;
1792         }
1793         Elem = Elem->next;
1794     }
1795     return Elem;
1796 }
1797 
popBlock(int _id)1798 void KHTMLParser::popBlock(int _id)
1799 {
1800     HTMLStackElem *Elem = blockStack;
1801     int maxLevel = 0;
1802 
1803 #ifdef PARSER_DEBUG
1804     qCDebug(KHTML_LOG) << "popBlock(" << getParserPrintableName(_id) << ")";
1805     while (Elem) {
1806         qCDebug(KHTML_LOG) << "   > " << getParserPrintableName(Elem->id);
1807         Elem = Elem->next;
1808     }
1809     Elem = blockStack;
1810 #endif
1811 
1812     while (Elem && (Elem->id != _id)) {
1813         if (maxLevel < Elem->level) {
1814             maxLevel = Elem->level;
1815         }
1816         Elem = Elem->next;
1817     }
1818     if (!Elem) {
1819         return;
1820     }
1821 
1822     if (maxLevel > Elem->level) {
1823         // We didn't match because the tag is in a different scope, e.g.,
1824         // <b><p>Foo</b>.  Try to correct the problem.
1825         if (!isResidualStyleTag(_id)) {
1826             return;
1827         }
1828         return handleResidualStyleCloseTagAcrossBlocks(Elem);
1829     }
1830 
1831     bool isAffectedByStyle = isAffectedByResidualStyle(Elem->id);
1832     HTMLStackElem *residualStyleStack = nullptr;
1833     NodeImpl *malformedTableParent = nullptr;
1834 
1835     Elem = blockStack;
1836 
1837     while (Elem) {
1838         if (Elem->id == _id) {
1839             int strayTable = inStrayTableContent;
1840             popOneBlock();
1841             Elem = nullptr;
1842 
1843             // This element was the root of some malformed content just inside an implicit or
1844             // explicit <tbody> or <tr>.
1845             // If we end up needing to reopen residual style tags, the root of the reopened chain
1846             // must also know that it is the root of malformed content inside a <tbody>/<tr>.
1847             if (strayTable && (inStrayTableContent < strayTable) && residualStyleStack) {
1848                 NodeImpl *curr = current;
1849                 while (curr && curr->id() != ID_TABLE) {
1850                     curr = curr->parentNode();
1851                 }
1852                 malformedTableParent = curr ? curr->parentNode() : nullptr;
1853             }
1854         } else {
1855             // Schedule this tag for reopening
1856             // after we complete the close of this entire block.
1857             NodeImpl *currNode = current;
1858             if (isAffectedByStyle && isResidualStyleTag(Elem->id)) {
1859                 // We've overloaded the use of stack elements and are just reusing the
1860                 // struct with a slightly different meaning to the variables.  Instead of chaining
1861                 // from innermost to outermost, we build up a list of all the tags we need to reopen
1862                 // from the outermost to the innermost, i.e., residualStyleStack will end up pointing
1863                 // to the outermost tag we need to reopen.
1864                 // We also set Elem->node to be the actual element that corresponds to the ID stored in
1865                 // Elem->id rather than the node that you should pop to when the element gets pulled off
1866                 // the stack.
1867                 popOneBlock(false);
1868                 Elem->next = residualStyleStack;
1869                 Elem->setNode(currNode);
1870                 residualStyleStack = Elem;
1871             } else {
1872                 popOneBlock();
1873             }
1874             Elem = blockStack;
1875         }
1876     }
1877 
1878     reopenResidualStyleTags(residualStyleStack, malformedTableParent);
1879 }
1880 
popOneBlock(bool delBlock)1881 void KHTMLParser::popOneBlock(bool delBlock)
1882 {
1883     HTMLStackElem *Elem = blockStack;
1884 
1885     // we should never get here, but some bad html might cause it.
1886 #ifndef PARSER_DEBUG
1887     if (!Elem) {
1888         return;
1889     }
1890 #else
1891     qCDebug(KHTML_LOG) << "popping block: " << getParserPrintableName(Elem->id) << "(" << Elem->id << ")";
1892 #endif
1893 
1894 #if SPEED_DEBUG < 1
1895     if ((Elem->node != current)) {
1896         if (current->maintainsState() && document) {
1897             document->registerMaintainsState(current);
1898             document->attemptRestoreState(current);
1899         }
1900         current->close();
1901     }
1902 #endif
1903 
1904     removeForbidden(Elem->id, forbiddenTag);
1905 
1906     blockStack = Elem->next;
1907     // we only set inline to false, if the element we close is a block level element.
1908     // This helps getting cases as <p><b>bla</b> <b>bla</b> right.
1909 
1910     m_inline = Elem->m_inline;
1911 
1912     if (current->id() == ID_FORM && form && inStrayTableContent) {
1913         form->setMalformed(true);
1914     }
1915 
1916     setCurrent(Elem->node);
1917 
1918     if (Elem->strayTableContent) {
1919         inStrayTableContent--;
1920     }
1921 
1922     if (delBlock) {
1923         delete Elem;
1924     }
1925 }
1926 
popInlineBlocks()1927 void KHTMLParser::popInlineBlocks()
1928 {
1929     while (blockStack && current->isInline() && current->id() != ID_FONT) {
1930         popOneBlock();
1931     }
1932 }
1933 
freeBlock()1934 void KHTMLParser::freeBlock()
1935 {
1936     while (blockStack) {
1937         popOneBlock();
1938     }
1939     blockStack = nullptr;
1940 }
1941 
createHead()1942 void KHTMLParser::createHead()
1943 {
1944     if (head || !doc()->documentElement()) {
1945         return;
1946     }
1947 
1948     head = new HTMLHeadElementImpl(document);
1949     HTMLElementImpl *body = doc()->body();
1950     int exceptioncode = 0;
1951     doc()->documentElement()->insertBefore(head.get(), body, exceptioncode);
1952     if (exceptioncode) {
1953 #ifdef PARSER_DEBUG
1954         qCDebug(KHTML_LOG) << "creation of head failed!!!!:" << exceptioncode;
1955 #endif
1956         delete head.get();
1957         head = nullptr;
1958     }
1959 
1960     // If the body does not exist yet, then the <head> should be pushed as the current block.
1961     if (head && !body) {
1962         pushBlock(head->id(), tagPriority(head->id()));
1963         setCurrent(head.get());
1964     }
1965 }
1966 
handleIsindex(Token * t)1967 NodeImpl *KHTMLParser::handleIsindex(Token *t)
1968 {
1969     NodeImpl *n;
1970     HTMLFormElementImpl *myform = form;
1971     if (!myform) {
1972         myform = new HTMLFormElementImpl(document, true);
1973         n = myform;
1974     } else {
1975         n = new HTMLDivElementImpl(document, ID_DIV);
1976     }
1977     NodeImpl *child = new HTMLHRElementImpl(document);
1978     n->addChild(child);
1979     DOMStringImpl *a = t->attrs ? t->attrs->getValue(ATTR_PROMPT) : nullptr;
1980     DOMString text = i18n("This is a searchable index. Enter search keywords: ");
1981     if (a) {
1982         text = a;
1983     }
1984     child = new TextImpl(document, text.implementation());
1985     n->addChild(child);
1986     child = new HTMLIsIndexElementImpl(document, myform);
1987     static_cast<ElementImpl *>(child)->setAttribute(ATTR_TYPE, "khtml_isindex");
1988     n->addChild(child);
1989     child = new HTMLHRElementImpl(document);
1990     n->addChild(child);
1991 
1992     return n;
1993 }
1994 
startBody()1995 void KHTMLParser::startBody()
1996 {
1997     if (inBody) {
1998         return;
1999     }
2000 
2001     inBody = true;
2002 
2003     if (isindex) {
2004         insertNode(isindex, true /* don't decend into this node */);
2005         isindex = nullptr;
2006     }
2007 }
2008