1 /* parser.c -- HTML Parser
2 
3   (c) 1998-2007 (W3C) MIT, ERCIM, Keio University
4   See tidyp.h for the copyright notice.
5 
6 */
7 
8 #include "tidy-int.h"
9 #include "lexer.h"
10 #include "parser.h"
11 #include "message.h"
12 #include "clean.h"
13 #include "tags.h"
14 #include "tmbstr.h"
15 
16 #ifdef AUTO_INPUT_ENCODING
17 #include "charsets.h"
18 #endif
19 
TY_(CheckNodeIntegrity)20 Bool TY_(CheckNodeIntegrity)(Node *node)
21 {
22 #ifndef NO_NODE_INTEGRITY_CHECK
23     Node *child;
24 
25     if (node->prev)
26     {
27         if (node->prev->next != node)
28             return no;
29     }
30 
31     if (node->next)
32     {
33         if (node->next == node || node->next->prev != node)
34             return no;
35     }
36 
37     if (node->parent)
38     {
39         if (node->prev == NULL && node->parent->content != node)
40             return no;
41 
42         if (node->next == NULL && node->parent->last != node)
43             return no;
44     }
45 
46     for (child = node->content; child; child = child->next)
47         if ( child->parent != node || !TY_(CheckNodeIntegrity)(child) )
48             return no;
49 
50 #endif
51     return yes;
52 }
53 
54 /*
55  used to determine how attributes
56  without values should be printed
57  this was introduced to deal with
58  user defined tags e.g. Cold Fusion
59 */
TY_(IsNewNode)60 Bool TY_(IsNewNode)(Node *node)
61 {
62     if (node && node->tag)
63     {
64         return (node->tag->model & CM_NEW);
65     }
66     return yes;
67 }
68 
TY_(CoerceNode)69 void TY_(CoerceNode)(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool unexpected)
70 {
71     const Dict* tag = TY_(LookupTagDef)(tid);
72     Node* tmp = TY_(InferredTag)(doc, tag->id);
73 
74     if (obsolete)
75         TY_(ReportWarning)(doc, node, tmp, OBSOLETE_ELEMENT);
76     else if (unexpected)
77         TY_(ReportError)(doc, node, tmp, REPLACING_UNEX_ELEMENT);
78     else
79         TY_(ReportNotice)(doc, node, tmp, REPLACING_ELEMENT);
80 
81     TidyDocFree(doc, tmp->element);
82     TidyDocFree(doc, tmp);
83 
84     node->was = node->tag;
85     node->tag = tag;
86     node->type = StartTag;
87     node->implicit = yes;
88     TidyDocFree(doc, node->element);
89     node->element = TY_(tmbstrdup)(doc->allocator, tag->name);
90 }
91 
92 /* extract a node and its children from a markup tree */
TY_(RemoveNode)93 Node *TY_(RemoveNode)(Node *node)
94 {
95     if (node->prev)
96         node->prev->next = node->next;
97 
98     if (node->next)
99         node->next->prev = node->prev;
100 
101     if (node->parent)
102     {
103         if (node->parent->content == node)
104             node->parent->content = node->next;
105 
106         if (node->parent->last == node)
107             node->parent->last = node->prev;
108     }
109 
110     node->parent = node->prev = node->next = NULL;
111     return node;
112 }
113 
114 /* remove node from markup tree and discard it */
TY_(DiscardElement)115 Node *TY_(DiscardElement)( TidyDocImpl* doc, Node *element )
116 {
117     Node *next = NULL;
118 
119     if (element)
120     {
121         next = element->next;
122         TY_(RemoveNode)(element);
123         TY_(FreeNode)( doc, element);
124     }
125 
126     return next;
127 }
128 
129 /*
130  insert "node" into markup tree as the firt element
131  of content of "element"
132 */
TY_(InsertNodeAtStart)133 void TY_(InsertNodeAtStart)(Node *element, Node *node)
134 {
135     node->parent = element;
136 
137     if (element->content == NULL)
138         element->last = node;
139     else
140         element->content->prev = node;
141 
142     node->next = element->content;
143     node->prev = NULL;
144     element->content = node;
145 }
146 
147 /*
148  insert "node" into markup tree as the last element
149  of content of "element"
150 */
TY_(InsertNodeAtEnd)151 void TY_(InsertNodeAtEnd)(Node *element, Node *node)
152 {
153     node->parent = element;
154     node->prev = element->last;
155 
156     if (element->last != NULL)
157         element->last->next = node;
158     else
159         element->content = node;
160 
161     element->last = node;
162 }
163 
164 /*
165  insert "node" into markup tree in place of "element"
166  which is moved to become the child of the node
167 */
InsertNodeAsParent(Node * element,Node * node)168 static void InsertNodeAsParent(Node *element, Node *node)
169 {
170     node->content = element;
171     node->last = element;
172     node->parent = element->parent;
173     element->parent = node;
174 
175     if (node->parent->content == element)
176         node->parent->content = node;
177 
178     if (node->parent->last == element)
179         node->parent->last = node;
180 
181     node->prev = element->prev;
182     element->prev = NULL;
183 
184     if (node->prev)
185         node->prev->next = node;
186 
187     node->next = element->next;
188     element->next = NULL;
189 
190     if (node->next)
191         node->next->prev = node;
192 }
193 
194 /* insert "node" into markup tree before "element" */
TY_(InsertNodeBeforeElement)195 void TY_(InsertNodeBeforeElement)(Node *element, Node *node)
196 {
197     Node *parent;
198 
199     parent = element->parent;
200     node->parent = parent;
201     node->next = element;
202     node->prev = element->prev;
203     element->prev = node;
204 
205     if (node->prev)
206         node->prev->next = node;
207 
208     if (parent->content == element)
209         parent->content = node;
210 }
211 
212 /* insert "node" into markup tree after "element" */
TY_(InsertNodeAfterElement)213 void TY_(InsertNodeAfterElement)(Node *element, Node *node)
214 {
215     Node *parent;
216 
217     parent = element->parent;
218     node->parent = parent;
219 
220     /* AQ - 13 Jan 2000 fix for parent == NULL */
221     if (parent != NULL && parent->last == element)
222         parent->last = node;
223     else
224     {
225         node->next = element->next;
226         /* AQ - 13 Jan 2000 fix for node->next == NULL */
227         if (node->next != NULL)
228             node->next->prev = node;
229     }
230 
231     element->next = node;
232     node->prev = element;
233 }
234 
CanPrune(TidyDocImpl * doc,Node * element)235 static Bool CanPrune( TidyDocImpl* doc, Node *element )
236 {
237     if ( TY_(nodeIsText)(element) )
238         return yes;
239 
240     if ( element->content )
241         return no;
242 
243     if ( element->tag == NULL )
244         return no;
245 
246     if ( element->tag->model & CM_BLOCK && element->attributes != NULL )
247         return no;
248 
249     if ( nodeIsA(element) && element->attributes != NULL )
250         return no;
251 
252     if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) )
253         return no;
254 
255     if ( element->tag->model & CM_ROW )
256         return no;
257 
258     if ( element->tag->model & CM_EMPTY )
259         return no;
260 
261     if ( nodeIsAPPLET(element) )
262         return no;
263 
264     if ( nodeIsOBJECT(element) )
265         return no;
266 
267     if ( nodeIsSCRIPT(element) && attrGetSRC(element) )
268         return no;
269 
270     if ( nodeIsTITLE(element) )
271         return no;
272 
273     /* #433359 - fix by Randy Waki 12 Mar 01 */
274     if ( nodeIsIFRAME(element) )
275         return no;
276 
277     /* fix for bug 770297 */
278     if (nodeIsTEXTAREA(element))
279         return no;
280 
281     if ( attrGetID(element) || attrGetNAME(element) )
282         return no;
283 
284     /* fix for bug 695408; a better fix would look for unknown and    */
285     /* known proprietary attributes that make the element significant */
286     if (attrGetDATAFLD(element))
287         return no;
288 
289     /* fix for bug 723772, don't trim new-...-tags */
290     if (element->tag->id == TidyTag_UNKNOWN)
291         return no;
292 
293     if (nodeIsBODY(element))
294         return no;
295 
296     if (nodeIsCOLGROUP(element))
297         return no;
298 
299     return yes;
300 }
301 
302 /* return next element */
TY_(TrimEmptyElement)303 Node *TY_(TrimEmptyElement)( TidyDocImpl* doc, Node *element )
304 {
305     if ( CanPrune(doc, element) )
306     {
307        if (element->type != TextNode)
308             TY_(ReportNotice)(doc, element, NULL, TRIM_EMPTY_ELEMENT);
309 
310         return TY_(DiscardElement)(doc, element);
311     }
312     return element->next;
313 }
314 
TY_(DropEmptyElements)315 Node* TY_(DropEmptyElements)(TidyDocImpl* doc, Node* node)
316 {
317     Node* next;
318 
319     while (node)
320     {
321         next = node->next;
322 
323         if (node->content)
324             TY_(DropEmptyElements)(doc, node->content);
325 
326         if (!TY_(nodeIsElement)(node) &&
327             !(TY_(nodeIsText)(node) && !(node->start < node->end)))
328         {
329             node = next;
330             continue;
331         }
332 
333         next = TY_(TrimEmptyElement)(doc, node);
334         node = next;
335     }
336 
337     return node;
338 }
339 
340 /*
341   errors in positioning of form start or end tags
342   generally require human intervention to fix
343 */
BadForm(TidyDocImpl * doc)344 static void BadForm( TidyDocImpl* doc )
345 {
346     doc->badForm = yes;
347     /* doc->errors++; */
348 }
349 
350 /*
351   This maps
352        <em>hello </em><strong>world</strong>
353   to
354        <em>hello</em> <strong>world</strong>
355 
356   If last child of element is a text node
357   then trim trailing white space character
358   moving it to after element's end tag.
359 */
TrimTrailingSpace(TidyDocImpl * doc,Node * element,Node * last)360 static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last )
361 {
362     Lexer* lexer = doc->lexer;
363     byte c;
364 
365     if (TY_(nodeIsText)(last))
366     {
367         if (last->end > last->start)
368         {
369             c = (byte) lexer->lexbuf[ last->end - 1 ];
370 
371             if (   c == ' '
372 #ifdef COMMENT_NBSP_FIX
373                 || c == 160
374 #endif
375                )
376             {
377 #ifdef COMMENT_NBSP_FIX
378                 /* take care with <td>&nbsp;</td> */
379                 if ( c == 160 &&
380                      ( element->tag == doc->tags.tag_td ||
381                        element->tag == doc->tags.tag_th )
382                    )
383                 {
384                     if (last->end > last->start + 1)
385                         last->end -= 1;
386                 }
387                 else
388 #endif
389                 {
390                     last->end -= 1;
391                     if ( (element->tag->model & CM_INLINE) &&
392                          !(element->tag->model & CM_FIELD) )
393                         lexer->insertspace = yes;
394                 }
395             }
396         }
397     }
398 }
399 
400 /* Only true for text nodes. */
TY_(IsBlank)401 Bool TY_(IsBlank)(Lexer *lexer, Node *node)
402 {
403     Bool isBlank = TY_(nodeIsText)(node);
404     if ( isBlank )
405         isBlank = ( node->end == node->start ||       /* Zero length */
406                     ( node->end == node->start+1      /* or one blank. */
407                       && lexer->lexbuf[node->start] == ' ' ) );
408     return isBlank;
409 }
410 
411 /*
412   This maps
413        <p>hello<em> world</em>
414   to
415        <p>hello <em>world</em>
416 
417   Trims initial space, by moving it before the
418   start tag, or if this element is the first in
419   parent's content, then by discarding the space
420 */
TrimInitialSpace(TidyDocImpl * doc,Node * element,Node * text)421 static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text )
422 {
423     Lexer* lexer = doc->lexer;
424     Node *prev, *node;
425 
426     if ( TY_(nodeIsText)(text) &&
427          lexer->lexbuf[text->start] == ' ' &&
428          text->start < text->end )
429     {
430         if ( (element->tag->model & CM_INLINE) &&
431              !(element->tag->model & CM_FIELD) )
432         {
433             prev = element->prev;
434 
435             if (TY_(nodeIsText)(prev))
436             {
437                 if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ')
438                     lexer->lexbuf[(prev->end)++] = ' ';
439 
440                 ++(element->start);
441             }
442             else /* create new node */
443             {
444                 node = TY_(NewNode)(lexer->allocator, lexer);
445                 node->start = (element->start)++;
446                 node->end = element->start;
447                 lexer->lexbuf[node->start] = ' ';
448                 TY_(InsertNodeBeforeElement)(element ,node);
449             }
450         }
451 
452         /* discard the space in current node */
453         ++(text->start);
454     }
455 }
456 
IsPreDescendant(Node * node)457 static Bool IsPreDescendant(Node* node)
458 {
459     Node *parent = node->parent;
460 
461     while (parent)
462     {
463         if (parent->tag && parent->tag->parser == TY_(ParsePre))
464             return yes;
465 
466         parent = parent->parent;
467     }
468 
469     return no;
470 }
471 
CleanTrailingWhitespace(TidyDocImpl * doc,Node * node)472 static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node)
473 {
474     Node* next;
475 
476     if (!TY_(nodeIsText)(node))
477         return no;
478 
479     if (node->parent->type == DocTypeTag)
480         return no;
481 
482     if (IsPreDescendant(node))
483         return no;
484 
485     if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
486         return no;
487 
488     next = node->next;
489 
490     /* <p>... </p> */
491     if (!next && !TY_(nodeHasCM)(node->parent, CM_INLINE))
492         return yes;
493 
494     /* <div><small>... </small><h3>...</h3></div> */
495     if (!next && node->parent->next && !TY_(nodeHasCM)(node->parent->next, CM_INLINE))
496         return yes;
497 
498     if (!next)
499         return no;
500 
501     if (nodeIsBR(next))
502         return yes;
503 
504     if (TY_(nodeHasCM)(next, CM_INLINE))
505         return no;
506 
507     /* <a href='/'>...</a> <p>...</p> */
508     if (next->type == StartTag)
509         return yes;
510 
511     /* <strong>...</strong> <hr /> */
512     if (next->type == StartEndTag)
513         return yes;
514 
515     /* evil adjacent text nodes, Tidy should not generate these :-( */
516     if (TY_(nodeIsText)(next) && next->start < next->end
517         && TY_(IsWhite)(doc->lexer->lexbuf[next->start]))
518         return yes;
519 
520     return no;
521 }
522 
CleanLeadingWhitespace(TidyDocImpl * ARG_UNUSED (doc),Node * node)523 static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node)
524 {
525     if (!TY_(nodeIsText)(node))
526         return no;
527 
528     if (node->parent->type == DocTypeTag)
529         return no;
530 
531     if (IsPreDescendant(node))
532         return no;
533 
534     if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
535         return no;
536 
537     /* <p>...<br> <em>...</em>...</p> */
538     if (nodeIsBR(node->prev))
539         return yes;
540 
541     /* <p> ...</p> */
542     if (node->prev == NULL && !TY_(nodeHasCM)(node->parent, CM_INLINE))
543         return yes;
544 
545     /* <h4>...</h4> <em>...</em> */
546     if (node->prev && !TY_(nodeHasCM)(node->prev, CM_INLINE) &&
547         TY_(nodeIsElement)(node->prev))
548         return yes;
549 
550     /* <p><span> ...</span></p> */
551     if (!node->prev && !node->parent->prev && !TY_(nodeHasCM)(node->parent->parent, CM_INLINE))
552         return yes;
553 
554     return no;
555 }
556 
CleanSpaces(TidyDocImpl * doc,Node * node)557 static void CleanSpaces(TidyDocImpl* doc, Node* node)
558 {
559     Node* next;
560 
561     while (node)
562     {
563         next = node->next;
564 
565         if (TY_(nodeIsText)(node) && CleanLeadingWhitespace(doc, node))
566             while (node->start < node->end && TY_(IsWhite)(doc->lexer->lexbuf[node->start]))
567                 ++(node->start);
568 
569         if (TY_(nodeIsText)(node) && CleanTrailingWhitespace(doc, node))
570             while (node->end > node->start && TY_(IsWhite)(doc->lexer->lexbuf[node->end - 1]))
571                 --(node->end);
572 
573         if (TY_(nodeIsText)(node) && !(node->start < node->end))
574         {
575             TY_(RemoveNode)(node);
576             TY_(FreeNode)(doc, node);
577             node = next;
578 
579             continue;
580         }
581 
582         if (node->content)
583             CleanSpaces(doc, node->content);
584 
585         node = next;
586     }
587 }
588 
589 /*
590   Move initial and trailing space out.
591   This routine maps:
592 
593        hello<em> world</em>
594   to
595        hello <em>world</em>
596   and
597        <em>hello </em><strong>world</strong>
598   to
599        <em>hello</em> <strong>world</strong>
600 */
TrimSpaces(TidyDocImpl * doc,Node * element)601 static void TrimSpaces( TidyDocImpl* doc, Node *element)
602 {
603     Node* text = element->content;
604 
605     if (nodeIsPRE(element) || IsPreDescendant(element))
606         return;
607 
608     if (TY_(nodeIsText)(text))
609         TrimInitialSpace(doc, element, text);
610 
611     text = element->last;
612 
613     if (TY_(nodeIsText)(text))
614         TrimTrailingSpace(doc, element, text);
615 }
616 
DescendantOf(Node * element,TidyTagId tid)617 static Bool DescendantOf( Node *element, TidyTagId tid )
618 {
619     Node *parent;
620     for ( parent = element->parent;
621           parent != NULL;
622           parent = parent->parent )
623     {
624         if ( TagIsId(parent, tid) )
625             return yes;
626     }
627     return no;
628 }
629 
InsertMisc(Node * element,Node * node)630 static Bool InsertMisc(Node *element, Node *node)
631 {
632     if (node->type == CommentTag ||
633         node->type == ProcInsTag ||
634         node->type == CDATATag ||
635         node->type == SectionTag ||
636         node->type == AspTag ||
637         node->type == JsteTag ||
638         node->type == PhpTag )
639     {
640         TY_(InsertNodeAtEnd)(element, node);
641         return yes;
642     }
643 
644     if ( node->type == XmlDecl )
645     {
646         Node* root = element;
647         while ( root && root->parent )
648             root = root->parent;
649         if ( root && !(root->content && root->content->type == XmlDecl))
650         {
651           TY_(InsertNodeAtStart)( root, node );
652           return yes;
653         }
654     }
655 
656     /* Declared empty tags seem to be slipping through
657     ** the cracks.  This is an experiment to figure out
658     ** a decent place to pick them up.
659     */
660     if ( node->tag &&
661          TY_(nodeIsElement)(node) &&
662          TY_(nodeCMIsEmpty)(node) && TagId(node) == TidyTag_UNKNOWN &&
663          (node->tag->versions & VERS_PROPRIETARY) != 0 )
664     {
665         TY_(InsertNodeAtEnd)(element, node);
666         return yes;
667     }
668 
669     return no;
670 }
671 
672 
ParseTag(TidyDocImpl * doc,Node * node,GetTokenMode mode)673 static void ParseTag( TidyDocImpl* doc, Node *node, GetTokenMode mode )
674 {
675     Lexer* lexer = doc->lexer;
676     /*
677        Fix by GLP 2000-12-21.  Need to reset insertspace if this
678        is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
679     */
680     if (node->tag->model & CM_EMPTY)
681     {
682         lexer->waswhite = no;
683         if (node->tag->parser == NULL)
684             return;
685     }
686     else if (!(node->tag->model & CM_INLINE))
687         lexer->insertspace = no;
688 
689     if (node->tag->parser == NULL)
690         return;
691 
692     if (node->type == StartEndTag)
693         return;
694 
695     (*node->tag->parser)( doc, node, mode );
696 }
697 
698 /*
699  the doctype has been found after other tags,
700  and needs moving to before the html element
701 */
InsertDocType(TidyDocImpl * doc,Node * element,Node * doctype)702 static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype )
703 {
704     Node* existing = TY_(FindDocType)( doc );
705     if ( existing )
706     {
707         TY_(ReportError)(doc, element, doctype, DISCARDING_UNEXPECTED );
708         TY_(FreeNode)( doc, doctype );
709     }
710     else
711     {
712         TY_(ReportError)(doc, element, doctype, DOCTYPE_AFTER_TAGS );
713         while ( !nodeIsHTML(element) )
714             element = element->parent;
715         TY_(InsertNodeBeforeElement)( element, doctype );
716     }
717 }
718 
719 /*
720  move node to the head, where element is used as starting
721  point in hunt for head. normally called during parsing
722 */
MoveToHead(TidyDocImpl * doc,Node * element,Node * node)723 static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node )
724 {
725     Node *head;
726 
727     TY_(RemoveNode)( node );  /* make sure that node is isolated */
728 
729     if ( TY_(nodeIsElement)(node) )
730     {
731         TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN );
732 
733         head = TY_(FindHEAD)(doc);
734         assert(head != NULL);
735 
736         TY_(InsertNodeAtEnd)(head, node);
737 
738         if ( node->tag->parser )
739             ParseTag( doc, node, IgnoreWhitespace );
740     }
741     else
742     {
743         TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
744         TY_(FreeNode)( doc, node );
745     }
746 }
747 
748 /* moves given node to end of body element */
MoveNodeToBody(TidyDocImpl * doc,Node * node)749 static void MoveNodeToBody( TidyDocImpl* doc, Node* node )
750 {
751     Node* body = TY_(FindBody)( doc );
752     if ( body )
753     {
754         TY_(RemoveNode)( node );
755         TY_(InsertNodeAtEnd)( body, node );
756     }
757 }
758 
AddClassNoIndent(TidyDocImpl * doc,Node * node)759 static void AddClassNoIndent( TidyDocImpl* doc, Node *node )
760 {
761     ctmbstr sprop =
762         "padding-left: 2ex; margin-left: 0ex"
763         "; margin-top: 0ex; margin-bottom: 0ex";
764     if ( !cfgBool(doc, TidyDecorateInferredUL) )
765         return;
766     if ( cfgBool(doc, TidyMakeClean) )
767         TY_(AddStyleAsClass)( doc, node, sprop );
768     else
769         TY_(AddStyleProperty)( doc, node, sprop );
770 }
771 
772 /*
773    element is node created by the lexer
774    upon seeing the start tag, or by the
775    parser when the start tag is inferred
776 */
TY_(ParseBlock)777 void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode)
778 {
779     Lexer* lexer = doc->lexer;
780     Node *node;
781     Bool checkstack = yes;
782     uint istackbase = 0;
783 
784     if ( element->tag->model & CM_EMPTY )
785         return;
786 
787     if ( nodeIsFORM(element) &&
788          DescendantOf(element, TidyTag_FORM) )
789         TY_(ReportError)(doc, element, NULL, ILLEGAL_NESTING );
790 
791     /*
792      InlineDup() asks the lexer to insert inline emphasis tags
793      currently pushed on the istack, but take care to avoid
794      propagating inline emphasis inside OBJECT or APPLET.
795      For these elements a fresh inline stack context is created
796      and disposed of upon reaching the end of the element.
797      They thus behave like table cells in this respect.
798     */
799     if (element->tag->model & CM_OBJECT)
800     {
801         istackbase = lexer->istackbase;
802         lexer->istackbase = lexer->istacksize;
803     }
804 
805     if (!(element->tag->model & CM_MIXED))
806         TY_(InlineDup)( doc, NULL );
807 
808     mode = IgnoreWhitespace;
809 
810     while ((node = TY_(GetToken)(doc, mode /*MixedContent*/)) != NULL)
811     {
812         /* end tag for this element */
813         if (node->type == EndTag && node->tag &&
814             (node->tag == element->tag || element->was == node->tag))
815         {
816             TY_(FreeNode)( doc, node );
817 
818             if (element->tag->model & CM_OBJECT)
819             {
820                 /* pop inline stack */
821                 while (lexer->istacksize > lexer->istackbase)
822                     TY_(PopInline)( doc, NULL );
823                 lexer->istackbase = istackbase;
824             }
825 
826             element->closed = yes;
827             TrimSpaces( doc, element );
828             return;
829         }
830 
831         if ( nodeIsBODY( node ) && DescendantOf( element, TidyTag_HEAD ))
832         {
833             /*  If we're in the HEAD, close it before proceeding.
834                 This is an extremely rare occurance, but has been observed.
835             */
836             TY_(UngetToken)( doc );
837             break;
838         }
839 
840         if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) )
841         {
842             if ( TY_(nodeIsElement)(node) )
843                 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
844             TY_(FreeNode)( doc, node );
845             continue;
846         }
847 
848 
849         if (node->type == EndTag)
850         {
851             if (node->tag == NULL)
852             {
853                 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
854                 TY_(FreeNode)( doc, node );
855                 continue;
856             }
857             else if ( nodeIsBR(node) )
858                 node->type = StartTag;
859             else if ( nodeIsP(node) )
860             {
861                 /* Cannot have a block inside a paragraph, so no checking
862                    for an ancestor is necessary -- but we _can_ have
863                    paragraphs inside a block, so change it to an implicit
864                    empty paragraph, to be dealt with according to the user's
865                    options
866                 */
867                 node->type = StartEndTag;
868                 node->implicit = yes;
869 #if OBSOLETE
870                 TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
871                 TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */
872                 TY_(InsertNodeAtEnd)( element, node );
873                 node = InferredTag(doc, TidyTag_BR);
874 #endif
875             }
876             else if (DescendantOf( element, node->tag->id ))
877             {
878                 /*
879                   if this is the end tag for an ancestor element
880                   then infer end tag for this element
881                 */
882                 TY_(UngetToken)( doc );
883                 break;
884 #if OBSOLETE
885                 Node *parent;
886                 for ( parent = element->parent;
887                       parent != NULL;
888                       parent = parent->parent )
889                 {
890                     if (node->tag == parent->tag)
891                     {
892                         if (!(element->tag->model & CM_OPT))
893                             TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
894 
895                         TY_(UngetToken)( doc );
896 
897                         if (element->tag->model & CM_OBJECT)
898                         {
899                             /* pop inline stack */
900                             while (lexer->istacksize > lexer->istackbase)
901                                 TY_(PopInline)( doc, NULL );
902                             lexer->istackbase = istackbase;
903                         }
904 
905                         TrimSpaces( doc, element );
906                         return;
907                     }
908                 }
909 #endif
910             }
911             else
912             {
913                 /* special case </tr> etc. for stuff moved in front of table */
914                 if ( lexer->exiled
915                      && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
916                 {
917                     TY_(UngetToken)( doc );
918                     TrimSpaces( doc, element );
919                     return;
920                 }
921             }
922         }
923 
924         /* mixed content model permits text */
925         if (TY_(nodeIsText)(node))
926         {
927             if ( checkstack )
928             {
929                 checkstack = no;
930                 if (!(element->tag->model & CM_MIXED))
931                 {
932                     if ( TY_(InlineDup)(doc, node) > 0 )
933                         continue;
934                 }
935             }
936 
937             TY_(InsertNodeAtEnd)(element, node);
938             mode = MixedContent;
939 
940             /*
941               HTML4 strict doesn't allow mixed content for
942               elements with %block; as their content model
943             */
944             /*
945               But only body, map, blockquote, form and
946               noscript have content model %block;
947             */
948             if ( nodeIsBODY(element)       ||
949                  nodeIsMAP(element)        ||
950                  nodeIsBLOCKQUOTE(element) ||
951                  nodeIsFORM(element)       ||
952                  nodeIsNOSCRIPT(element) )
953                 TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
954             continue;
955         }
956 
957         if ( InsertMisc(element, node) )
958             continue;
959 
960         /* allow PARAM elements? */
961         if ( nodeIsPARAM(node) )
962         {
963             if ( TY_(nodeHasCM)(element, CM_PARAM) && TY_(nodeIsElement)(node) )
964             {
965                 TY_(InsertNodeAtEnd)(element, node);
966                 continue;
967             }
968 
969             /* otherwise discard it */
970             TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
971             TY_(FreeNode)( doc, node );
972             continue;
973         }
974 
975         /* allow AREA elements? */
976         if ( nodeIsAREA(node) )
977         {
978             if ( nodeIsMAP(element) && TY_(nodeIsElement)(node) )
979             {
980                 TY_(InsertNodeAtEnd)(element, node);
981                 continue;
982             }
983 
984             /* otherwise discard it */
985             TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
986             TY_(FreeNode)( doc, node );
987             continue;
988         }
989 
990         /* ignore unknown start/end tags */
991         if ( node->tag == NULL )
992         {
993             TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
994             TY_(FreeNode)( doc, node );
995             continue;
996         }
997 
998         /*
999           Allow CM_INLINE elements here.
1000 
1001           Allow CM_BLOCK elements here unless
1002           lexer->excludeBlocks is yes.
1003 
1004           LI and DD are special cased.
1005 
1006           Otherwise infer end tag for this element.
1007         */
1008 
1009         if ( !TY_(nodeHasCM)(node, CM_INLINE) )
1010         {
1011             if ( !TY_(nodeIsElement)(node) )
1012             {
1013                 if ( nodeIsFORM(node) )
1014                     BadForm( doc );
1015 
1016                 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1017                 TY_(FreeNode)( doc, node );
1018                 continue;
1019             }
1020 
1021             /* #427671 - Fix by Randy Waki - 10 Aug 00 */
1022             /*
1023              If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION
1024              start tag, discard the start tag and let the subsequent content get
1025              parsed as content of the enclosing LI.  This seems to mimic IE and
1026              Netscape, and avoids an infinite loop: without this check,
1027              ParseBlock (which is parsing the LI's content) and ParseList (which
1028              is parsing the LI's parent's content) repeatedly defer to each
1029              other to parse the illegal start tag, each time inferring a missing
1030              </li> or <li> respectively.
1031 
1032              NOTE: This check is a bit fragile.  It specifically checks for the
1033              four tags that happen to weave their way through the current series
1034              of tests performed by ParseBlock and ParseList to trigger the
1035              infinite loop.
1036             */
1037             if ( nodeIsLI(element) )
1038             {
1039                 if ( nodeIsFRAME(node)    ||
1040                      nodeIsFRAMESET(node) ||
1041                      nodeIsOPTGROUP(node) ||
1042                      nodeIsOPTION(node) )
1043                 {
1044                     TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1045                     TY_(FreeNode)( doc, node );  /* DSR - 27Apr02 avoid memory leak */
1046                     continue;
1047                 }
1048             }
1049 
1050             if ( nodeIsTD(element) || nodeIsTH(element) )
1051             {
1052                 /* if parent is a table cell, avoid inferring the end of the cell */
1053 
1054                 if ( TY_(nodeHasCM)(node, CM_HEAD) )
1055                 {
1056                     MoveToHead( doc, element, node );
1057                     continue;
1058                 }
1059 
1060                 if ( TY_(nodeHasCM)(node, CM_LIST) )
1061                 {
1062                     TY_(UngetToken)( doc );
1063                     node = TY_(InferredTag)(doc, TidyTag_UL);
1064                     AddClassNoIndent(doc, node);
1065                     lexer->excludeBlocks = yes;
1066                 }
1067                 else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
1068                 {
1069                     TY_(UngetToken)( doc );
1070                     node = TY_(InferredTag)(doc, TidyTag_DL);
1071                     lexer->excludeBlocks = yes;
1072                 }
1073 
1074                 /* infer end of current table cell */
1075                 if ( !TY_(nodeHasCM)(node, CM_BLOCK) )
1076                 {
1077                     TY_(UngetToken)( doc );
1078                     TrimSpaces( doc, element );
1079                     return;
1080                 }
1081             }
1082             else if ( TY_(nodeHasCM)(node, CM_BLOCK) )
1083             {
1084                 if ( lexer->excludeBlocks )
1085                 {
1086                     if ( !TY_(nodeHasCM)(element, CM_OPT) )
1087                         TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1088 
1089                     TY_(UngetToken)( doc );
1090 
1091                     if ( TY_(nodeHasCM)(element, CM_OBJECT) )
1092                         lexer->istackbase = istackbase;
1093 
1094                     TrimSpaces( doc, element );
1095                     return;
1096                 }
1097             }
1098             else /* things like list items */
1099             {
1100                 if (node->tag->model & CM_HEAD)
1101                 {
1102                     MoveToHead( doc, element, node );
1103                     continue;
1104                 }
1105 
1106                 /*
1107                  special case where a form start tag
1108                  occurs in a tr and is followed by td or th
1109                 */
1110 
1111                 if ( nodeIsFORM(element) &&
1112                      nodeIsTD(element->parent) &&
1113                      element->parent->implicit )
1114                 {
1115                     if ( nodeIsTD(node) )
1116                     {
1117                         TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1118                         TY_(FreeNode)( doc, node );
1119                         continue;
1120                     }
1121 
1122                     if ( nodeIsTH(node) )
1123                     {
1124                         TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1125                         TY_(FreeNode)( doc, node );
1126                         node = element->parent;
1127                         TidyDocFree(doc, node->element);
1128                         node->element = TY_(tmbstrdup)(doc->allocator, "th");
1129                         node->tag = TY_(LookupTagDef)( TidyTag_TH );
1130                         continue;
1131                     }
1132                 }
1133 
1134                 if ( !TY_(nodeHasCM)(element, CM_OPT) && !element->implicit )
1135                     TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1136 
1137                 TY_(UngetToken)( doc );
1138 
1139                 if ( TY_(nodeHasCM)(node, CM_LIST) )
1140                 {
1141                     if ( element->parent && element->parent->tag &&
1142                          element->parent->tag->parser == TY_(ParseList) )
1143                     {
1144                         TrimSpaces( doc, element );
1145                         return;
1146                     }
1147 
1148                     node = TY_(InferredTag)(doc, TidyTag_UL);
1149                     AddClassNoIndent(doc, node);
1150                 }
1151                 else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
1152                 {
1153                     if ( nodeIsDL(element->parent) )
1154                     {
1155                         TrimSpaces( doc, element );
1156                         return;
1157                     }
1158 
1159                     node = TY_(InferredTag)(doc, TidyTag_DL);
1160                 }
1161                 else if ( TY_(nodeHasCM)(node, CM_TABLE) || TY_(nodeHasCM)(node, CM_ROW) )
1162                 {
1163                     /* http://tidy.sf.net/issue/1316307 */
1164                     /* In exiled mode, return so table processing can
1165                        continue. */
1166                     if (lexer->exiled)
1167                         return;
1168                     node = TY_(InferredTag)(doc, TidyTag_TABLE);
1169                 }
1170                 else if ( TY_(nodeHasCM)(element, CM_OBJECT) )
1171                 {
1172                     /* pop inline stack */
1173                     while ( lexer->istacksize > lexer->istackbase )
1174                         TY_(PopInline)( doc, NULL );
1175                     lexer->istackbase = istackbase;
1176                     TrimSpaces( doc, element );
1177                     return;
1178 
1179                 }
1180                 else
1181                 {
1182                     TrimSpaces( doc, element );
1183                     return;
1184                 }
1185             }
1186         }
1187 
1188         /* parse known element */
1189         if (TY_(nodeIsElement)(node))
1190         {
1191             if (node->tag->model & CM_INLINE)
1192             {
1193                 if (checkstack && !node->implicit)
1194                 {
1195                     checkstack = no;
1196 
1197                     if (!(element->tag->model & CM_MIXED)) /* #431731 - fix by Randy Waki 25 Dec 00 */
1198                     {
1199                         if ( TY_(InlineDup)(doc, node) > 0 )
1200                             continue;
1201                     }
1202                 }
1203 
1204                 mode = MixedContent;
1205             }
1206             else
1207             {
1208                 checkstack = yes;
1209                 mode = IgnoreWhitespace;
1210             }
1211 
1212             /* trim white space before <br> */
1213             if ( nodeIsBR(node) )
1214                 TrimSpaces( doc, element );
1215 
1216             TY_(InsertNodeAtEnd)(element, node);
1217 
1218             if (node->implicit)
1219                 TY_(ReportError)(doc, element, node, INSERTING_TAG );
1220 
1221             ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ );
1222             continue;
1223         }
1224 
1225         /* discard unexpected tags */
1226         if (node->type == EndTag)
1227             TY_(PopInline)( doc, node );  /* if inline end tag */
1228 
1229         TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1230         TY_(FreeNode)( doc, node );
1231         continue;
1232     }
1233 
1234     if (!(element->tag->model & CM_OPT))
1235         TY_(ReportError)(doc, element, node, MISSING_ENDTAG_FOR);
1236 
1237     if (element->tag->model & CM_OBJECT)
1238     {
1239         /* pop inline stack */
1240         while ( lexer->istacksize > lexer->istackbase )
1241             TY_(PopInline)( doc, NULL );
1242         lexer->istackbase = istackbase;
1243     }
1244 
1245     TrimSpaces( doc, element );
1246 }
1247 
TY_(ParseInline)1248 void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
1249 {
1250     Lexer* lexer = doc->lexer;
1251     Node *node, *parent;
1252 
1253     if (element->tag->model & CM_EMPTY)
1254         return;
1255 
1256     /*
1257      ParseInline is used for some block level elements like H1 to H6
1258      For such elements we need to insert inline emphasis tags currently
1259      on the inline stack. For Inline elements, we normally push them
1260      onto the inline stack provided they aren't implicit or OBJECT/APPLET.
1261      This test is carried out in PushInline and PopInline, see istack.c
1262 
1263      InlineDup(...) is not called for elements with a CM_MIXED (inline and
1264      block) content model, e.g. <del> or <ins>, otherwise constructs like
1265 
1266        <p>111<a name='foo'>222<del>333</del>444</a>555</p>
1267        <p>111<span>222<del>333</del>444</span>555</p>
1268        <p>111<em>222<del>333</del>444</em>555</p>
1269 
1270      will get corrupted.
1271     */
1272     if ((TY_(nodeHasCM)(element, CM_BLOCK) || nodeIsDT(element)) &&
1273         !TY_(nodeHasCM)(element, CM_MIXED))
1274         TY_(InlineDup)(doc, NULL);
1275     else if (TY_(nodeHasCM)(element, CM_INLINE))
1276         TY_(PushInline)(doc, element);
1277 
1278     if ( nodeIsNOBR(element) )
1279         doc->badLayout |= USING_NOBR;
1280     else if ( nodeIsFONT(element) )
1281         doc->badLayout |= USING_FONT;
1282 
1283     /* Inline elements may or may not be within a preformatted element */
1284     if (mode != Preformatted)
1285         mode = MixedContent;
1286 
1287     while ((node = TY_(GetToken)(doc, mode)) != NULL)
1288     {
1289         /* end tag for current element */
1290         if (node->tag == element->tag && node->type == EndTag)
1291         {
1292             if (element->tag->model & CM_INLINE)
1293                 TY_(PopInline)( doc, node );
1294 
1295             TY_(FreeNode)( doc, node );
1296 
1297             if (!(mode & Preformatted))
1298                 TrimSpaces(doc, element);
1299 
1300             /*
1301              if a font element wraps an anchor and nothing else
1302              then move the font element inside the anchor since
1303              otherwise it won't alter the anchor text color
1304             */
1305             if ( nodeIsFONT(element) &&
1306                  element->content && element->content == element->last )
1307             {
1308                 Node *child = element->content;
1309 
1310                 if ( nodeIsA(child) )
1311                 {
1312                     child->parent = element->parent;
1313                     child->next = element->next;
1314                     child->prev = element->prev;
1315 
1316                     element->next = NULL;
1317                     element->prev = NULL;
1318                     element->parent = child;
1319 
1320                     element->content = child->content;
1321                     element->last = child->last;
1322                     child->content = element;
1323 
1324                     TY_(FixNodeLinks)(child);
1325                     TY_(FixNodeLinks)(element);
1326                 }
1327             }
1328 
1329             element->closed = yes;
1330             TrimSpaces( doc, element );
1331             return;
1332         }
1333 
1334         /* <u>...<u>  map 2nd <u> to </u> if 1st is explicit */
1335         /* (see additional conditions below) */
1336         /* otherwise emphasis nesting is probably unintentional */
1337         /* big, small, sub, sup have cumulative effect to leave them alone */
1338         if ( node->type == StartTag
1339              && node->tag == element->tag
1340              && TY_(IsPushed)( doc, node )
1341              && !node->implicit
1342              && !element->implicit
1343              && node->tag && (node->tag->model & CM_INLINE)
1344              && !nodeIsA(node)
1345              && !nodeIsFONT(node)
1346              && !nodeIsBIG(node)
1347              && !nodeIsSMALL(node)
1348              && !nodeIsSUB(node)
1349              && !nodeIsSUP(node)
1350              && !nodeIsQ(node)
1351              && !nodeIsSPAN(node)
1352            )
1353         {
1354             /* proceeds only if "node" does not have any attribute and
1355                follows a text node not finishing with a space */
1356             if (element->content != NULL && node->attributes == NULL
1357                 && TY_(nodeIsText)(element->last)
1358                 && !TY_(TextNodeEndWithSpace)(doc->lexer, element->last) )
1359             {
1360                 TY_(ReportWarning)(doc, element, node, COERCE_TO_ENDTAG_WARN);
1361                 node->type = EndTag;
1362                 TY_(UngetToken)(doc);
1363                 continue;
1364             }
1365 
1366             if (node->attributes == NULL || element->attributes == NULL)
1367                 TY_(ReportWarning)(doc, element, node, NESTED_EMPHASIS);
1368         }
1369         else if ( TY_(IsPushed)(doc, node) && node->type == StartTag &&
1370                   nodeIsQ(node) )
1371         {
1372             TY_(ReportWarning)(doc, element, node, NESTED_QUOTATION);
1373         }
1374 
1375         if ( TY_(nodeIsText)(node) )
1376         {
1377             /* only called for 1st child */
1378             if ( element->content == NULL && !(mode & Preformatted) )
1379                 TrimSpaces( doc, element );
1380 
1381             if ( node->start >= node->end )
1382             {
1383                 TY_(FreeNode)( doc, node );
1384                 continue;
1385             }
1386 
1387             TY_(InsertNodeAtEnd)(element, node);
1388             continue;
1389         }
1390 
1391         /* mixed content model so allow text */
1392         if (InsertMisc(element, node))
1393             continue;
1394 
1395         /* deal with HTML tags */
1396         if ( nodeIsHTML(node) )
1397         {
1398             if ( TY_(nodeIsElement)(node) )
1399             {
1400                 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1401                 TY_(FreeNode)( doc, node );
1402                 continue;
1403             }
1404 
1405             /* otherwise infer end of inline element */
1406             TY_(UngetToken)( doc );
1407 
1408             if (!(mode & Preformatted))
1409                 TrimSpaces(doc, element);
1410 
1411             return;
1412         }
1413 
1414         /* within <dt> or <pre> map <p> to <br> */
1415         if ( nodeIsP(node) &&
1416              node->type == StartTag &&
1417              ( (mode & Preformatted) ||
1418                nodeIsDT(element) ||
1419                DescendantOf(element, TidyTag_DT )
1420              )
1421            )
1422         {
1423             node->tag = TY_(LookupTagDef)( TidyTag_BR );
1424             TidyDocFree(doc, node->element);
1425             node->element = TY_(tmbstrdup)(doc->allocator, "br");
1426             TrimSpaces(doc, element);
1427             TY_(InsertNodeAtEnd)(element, node);
1428             continue;
1429         }
1430 
1431         /* <p> allowed within <address> in HTML 4.01 Transitional */
1432         if ( nodeIsP(node) &&
1433              node->type == StartTag &&
1434              nodeIsADDRESS(element) )
1435         {
1436             TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
1437             TY_(InsertNodeAtEnd)(element, node);
1438             (*node->tag->parser)( doc, node, mode );
1439             continue;
1440         }
1441 
1442         /* ignore unknown and PARAM tags */
1443         if ( node->tag == NULL || nodeIsPARAM(node) )
1444         {
1445             TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1446             TY_(FreeNode)( doc, node );
1447             continue;
1448         }
1449 
1450         if ( nodeIsBR(node) && node->type == EndTag )
1451             node->type = StartTag;
1452 
1453         if ( node->type == EndTag )
1454         {
1455            /* coerce </br> to <br> */
1456            if ( nodeIsBR(node) )
1457                 node->type = StartTag;
1458            else if ( nodeIsP(node) )
1459            {
1460                /* coerce unmatched </p> to <br><br> */
1461                 if ( !DescendantOf(element, TidyTag_P) )
1462                 {
1463                     TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
1464                     TrimSpaces( doc, element );
1465                     TY_(InsertNodeAtEnd)( element, node );
1466                     node = TY_(InferredTag)(doc, TidyTag_BR);
1467                     TY_(InsertNodeAtEnd)( element, node ); /* todo: check this */
1468                     continue;
1469                 }
1470            }
1471            else if ( TY_(nodeHasCM)(node, CM_INLINE)
1472                      && !nodeIsA(node)
1473                      && !TY_(nodeHasCM)(node, CM_OBJECT)
1474                      && TY_(nodeHasCM)(element, CM_INLINE) )
1475             {
1476                 /* allow any inline end tag to end current element */
1477 
1478                 /* http://tidy.sf.net/issue/1426419 */
1479                 /* but, like the browser, retain an earlier inline element.
1480                    This is implemented by setting the lexer into a mode
1481                    where it gets tokens from the inline stack rather than
1482                    from the input stream. Check if the scenerio fits. */
1483                 if ( !nodeIsA(element)
1484                      && (node->tag != element->tag)
1485                      && TY_(IsPushed)( doc, node )
1486                      && TY_(IsPushed)( doc, element ) )
1487                 {
1488                     /* we have something like
1489                        <b>bold <i>bold and italic</b> italics</i> */
1490                     if ( TY_(SwitchInline)( doc, element, node ) )
1491                     {
1492                         TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG);
1493                         TY_(UngetToken)( doc ); /* put this back */
1494                         TY_(InlineDup1)( doc, NULL, element ); /* dupe the <i>, after </b> */
1495                         if (!(mode & Preformatted))
1496                             TrimSpaces( doc, element );
1497                         return; /* close <i>, but will re-open it, after </b> */
1498                     }
1499                 }
1500                 TY_(PopInline)( doc, element );
1501 
1502                 if ( !nodeIsA(element) )
1503                 {
1504                     if ( nodeIsA(node) && node->tag != element->tag )
1505                     {
1506                        TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1507                        TY_(UngetToken)( doc );
1508                     }
1509                     else
1510                     {
1511                         TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG);
1512                         TY_(FreeNode)( doc, node);
1513                     }
1514 
1515                     if (!(mode & Preformatted))
1516                         TrimSpaces(doc, element);
1517 
1518                     return;
1519                 }
1520 
1521                 /* if parent is <a> then discard unexpected inline end tag */
1522                 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1523                 TY_(FreeNode)( doc, node);
1524                 continue;
1525             }  /* special case </tr> etc. for stuff moved in front of table */
1526             else if ( lexer->exiled
1527                      && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
1528             {
1529                 TY_(UngetToken)( doc );
1530                 TrimSpaces(doc, element);
1531                 return;
1532             }
1533         }
1534 
1535         /* allow any header tag to end current header */
1536         if ( TY_(nodeHasCM)(node, CM_HEADING) && TY_(nodeHasCM)(element, CM_HEADING) )
1537         {
1538 
1539             if ( node->tag == element->tag )
1540             {
1541                 TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG );
1542                 TY_(FreeNode)( doc, node);
1543             }
1544             else
1545             {
1546                 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1547                 TY_(UngetToken)( doc );
1548             }
1549 
1550             if (!(mode & Preformatted))
1551                 TrimSpaces(doc, element);
1552 
1553             return;
1554         }
1555 
1556         /*
1557            an <A> tag to ends any open <A> element
1558            but <A href=...> is mapped to </A><A href=...>
1559         */
1560         /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
1561         /* if (node->tag == doc->tags.tag_a && !node->implicit && TY_(IsPushed)(doc, node)) */
1562         if ( nodeIsA(node) && !node->implicit &&
1563              (nodeIsA(element) || DescendantOf(element, TidyTag_A)) )
1564         {
1565             /* coerce <a> to </a> unless it has some attributes */
1566             /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
1567             /* other fixes by Dave Raggett */
1568             /* if (node->attributes == NULL) */
1569             if (node->type != EndTag && node->attributes == NULL)
1570             {
1571                 node->type = EndTag;
1572                 TY_(ReportError)(doc, element, node, COERCE_TO_ENDTAG);
1573                 /* TY_(PopInline)( doc, node ); */
1574                 TY_(UngetToken)( doc );
1575                 continue;
1576             }
1577 
1578             TY_(UngetToken)( doc );
1579             TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
1580             /* TY_(PopInline)( doc, element ); */
1581 
1582             if (!(mode & Preformatted))
1583                 TrimSpaces(doc, element);
1584 
1585             return;
1586         }
1587 
1588         if (element->tag->model & CM_HEADING)
1589         {
1590             if ( nodeIsCENTER(node) || nodeIsDIV(node) )
1591             {
1592                 if (!TY_(nodeIsElement)(node))
1593                 {
1594                     TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1595                     TY_(FreeNode)( doc, node);
1596                     continue;
1597                 }
1598 
1599                 TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
1600 
1601                 /* insert center as parent if heading is empty */
1602                 if (element->content == NULL)
1603                 {
1604                     InsertNodeAsParent(element, node);
1605                     continue;
1606                 }
1607 
1608                 /* split heading and make center parent of 2nd part */
1609                 TY_(InsertNodeAfterElement)(element, node);
1610 
1611                 if (!(mode & Preformatted))
1612                     TrimSpaces(doc, element);
1613 
1614                 element = TY_(CloneNode)( doc, element );
1615                 TY_(InsertNodeAtEnd)(node, element);
1616                 continue;
1617             }
1618 
1619             if ( nodeIsHR(node) )
1620             {
1621                 if ( !TY_(nodeIsElement)(node) )
1622                 {
1623                     TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1624                     TY_(FreeNode)( doc, node);
1625                     continue;
1626                 }
1627 
1628                 TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
1629 
1630                 /* insert hr before heading if heading is empty */
1631                 if (element->content == NULL)
1632                 {
1633                     TY_(InsertNodeBeforeElement)(element, node);
1634                     continue;
1635                 }
1636 
1637                 /* split heading and insert hr before 2nd part */
1638                 TY_(InsertNodeAfterElement)(element, node);
1639 
1640                 if (!(mode & Preformatted))
1641                     TrimSpaces(doc, element);
1642 
1643                 element = TY_(CloneNode)( doc, element );
1644                 TY_(InsertNodeAfterElement)(node, element);
1645                 continue;
1646             }
1647         }
1648 
1649         if ( nodeIsDT(element) )
1650         {
1651             if ( nodeIsHR(node) )
1652             {
1653                 Node *dd;
1654                 if ( !TY_(nodeIsElement)(node) )
1655                 {
1656                     TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1657                     TY_(FreeNode)( doc, node);
1658                     continue;
1659                 }
1660 
1661                 TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
1662                 dd = TY_(InferredTag)(doc, TidyTag_DD);
1663 
1664                 /* insert hr within dd before dt if dt is empty */
1665                 if (element->content == NULL)
1666                 {
1667                     TY_(InsertNodeBeforeElement)(element, dd);
1668                     TY_(InsertNodeAtEnd)(dd, node);
1669                     continue;
1670                 }
1671 
1672                 /* split dt and insert hr within dd before 2nd part */
1673                 TY_(InsertNodeAfterElement)(element, dd);
1674                 TY_(InsertNodeAtEnd)(dd, node);
1675 
1676                 if (!(mode & Preformatted))
1677                     TrimSpaces(doc, element);
1678 
1679                 element = TY_(CloneNode)( doc, element );
1680                 TY_(InsertNodeAfterElement)(dd, element);
1681                 continue;
1682             }
1683         }
1684 
1685 
1686         /*
1687           if this is the end tag for an ancestor element
1688           then infer end tag for this element
1689         */
1690         if (node->type == EndTag)
1691         {
1692             for (parent = element->parent;
1693                     parent != NULL; parent = parent->parent)
1694             {
1695                 if (node->tag == parent->tag)
1696                 {
1697                     if (!(element->tag->model & CM_OPT) && !element->implicit)
1698                         TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
1699 
1700                     if( TY_(IsPushedLast)( doc, element, node ) )
1701                         TY_(PopInline)( doc, element );
1702                     TY_(UngetToken)( doc );
1703 
1704                     if (!(mode & Preformatted))
1705                         TrimSpaces(doc, element);
1706 
1707                     return;
1708                 }
1709             }
1710         }
1711 
1712         /* block level tags end this element */
1713         if (!(node->tag->model & CM_INLINE) &&
1714             !(element->tag->model & CM_MIXED))
1715         {
1716             if ( !TY_(nodeIsElement)(node) )
1717             {
1718                 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1719                 TY_(FreeNode)( doc, node);
1720                 continue;
1721             }
1722 
1723             if (!(element->tag->model & CM_OPT))
1724                 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
1725 
1726             if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK))
1727             {
1728                 MoveToHead(doc, element, node);
1729                 continue;
1730             }
1731 
1732             /*
1733                prevent anchors from propagating into block tags
1734                except for headings h1 to h6
1735             */
1736             if ( nodeIsA(element) )
1737             {
1738                 if (node->tag && !(node->tag->model & CM_HEADING))
1739                     TY_(PopInline)( doc, element );
1740                 else if (!(element->content))
1741                 {
1742                     TY_(DiscardElement)( doc, element );
1743                     TY_(UngetToken)( doc );
1744                     return;
1745                 }
1746             }
1747 
1748             TY_(UngetToken)( doc );
1749 
1750             if (!(mode & Preformatted))
1751                 TrimSpaces(doc, element);
1752 
1753             return;
1754         }
1755 
1756         /* parse inline element */
1757         if (TY_(nodeIsElement)(node))
1758         {
1759             if (node->implicit)
1760                 TY_(ReportError)(doc, element, node, INSERTING_TAG);
1761 
1762             /* trim white space before <br> */
1763             if ( nodeIsBR(node) )
1764                 TrimSpaces(doc, element);
1765 
1766             TY_(InsertNodeAtEnd)(element, node);
1767             ParseTag(doc, node, mode);
1768             continue;
1769         }
1770 
1771         /* discard unexpected tags */
1772         TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1773         TY_(FreeNode)( doc, node );
1774         continue;
1775     }
1776 
1777     if (!(element->tag->model & CM_OPT))
1778         TY_(ReportError)(doc, element, node, MISSING_ENDTAG_FOR);
1779 
1780 }
1781 
TY_(ParseEmpty)1782 void TY_(ParseEmpty)(TidyDocImpl* doc, Node *element, GetTokenMode mode)
1783 {
1784     Lexer* lexer = doc->lexer;
1785     if ( lexer->isvoyager )
1786     {
1787         Node *node = TY_(GetToken)( doc, mode);
1788         if ( node )
1789         {
1790             if ( !(node->type == EndTag && node->tag == element->tag) )
1791             {
1792                 TY_(ReportError)(doc, element, node, ELEMENT_NOT_EMPTY);
1793                 TY_(UngetToken)( doc );
1794             }
1795             else
1796             {
1797                 TY_(FreeNode)( doc, node );
1798             }
1799         }
1800     }
1801 }
1802 
TY_(ParseDefList)1803 void TY_(ParseDefList)(TidyDocImpl* doc, Node *list, GetTokenMode mode)
1804 {
1805     Lexer* lexer = doc->lexer;
1806     Node *node, *parent;
1807 
1808     if (list->tag->model & CM_EMPTY)
1809         return;
1810 
1811     lexer->insert = NULL;  /* defer implicit inline start tags */
1812 
1813     while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL)
1814     {
1815         if (node->tag == list->tag && node->type == EndTag)
1816         {
1817             TY_(FreeNode)( doc, node);
1818             list->closed = yes;
1819             return;
1820         }
1821 
1822         /* deal with comments etc. */
1823         if (InsertMisc(list, node))
1824             continue;
1825 
1826         if (TY_(nodeIsText)(node))
1827         {
1828             TY_(UngetToken)( doc );
1829             node = TY_(InferredTag)(doc, TidyTag_DT);
1830             TY_(ReportError)(doc, list, node, MISSING_STARTTAG);
1831         }
1832 
1833         if (node->tag == NULL)
1834         {
1835             TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1836             TY_(FreeNode)( doc, node);
1837             continue;
1838         }
1839 
1840         /*
1841           if this is the end tag for an ancestor element
1842           then infer end tag for this element
1843         */
1844         if (node->type == EndTag)
1845         {
1846             Bool discardIt = no;
1847             if ( nodeIsFORM(node) )
1848             {
1849                 BadForm( doc );
1850                 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1851                 TY_(FreeNode)( doc, node );
1852                 continue;
1853             }
1854 
1855             for (parent = list->parent;
1856                     parent != NULL; parent = parent->parent)
1857             {
1858                /* Do not match across BODY to avoid infinite loop
1859                   between ParseBody and this parser,
1860                   See http://tidy.sf.net/bug/1098012. */
1861                 if (nodeIsBODY(parent))
1862                 {
1863                     discardIt = yes;
1864                     break;
1865                 }
1866                 if (node->tag == parent->tag)
1867                 {
1868                     TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
1869 
1870                     TY_(UngetToken)( doc );
1871                     return;
1872                 }
1873             }
1874             if (discardIt)
1875             {
1876                 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1877                 TY_(FreeNode)( doc, node);
1878                 continue;
1879             }
1880         }
1881 
1882         /* center in a dt or a dl breaks the dl list in two */
1883         if ( nodeIsCENTER(node) )
1884         {
1885             if (list->content)
1886                 TY_(InsertNodeAfterElement)(list, node);
1887             else /* trim empty dl list */
1888             {
1889                 TY_(InsertNodeBeforeElement)(list, node);
1890 
1891             }
1892 
1893             /* #426885 - fix by Glenn Carroll 19 Apr 00, and
1894                          Gary Dechaines 11 Aug 00 */
1895             /* ParseTag can destroy node, if it finds that
1896              * this <center> is followed immediately by </center>.
1897              * It's awkward but necessary to determine if this
1898              * has happened.
1899              */
1900             parent = node->parent;
1901 
1902             /* and parse contents of center */
1903             lexer->excludeBlocks = no;
1904             ParseTag( doc, node, mode);
1905             lexer->excludeBlocks = yes;
1906 
1907             /* now create a new dl element,
1908              * unless node has been blown away because the
1909              * center was empty, as above.
1910              */
1911             if (parent->last == node)
1912             {
1913                 list = TY_(InferredTag)(doc, TidyTag_DL);
1914                 TY_(InsertNodeAfterElement)(node, list);
1915             }
1916             continue;
1917         }
1918 
1919         if ( !(nodeIsDT(node) || nodeIsDD(node)) )
1920         {
1921             TY_(UngetToken)( doc );
1922 
1923             if (!(node->tag->model & (CM_BLOCK | CM_INLINE)))
1924             {
1925                 TY_(ReportError)(doc, list, node, TAG_NOT_ALLOWED_IN);
1926                 return;
1927             }
1928 
1929             /* if DD appeared directly in BODY then exclude blocks */
1930             if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks)
1931                 return;
1932 
1933             node = TY_(InferredTag)(doc, TidyTag_DD);
1934             TY_(ReportError)(doc, list, node, MISSING_STARTTAG);
1935         }
1936 
1937         if (node->type == EndTag)
1938         {
1939             TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1940             TY_(FreeNode)( doc, node);
1941             continue;
1942         }
1943 
1944         /* node should be <DT> or <DD>*/
1945         TY_(InsertNodeAtEnd)(list, node);
1946         ParseTag( doc, node, IgnoreWhitespace);
1947     }
1948 
1949     TY_(ReportError)(doc, list, node, MISSING_ENDTAG_FOR);
1950 }
1951 
FindLastLI(Node * list,Node ** lastli)1952 static Bool FindLastLI( Node *list, Node **lastli )
1953 {
1954     Node *node;
1955 
1956     *lastli = NULL;
1957     for ( node = list->content; node ; node = node->next )
1958         if ( nodeIsLI(node) && node->type == StartTag )
1959             *lastli=node;
1960     return *lastli ? yes:no;
1961 }
1962 
TY_(ParseList)1963 void TY_(ParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode))
1964 {
1965     Lexer* lexer = doc->lexer;
1966     Node *node, *parent, *lastli;
1967     Bool wasblock;
1968 
1969     if (list->tag->model & CM_EMPTY)
1970         return;
1971 
1972     lexer->insert = NULL;  /* defer implicit inline start tags */
1973 
1974     while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL)
1975     {
1976         if (node->tag == list->tag && node->type == EndTag)
1977         {
1978             TY_(FreeNode)( doc, node);
1979             list->closed = yes;
1980             return;
1981         }
1982 
1983         /* deal with comments etc. */
1984         if (InsertMisc(list, node))
1985             continue;
1986 
1987         if (node->type != TextNode && node->tag == NULL)
1988         {
1989             TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1990             TY_(FreeNode)( doc, node);
1991             continue;
1992         }
1993 
1994         /*
1995           if this is the end tag for an ancestor element
1996           then infer end tag for this element
1997         */
1998         if (node->type == EndTag)
1999         {
2000             if ( nodeIsFORM(node) )
2001             {
2002                 BadForm( doc );
2003                 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2004                 TY_(FreeNode)( doc, node );
2005                 continue;
2006             }
2007 
2008             if (TY_(nodeHasCM)(node,CM_INLINE))
2009             {
2010                 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2011                 TY_(PopInline)( doc, node );
2012                 TY_(FreeNode)( doc, node);
2013                 continue;
2014             }
2015 
2016             for ( parent = list->parent;
2017                   parent != NULL; parent = parent->parent )
2018             {
2019                /* Do not match across BODY to avoid infinite loop
2020                   between ParseBody and this parser,
2021                   See http://tidy.sf.net/bug/1053626. */
2022                 if (nodeIsBODY(parent))
2023                     break;
2024                 if (node->tag == parent->tag)
2025                 {
2026                     TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
2027                     TY_(UngetToken)( doc );
2028                     return;
2029                 }
2030             }
2031 
2032             TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2033             TY_(FreeNode)( doc, node);
2034             continue;
2035         }
2036 
2037         if ( !nodeIsLI(node) )
2038         {
2039             TY_(UngetToken)( doc );
2040 
2041             if (TY_(nodeHasCM)(node,CM_BLOCK) && lexer->excludeBlocks)
2042             {
2043                 TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
2044                 return;
2045             }
2046             /* http://tidy.sf.net/issue/1316307 */
2047             /* In exiled mode, return so table processing can continue. */
2048             else if ( lexer->exiled
2049                       && (TY_(nodeHasCM)(node, CM_TABLE|CM_ROWGRP|CM_ROW)
2050                           || nodeIsTABLE(node)) )
2051                 return;
2052 
2053             /* http://tidy.sf.net/issue/836462
2054                If "list" is an unordered list, insert the next tag within
2055                the last <li> to preserve the numbering to match the visual
2056                rendering of most browsers. */
2057             if ( nodeIsOL(list) && FindLastLI(list, &lastli) )
2058             {
2059                 /* Create a node for error reporting */
2060                 node = TY_(InferredTag)(doc, TidyTag_LI);
2061                 TY_(ReportError)(doc, list, node, MISSING_STARTTAG );
2062                 TY_(FreeNode)( doc, node);
2063                 node = lastli;
2064             }
2065             else
2066             {
2067                 /* Add an inferred <li> */
2068                 wasblock = TY_(nodeHasCM)(node,CM_BLOCK);
2069                 node = TY_(InferredTag)(doc, TidyTag_LI);
2070                 /* Add "display: inline" to avoid a blank line after <li> with
2071                    Internet Explorer. See http://tidy.sf.net/issue/836462 */
2072                 TY_(AddStyleProperty)( doc, node,
2073                                        wasblock
2074                                        ? "list-style: none; display: inline"
2075                                        : "list-style: none"
2076                                        );
2077                 TY_(ReportError)(doc, list, node, MISSING_STARTTAG );
2078                 TY_(InsertNodeAtEnd)(list,node);
2079             }
2080         }
2081         else
2082             /* node is <LI> */
2083             TY_(InsertNodeAtEnd)(list,node);
2084 
2085         ParseTag( doc, node, IgnoreWhitespace);
2086     }
2087 
2088     TY_(ReportError)(doc, list, node, MISSING_ENDTAG_FOR);
2089 }
2090 
2091 /*
2092  unexpected content in table row is moved to just before
2093  the table in accordance with Netscape and IE. This code
2094  assumes that node hasn't been inserted into the row.
2095 */
MoveBeforeTable(TidyDocImpl * ARG_UNUSED (doc),Node * row,Node * node)2096 static void MoveBeforeTable( TidyDocImpl* ARG_UNUSED(doc), Node *row,
2097                              Node *node )
2098 {
2099     Node *table;
2100 
2101     /* first find the table element */
2102     for (table = row->parent; table; table = table->parent)
2103     {
2104         if ( nodeIsTABLE(table) )
2105         {
2106             TY_(InsertNodeBeforeElement)( table, node );
2107             return;
2108         }
2109     }
2110     /* No table element */
2111     TY_(InsertNodeBeforeElement)( row->parent, node );
2112 }
2113 
2114 /*
2115  if a table row is empty then insert an empty cell
2116  this practice is consistent with browser behavior
2117  and avoids potential problems with row spanning cells
2118 */
FixEmptyRow(TidyDocImpl * doc,Node * row)2119 static void FixEmptyRow(TidyDocImpl* doc, Node *row)
2120 {
2121     Node *cell;
2122 
2123     if (row->content == NULL)
2124     {
2125         cell = TY_(InferredTag)(doc, TidyTag_TD);
2126         TY_(InsertNodeAtEnd)(row, cell);
2127         TY_(ReportError)(doc, row, cell, MISSING_STARTTAG);
2128     }
2129 }
2130 
TY_(ParseRow)2131 void TY_(ParseRow)(TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode))
2132 {
2133     Lexer* lexer = doc->lexer;
2134     Node *node;
2135     Bool exclude_state;
2136 
2137     if (row->tag->model & CM_EMPTY)
2138         return;
2139 
2140     while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2141     {
2142         if (node->tag == row->tag)
2143         {
2144             if (node->type == EndTag)
2145             {
2146                 TY_(FreeNode)( doc, node);
2147                 row->closed = yes;
2148                 FixEmptyRow( doc, row);
2149                 return;
2150             }
2151 
2152             /* New row start implies end of current row */
2153             TY_(UngetToken)( doc );
2154             FixEmptyRow( doc, row);
2155             return;
2156         }
2157 
2158         /*
2159           if this is the end tag for an ancestor element
2160           then infer end tag for this element
2161         */
2162         if ( node->type == EndTag )
2163         {
2164             if ( (TY_(nodeHasCM)(node, CM_HTML|CM_TABLE) || nodeIsTABLE(node))
2165                  && DescendantOf(row, TagId(node)) )
2166             {
2167                 TY_(UngetToken)( doc );
2168                 return;
2169             }
2170 
2171             if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2172             {
2173                 if ( nodeIsFORM(node) )
2174                     BadForm( doc );
2175 
2176                 TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2177                 TY_(FreeNode)( doc, node);
2178                 continue;
2179             }
2180 
2181             if ( nodeIsTD(node) || nodeIsTH(node) )
2182             {
2183                 TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2184                 TY_(FreeNode)( doc, node);
2185                 continue;
2186             }
2187         }
2188 
2189         /* deal with comments etc. */
2190         if (InsertMisc(row, node))
2191             continue;
2192 
2193         /* discard unknown tags */
2194         if (node->tag == NULL && node->type != TextNode)
2195         {
2196             TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2197             TY_(FreeNode)( doc, node);
2198             continue;
2199         }
2200 
2201         /* discard unexpected <table> element */
2202         if ( nodeIsTABLE(node) )
2203         {
2204             TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2205             TY_(FreeNode)( doc, node);
2206             continue;
2207         }
2208 
2209         /* THEAD, TFOOT or TBODY */
2210         if ( TY_(nodeHasCM)(node, CM_ROWGRP) )
2211         {
2212             TY_(UngetToken)( doc );
2213             return;
2214         }
2215 
2216         if (node->type == EndTag)
2217         {
2218             TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2219             TY_(FreeNode)( doc, node);
2220             continue;
2221         }
2222 
2223         /*
2224           if text or inline or block move before table
2225           if head content move to head
2226         */
2227 
2228         if (node->type != EndTag)
2229         {
2230             if ( nodeIsFORM(node) )
2231             {
2232                 TY_(UngetToken)( doc );
2233                 node = TY_(InferredTag)(doc, TidyTag_TD);
2234                 TY_(ReportError)(doc, row, node, MISSING_STARTTAG);
2235             }
2236             else if ( TY_(nodeIsText)(node)
2237                       || TY_(nodeHasCM)(node, CM_BLOCK | CM_INLINE) )
2238             {
2239                 MoveBeforeTable( doc, row, node );
2240                 TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN);
2241                 lexer->exiled = yes;
2242                 exclude_state = lexer->excludeBlocks;
2243                 lexer->excludeBlocks = no;
2244 
2245                 if (node->type != TextNode)
2246                     ParseTag( doc, node, IgnoreWhitespace);
2247 
2248                 lexer->exiled = no;
2249                 lexer->excludeBlocks = exclude_state;
2250                 continue;
2251             }
2252             else if (node->tag->model & CM_HEAD)
2253             {
2254                 TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN);
2255                 MoveToHead( doc, row, node);
2256                 continue;
2257             }
2258         }
2259 
2260         if ( !(nodeIsTD(node) || nodeIsTH(node)) )
2261         {
2262             TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN);
2263             TY_(FreeNode)( doc, node);
2264             continue;
2265         }
2266 
2267         /* node should be <TD> or <TH> */
2268         TY_(InsertNodeAtEnd)(row, node);
2269         exclude_state = lexer->excludeBlocks;
2270         lexer->excludeBlocks = no;
2271         ParseTag( doc, node, IgnoreWhitespace);
2272         lexer->excludeBlocks = exclude_state;
2273 
2274         /* pop inline stack */
2275 
2276         while ( lexer->istacksize > lexer->istackbase )
2277             TY_(PopInline)( doc, NULL );
2278     }
2279 
2280 }
2281 
TY_(ParseRowGroup)2282 void TY_(ParseRowGroup)(TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSED(mode))
2283 {
2284     Lexer* lexer = doc->lexer;
2285     Node *node, *parent;
2286 
2287     if (rowgroup->tag->model & CM_EMPTY)
2288         return;
2289 
2290     while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2291     {
2292         if (node->tag == rowgroup->tag)
2293         {
2294             if (node->type == EndTag)
2295             {
2296                 rowgroup->closed = yes;
2297                 TY_(FreeNode)( doc, node);
2298                 return;
2299             }
2300 
2301             TY_(UngetToken)( doc );
2302             return;
2303         }
2304 
2305         /* if </table> infer end tag */
2306         if ( nodeIsTABLE(node) && node->type == EndTag )
2307         {
2308             TY_(UngetToken)( doc );
2309             return;
2310         }
2311 
2312         /* deal with comments etc. */
2313         if (InsertMisc(rowgroup, node))
2314             continue;
2315 
2316         /* discard unknown tags */
2317         if (node->tag == NULL && node->type != TextNode)
2318         {
2319             TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2320             TY_(FreeNode)( doc, node);
2321             continue;
2322         }
2323 
2324         /*
2325           if TD or TH then infer <TR>
2326           if text or inline or block move before table
2327           if head content move to head
2328         */
2329 
2330         if (node->type != EndTag)
2331         {
2332             if ( nodeIsTD(node) || nodeIsTH(node) )
2333             {
2334                 TY_(UngetToken)( doc );
2335                 node = TY_(InferredTag)(doc, TidyTag_TR);
2336                 TY_(ReportError)(doc, rowgroup, node, MISSING_STARTTAG);
2337             }
2338             else if ( TY_(nodeIsText)(node)
2339                       || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2340             {
2341                 MoveBeforeTable( doc, rowgroup, node );
2342                 TY_(ReportError)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
2343                 lexer->exiled = yes;
2344 
2345                 if (node->type != TextNode)
2346                     ParseTag(doc, node, IgnoreWhitespace);
2347 
2348                 lexer->exiled = no;
2349                 continue;
2350             }
2351             else if (node->tag->model & CM_HEAD)
2352             {
2353                 TY_(ReportError)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
2354                 MoveToHead(doc, rowgroup, node);
2355                 continue;
2356             }
2357         }
2358 
2359         /*
2360           if this is the end tag for ancestor element
2361           then infer end tag for this element
2362         */
2363         if (node->type == EndTag)
2364         {
2365             if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2366             {
2367                 if ( nodeIsFORM(node) )
2368                     BadForm( doc );
2369 
2370                 TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2371                 TY_(FreeNode)( doc, node);
2372                 continue;
2373             }
2374 
2375             if ( nodeIsTR(node) || nodeIsTD(node) || nodeIsTH(node) )
2376             {
2377                 TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2378                 TY_(FreeNode)( doc, node);
2379                 continue;
2380             }
2381 
2382             for ( parent = rowgroup->parent;
2383                   parent != NULL;
2384                   parent = parent->parent )
2385             {
2386                 if (node->tag == parent->tag)
2387                 {
2388                     TY_(UngetToken)( doc );
2389                     return;
2390                 }
2391             }
2392         }
2393 
2394         /*
2395           if THEAD, TFOOT or TBODY then implied end tag
2396 
2397         */
2398         if (node->tag->model & CM_ROWGRP)
2399         {
2400             if (node->type != EndTag)
2401             {
2402                 TY_(UngetToken)( doc );
2403                 return;
2404             }
2405         }
2406 
2407         if (node->type == EndTag)
2408         {
2409             TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2410             TY_(FreeNode)( doc, node);
2411             continue;
2412         }
2413 
2414         if ( !nodeIsTR(node) )
2415         {
2416             node = TY_(InferredTag)(doc, TidyTag_TR);
2417             TY_(ReportError)(doc, rowgroup, node, MISSING_STARTTAG);
2418             TY_(UngetToken)( doc );
2419         }
2420 
2421        /* node should be <TR> */
2422         TY_(InsertNodeAtEnd)(rowgroup, node);
2423         ParseTag(doc, node, IgnoreWhitespace);
2424     }
2425 
2426 }
2427 
TY_(ParseColGroup)2428 void TY_(ParseColGroup)(TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSED(mode))
2429 {
2430     Node *node, *parent;
2431 
2432     if (colgroup->tag->model & CM_EMPTY)
2433         return;
2434 
2435     while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2436     {
2437         if (node->tag == colgroup->tag && node->type == EndTag)
2438         {
2439             TY_(FreeNode)( doc, node);
2440             colgroup->closed = yes;
2441             return;
2442         }
2443 
2444         /*
2445           if this is the end tag for an ancestor element
2446           then infer end tag for this element
2447         */
2448         if (node->type == EndTag)
2449         {
2450             if ( nodeIsFORM(node) )
2451             {
2452                 BadForm( doc );
2453                 TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2454                 TY_(FreeNode)( doc, node);
2455                 continue;
2456             }
2457 
2458             for ( parent = colgroup->parent;
2459                   parent != NULL;
2460                   parent = parent->parent )
2461             {
2462                 if (node->tag == parent->tag)
2463                 {
2464                     TY_(UngetToken)( doc );
2465                     return;
2466                 }
2467             }
2468         }
2469 
2470         if (TY_(nodeIsText)(node))
2471         {
2472             TY_(UngetToken)( doc );
2473             return;
2474         }
2475 
2476         /* deal with comments etc. */
2477         if (InsertMisc(colgroup, node))
2478             continue;
2479 
2480         /* discard unknown tags */
2481         if (node->tag == NULL)
2482         {
2483             TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2484             TY_(FreeNode)( doc, node);
2485             continue;
2486         }
2487 
2488         if ( !nodeIsCOL(node) )
2489         {
2490             TY_(UngetToken)( doc );
2491             return;
2492         }
2493 
2494         if (node->type == EndTag)
2495         {
2496             TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2497             TY_(FreeNode)( doc, node);
2498             continue;
2499         }
2500 
2501         /* node should be <COL> */
2502         TY_(InsertNodeAtEnd)(colgroup, node);
2503         ParseTag(doc, node, IgnoreWhitespace);
2504     }
2505 }
2506 
TY_(ParseTableTag)2507 void TY_(ParseTableTag)(TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(mode))
2508 {
2509     Lexer* lexer = doc->lexer;
2510     Node *node, *parent;
2511     uint istackbase;
2512 
2513     TY_(DeferDup)( doc );
2514     istackbase = lexer->istackbase;
2515     lexer->istackbase = lexer->istacksize;
2516 
2517     while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2518     {
2519         if (node->tag == table->tag && node->type == EndTag)
2520         {
2521             TY_(FreeNode)( doc, node);
2522             lexer->istackbase = istackbase;
2523             table->closed = yes;
2524             return;
2525         }
2526 
2527         /* deal with comments etc. */
2528         if (InsertMisc(table, node))
2529             continue;
2530 
2531         /* discard unknown tags */
2532         if (node->tag == NULL && node->type != TextNode)
2533         {
2534             TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2535             TY_(FreeNode)( doc, node);
2536             continue;
2537         }
2538 
2539         /* if TD or TH or text or inline or block then infer <TR> */
2540 
2541         if (node->type != EndTag)
2542         {
2543             if ( nodeIsTD(node) || nodeIsTH(node) || nodeIsTABLE(node) )
2544             {
2545                 TY_(UngetToken)( doc );
2546                 node = TY_(InferredTag)(doc, TidyTag_TR);
2547                 TY_(ReportError)(doc, table, node, MISSING_STARTTAG);
2548             }
2549             else if ( TY_(nodeIsText)(node) ||TY_(nodeHasCM)(node,CM_BLOCK|CM_INLINE) )
2550             {
2551                 TY_(InsertNodeBeforeElement)(table, node);
2552                 TY_(ReportError)(doc, table, node, TAG_NOT_ALLOWED_IN);
2553                 lexer->exiled = yes;
2554 
2555                 if (node->type != TextNode)
2556                     ParseTag(doc, node, IgnoreWhitespace);
2557 
2558                 lexer->exiled = no;
2559                 continue;
2560             }
2561             else if (node->tag->model & CM_HEAD)
2562             {
2563                 MoveToHead(doc, table, node);
2564                 continue;
2565             }
2566         }
2567 
2568         /*
2569           if this is the end tag for an ancestor element
2570           then infer end tag for this element
2571         */
2572         if (node->type == EndTag)
2573         {
2574             if ( nodeIsFORM(node) )
2575             {
2576                 BadForm( doc );
2577                 TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2578                 TY_(FreeNode)( doc, node);
2579                 continue;
2580             }
2581 
2582             /* best to discard unexpected block/inline end tags */
2583             if ( TY_(nodeHasCM)(node, CM_TABLE|CM_ROW) ||
2584                  TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2585             {
2586                 TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2587                 TY_(FreeNode)( doc, node);
2588                 continue;
2589             }
2590 
2591             for ( parent = table->parent;
2592                   parent != NULL;
2593                   parent = parent->parent )
2594             {
2595                 if (node->tag == parent->tag)
2596                 {
2597                     TY_(ReportError)(doc, table, node, MISSING_ENDTAG_BEFORE );
2598                     TY_(UngetToken)( doc );
2599                     lexer->istackbase = istackbase;
2600                     return;
2601                 }
2602             }
2603         }
2604 
2605         if (!(node->tag->model & CM_TABLE))
2606         {
2607             TY_(UngetToken)( doc );
2608             TY_(ReportError)(doc, table, node, TAG_NOT_ALLOWED_IN);
2609             lexer->istackbase = istackbase;
2610             return;
2611         }
2612 
2613         if (TY_(nodeIsElement)(node))
2614         {
2615             TY_(InsertNodeAtEnd)(table, node);
2616             ParseTag(doc, node, IgnoreWhitespace);
2617             continue;
2618         }
2619 
2620         /* discard unexpected text nodes and end tags */
2621         TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2622         TY_(FreeNode)( doc, node);
2623     }
2624 
2625     TY_(ReportError)(doc, table, node, MISSING_ENDTAG_FOR);
2626     lexer->istackbase = istackbase;
2627 }
2628 
2629 /* acceptable content for pre elements */
PreContent(TidyDocImpl * ARG_UNUSED (doc),Node * node)2630 static Bool PreContent( TidyDocImpl* ARG_UNUSED(doc), Node* node )
2631 {
2632     /* p is coerced to br's, Text OK too */
2633     if ( nodeIsP(node) || TY_(nodeIsText)(node) )
2634         return yes;
2635 
2636     if ( node->tag == NULL ||
2637          nodeIsPARAM(node) ||
2638          !TY_(nodeHasCM)(node, CM_INLINE|CM_NEW) )
2639         return no;
2640 
2641     return yes;
2642 }
2643 
TY_(ParsePre)2644 void TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) )
2645 {
2646     Node *node;
2647 
2648     if (pre->tag->model & CM_EMPTY)
2649         return;
2650 
2651     TY_(InlineDup)( doc, NULL ); /* tell lexer to insert inlines if needed */
2652 
2653     while ((node = TY_(GetToken)(doc, Preformatted)) != NULL)
2654     {
2655         if ( node->type == EndTag &&
2656              (node->tag == pre->tag || DescendantOf(pre, TagId(node))) )
2657         {
2658             if (nodeIsBODY(node) || nodeIsHTML(node))
2659             {
2660                 TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2661                 TY_(FreeNode)(doc, node);
2662                 continue;
2663             }
2664             if (node->tag == pre->tag)
2665             {
2666                 TY_(FreeNode)(doc, node);
2667             }
2668             else
2669             {
2670                 TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_BEFORE );
2671                 TY_(UngetToken)( doc );
2672             }
2673             pre->closed = yes;
2674             TrimSpaces(doc, pre);
2675             return;
2676         }
2677 
2678         if (TY_(nodeIsText)(node))
2679         {
2680             TY_(InsertNodeAtEnd)(pre, node);
2681             continue;
2682         }
2683 
2684         /* deal with comments etc. */
2685         if (InsertMisc(pre, node))
2686             continue;
2687 
2688         if (node->tag == NULL)
2689         {
2690             TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2691             TY_(FreeNode)(doc, node);
2692             continue;
2693         }
2694 
2695         /* strip unexpected tags */
2696         if ( !PreContent(doc, node) )
2697         {
2698             Node *newnode;
2699 
2700             /* fix for http://tidy.sf.net/bug/772205 */
2701             if (node->type == EndTag)
2702             {
2703                 /* http://tidy.sf.net/issue/1590220 */
2704                if ( doc->lexer->exiled
2705                    && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
2706                {
2707                   TY_(UngetToken)(doc);
2708                   TrimSpaces(doc, pre);
2709                   return;
2710                }
2711 
2712                TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2713                TY_(FreeNode)(doc, node);
2714                continue;
2715             }
2716             /* http://tidy.sf.net/issue/1590220 */
2717             else if (TY_(nodeHasCM)(node, CM_TABLE|CM_ROW)
2718                      || nodeIsTABLE(node) )
2719             {
2720                 if (!doc->lexer->exiled)
2721                     /* No missing close warning if exiled. */
2722                     TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_BEFORE);
2723 
2724                 TY_(UngetToken)(doc);
2725                 return;
2726             }
2727 
2728             /*
2729               This is basically what Tidy 04 August 2000 did and far more accurate
2730               with respect to browser behaivour than the code commented out above.
2731               Tidy could try to propagate the <pre> into each disallowed child where
2732               <pre> is allowed in order to replicate some browsers behaivour, but
2733               there are a lot of exceptions, e.g. Internet Explorer does not propagate
2734               <pre> into table cells while Mozilla does. Opera 6 never propagates
2735               <pre> into blocklevel elements while Opera 7 behaves much like Mozilla.
2736 
2737               Tidy behaves thus mostly like Opera 6 except for nested <pre> elements
2738               which are handled like Mozilla takes them (Opera6 closes all <pre> after
2739               the first </pre>).
2740 
2741               There are similar issues like replacing <p> in <pre> with <br>, for
2742               example
2743 
2744                 <pre>...<p>...</pre>                 (Input)
2745                 <pre>...<br>...</pre>                (Tidy)
2746                 <pre>...<br>...</pre>                (Opera 7 and Internet Explorer)
2747                 <pre>...<br><br>...</pre>            (Opera 6 and Mozilla)
2748 
2749                 <pre>...<p>...</p>...</pre>          (Input)
2750                 <pre>...<br>......</pre>             (Tidy, BUG!)
2751                 <pre>...<br>...<br>...</pre>         (Internet Explorer)
2752                 <pre>...<br><br>...<br><br>...</pre> (Mozilla, Opera 6)
2753                 <pre>...<br>...<br><br>...</pre>     (Opera 7)
2754 
2755               or something similar, they could also be closing the <pre> and propagate
2756               the <pre> into the newly opened <p>.
2757 
2758               Todo: IMG, OBJECT, APPLET, BIG, SMALL, SUB, SUP, FONT, and BASEFONT are
2759               dissallowed in <pre>, Tidy neither detects this nor does it perform any
2760               cleanup operation. Tidy should at least issue a warning if it encounters
2761               such constructs.
2762 
2763               Todo: discarding </p> is abviously a bug, it should be replaced by <br>.
2764             */
2765             TY_(InsertNodeAfterElement)(pre, node);
2766             TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_BEFORE);
2767             ParseTag(doc, node, IgnoreWhitespace);
2768 
2769             newnode = TY_(InferredTag)(doc, TidyTag_PRE);
2770             TY_(ReportError)(doc, pre, newnode, INSERTING_TAG);
2771             pre = newnode;
2772             TY_(InsertNodeAfterElement)(node, pre);
2773 
2774             continue;
2775         }
2776 
2777         if ( nodeIsP(node) )
2778         {
2779             if (node->type == StartTag)
2780             {
2781                 TY_(ReportError)(doc, pre, node, USING_BR_INPLACE_OF);
2782 
2783                 /* trim white space before <p> in <pre>*/
2784                 TrimSpaces(doc, pre);
2785 
2786                 /* coerce both <p> and </p> to <br> */
2787                 TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
2788                 TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */
2789                 TY_(InsertNodeAtEnd)( pre, node );
2790             }
2791             else
2792             {
2793                 TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2794                 TY_(FreeNode)( doc, node);
2795             }
2796             continue;
2797         }
2798 
2799         if ( TY_(nodeIsElement)(node) )
2800         {
2801             /* trim white space before <br> */
2802             if ( nodeIsBR(node) )
2803                 TrimSpaces(doc, pre);
2804 
2805             TY_(InsertNodeAtEnd)(pre, node);
2806             ParseTag(doc, node, Preformatted);
2807             continue;
2808         }
2809 
2810         /* discard unexpected tags */
2811         TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2812         TY_(FreeNode)( doc, node);
2813     }
2814 
2815     TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_FOR);
2816 }
2817 
TY_(ParseOptGroup)2818 void TY_(ParseOptGroup)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode))
2819 {
2820     Lexer* lexer = doc->lexer;
2821     Node *node;
2822 
2823     lexer->insert = NULL;  /* defer implicit inline start tags */
2824 
2825     while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2826     {
2827         if (node->tag == field->tag && node->type == EndTag)
2828         {
2829             TY_(FreeNode)( doc, node);
2830             field->closed = yes;
2831             TrimSpaces(doc, field);
2832             return;
2833         }
2834 
2835         /* deal with comments etc. */
2836         if (InsertMisc(field, node))
2837             continue;
2838 
2839         if ( node->type == StartTag &&
2840              (nodeIsOPTION(node) || nodeIsOPTGROUP(node)) )
2841         {
2842             if ( nodeIsOPTGROUP(node) )
2843                 TY_(ReportError)(doc, field, node, CANT_BE_NESTED);
2844 
2845             TY_(InsertNodeAtEnd)(field, node);
2846             ParseTag(doc, node, MixedContent);
2847             continue;
2848         }
2849 
2850         /* discard unexpected tags */
2851         TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED );
2852         TY_(FreeNode)( doc, node);
2853     }
2854 }
2855 
2856 
TY_(ParseSelect)2857 void TY_(ParseSelect)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode))
2858 {
2859     Lexer* lexer = doc->lexer;
2860     Node *node;
2861 
2862     lexer->insert = NULL;  /* defer implicit inline start tags */
2863 
2864     while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2865     {
2866         if (node->tag == field->tag && node->type == EndTag)
2867         {
2868             TY_(FreeNode)( doc, node);
2869             field->closed = yes;
2870             TrimSpaces(doc, field);
2871             return;
2872         }
2873 
2874         /* deal with comments etc. */
2875         if (InsertMisc(field, node))
2876             continue;
2877 
2878         if ( node->type == StartTag &&
2879              ( nodeIsOPTION(node)   ||
2880                nodeIsOPTGROUP(node) ||
2881                nodeIsSCRIPT(node))
2882            )
2883         {
2884             TY_(InsertNodeAtEnd)(field, node);
2885             ParseTag(doc, node, IgnoreWhitespace);
2886             continue;
2887         }
2888 
2889         /* discard unexpected tags */
2890         TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED);
2891         TY_(FreeNode)( doc, node);
2892     }
2893 
2894     TY_(ReportError)(doc, field, node, MISSING_ENDTAG_FOR);
2895 }
2896 
TY_(ParseText)2897 void TY_(ParseText)(TidyDocImpl* doc, Node *field, GetTokenMode mode)
2898 {
2899     Lexer* lexer = doc->lexer;
2900     Node *node;
2901 
2902     lexer->insert = NULL;  /* defer implicit inline start tags */
2903 
2904     if ( nodeIsTEXTAREA(field) )
2905         mode = Preformatted;
2906     else
2907         mode = MixedContent;  /* kludge for font tags */
2908 
2909     while ((node = TY_(GetToken)(doc, mode)) != NULL)
2910     {
2911         if (node->tag == field->tag && node->type == EndTag)
2912         {
2913             TY_(FreeNode)( doc, node);
2914             field->closed = yes;
2915             TrimSpaces(doc, field);
2916             return;
2917         }
2918 
2919         /* deal with comments etc. */
2920         if (InsertMisc(field, node))
2921             continue;
2922 
2923         if (TY_(nodeIsText)(node))
2924         {
2925             /* only called for 1st child */
2926             if (field->content == NULL && !(mode & Preformatted))
2927                 TrimSpaces(doc, field);
2928 
2929             if (node->start >= node->end)
2930             {
2931                 TY_(FreeNode)( doc, node);
2932                 continue;
2933             }
2934 
2935             TY_(InsertNodeAtEnd)(field, node);
2936             continue;
2937         }
2938 
2939         /* for textarea should all cases of < and & be escaped? */
2940 
2941         /* discard inline tags e.g. font */
2942         if (   node->tag
2943             && node->tag->model & CM_INLINE
2944             && !(node->tag->model & CM_FIELD)) /* #487283 - fix by Lee Passey 25 Jan 02 */
2945         {
2946             TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED);
2947             TY_(FreeNode)( doc, node);
2948             continue;
2949         }
2950 
2951         /* terminate element on other tags */
2952         if (!(field->tag->model & CM_OPT))
2953             TY_(ReportError)(doc, field, node, MISSING_ENDTAG_BEFORE);
2954 
2955         TY_(UngetToken)( doc );
2956         TrimSpaces(doc, field);
2957         return;
2958     }
2959 
2960     if (!(field->tag->model & CM_OPT))
2961         TY_(ReportError)(doc, field, node, MISSING_ENDTAG_FOR);
2962 }
2963 
2964 
TY_(ParseTitle)2965 void TY_(ParseTitle)(TidyDocImpl* doc, Node *title, GetTokenMode ARG_UNUSED(mode))
2966 {
2967     Node *node;
2968     while ((node = TY_(GetToken)(doc, MixedContent)) != NULL)
2969     {
2970         if (node->tag == title->tag && node->type == StartTag)
2971         {
2972             TY_(ReportError)(doc, title, node, COERCE_TO_ENDTAG);
2973             node->type = EndTag;
2974             TY_(UngetToken)( doc );
2975             continue;
2976         }
2977         else if (node->tag == title->tag && node->type == EndTag)
2978         {
2979             TY_(FreeNode)( doc, node);
2980             title->closed = yes;
2981             TrimSpaces(doc, title);
2982             return;
2983         }
2984 
2985         if (TY_(nodeIsText)(node))
2986         {
2987             /* only called for 1st child */
2988             if (title->content == NULL)
2989                 TrimInitialSpace(doc, title, node);
2990 
2991             if (node->start >= node->end)
2992             {
2993                 TY_(FreeNode)( doc, node);
2994                 continue;
2995             }
2996 
2997             TY_(InsertNodeAtEnd)(title, node);
2998             continue;
2999         }
3000 
3001         /* deal with comments etc. */
3002         if (InsertMisc(title, node))
3003             continue;
3004 
3005         /* discard unknown tags */
3006         if (node->tag == NULL)
3007         {
3008             TY_(ReportError)(doc, title, node, DISCARDING_UNEXPECTED);
3009             TY_(FreeNode)( doc, node);
3010             continue;
3011         }
3012 
3013         /* pushback unexpected tokens */
3014         TY_(ReportError)(doc, title, node, MISSING_ENDTAG_BEFORE);
3015         TY_(UngetToken)( doc );
3016         TrimSpaces(doc, title);
3017         return;
3018     }
3019 
3020     TY_(ReportError)(doc, title, node, MISSING_ENDTAG_FOR);
3021 }
3022 
3023 /*
3024   This isn't quite right for CDATA content as it recognises
3025   tags within the content and parses them accordingly.
3026   This will unfortunately screw up scripts which include
3027   < + letter,  < + !, < + ?  or  < + / + letter
3028 */
3029 
TY_(ParseScript)3030 void TY_(ParseScript)(TidyDocImpl* doc, Node *script, GetTokenMode ARG_UNUSED(mode))
3031 {
3032     Node *node;
3033 
3034     doc->lexer->parent = script;
3035     node = TY_(GetToken)(doc, CdataContent);
3036     doc->lexer->parent = NULL;
3037 
3038     if (node)
3039     {
3040         TY_(InsertNodeAtEnd)(script, node);
3041     }
3042     else
3043     {
3044         /* handle e.g. a document like "<script>" */
3045         TY_(ReportError)(doc, script, NULL, MISSING_ENDTAG_FOR);
3046         return;
3047     }
3048 
3049     node = TY_(GetToken)(doc, IgnoreWhitespace);
3050 
3051     if (!(node && node->type == EndTag && node->tag &&
3052         node->tag->id == script->tag->id))
3053     {
3054         TY_(ReportError)(doc, script, node, MISSING_ENDTAG_FOR);
3055 
3056         if (node)
3057             TY_(UngetToken)(doc);
3058     }
3059     else
3060     {
3061         TY_(FreeNode)(doc, node);
3062     }
3063 }
3064 
TY_(IsJavaScript)3065 Bool TY_(IsJavaScript)(Node *node)
3066 {
3067     Bool result = no;
3068     AttVal *attr;
3069 
3070     if (node->attributes == NULL)
3071         return yes;
3072 
3073     for (attr = node->attributes; attr; attr = attr->next)
3074     {
3075         if ( (attrIsLANGUAGE(attr) || attrIsTYPE(attr))
3076              && AttrContains(attr, "javascript") )
3077         {
3078             result = yes;
3079             break;
3080         }
3081     }
3082 
3083     return result;
3084 }
3085 
TY_(ParseHead)3086 void TY_(ParseHead)(TidyDocImpl* doc, Node *head, GetTokenMode ARG_UNUSED(mode))
3087 {
3088     Lexer* lexer = doc->lexer;
3089     Node *node;
3090     int HasTitle = 0;
3091     int HasBase = 0;
3092 
3093     while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
3094     {
3095         if (node->tag == head->tag && node->type == EndTag)
3096         {
3097             TY_(FreeNode)( doc, node);
3098             head->closed = yes;
3099             break;
3100         }
3101 
3102         /* find and discard multiple <head> elements */
3103         /* find and discard <html> in <head> elements */
3104         if ((node->tag == head->tag || nodeIsHTML(node)) && node->type == StartTag)
3105         {
3106             TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED);
3107             TY_(FreeNode)(doc, node);
3108             continue;
3109         }
3110 
3111         if (TY_(nodeIsText)(node))
3112         {
3113             TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN);
3114             TY_(UngetToken)( doc );
3115             break;
3116         }
3117 
3118         if (node->type == ProcInsTag && node->element &&
3119             TY_(tmbstrcmp)(node->element, "xml-stylesheet") == 0)
3120         {
3121             TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN);
3122             TY_(InsertNodeBeforeElement)(TY_(FindHTML)(doc), node);
3123             continue;
3124         }
3125 
3126         /* deal with comments etc. */
3127         if (InsertMisc(head, node))
3128             continue;
3129 
3130         if (node->type == DocTypeTag)
3131         {
3132             InsertDocType(doc, head, node);
3133             continue;
3134         }
3135 
3136         /* discard unknown tags */
3137         if (node->tag == NULL)
3138         {
3139             TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED);
3140             TY_(FreeNode)( doc, node);
3141             continue;
3142         }
3143 
3144         /*
3145          if it doesn't belong in the head then
3146          treat as implicit end of head and deal
3147          with as part of the body
3148         */
3149         if (!(node->tag->model & CM_HEAD))
3150         {
3151             /* #545067 Implicit closing of head broken - warn only for XHTML input */
3152             if ( lexer->isvoyager )
3153                 TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN );
3154             TY_(UngetToken)( doc );
3155             break;
3156         }
3157 
3158         if (TY_(nodeIsElement)(node))
3159         {
3160             if ( nodeIsTITLE(node) )
3161             {
3162                 ++HasTitle;
3163 
3164                 if (HasTitle > 1)
3165                     TY_(ReportError)(doc, head, node,
3166                                      head ?
3167                                      TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS);
3168             }
3169             else if ( nodeIsBASE(node) )
3170             {
3171                 ++HasBase;
3172 
3173                 if (HasBase > 1)
3174                     TY_(ReportError)(doc, head, node,
3175                                      head ?
3176                                      TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS);
3177             }
3178             else if ( nodeIsNOSCRIPT(node) )
3179             {
3180                 TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN);
3181             }
3182 
3183 #ifdef AUTO_INPUT_ENCODING
3184             else if (nodeIsMETA(node))
3185             {
3186                 AttVal * httpEquiv = AttrGetById(node, TidyAttr_HTTP_EQUIV);
3187                 AttVal * content = AttrGetById(node, TidyAttr_CONTENT);
3188                 if (httpEquiv && AttrValueIs(httpEquiv, "Content-Type") && AttrHasValue(content))
3189                 {
3190                     tmbstr val, charset;
3191                     uint end = 0;
3192                     val = charset = TY_(tmbstrdup)(doc->allocator, content->value);
3193                     val = TY_(tmbstrtolower)(val);
3194                     val = strstr(content->value, "charset");
3195 
3196                     if (val)
3197                         val += 7;
3198 
3199                     while(val && *val && (TY_(IsWhite)((tchar)*val) ||
3200                           *val == '=' || *val == '"' || *val == '\''))
3201                         ++val;
3202 
3203                     while(val && val[end] && !(TY_(IsWhite)((tchar)val[end]) ||
3204                           val[end] == '"' || val[end] == '\'' || val[end] == ';'))
3205                         ++end;
3206 
3207                     if (val && end)
3208                     {
3209                         tmbstr encoding = TY_(tmbstrndup)(doc->allocator,val, end);
3210                         uint id = TY_(GetEncodingIdFromName)(encoding);
3211 
3212                         /* todo: detect mismatch with BOM/XMLDecl/declared */
3213                         /* todo: error for unsupported encodings */
3214                         /* todo: try to re-init transcoder */
3215                         /* todo: change input/output encoding settings */
3216                         /* todo: store id in StreamIn */
3217 
3218                         TidyDocFree(doc, encoding);
3219                     }
3220 
3221                     TidyDocFree(doc, charset);
3222                 }
3223             }
3224 #endif /* AUTO_INPUT_ENCODING */
3225 
3226             TY_(InsertNodeAtEnd)(head, node);
3227             ParseTag(doc, node, IgnoreWhitespace);
3228             continue;
3229         }
3230 
3231         /* discard unexpected text nodes and end tags */
3232         TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED);
3233         TY_(FreeNode)( doc, node);
3234     }
3235 }
3236 
TY_(ParseBody)3237 void TY_(ParseBody)(TidyDocImpl* doc, Node *body, GetTokenMode mode)
3238 {
3239     Lexer* lexer = doc->lexer;
3240     Node *node;
3241     Bool checkstack, iswhitenode;
3242 
3243     mode = IgnoreWhitespace;
3244     checkstack = yes;
3245 
3246     TY_(BumpObject)( doc, body->parent );
3247 
3248     while ((node = TY_(GetToken)(doc, mode)) != NULL)
3249     {
3250         /* find and discard multiple <body> elements */
3251         if (node->tag == body->tag && node->type == StartTag)
3252         {
3253             TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3254             TY_(FreeNode)(doc, node);
3255             continue;
3256         }
3257 
3258         /* #538536 Extra endtags not detected */
3259         if ( nodeIsHTML(node) )
3260         {
3261             if (TY_(nodeIsElement)(node) || lexer->seenEndHtml)
3262                 TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3263             else
3264                 lexer->seenEndHtml = 1;
3265 
3266             TY_(FreeNode)( doc, node);
3267             continue;
3268         }
3269 
3270         if ( lexer->seenEndBody &&
3271              ( node->type == StartTag ||
3272                node->type == EndTag   ||
3273                node->type == StartEndTag ) )
3274         {
3275             TY_(ReportError)(doc, body, node, CONTENT_AFTER_BODY );
3276         }
3277 
3278         if ( node->tag == body->tag && node->type == EndTag )
3279         {
3280             body->closed = yes;
3281             TrimSpaces(doc, body);
3282             TY_(FreeNode)( doc, node);
3283             lexer->seenEndBody = 1;
3284             mode = IgnoreWhitespace;
3285 
3286             if ( nodeIsNOFRAMES(body->parent) )
3287                 break;
3288 
3289             continue;
3290         }
3291 
3292         if ( nodeIsNOFRAMES(node) )
3293         {
3294             if (node->type == StartTag)
3295             {
3296                 TY_(InsertNodeAtEnd)(body, node);
3297                 TY_(ParseBlock)(doc, node, mode);
3298                 continue;
3299             }
3300 
3301             if (node->type == EndTag && nodeIsNOFRAMES(body->parent) )
3302             {
3303                 TrimSpaces(doc, body);
3304                 TY_(UngetToken)( doc );
3305                 break;
3306             }
3307         }
3308 
3309         if ( (nodeIsFRAME(node) || nodeIsFRAMESET(node))
3310              && nodeIsNOFRAMES(body->parent) )
3311         {
3312             TrimSpaces(doc, body);
3313             TY_(UngetToken)( doc );
3314             break;
3315         }
3316 
3317         iswhitenode = no;
3318 
3319         if ( TY_(nodeIsText)(node) &&
3320              node->end <= node->start + 1 &&
3321              lexer->lexbuf[node->start] == ' ' )
3322             iswhitenode = yes;
3323 
3324         /* deal with comments etc. */
3325         if (InsertMisc(body, node))
3326             continue;
3327 
3328         /* mixed content model permits text */
3329         if (TY_(nodeIsText)(node))
3330         {
3331             if (iswhitenode && mode == IgnoreWhitespace)
3332             {
3333                 TY_(FreeNode)( doc, node);
3334                 continue;
3335             }
3336 
3337             /* HTML 2 and HTML4 strict don't allow text here */
3338             TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT | VERS_HTML20));
3339 
3340             if (checkstack)
3341             {
3342                 checkstack = no;
3343 
3344                 if ( TY_(InlineDup)(doc, node) > 0 )
3345                     continue;
3346             }
3347 
3348             TY_(InsertNodeAtEnd)(body, node);
3349             mode = MixedContent;
3350             continue;
3351         }
3352 
3353         if (node->type == DocTypeTag)
3354         {
3355             InsertDocType(doc, body, node);
3356             continue;
3357         }
3358         /* discard unknown  and PARAM tags */
3359         if ( node->tag == NULL || nodeIsPARAM(node) )
3360         {
3361             TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3362             TY_(FreeNode)( doc, node);
3363             continue;
3364         }
3365 
3366         /*
3367           Netscape allows LI and DD directly in BODY
3368           We infer UL or DL respectively and use this
3369           Bool to exclude block-level elements so as
3370           to match Netscape's observed behaviour.
3371         */
3372         lexer->excludeBlocks = no;
3373 
3374         if ( nodeIsINPUT(node) ||
3375              (!TY_(nodeHasCM)(node, CM_BLOCK) && !TY_(nodeHasCM)(node, CM_INLINE))
3376            )
3377         {
3378             /* avoid this error message being issued twice */
3379             if (!(node->tag->model & CM_HEAD))
3380                 TY_(ReportError)(doc, body, node, TAG_NOT_ALLOWED_IN);
3381 
3382             if (node->tag->model & CM_HTML)
3383             {
3384                 /* copy body attributes if current body was inferred */
3385                 if ( nodeIsBODY(node) && body->implicit
3386                      && body->attributes == NULL )
3387                 {
3388                     body->attributes = node->attributes;
3389                     node->attributes = NULL;
3390                 }
3391 
3392                 TY_(FreeNode)( doc, node);
3393                 continue;
3394             }
3395 
3396             if (node->tag->model & CM_HEAD)
3397             {
3398                 MoveToHead(doc, body, node);
3399                 continue;
3400             }
3401 
3402             if (node->tag->model & CM_LIST)
3403             {
3404                 TY_(UngetToken)( doc );
3405                 node = TY_(InferredTag)(doc, TidyTag_UL);
3406                 AddClassNoIndent(doc, node);
3407                 lexer->excludeBlocks = yes;
3408             }
3409             else if (node->tag->model & CM_DEFLIST)
3410             {
3411                 TY_(UngetToken)( doc );
3412                 node = TY_(InferredTag)(doc, TidyTag_DL);
3413                 lexer->excludeBlocks = yes;
3414             }
3415             else if (node->tag->model & (CM_TABLE | CM_ROWGRP | CM_ROW))
3416             {
3417                 /* http://tidy.sf.net/issue/2855621 */
3418                 if (node->type != EndTag) {
3419                     TY_(UngetToken)( doc );
3420                     node = TY_(InferredTag)(doc, TidyTag_TABLE);
3421                 }
3422                 lexer->excludeBlocks = yes;
3423             }
3424             else if ( nodeIsINPUT(node) )
3425             {
3426                 TY_(UngetToken)( doc );
3427                 node = TY_(InferredTag)(doc, TidyTag_FORM);
3428                 lexer->excludeBlocks = yes;
3429             }
3430             else
3431             {
3432                 if ( !TY_(nodeHasCM)(node, CM_ROW | CM_FIELD) )
3433                 {
3434                     TY_(UngetToken)( doc );
3435                     return;
3436                 }
3437 
3438                 /* ignore </td> </th> <option> etc. */
3439                 TY_(FreeNode)( doc, node );
3440                 continue;
3441             }
3442         }
3443 
3444         if (node->type == EndTag)
3445         {
3446             if ( nodeIsBR(node) )
3447                 node->type = StartTag;
3448             else if ( nodeIsP(node) )
3449             {
3450                 node->type = StartEndTag;
3451                 node->implicit = yes;
3452 #if OBSOLETE
3453                 TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
3454                 FreeAttrs( doc, node ); /* discard align attribute etc. */
3455                 TY_(InsertNodeAtEnd)(body, node);
3456                 node = TY_(InferredTag)(doc, TidyTag_BR);
3457 #endif
3458             }
3459             else if ( TY_(nodeHasCM)(node, CM_INLINE) )
3460                 TY_(PopInline)( doc, node );
3461         }
3462 
3463         if (TY_(nodeIsElement)(node))
3464         {
3465             if ( TY_(nodeHasCM)(node, CM_INLINE) && !TY_(nodeHasCM)(node, CM_MIXED) )
3466             {
3467                 /* HTML4 strict doesn't allow inline content here */
3468                 /* but HTML2 does allow img elements as children of body */
3469                 if ( nodeIsIMG(node) )
3470                     TY_(ConstrainVersion)(doc, ~VERS_HTML40_STRICT);
3471                 else
3472                     TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT|VERS_HTML20));
3473 
3474                 if (checkstack && !node->implicit)
3475                 {
3476                     checkstack = no;
3477 
3478                     if ( TY_(InlineDup)(doc, node) > 0 )
3479                         continue;
3480                 }
3481 
3482                 mode = MixedContent;
3483             }
3484             else
3485             {
3486                 checkstack = yes;
3487                 mode = IgnoreWhitespace;
3488             }
3489 
3490             if (node->implicit)
3491                 TY_(ReportError)(doc, body, node, INSERTING_TAG);
3492 
3493             TY_(InsertNodeAtEnd)(body, node);
3494             ParseTag(doc, node, mode);
3495             continue;
3496         }
3497 
3498         /* discard unexpected tags */
3499         TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3500         TY_(FreeNode)( doc, node);
3501     }
3502 }
3503 
TY_(ParseNoFrames)3504 void TY_(ParseNoFrames)(TidyDocImpl* doc, Node *noframes, GetTokenMode mode)
3505 {
3506     Lexer* lexer = doc->lexer;
3507     Node *node;
3508 
3509     if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3510     {
3511         doc->badAccess |=  BA_USING_NOFRAMES;
3512     }
3513     mode = IgnoreWhitespace;
3514 
3515     while ( (node = TY_(GetToken)(doc, mode)) != NULL )
3516     {
3517         if ( node->tag == noframes->tag && node->type == EndTag )
3518         {
3519             TY_(FreeNode)( doc, node);
3520             noframes->closed = yes;
3521             TrimSpaces(doc, noframes);
3522             return;
3523         }
3524 
3525         if ( nodeIsFRAME(node) || nodeIsFRAMESET(node) )
3526         {
3527             TrimSpaces(doc, noframes);
3528             if (node->type == EndTag)
3529             {
3530                 TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3531                 TY_(FreeNode)( doc, node);       /* Throw it away */
3532             }
3533             else
3534             {
3535                 TY_(ReportError)(doc, noframes, node, MISSING_ENDTAG_BEFORE);
3536                 TY_(UngetToken)( doc );
3537             }
3538             return;
3539         }
3540 
3541         if ( nodeIsHTML(node) )
3542         {
3543             if (TY_(nodeIsElement)(node))
3544                 TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3545 
3546             TY_(FreeNode)( doc, node);
3547             continue;
3548         }
3549 
3550         /* deal with comments etc. */
3551         if (InsertMisc(noframes, node))
3552             continue;
3553 
3554         if ( nodeIsBODY(node) && node->type == StartTag )
3555         {
3556             Bool seen_body = lexer->seenEndBody;
3557             TY_(InsertNodeAtEnd)(noframes, node);
3558             ParseTag(doc, node, IgnoreWhitespace /*MixedContent*/);
3559 
3560             /* fix for bug http://tidy.sf.net/bug/887259 */
3561             if (seen_body && TY_(FindBody)(doc) != node)
3562             {
3563                 TY_(CoerceNode)(doc, node, TidyTag_DIV, no, no);
3564                 MoveNodeToBody(doc, node);
3565             }
3566             continue;
3567         }
3568 
3569         /* implicit body element inferred */
3570         if (TY_(nodeIsText)(node) || (node->tag && node->type != EndTag))
3571         {
3572             Node *body = TY_(FindBody)( doc );
3573             if ( body || lexer->seenEndBody )
3574             {
3575                 if ( body == NULL )
3576                 {
3577                     TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3578                     TY_(FreeNode)( doc, node);
3579                     continue;
3580                 }
3581                 if ( TY_(nodeIsText)(node) )
3582                 {
3583                     TY_(UngetToken)( doc );
3584                     node = TY_(InferredTag)(doc, TidyTag_P);
3585                     TY_(ReportError)(doc, noframes, node, CONTENT_AFTER_BODY );
3586                 }
3587                 TY_(InsertNodeAtEnd)( body, node );
3588             }
3589             else
3590             {
3591                 TY_(UngetToken)( doc );
3592                 node = TY_(InferredTag)(doc, TidyTag_BODY);
3593                 if ( cfgBool(doc, TidyXmlOut) )
3594                     TY_(ReportError)(doc, noframes, node, INSERTING_TAG);
3595                 TY_(InsertNodeAtEnd)( noframes, node );
3596             }
3597 
3598             ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ );
3599             continue;
3600         }
3601 
3602         /* discard unexpected end tags */
3603         TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3604         TY_(FreeNode)( doc, node);
3605     }
3606 
3607     TY_(ReportError)(doc, noframes, node, MISSING_ENDTAG_FOR);
3608 }
3609 
TY_(ParseFrameSet)3610 void TY_(ParseFrameSet)(TidyDocImpl* doc, Node *frameset, GetTokenMode ARG_UNUSED(mode))
3611 {
3612     Lexer* lexer = doc->lexer;
3613     Node *node;
3614 
3615     if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3616     {
3617         doc->badAccess |= BA_USING_FRAMES;
3618     }
3619 
3620     while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
3621     {
3622         if (node->tag == frameset->tag && node->type == EndTag)
3623         {
3624             TY_(FreeNode)( doc, node);
3625             frameset->closed = yes;
3626             TrimSpaces(doc, frameset);
3627             return;
3628         }
3629 
3630         /* deal with comments etc. */
3631         if (InsertMisc(frameset, node))
3632             continue;
3633 
3634         if (node->tag == NULL)
3635         {
3636             TY_(ReportError)(doc, frameset, node, DISCARDING_UNEXPECTED);
3637             TY_(FreeNode)( doc, node);
3638             continue;
3639         }
3640 
3641         if (TY_(nodeIsElement)(node))
3642         {
3643             if (node->tag && node->tag->model & CM_HEAD)
3644             {
3645                 MoveToHead(doc, frameset, node);
3646                 continue;
3647             }
3648         }
3649 
3650         if ( nodeIsBODY(node) )
3651         {
3652             TY_(UngetToken)( doc );
3653             node = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
3654             TY_(ReportError)(doc, frameset, node, INSERTING_TAG);
3655         }
3656 
3657         if (node->type == StartTag && (node->tag->model & CM_FRAMES))
3658         {
3659             TY_(InsertNodeAtEnd)(frameset, node);
3660             lexer->excludeBlocks = no;
3661             ParseTag(doc, node, MixedContent);
3662             continue;
3663         }
3664         else if (node->type == StartEndTag && (node->tag->model & CM_FRAMES))
3665         {
3666             TY_(InsertNodeAtEnd)(frameset, node);
3667             continue;
3668         }
3669 
3670         /* discard unexpected tags */
3671 #if SUPPORT_ACCESSIBILITY_CHECKS
3672         /* WAI [6.5.1.4] link is being discarded outside of NOFRAME */
3673         if ( nodeIsA(node) )
3674            doc->badAccess |= BA_INVALID_LINK_NOFRAMES;
3675 #endif
3676 
3677         TY_(ReportError)(doc, frameset, node, DISCARDING_UNEXPECTED);
3678         TY_(FreeNode)( doc, node);
3679     }
3680 
3681     TY_(ReportError)(doc, frameset, node, MISSING_ENDTAG_FOR);
3682 }
3683 
TY_(ParseHTML)3684 void TY_(ParseHTML)(TidyDocImpl* doc, Node *html, GetTokenMode mode)
3685 {
3686     Node *node, *head;
3687     Node *frameset = NULL;
3688     Node *noframes = NULL;
3689 
3690     TY_(SetOptionBool)( doc, TidyXmlTags, no );
3691 
3692     for (;;)
3693     {
3694         node = TY_(GetToken)(doc, IgnoreWhitespace);
3695 
3696         if (node == NULL)
3697         {
3698             node = TY_(InferredTag)(doc, TidyTag_HEAD);
3699             break;
3700         }
3701 
3702         if ( nodeIsHEAD(node) )
3703             break;
3704 
3705         if (node->tag == html->tag && node->type == EndTag)
3706         {
3707             TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3708             TY_(FreeNode)( doc, node);
3709             continue;
3710         }
3711 
3712         /* find and discard multiple <html> elements */
3713         if (node->tag == html->tag && node->type == StartTag)
3714         {
3715             TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3716             TY_(FreeNode)(doc, node);
3717             continue;
3718         }
3719 
3720         /* deal with comments etc. */
3721         if (InsertMisc(html, node))
3722             continue;
3723 
3724         TY_(UngetToken)( doc );
3725         node = TY_(InferredTag)(doc, TidyTag_HEAD);
3726         break;
3727     }
3728 
3729     head = node;
3730     TY_(InsertNodeAtEnd)(html, head);
3731     TY_(ParseHead)(doc, head, mode);
3732 
3733     for (;;)
3734     {
3735         node = TY_(GetToken)(doc, IgnoreWhitespace);
3736 
3737         if (node == NULL)
3738         {
3739             if (frameset == NULL) /* implied body */
3740             {
3741                 node = TY_(InferredTag)(doc, TidyTag_BODY);
3742                 TY_(InsertNodeAtEnd)(html, node);
3743                 TY_(ParseBody)(doc, node, mode);
3744             }
3745 
3746             return;
3747         }
3748 
3749         /* robustly handle html tags */
3750         if (node->tag == html->tag)
3751         {
3752             if (node->type != StartTag && frameset == NULL)
3753                 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3754 
3755             TY_(FreeNode)( doc, node);
3756             continue;
3757         }
3758 
3759         /* deal with comments etc. */
3760         if (InsertMisc(html, node))
3761             continue;
3762 
3763         /* if frameset document coerce <body> to <noframes> */
3764         if ( nodeIsBODY(node) )
3765         {
3766             if (node->type != StartTag)
3767             {
3768                 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3769                 TY_(FreeNode)( doc, node);
3770                 continue;
3771             }
3772 
3773             if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3774             {
3775                 if (frameset != NULL)
3776                 {
3777                     TY_(UngetToken)( doc );
3778 
3779                     if (noframes == NULL)
3780                     {
3781                         noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
3782                         TY_(InsertNodeAtEnd)(frameset, noframes);
3783                         TY_(ReportError)(doc, html, noframes, INSERTING_TAG);
3784                     }
3785                     else
3786                     {
3787                         if (noframes->type == StartEndTag)
3788                             noframes->type = StartTag;
3789                     }
3790 
3791                     ParseTag(doc, noframes, mode);
3792                     continue;
3793                 }
3794             }
3795 
3796             TY_(ConstrainVersion)(doc, ~VERS_FRAMESET);
3797             break;  /* to parse body */
3798         }
3799 
3800         /* flag an error if we see more than one frameset */
3801         if ( nodeIsFRAMESET(node) )
3802         {
3803             if (node->type != StartTag)
3804             {
3805                 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3806                 TY_(FreeNode)( doc, node);
3807                 continue;
3808             }
3809 
3810             if (frameset != NULL)
3811                 TY_(ReportFatal)(doc, html, node, DUPLICATE_FRAMESET);
3812             else
3813                 frameset = node;
3814 
3815             TY_(InsertNodeAtEnd)(html, node);
3816             ParseTag(doc, node, mode);
3817 
3818             /*
3819               see if it includes a noframes element so
3820               that we can merge subsequent noframes elements
3821             */
3822 
3823             for (node = frameset->content; node; node = node->next)
3824             {
3825                 if ( nodeIsNOFRAMES(node) )
3826                     noframes = node;
3827             }
3828             continue;
3829         }
3830 
3831         /* if not a frameset document coerce <noframes> to <body> */
3832         if ( nodeIsNOFRAMES(node) )
3833         {
3834             if (node->type != StartTag)
3835             {
3836                 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3837                 TY_(FreeNode)( doc, node);
3838                 continue;
3839             }
3840 
3841             if (frameset == NULL)
3842             {
3843                 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3844                 TY_(FreeNode)( doc, node);
3845                 node = TY_(InferredTag)(doc, TidyTag_BODY);
3846                 break;
3847             }
3848 
3849             if (noframes == NULL)
3850             {
3851                 noframes = node;
3852                 TY_(InsertNodeAtEnd)(frameset, noframes);
3853             }
3854             else
3855                 TY_(FreeNode)( doc, node);
3856 
3857             ParseTag(doc, noframes, mode);
3858             continue;
3859         }
3860 
3861         if (TY_(nodeIsElement)(node))
3862         {
3863             if (node->tag && node->tag->model & CM_HEAD)
3864             {
3865                 MoveToHead(doc, html, node);
3866                 continue;
3867             }
3868 
3869             /* discard illegal frame element following a frameset */
3870             if ( frameset != NULL && nodeIsFRAME(node) )
3871             {
3872                 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3873                 TY_(FreeNode)(doc, node);
3874                 continue;
3875             }
3876         }
3877 
3878         TY_(UngetToken)( doc );
3879 
3880         /* insert other content into noframes element */
3881 
3882         if (frameset)
3883         {
3884             if (noframes == NULL)
3885             {
3886                 noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
3887                 TY_(InsertNodeAtEnd)(frameset, noframes);
3888             }
3889             else
3890             {
3891                 TY_(ReportError)(doc, html, node, NOFRAMES_CONTENT);
3892                 if (noframes->type == StartEndTag)
3893                     noframes->type = StartTag;
3894             }
3895 
3896             TY_(ConstrainVersion)(doc, VERS_FRAMESET);
3897             ParseTag(doc, noframes, mode);
3898             continue;
3899         }
3900 
3901         node = TY_(InferredTag)(doc, TidyTag_BODY);
3902         TY_(ReportError)(doc, html, node, INSERTING_TAG );
3903         TY_(ConstrainVersion)(doc, ~VERS_FRAMESET);
3904         break;
3905     }
3906 
3907     /* node must be body */
3908 
3909     TY_(InsertNodeAtEnd)(html, node);
3910     ParseTag(doc, node, mode);
3911 }
3912 
nodeCMIsOnlyInline(Node * node)3913 static Bool nodeCMIsOnlyInline( Node* node )
3914 {
3915     return TY_(nodeHasCM)( node, CM_INLINE ) && !TY_(nodeHasCM)( node, CM_BLOCK );
3916 }
3917 
EncloseBodyText(TidyDocImpl * doc)3918 static void EncloseBodyText(TidyDocImpl* doc)
3919 {
3920     Node* node;
3921     Node* body = TY_(FindBody)(doc);
3922 
3923     if (!body)
3924         return;
3925 
3926     node = body->content;
3927 
3928     while (node)
3929     {
3930         if ((TY_(nodeIsText)(node) && !TY_(IsBlank)(doc->lexer, node)) ||
3931             (TY_(nodeIsElement)(node) && nodeCMIsOnlyInline(node)))
3932         {
3933             Node* p = TY_(InferredTag)(doc, TidyTag_P);
3934             TY_(InsertNodeBeforeElement)(node, p);
3935             while (node && (!TY_(nodeIsElement)(node) || nodeCMIsOnlyInline(node)))
3936             {
3937                 Node* next = node->next;
3938                 TY_(RemoveNode)(node);
3939                 TY_(InsertNodeAtEnd)(p, node);
3940                 node = next;
3941             }
3942             TrimSpaces(doc, p);
3943             continue;
3944         }
3945         node = node->next;
3946     }
3947 }
3948 
3949 /* <form>, <blockquote> and <noscript> do not allow #PCDATA in
3950    HTML 4.01 Strict (%block; model instead of %flow;).
3951   When requested, text nodes in these elements are wrapped in <p>. */
EncloseBlockText(TidyDocImpl * doc,Node * node)3952 static void EncloseBlockText(TidyDocImpl* doc, Node* node)
3953 {
3954     Node *next;
3955     Node *block;
3956 
3957     while (node)
3958     {
3959         next = node->next;
3960 
3961         if (node->content)
3962             EncloseBlockText(doc, node->content);
3963 
3964         if (!(nodeIsFORM(node) || nodeIsNOSCRIPT(node) ||
3965               nodeIsBLOCKQUOTE(node))
3966             || !node->content)
3967         {
3968             node = next;
3969             continue;
3970         }
3971 
3972         block = node->content;
3973 
3974         if ((TY_(nodeIsText)(block) && !TY_(IsBlank)(doc->lexer, block)) ||
3975             (TY_(nodeIsElement)(block) && nodeCMIsOnlyInline(block)))
3976         {
3977             Node* p = TY_(InferredTag)(doc, TidyTag_P);
3978             TY_(InsertNodeBeforeElement)(block, p);
3979             while (block &&
3980                    (!TY_(nodeIsElement)(block) || nodeCMIsOnlyInline(block)))
3981             {
3982                 Node* tempNext = block->next;
3983                 TY_(RemoveNode)(block);
3984                 TY_(InsertNodeAtEnd)(p, block);
3985                 block = tempNext;
3986             }
3987             TrimSpaces(doc, p);
3988             continue;
3989         }
3990 
3991         node = next;
3992     }
3993 }
3994 
ReplaceObsoleteElements(TidyDocImpl * doc,Node * node)3995 static void ReplaceObsoleteElements(TidyDocImpl* doc, Node* node)
3996 {
3997     Node *next;
3998 
3999     while (node)
4000     {
4001         next = node->next;
4002 
4003         if (nodeIsDIR(node) || nodeIsMENU(node))
4004             TY_(CoerceNode)(doc, node, TidyTag_UL, yes, yes);
4005 
4006         if (nodeIsXMP(node) || nodeIsLISTING(node) ||
4007             (node->tag && node->tag->id == TidyTag_PLAINTEXT))
4008             TY_(CoerceNode)(doc, node, TidyTag_PRE, yes, yes);
4009 
4010         if (node->content)
4011             ReplaceObsoleteElements(doc, node->content);
4012 
4013         node = next;
4014     }
4015 }
4016 
AttributeChecks(TidyDocImpl * doc,Node * node)4017 static void AttributeChecks(TidyDocImpl* doc, Node* node)
4018 {
4019     Node *next;
4020 
4021     while (node)
4022     {
4023         next = node->next;
4024 
4025         if (TY_(nodeIsElement)(node))
4026         {
4027             if (node->tag->chkattrs)
4028                 node->tag->chkattrs(doc, node);
4029             else
4030                 TY_(CheckAttributes)(doc, node);
4031         }
4032 
4033         if (node->content)
4034             AttributeChecks(doc, node->content);
4035 
4036         assert( next != node ); /* http://tidy.sf.net/issue/1603538 */
4037         node = next;
4038     }
4039 }
4040 
4041 /*
4042   HTML is the top level element
4043 */
TY_(ParseDocument)4044 void TY_(ParseDocument)(TidyDocImpl* doc)
4045 {
4046     Node *node, *html, *doctype = NULL;
4047 
4048     while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
4049     {
4050         if (node->type == XmlDecl)
4051         {
4052             if (TY_(FindXmlDecl)(doc) && doc->root.content)
4053             {
4054                 TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4055                 TY_(FreeNode)(doc, node);
4056                 continue;
4057             }
4058             if (node->line != 1 || (node->line == 1 && node->column != 1))
4059             {
4060                 TY_(ReportError)(doc, &doc->root, node, SPACE_PRECEDING_XMLDECL);
4061             }
4062         }
4063 #ifdef AUTO_INPUT_ENCODING
4064         if (node->type == XmlDecl)
4065         {
4066             AttVal* encoding = GetAttrByName(node, "encoding");
4067             if (AttrHasValue(encoding))
4068             {
4069                 uint id = TY_(GetEncodingIdFromName)(encoding->value);
4070 
4071                 /* todo: detect mismatch with BOM/XMLDecl/declared */
4072                 /* todo: error for unsupported encodings */
4073                 /* todo: try to re-init transcoder */
4074                 /* todo: change input/output encoding settings */
4075                 /* todo: store id in StreamIn */
4076             }
4077         }
4078 #endif /* AUTO_INPUT_ENCODING */
4079 
4080         /* deal with comments etc. */
4081         if (InsertMisc( &doc->root, node ))
4082             continue;
4083 
4084         if (node->type == DocTypeTag)
4085         {
4086             if (doctype == NULL)
4087             {
4088                 TY_(InsertNodeAtEnd)( &doc->root, node);
4089                 doctype = node;
4090             }
4091             else
4092             {
4093                 TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4094                 TY_(FreeNode)( doc, node);
4095             }
4096             continue;
4097         }
4098 
4099         if (node->type == EndTag)
4100         {
4101             TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4102             TY_(FreeNode)( doc, node);
4103             continue;
4104         }
4105 
4106         if (node->type == StartTag && nodeIsHTML(node))
4107         {
4108             AttVal *xmlns;
4109 
4110             xmlns = TY_(AttrGetById)(node, TidyAttr_XMLNS);
4111 
4112             if (AttrValueIs(xmlns, XHTML_NAMESPACE))
4113             {
4114                 Bool htmlOut = cfgBool( doc, TidyHtmlOut );
4115                 doc->lexer->isvoyager = yes;                  /* Unless plain HTML */
4116                 TY_(SetOptionBool)( doc, TidyXhtmlOut, !htmlOut ); /* is specified, output*/
4117                 TY_(SetOptionBool)( doc, TidyXmlOut, !htmlOut );   /* will be XHTML. */
4118 
4119                 /* adjust other config options, just as in config.c */
4120                 if ( !htmlOut )
4121                 {
4122                     TY_(SetOptionBool)( doc, TidyUpperCaseTags, no );
4123                     TY_(SetOptionBool)( doc, TidyUpperCaseAttrs, no );
4124                 }
4125             }
4126         }
4127 
4128         if ( node->type != StartTag || !nodeIsHTML(node) )
4129         {
4130             TY_(UngetToken)( doc );
4131             html = TY_(InferredTag)(doc, TidyTag_HTML);
4132         }
4133         else
4134             html = node;
4135 
4136         if (!TY_(FindDocType)(doc))
4137             TY_(ReportError)(doc, NULL, NULL, MISSING_DOCTYPE);
4138 
4139         TY_(InsertNodeAtEnd)( &doc->root, html);
4140         TY_(ParseHTML)( doc, html, IgnoreWhitespace );
4141         break;
4142     }
4143 
4144 #if SUPPORT_ACCESSIBILITY_CHECKS
4145     /* do this before any more document fixes */
4146     if ( cfg( doc, TidyAccessibilityCheckLevel ) > 0 )
4147         TY_(AccessibilityChecks)( doc );
4148 #endif /* #if SUPPORT_ACCESSIBILITY_CHECKS */
4149 
4150     if (!TY_(FindHTML)(doc))
4151     {
4152         /* a later check should complain if <body> is empty */
4153         html = TY_(InferredTag)(doc, TidyTag_HTML);
4154         TY_(InsertNodeAtEnd)( &doc->root, html);
4155         TY_(ParseHTML)(doc, html, IgnoreWhitespace);
4156     }
4157 
4158     if (!TY_(FindTITLE)(doc))
4159     {
4160         Node* head = TY_(FindHEAD)(doc);
4161         TY_(ReportError)(doc, head, NULL, MISSING_TITLE_ELEMENT);
4162         TY_(InsertNodeAtEnd)(head, TY_(InferredTag)(doc, TidyTag_TITLE));
4163     }
4164 
4165     AttributeChecks(doc, &doc->root);
4166     ReplaceObsoleteElements(doc, &doc->root);
4167     TY_(DropEmptyElements)(doc, &doc->root);
4168     CleanSpaces(doc, &doc->root);
4169 
4170     if (cfgBool(doc, TidyEncloseBodyText))
4171         EncloseBodyText(doc);
4172     if (cfgBool(doc, TidyEncloseBlockText))
4173         EncloseBlockText(doc, &doc->root);
4174 }
4175 
TY_(XMLPreserveWhiteSpace)4176 Bool TY_(XMLPreserveWhiteSpace)( TidyDocImpl* doc, Node *element)
4177 {
4178     AttVal *attribute;
4179 
4180     /* search attributes for xml:space */
4181     for (attribute = element->attributes; attribute; attribute = attribute->next)
4182     {
4183         if (attrIsXML_SPACE(attribute))
4184         {
4185             if (AttrValueIs(attribute, "preserve"))
4186                 return yes;
4187 
4188             return no;
4189         }
4190     }
4191 
4192     if (element->element == NULL)
4193         return no;
4194 
4195     /* kludge for html docs without explicit xml:space attribute */
4196     if (nodeIsPRE(element)    ||
4197         nodeIsSCRIPT(element) ||
4198         nodeIsSTYLE(element)  ||
4199         TY_(FindParser)(doc, element) == TY_(ParsePre))
4200         return yes;
4201 
4202     /* kludge for XSL docs */
4203     if ( TY_(tmbstrcasecmp)(element->element, "xsl:text") == 0 )
4204         return yes;
4205 
4206     return no;
4207 }
4208 
4209 /*
4210   XML documents
4211 */
ParseXMLElement(TidyDocImpl * doc,Node * element,GetTokenMode mode)4212 static void ParseXMLElement(TidyDocImpl* doc, Node *element, GetTokenMode mode)
4213 {
4214     Lexer* lexer = doc->lexer;
4215     Node *node;
4216 
4217     /* if node is pre or has xml:space="preserve" then do so */
4218 
4219     if ( TY_(XMLPreserveWhiteSpace)(doc, element) )
4220         mode = Preformatted;
4221 
4222     while ((node = TY_(GetToken)(doc, mode)) != NULL)
4223     {
4224         if (node->type == EndTag &&
4225            node->element && element->element &&
4226            TY_(tmbstrcmp)(node->element, element->element) == 0)
4227         {
4228             TY_(FreeNode)( doc, node);
4229             element->closed = yes;
4230             break;
4231         }
4232 
4233         /* discard unexpected end tags */
4234         if (node->type == EndTag)
4235         {
4236             if (element)
4237                 TY_(ReportFatal)(doc, element, node, UNEXPECTED_ENDTAG_IN);
4238             else
4239                 TY_(ReportFatal)(doc, element, node, UNEXPECTED_ENDTAG);
4240 
4241             TY_(FreeNode)( doc, node);
4242             continue;
4243         }
4244 
4245         /* parse content on seeing start tag */
4246         if (node->type == StartTag)
4247             ParseXMLElement( doc, node, mode );
4248 
4249         TY_(InsertNodeAtEnd)(element, node);
4250     }
4251 
4252     /*
4253      if first child is text then trim initial space and
4254      delete text node if it is empty.
4255     */
4256 
4257     node = element->content;
4258 
4259     if (TY_(nodeIsText)(node) && mode != Preformatted)
4260     {
4261         if ( lexer->lexbuf[node->start] == ' ' )
4262         {
4263             node->start++;
4264 
4265             if (node->start >= node->end)
4266                 TY_(DiscardElement)( doc, node );
4267         }
4268     }
4269 
4270     /*
4271      if last child is text then trim final space and
4272      delete the text node if it is empty
4273     */
4274 
4275     node = element->last;
4276 
4277     if (TY_(nodeIsText)(node) && mode != Preformatted)
4278     {
4279         if ( lexer->lexbuf[node->end - 1] == ' ' )
4280         {
4281             node->end--;
4282 
4283             if (node->start >= node->end)
4284                 TY_(DiscardElement)( doc, node );
4285         }
4286     }
4287 }
4288 
TY_(ParseXMLDocument)4289 void TY_(ParseXMLDocument)(TidyDocImpl* doc)
4290 {
4291     Node *node, *doctype = NULL;
4292 
4293     TY_(SetOptionBool)( doc, TidyXmlTags, yes );
4294 
4295     while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
4296     {
4297         /* discard unexpected end tags */
4298         if (node->type == EndTag)
4299         {
4300             TY_(ReportError)(doc, NULL, node, UNEXPECTED_ENDTAG);
4301             TY_(FreeNode)( doc, node);
4302             continue;
4303         }
4304 
4305          /* deal with comments etc. */
4306         if (InsertMisc( &doc->root, node))
4307             continue;
4308 
4309         if (node->type == DocTypeTag)
4310         {
4311             if (doctype == NULL)
4312             {
4313                 TY_(InsertNodeAtEnd)( &doc->root, node);
4314                 doctype = node;
4315             }
4316             else
4317             {
4318                 TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4319                 TY_(FreeNode)( doc, node);
4320             }
4321             continue;
4322         }
4323 
4324         if (node->type == StartEndTag)
4325         {
4326             TY_(InsertNodeAtEnd)( &doc->root, node);
4327             continue;
4328         }
4329 
4330        /* if start tag then parse element's content */
4331         if (node->type == StartTag)
4332         {
4333             TY_(InsertNodeAtEnd)( &doc->root, node );
4334             ParseXMLElement( doc, node, IgnoreWhitespace );
4335             continue;
4336         }
4337 
4338         TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4339         TY_(FreeNode)( doc, node);
4340     }
4341 
4342     /* ensure presence of initial <?xml version="1.0"?> */
4343     if ( cfgBool(doc, TidyXmlDecl) )
4344         TY_(FixXmlDecl)( doc );
4345 }
4346 
4347 /*
4348  * local variables:
4349  * mode: c
4350  * indent-tabs-mode: nil
4351  * c-basic-offset: 4
4352  * eval: (c-set-offset 'substatement-open 0)
4353  * end:
4354  */
4355