1 /* parser.c -- HTML Parser
2
3 (c) 1998-2007 (W3C) MIT, ERCIM, Keio University
4 See tidyp.h for the copyright notice.
5
6 */
7
8 #include "tidy-int.h"
9 #include "lexer.h"
10 #include "parser.h"
11 #include "message.h"
12 #include "clean.h"
13 #include "tags.h"
14 #include "tmbstr.h"
15
16 #ifdef AUTO_INPUT_ENCODING
17 #include "charsets.h"
18 #endif
19
TY_(CheckNodeIntegrity)20 Bool TY_(CheckNodeIntegrity)(Node *node)
21 {
22 #ifndef NO_NODE_INTEGRITY_CHECK
23 Node *child;
24
25 if (node->prev)
26 {
27 if (node->prev->next != node)
28 return no;
29 }
30
31 if (node->next)
32 {
33 if (node->next == node || node->next->prev != node)
34 return no;
35 }
36
37 if (node->parent)
38 {
39 if (node->prev == NULL && node->parent->content != node)
40 return no;
41
42 if (node->next == NULL && node->parent->last != node)
43 return no;
44 }
45
46 for (child = node->content; child; child = child->next)
47 if ( child->parent != node || !TY_(CheckNodeIntegrity)(child) )
48 return no;
49
50 #endif
51 return yes;
52 }
53
54 /*
55 used to determine how attributes
56 without values should be printed
57 this was introduced to deal with
58 user defined tags e.g. Cold Fusion
59 */
TY_(IsNewNode)60 Bool TY_(IsNewNode)(Node *node)
61 {
62 if (node && node->tag)
63 {
64 return (node->tag->model & CM_NEW);
65 }
66 return yes;
67 }
68
TY_(CoerceNode)69 void TY_(CoerceNode)(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool unexpected)
70 {
71 const Dict* tag = TY_(LookupTagDef)(tid);
72 Node* tmp = TY_(InferredTag)(doc, tag->id);
73
74 if (obsolete)
75 TY_(ReportWarning)(doc, node, tmp, OBSOLETE_ELEMENT);
76 else if (unexpected)
77 TY_(ReportError)(doc, node, tmp, REPLACING_UNEX_ELEMENT);
78 else
79 TY_(ReportNotice)(doc, node, tmp, REPLACING_ELEMENT);
80
81 TidyDocFree(doc, tmp->element);
82 TidyDocFree(doc, tmp);
83
84 node->was = node->tag;
85 node->tag = tag;
86 node->type = StartTag;
87 node->implicit = yes;
88 TidyDocFree(doc, node->element);
89 node->element = TY_(tmbstrdup)(doc->allocator, tag->name);
90 }
91
92 /* extract a node and its children from a markup tree */
TY_(RemoveNode)93 Node *TY_(RemoveNode)(Node *node)
94 {
95 if (node->prev)
96 node->prev->next = node->next;
97
98 if (node->next)
99 node->next->prev = node->prev;
100
101 if (node->parent)
102 {
103 if (node->parent->content == node)
104 node->parent->content = node->next;
105
106 if (node->parent->last == node)
107 node->parent->last = node->prev;
108 }
109
110 node->parent = node->prev = node->next = NULL;
111 return node;
112 }
113
114 /* remove node from markup tree and discard it */
TY_(DiscardElement)115 Node *TY_(DiscardElement)( TidyDocImpl* doc, Node *element )
116 {
117 Node *next = NULL;
118
119 if (element)
120 {
121 next = element->next;
122 TY_(RemoveNode)(element);
123 TY_(FreeNode)( doc, element);
124 }
125
126 return next;
127 }
128
129 /*
130 insert "node" into markup tree as the firt element
131 of content of "element"
132 */
TY_(InsertNodeAtStart)133 void TY_(InsertNodeAtStart)(Node *element, Node *node)
134 {
135 node->parent = element;
136
137 if (element->content == NULL)
138 element->last = node;
139 else
140 element->content->prev = node;
141
142 node->next = element->content;
143 node->prev = NULL;
144 element->content = node;
145 }
146
147 /*
148 insert "node" into markup tree as the last element
149 of content of "element"
150 */
TY_(InsertNodeAtEnd)151 void TY_(InsertNodeAtEnd)(Node *element, Node *node)
152 {
153 node->parent = element;
154 node->prev = element->last;
155
156 if (element->last != NULL)
157 element->last->next = node;
158 else
159 element->content = node;
160
161 element->last = node;
162 }
163
164 /*
165 insert "node" into markup tree in place of "element"
166 which is moved to become the child of the node
167 */
InsertNodeAsParent(Node * element,Node * node)168 static void InsertNodeAsParent(Node *element, Node *node)
169 {
170 node->content = element;
171 node->last = element;
172 node->parent = element->parent;
173 element->parent = node;
174
175 if (node->parent->content == element)
176 node->parent->content = node;
177
178 if (node->parent->last == element)
179 node->parent->last = node;
180
181 node->prev = element->prev;
182 element->prev = NULL;
183
184 if (node->prev)
185 node->prev->next = node;
186
187 node->next = element->next;
188 element->next = NULL;
189
190 if (node->next)
191 node->next->prev = node;
192 }
193
194 /* insert "node" into markup tree before "element" */
TY_(InsertNodeBeforeElement)195 void TY_(InsertNodeBeforeElement)(Node *element, Node *node)
196 {
197 Node *parent;
198
199 parent = element->parent;
200 node->parent = parent;
201 node->next = element;
202 node->prev = element->prev;
203 element->prev = node;
204
205 if (node->prev)
206 node->prev->next = node;
207
208 if (parent->content == element)
209 parent->content = node;
210 }
211
212 /* insert "node" into markup tree after "element" */
TY_(InsertNodeAfterElement)213 void TY_(InsertNodeAfterElement)(Node *element, Node *node)
214 {
215 Node *parent;
216
217 parent = element->parent;
218 node->parent = parent;
219
220 /* AQ - 13 Jan 2000 fix for parent == NULL */
221 if (parent != NULL && parent->last == element)
222 parent->last = node;
223 else
224 {
225 node->next = element->next;
226 /* AQ - 13 Jan 2000 fix for node->next == NULL */
227 if (node->next != NULL)
228 node->next->prev = node;
229 }
230
231 element->next = node;
232 node->prev = element;
233 }
234
CanPrune(TidyDocImpl * doc,Node * element)235 static Bool CanPrune( TidyDocImpl* doc, Node *element )
236 {
237 if ( TY_(nodeIsText)(element) )
238 return yes;
239
240 if ( element->content )
241 return no;
242
243 if ( element->tag == NULL )
244 return no;
245
246 if ( element->tag->model & CM_BLOCK && element->attributes != NULL )
247 return no;
248
249 if ( nodeIsA(element) && element->attributes != NULL )
250 return no;
251
252 if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) )
253 return no;
254
255 if ( element->tag->model & CM_ROW )
256 return no;
257
258 if ( element->tag->model & CM_EMPTY )
259 return no;
260
261 if ( nodeIsAPPLET(element) )
262 return no;
263
264 if ( nodeIsOBJECT(element) )
265 return no;
266
267 if ( nodeIsSCRIPT(element) && attrGetSRC(element) )
268 return no;
269
270 if ( nodeIsTITLE(element) )
271 return no;
272
273 /* #433359 - fix by Randy Waki 12 Mar 01 */
274 if ( nodeIsIFRAME(element) )
275 return no;
276
277 /* fix for bug 770297 */
278 if (nodeIsTEXTAREA(element))
279 return no;
280
281 if ( attrGetID(element) || attrGetNAME(element) )
282 return no;
283
284 /* fix for bug 695408; a better fix would look for unknown and */
285 /* known proprietary attributes that make the element significant */
286 if (attrGetDATAFLD(element))
287 return no;
288
289 /* fix for bug 723772, don't trim new-...-tags */
290 if (element->tag->id == TidyTag_UNKNOWN)
291 return no;
292
293 if (nodeIsBODY(element))
294 return no;
295
296 if (nodeIsCOLGROUP(element))
297 return no;
298
299 return yes;
300 }
301
302 /* return next element */
TY_(TrimEmptyElement)303 Node *TY_(TrimEmptyElement)( TidyDocImpl* doc, Node *element )
304 {
305 if ( CanPrune(doc, element) )
306 {
307 if (element->type != TextNode)
308 TY_(ReportNotice)(doc, element, NULL, TRIM_EMPTY_ELEMENT);
309
310 return TY_(DiscardElement)(doc, element);
311 }
312 return element->next;
313 }
314
TY_(DropEmptyElements)315 Node* TY_(DropEmptyElements)(TidyDocImpl* doc, Node* node)
316 {
317 Node* next;
318
319 while (node)
320 {
321 next = node->next;
322
323 if (node->content)
324 TY_(DropEmptyElements)(doc, node->content);
325
326 if (!TY_(nodeIsElement)(node) &&
327 !(TY_(nodeIsText)(node) && !(node->start < node->end)))
328 {
329 node = next;
330 continue;
331 }
332
333 next = TY_(TrimEmptyElement)(doc, node);
334 node = next;
335 }
336
337 return node;
338 }
339
340 /*
341 errors in positioning of form start or end tags
342 generally require human intervention to fix
343 */
BadForm(TidyDocImpl * doc)344 static void BadForm( TidyDocImpl* doc )
345 {
346 doc->badForm = yes;
347 /* doc->errors++; */
348 }
349
350 /*
351 This maps
352 <em>hello </em><strong>world</strong>
353 to
354 <em>hello</em> <strong>world</strong>
355
356 If last child of element is a text node
357 then trim trailing white space character
358 moving it to after element's end tag.
359 */
TrimTrailingSpace(TidyDocImpl * doc,Node * element,Node * last)360 static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last )
361 {
362 Lexer* lexer = doc->lexer;
363 byte c;
364
365 if (TY_(nodeIsText)(last))
366 {
367 if (last->end > last->start)
368 {
369 c = (byte) lexer->lexbuf[ last->end - 1 ];
370
371 if ( c == ' '
372 #ifdef COMMENT_NBSP_FIX
373 || c == 160
374 #endif
375 )
376 {
377 #ifdef COMMENT_NBSP_FIX
378 /* take care with <td> </td> */
379 if ( c == 160 &&
380 ( element->tag == doc->tags.tag_td ||
381 element->tag == doc->tags.tag_th )
382 )
383 {
384 if (last->end > last->start + 1)
385 last->end -= 1;
386 }
387 else
388 #endif
389 {
390 last->end -= 1;
391 if ( (element->tag->model & CM_INLINE) &&
392 !(element->tag->model & CM_FIELD) )
393 lexer->insertspace = yes;
394 }
395 }
396 }
397 }
398 }
399
400 /* Only true for text nodes. */
TY_(IsBlank)401 Bool TY_(IsBlank)(Lexer *lexer, Node *node)
402 {
403 Bool isBlank = TY_(nodeIsText)(node);
404 if ( isBlank )
405 isBlank = ( node->end == node->start || /* Zero length */
406 ( node->end == node->start+1 /* or one blank. */
407 && lexer->lexbuf[node->start] == ' ' ) );
408 return isBlank;
409 }
410
411 /*
412 This maps
413 <p>hello<em> world</em>
414 to
415 <p>hello <em>world</em>
416
417 Trims initial space, by moving it before the
418 start tag, or if this element is the first in
419 parent's content, then by discarding the space
420 */
TrimInitialSpace(TidyDocImpl * doc,Node * element,Node * text)421 static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text )
422 {
423 Lexer* lexer = doc->lexer;
424 Node *prev, *node;
425
426 if ( TY_(nodeIsText)(text) &&
427 lexer->lexbuf[text->start] == ' ' &&
428 text->start < text->end )
429 {
430 if ( (element->tag->model & CM_INLINE) &&
431 !(element->tag->model & CM_FIELD) )
432 {
433 prev = element->prev;
434
435 if (TY_(nodeIsText)(prev))
436 {
437 if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ')
438 lexer->lexbuf[(prev->end)++] = ' ';
439
440 ++(element->start);
441 }
442 else /* create new node */
443 {
444 node = TY_(NewNode)(lexer->allocator, lexer);
445 node->start = (element->start)++;
446 node->end = element->start;
447 lexer->lexbuf[node->start] = ' ';
448 TY_(InsertNodeBeforeElement)(element ,node);
449 }
450 }
451
452 /* discard the space in current node */
453 ++(text->start);
454 }
455 }
456
IsPreDescendant(Node * node)457 static Bool IsPreDescendant(Node* node)
458 {
459 Node *parent = node->parent;
460
461 while (parent)
462 {
463 if (parent->tag && parent->tag->parser == TY_(ParsePre))
464 return yes;
465
466 parent = parent->parent;
467 }
468
469 return no;
470 }
471
CleanTrailingWhitespace(TidyDocImpl * doc,Node * node)472 static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node)
473 {
474 Node* next;
475
476 if (!TY_(nodeIsText)(node))
477 return no;
478
479 if (node->parent->type == DocTypeTag)
480 return no;
481
482 if (IsPreDescendant(node))
483 return no;
484
485 if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
486 return no;
487
488 next = node->next;
489
490 /* <p>... </p> */
491 if (!next && !TY_(nodeHasCM)(node->parent, CM_INLINE))
492 return yes;
493
494 /* <div><small>... </small><h3>...</h3></div> */
495 if (!next && node->parent->next && !TY_(nodeHasCM)(node->parent->next, CM_INLINE))
496 return yes;
497
498 if (!next)
499 return no;
500
501 if (nodeIsBR(next))
502 return yes;
503
504 if (TY_(nodeHasCM)(next, CM_INLINE))
505 return no;
506
507 /* <a href='/'>...</a> <p>...</p> */
508 if (next->type == StartTag)
509 return yes;
510
511 /* <strong>...</strong> <hr /> */
512 if (next->type == StartEndTag)
513 return yes;
514
515 /* evil adjacent text nodes, Tidy should not generate these :-( */
516 if (TY_(nodeIsText)(next) && next->start < next->end
517 && TY_(IsWhite)(doc->lexer->lexbuf[next->start]))
518 return yes;
519
520 return no;
521 }
522
CleanLeadingWhitespace(TidyDocImpl * ARG_UNUSED (doc),Node * node)523 static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node)
524 {
525 if (!TY_(nodeIsText)(node))
526 return no;
527
528 if (node->parent->type == DocTypeTag)
529 return no;
530
531 if (IsPreDescendant(node))
532 return no;
533
534 if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
535 return no;
536
537 /* <p>...<br> <em>...</em>...</p> */
538 if (nodeIsBR(node->prev))
539 return yes;
540
541 /* <p> ...</p> */
542 if (node->prev == NULL && !TY_(nodeHasCM)(node->parent, CM_INLINE))
543 return yes;
544
545 /* <h4>...</h4> <em>...</em> */
546 if (node->prev && !TY_(nodeHasCM)(node->prev, CM_INLINE) &&
547 TY_(nodeIsElement)(node->prev))
548 return yes;
549
550 /* <p><span> ...</span></p> */
551 if (!node->prev && !node->parent->prev && !TY_(nodeHasCM)(node->parent->parent, CM_INLINE))
552 return yes;
553
554 return no;
555 }
556
CleanSpaces(TidyDocImpl * doc,Node * node)557 static void CleanSpaces(TidyDocImpl* doc, Node* node)
558 {
559 Node* next;
560
561 while (node)
562 {
563 next = node->next;
564
565 if (TY_(nodeIsText)(node) && CleanLeadingWhitespace(doc, node))
566 while (node->start < node->end && TY_(IsWhite)(doc->lexer->lexbuf[node->start]))
567 ++(node->start);
568
569 if (TY_(nodeIsText)(node) && CleanTrailingWhitespace(doc, node))
570 while (node->end > node->start && TY_(IsWhite)(doc->lexer->lexbuf[node->end - 1]))
571 --(node->end);
572
573 if (TY_(nodeIsText)(node) && !(node->start < node->end))
574 {
575 TY_(RemoveNode)(node);
576 TY_(FreeNode)(doc, node);
577 node = next;
578
579 continue;
580 }
581
582 if (node->content)
583 CleanSpaces(doc, node->content);
584
585 node = next;
586 }
587 }
588
589 /*
590 Move initial and trailing space out.
591 This routine maps:
592
593 hello<em> world</em>
594 to
595 hello <em>world</em>
596 and
597 <em>hello </em><strong>world</strong>
598 to
599 <em>hello</em> <strong>world</strong>
600 */
TrimSpaces(TidyDocImpl * doc,Node * element)601 static void TrimSpaces( TidyDocImpl* doc, Node *element)
602 {
603 Node* text = element->content;
604
605 if (nodeIsPRE(element) || IsPreDescendant(element))
606 return;
607
608 if (TY_(nodeIsText)(text))
609 TrimInitialSpace(doc, element, text);
610
611 text = element->last;
612
613 if (TY_(nodeIsText)(text))
614 TrimTrailingSpace(doc, element, text);
615 }
616
DescendantOf(Node * element,TidyTagId tid)617 static Bool DescendantOf( Node *element, TidyTagId tid )
618 {
619 Node *parent;
620 for ( parent = element->parent;
621 parent != NULL;
622 parent = parent->parent )
623 {
624 if ( TagIsId(parent, tid) )
625 return yes;
626 }
627 return no;
628 }
629
InsertMisc(Node * element,Node * node)630 static Bool InsertMisc(Node *element, Node *node)
631 {
632 if (node->type == CommentTag ||
633 node->type == ProcInsTag ||
634 node->type == CDATATag ||
635 node->type == SectionTag ||
636 node->type == AspTag ||
637 node->type == JsteTag ||
638 node->type == PhpTag )
639 {
640 TY_(InsertNodeAtEnd)(element, node);
641 return yes;
642 }
643
644 if ( node->type == XmlDecl )
645 {
646 Node* root = element;
647 while ( root && root->parent )
648 root = root->parent;
649 if ( root && !(root->content && root->content->type == XmlDecl))
650 {
651 TY_(InsertNodeAtStart)( root, node );
652 return yes;
653 }
654 }
655
656 /* Declared empty tags seem to be slipping through
657 ** the cracks. This is an experiment to figure out
658 ** a decent place to pick them up.
659 */
660 if ( node->tag &&
661 TY_(nodeIsElement)(node) &&
662 TY_(nodeCMIsEmpty)(node) && TagId(node) == TidyTag_UNKNOWN &&
663 (node->tag->versions & VERS_PROPRIETARY) != 0 )
664 {
665 TY_(InsertNodeAtEnd)(element, node);
666 return yes;
667 }
668
669 return no;
670 }
671
672
ParseTag(TidyDocImpl * doc,Node * node,GetTokenMode mode)673 static void ParseTag( TidyDocImpl* doc, Node *node, GetTokenMode mode )
674 {
675 Lexer* lexer = doc->lexer;
676 /*
677 Fix by GLP 2000-12-21. Need to reset insertspace if this
678 is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
679 */
680 if (node->tag->model & CM_EMPTY)
681 {
682 lexer->waswhite = no;
683 if (node->tag->parser == NULL)
684 return;
685 }
686 else if (!(node->tag->model & CM_INLINE))
687 lexer->insertspace = no;
688
689 if (node->tag->parser == NULL)
690 return;
691
692 if (node->type == StartEndTag)
693 return;
694
695 (*node->tag->parser)( doc, node, mode );
696 }
697
698 /*
699 the doctype has been found after other tags,
700 and needs moving to before the html element
701 */
InsertDocType(TidyDocImpl * doc,Node * element,Node * doctype)702 static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype )
703 {
704 Node* existing = TY_(FindDocType)( doc );
705 if ( existing )
706 {
707 TY_(ReportError)(doc, element, doctype, DISCARDING_UNEXPECTED );
708 TY_(FreeNode)( doc, doctype );
709 }
710 else
711 {
712 TY_(ReportError)(doc, element, doctype, DOCTYPE_AFTER_TAGS );
713 while ( !nodeIsHTML(element) )
714 element = element->parent;
715 TY_(InsertNodeBeforeElement)( element, doctype );
716 }
717 }
718
719 /*
720 move node to the head, where element is used as starting
721 point in hunt for head. normally called during parsing
722 */
MoveToHead(TidyDocImpl * doc,Node * element,Node * node)723 static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node )
724 {
725 Node *head;
726
727 TY_(RemoveNode)( node ); /* make sure that node is isolated */
728
729 if ( TY_(nodeIsElement)(node) )
730 {
731 TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN );
732
733 head = TY_(FindHEAD)(doc);
734 assert(head != NULL);
735
736 TY_(InsertNodeAtEnd)(head, node);
737
738 if ( node->tag->parser )
739 ParseTag( doc, node, IgnoreWhitespace );
740 }
741 else
742 {
743 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
744 TY_(FreeNode)( doc, node );
745 }
746 }
747
748 /* moves given node to end of body element */
MoveNodeToBody(TidyDocImpl * doc,Node * node)749 static void MoveNodeToBody( TidyDocImpl* doc, Node* node )
750 {
751 Node* body = TY_(FindBody)( doc );
752 if ( body )
753 {
754 TY_(RemoveNode)( node );
755 TY_(InsertNodeAtEnd)( body, node );
756 }
757 }
758
AddClassNoIndent(TidyDocImpl * doc,Node * node)759 static void AddClassNoIndent( TidyDocImpl* doc, Node *node )
760 {
761 ctmbstr sprop =
762 "padding-left: 2ex; margin-left: 0ex"
763 "; margin-top: 0ex; margin-bottom: 0ex";
764 if ( !cfgBool(doc, TidyDecorateInferredUL) )
765 return;
766 if ( cfgBool(doc, TidyMakeClean) )
767 TY_(AddStyleAsClass)( doc, node, sprop );
768 else
769 TY_(AddStyleProperty)( doc, node, sprop );
770 }
771
772 /*
773 element is node created by the lexer
774 upon seeing the start tag, or by the
775 parser when the start tag is inferred
776 */
TY_(ParseBlock)777 void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode)
778 {
779 Lexer* lexer = doc->lexer;
780 Node *node;
781 Bool checkstack = yes;
782 uint istackbase = 0;
783
784 if ( element->tag->model & CM_EMPTY )
785 return;
786
787 if ( nodeIsFORM(element) &&
788 DescendantOf(element, TidyTag_FORM) )
789 TY_(ReportError)(doc, element, NULL, ILLEGAL_NESTING );
790
791 /*
792 InlineDup() asks the lexer to insert inline emphasis tags
793 currently pushed on the istack, but take care to avoid
794 propagating inline emphasis inside OBJECT or APPLET.
795 For these elements a fresh inline stack context is created
796 and disposed of upon reaching the end of the element.
797 They thus behave like table cells in this respect.
798 */
799 if (element->tag->model & CM_OBJECT)
800 {
801 istackbase = lexer->istackbase;
802 lexer->istackbase = lexer->istacksize;
803 }
804
805 if (!(element->tag->model & CM_MIXED))
806 TY_(InlineDup)( doc, NULL );
807
808 mode = IgnoreWhitespace;
809
810 while ((node = TY_(GetToken)(doc, mode /*MixedContent*/)) != NULL)
811 {
812 /* end tag for this element */
813 if (node->type == EndTag && node->tag &&
814 (node->tag == element->tag || element->was == node->tag))
815 {
816 TY_(FreeNode)( doc, node );
817
818 if (element->tag->model & CM_OBJECT)
819 {
820 /* pop inline stack */
821 while (lexer->istacksize > lexer->istackbase)
822 TY_(PopInline)( doc, NULL );
823 lexer->istackbase = istackbase;
824 }
825
826 element->closed = yes;
827 TrimSpaces( doc, element );
828 return;
829 }
830
831 if ( nodeIsBODY( node ) && DescendantOf( element, TidyTag_HEAD ))
832 {
833 /* If we're in the HEAD, close it before proceeding.
834 This is an extremely rare occurance, but has been observed.
835 */
836 TY_(UngetToken)( doc );
837 break;
838 }
839
840 if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) )
841 {
842 if ( TY_(nodeIsElement)(node) )
843 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
844 TY_(FreeNode)( doc, node );
845 continue;
846 }
847
848
849 if (node->type == EndTag)
850 {
851 if (node->tag == NULL)
852 {
853 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
854 TY_(FreeNode)( doc, node );
855 continue;
856 }
857 else if ( nodeIsBR(node) )
858 node->type = StartTag;
859 else if ( nodeIsP(node) )
860 {
861 /* Cannot have a block inside a paragraph, so no checking
862 for an ancestor is necessary -- but we _can_ have
863 paragraphs inside a block, so change it to an implicit
864 empty paragraph, to be dealt with according to the user's
865 options
866 */
867 node->type = StartEndTag;
868 node->implicit = yes;
869 #if OBSOLETE
870 TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
871 TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */
872 TY_(InsertNodeAtEnd)( element, node );
873 node = InferredTag(doc, TidyTag_BR);
874 #endif
875 }
876 else if (DescendantOf( element, node->tag->id ))
877 {
878 /*
879 if this is the end tag for an ancestor element
880 then infer end tag for this element
881 */
882 TY_(UngetToken)( doc );
883 break;
884 #if OBSOLETE
885 Node *parent;
886 for ( parent = element->parent;
887 parent != NULL;
888 parent = parent->parent )
889 {
890 if (node->tag == parent->tag)
891 {
892 if (!(element->tag->model & CM_OPT))
893 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
894
895 TY_(UngetToken)( doc );
896
897 if (element->tag->model & CM_OBJECT)
898 {
899 /* pop inline stack */
900 while (lexer->istacksize > lexer->istackbase)
901 TY_(PopInline)( doc, NULL );
902 lexer->istackbase = istackbase;
903 }
904
905 TrimSpaces( doc, element );
906 return;
907 }
908 }
909 #endif
910 }
911 else
912 {
913 /* special case </tr> etc. for stuff moved in front of table */
914 if ( lexer->exiled
915 && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
916 {
917 TY_(UngetToken)( doc );
918 TrimSpaces( doc, element );
919 return;
920 }
921 }
922 }
923
924 /* mixed content model permits text */
925 if (TY_(nodeIsText)(node))
926 {
927 if ( checkstack )
928 {
929 checkstack = no;
930 if (!(element->tag->model & CM_MIXED))
931 {
932 if ( TY_(InlineDup)(doc, node) > 0 )
933 continue;
934 }
935 }
936
937 TY_(InsertNodeAtEnd)(element, node);
938 mode = MixedContent;
939
940 /*
941 HTML4 strict doesn't allow mixed content for
942 elements with %block; as their content model
943 */
944 /*
945 But only body, map, blockquote, form and
946 noscript have content model %block;
947 */
948 if ( nodeIsBODY(element) ||
949 nodeIsMAP(element) ||
950 nodeIsBLOCKQUOTE(element) ||
951 nodeIsFORM(element) ||
952 nodeIsNOSCRIPT(element) )
953 TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
954 continue;
955 }
956
957 if ( InsertMisc(element, node) )
958 continue;
959
960 /* allow PARAM elements? */
961 if ( nodeIsPARAM(node) )
962 {
963 if ( TY_(nodeHasCM)(element, CM_PARAM) && TY_(nodeIsElement)(node) )
964 {
965 TY_(InsertNodeAtEnd)(element, node);
966 continue;
967 }
968
969 /* otherwise discard it */
970 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
971 TY_(FreeNode)( doc, node );
972 continue;
973 }
974
975 /* allow AREA elements? */
976 if ( nodeIsAREA(node) )
977 {
978 if ( nodeIsMAP(element) && TY_(nodeIsElement)(node) )
979 {
980 TY_(InsertNodeAtEnd)(element, node);
981 continue;
982 }
983
984 /* otherwise discard it */
985 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
986 TY_(FreeNode)( doc, node );
987 continue;
988 }
989
990 /* ignore unknown start/end tags */
991 if ( node->tag == NULL )
992 {
993 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
994 TY_(FreeNode)( doc, node );
995 continue;
996 }
997
998 /*
999 Allow CM_INLINE elements here.
1000
1001 Allow CM_BLOCK elements here unless
1002 lexer->excludeBlocks is yes.
1003
1004 LI and DD are special cased.
1005
1006 Otherwise infer end tag for this element.
1007 */
1008
1009 if ( !TY_(nodeHasCM)(node, CM_INLINE) )
1010 {
1011 if ( !TY_(nodeIsElement)(node) )
1012 {
1013 if ( nodeIsFORM(node) )
1014 BadForm( doc );
1015
1016 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1017 TY_(FreeNode)( doc, node );
1018 continue;
1019 }
1020
1021 /* #427671 - Fix by Randy Waki - 10 Aug 00 */
1022 /*
1023 If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION
1024 start tag, discard the start tag and let the subsequent content get
1025 parsed as content of the enclosing LI. This seems to mimic IE and
1026 Netscape, and avoids an infinite loop: without this check,
1027 ParseBlock (which is parsing the LI's content) and ParseList (which
1028 is parsing the LI's parent's content) repeatedly defer to each
1029 other to parse the illegal start tag, each time inferring a missing
1030 </li> or <li> respectively.
1031
1032 NOTE: This check is a bit fragile. It specifically checks for the
1033 four tags that happen to weave their way through the current series
1034 of tests performed by ParseBlock and ParseList to trigger the
1035 infinite loop.
1036 */
1037 if ( nodeIsLI(element) )
1038 {
1039 if ( nodeIsFRAME(node) ||
1040 nodeIsFRAMESET(node) ||
1041 nodeIsOPTGROUP(node) ||
1042 nodeIsOPTION(node) )
1043 {
1044 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1045 TY_(FreeNode)( doc, node ); /* DSR - 27Apr02 avoid memory leak */
1046 continue;
1047 }
1048 }
1049
1050 if ( nodeIsTD(element) || nodeIsTH(element) )
1051 {
1052 /* if parent is a table cell, avoid inferring the end of the cell */
1053
1054 if ( TY_(nodeHasCM)(node, CM_HEAD) )
1055 {
1056 MoveToHead( doc, element, node );
1057 continue;
1058 }
1059
1060 if ( TY_(nodeHasCM)(node, CM_LIST) )
1061 {
1062 TY_(UngetToken)( doc );
1063 node = TY_(InferredTag)(doc, TidyTag_UL);
1064 AddClassNoIndent(doc, node);
1065 lexer->excludeBlocks = yes;
1066 }
1067 else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
1068 {
1069 TY_(UngetToken)( doc );
1070 node = TY_(InferredTag)(doc, TidyTag_DL);
1071 lexer->excludeBlocks = yes;
1072 }
1073
1074 /* infer end of current table cell */
1075 if ( !TY_(nodeHasCM)(node, CM_BLOCK) )
1076 {
1077 TY_(UngetToken)( doc );
1078 TrimSpaces( doc, element );
1079 return;
1080 }
1081 }
1082 else if ( TY_(nodeHasCM)(node, CM_BLOCK) )
1083 {
1084 if ( lexer->excludeBlocks )
1085 {
1086 if ( !TY_(nodeHasCM)(element, CM_OPT) )
1087 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1088
1089 TY_(UngetToken)( doc );
1090
1091 if ( TY_(nodeHasCM)(element, CM_OBJECT) )
1092 lexer->istackbase = istackbase;
1093
1094 TrimSpaces( doc, element );
1095 return;
1096 }
1097 }
1098 else /* things like list items */
1099 {
1100 if (node->tag->model & CM_HEAD)
1101 {
1102 MoveToHead( doc, element, node );
1103 continue;
1104 }
1105
1106 /*
1107 special case where a form start tag
1108 occurs in a tr and is followed by td or th
1109 */
1110
1111 if ( nodeIsFORM(element) &&
1112 nodeIsTD(element->parent) &&
1113 element->parent->implicit )
1114 {
1115 if ( nodeIsTD(node) )
1116 {
1117 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1118 TY_(FreeNode)( doc, node );
1119 continue;
1120 }
1121
1122 if ( nodeIsTH(node) )
1123 {
1124 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1125 TY_(FreeNode)( doc, node );
1126 node = element->parent;
1127 TidyDocFree(doc, node->element);
1128 node->element = TY_(tmbstrdup)(doc->allocator, "th");
1129 node->tag = TY_(LookupTagDef)( TidyTag_TH );
1130 continue;
1131 }
1132 }
1133
1134 if ( !TY_(nodeHasCM)(element, CM_OPT) && !element->implicit )
1135 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1136
1137 TY_(UngetToken)( doc );
1138
1139 if ( TY_(nodeHasCM)(node, CM_LIST) )
1140 {
1141 if ( element->parent && element->parent->tag &&
1142 element->parent->tag->parser == TY_(ParseList) )
1143 {
1144 TrimSpaces( doc, element );
1145 return;
1146 }
1147
1148 node = TY_(InferredTag)(doc, TidyTag_UL);
1149 AddClassNoIndent(doc, node);
1150 }
1151 else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
1152 {
1153 if ( nodeIsDL(element->parent) )
1154 {
1155 TrimSpaces( doc, element );
1156 return;
1157 }
1158
1159 node = TY_(InferredTag)(doc, TidyTag_DL);
1160 }
1161 else if ( TY_(nodeHasCM)(node, CM_TABLE) || TY_(nodeHasCM)(node, CM_ROW) )
1162 {
1163 /* http://tidy.sf.net/issue/1316307 */
1164 /* In exiled mode, return so table processing can
1165 continue. */
1166 if (lexer->exiled)
1167 return;
1168 node = TY_(InferredTag)(doc, TidyTag_TABLE);
1169 }
1170 else if ( TY_(nodeHasCM)(element, CM_OBJECT) )
1171 {
1172 /* pop inline stack */
1173 while ( lexer->istacksize > lexer->istackbase )
1174 TY_(PopInline)( doc, NULL );
1175 lexer->istackbase = istackbase;
1176 TrimSpaces( doc, element );
1177 return;
1178
1179 }
1180 else
1181 {
1182 TrimSpaces( doc, element );
1183 return;
1184 }
1185 }
1186 }
1187
1188 /* parse known element */
1189 if (TY_(nodeIsElement)(node))
1190 {
1191 if (node->tag->model & CM_INLINE)
1192 {
1193 if (checkstack && !node->implicit)
1194 {
1195 checkstack = no;
1196
1197 if (!(element->tag->model & CM_MIXED)) /* #431731 - fix by Randy Waki 25 Dec 00 */
1198 {
1199 if ( TY_(InlineDup)(doc, node) > 0 )
1200 continue;
1201 }
1202 }
1203
1204 mode = MixedContent;
1205 }
1206 else
1207 {
1208 checkstack = yes;
1209 mode = IgnoreWhitespace;
1210 }
1211
1212 /* trim white space before <br> */
1213 if ( nodeIsBR(node) )
1214 TrimSpaces( doc, element );
1215
1216 TY_(InsertNodeAtEnd)(element, node);
1217
1218 if (node->implicit)
1219 TY_(ReportError)(doc, element, node, INSERTING_TAG );
1220
1221 ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ );
1222 continue;
1223 }
1224
1225 /* discard unexpected tags */
1226 if (node->type == EndTag)
1227 TY_(PopInline)( doc, node ); /* if inline end tag */
1228
1229 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1230 TY_(FreeNode)( doc, node );
1231 continue;
1232 }
1233
1234 if (!(element->tag->model & CM_OPT))
1235 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_FOR);
1236
1237 if (element->tag->model & CM_OBJECT)
1238 {
1239 /* pop inline stack */
1240 while ( lexer->istacksize > lexer->istackbase )
1241 TY_(PopInline)( doc, NULL );
1242 lexer->istackbase = istackbase;
1243 }
1244
1245 TrimSpaces( doc, element );
1246 }
1247
TY_(ParseInline)1248 void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
1249 {
1250 Lexer* lexer = doc->lexer;
1251 Node *node, *parent;
1252
1253 if (element->tag->model & CM_EMPTY)
1254 return;
1255
1256 /*
1257 ParseInline is used for some block level elements like H1 to H6
1258 For such elements we need to insert inline emphasis tags currently
1259 on the inline stack. For Inline elements, we normally push them
1260 onto the inline stack provided they aren't implicit or OBJECT/APPLET.
1261 This test is carried out in PushInline and PopInline, see istack.c
1262
1263 InlineDup(...) is not called for elements with a CM_MIXED (inline and
1264 block) content model, e.g. <del> or <ins>, otherwise constructs like
1265
1266 <p>111<a name='foo'>222<del>333</del>444</a>555</p>
1267 <p>111<span>222<del>333</del>444</span>555</p>
1268 <p>111<em>222<del>333</del>444</em>555</p>
1269
1270 will get corrupted.
1271 */
1272 if ((TY_(nodeHasCM)(element, CM_BLOCK) || nodeIsDT(element)) &&
1273 !TY_(nodeHasCM)(element, CM_MIXED))
1274 TY_(InlineDup)(doc, NULL);
1275 else if (TY_(nodeHasCM)(element, CM_INLINE))
1276 TY_(PushInline)(doc, element);
1277
1278 if ( nodeIsNOBR(element) )
1279 doc->badLayout |= USING_NOBR;
1280 else if ( nodeIsFONT(element) )
1281 doc->badLayout |= USING_FONT;
1282
1283 /* Inline elements may or may not be within a preformatted element */
1284 if (mode != Preformatted)
1285 mode = MixedContent;
1286
1287 while ((node = TY_(GetToken)(doc, mode)) != NULL)
1288 {
1289 /* end tag for current element */
1290 if (node->tag == element->tag && node->type == EndTag)
1291 {
1292 if (element->tag->model & CM_INLINE)
1293 TY_(PopInline)( doc, node );
1294
1295 TY_(FreeNode)( doc, node );
1296
1297 if (!(mode & Preformatted))
1298 TrimSpaces(doc, element);
1299
1300 /*
1301 if a font element wraps an anchor and nothing else
1302 then move the font element inside the anchor since
1303 otherwise it won't alter the anchor text color
1304 */
1305 if ( nodeIsFONT(element) &&
1306 element->content && element->content == element->last )
1307 {
1308 Node *child = element->content;
1309
1310 if ( nodeIsA(child) )
1311 {
1312 child->parent = element->parent;
1313 child->next = element->next;
1314 child->prev = element->prev;
1315
1316 element->next = NULL;
1317 element->prev = NULL;
1318 element->parent = child;
1319
1320 element->content = child->content;
1321 element->last = child->last;
1322 child->content = element;
1323
1324 TY_(FixNodeLinks)(child);
1325 TY_(FixNodeLinks)(element);
1326 }
1327 }
1328
1329 element->closed = yes;
1330 TrimSpaces( doc, element );
1331 return;
1332 }
1333
1334 /* <u>...<u> map 2nd <u> to </u> if 1st is explicit */
1335 /* (see additional conditions below) */
1336 /* otherwise emphasis nesting is probably unintentional */
1337 /* big, small, sub, sup have cumulative effect to leave them alone */
1338 if ( node->type == StartTag
1339 && node->tag == element->tag
1340 && TY_(IsPushed)( doc, node )
1341 && !node->implicit
1342 && !element->implicit
1343 && node->tag && (node->tag->model & CM_INLINE)
1344 && !nodeIsA(node)
1345 && !nodeIsFONT(node)
1346 && !nodeIsBIG(node)
1347 && !nodeIsSMALL(node)
1348 && !nodeIsSUB(node)
1349 && !nodeIsSUP(node)
1350 && !nodeIsQ(node)
1351 && !nodeIsSPAN(node)
1352 )
1353 {
1354 /* proceeds only if "node" does not have any attribute and
1355 follows a text node not finishing with a space */
1356 if (element->content != NULL && node->attributes == NULL
1357 && TY_(nodeIsText)(element->last)
1358 && !TY_(TextNodeEndWithSpace)(doc->lexer, element->last) )
1359 {
1360 TY_(ReportWarning)(doc, element, node, COERCE_TO_ENDTAG_WARN);
1361 node->type = EndTag;
1362 TY_(UngetToken)(doc);
1363 continue;
1364 }
1365
1366 if (node->attributes == NULL || element->attributes == NULL)
1367 TY_(ReportWarning)(doc, element, node, NESTED_EMPHASIS);
1368 }
1369 else if ( TY_(IsPushed)(doc, node) && node->type == StartTag &&
1370 nodeIsQ(node) )
1371 {
1372 TY_(ReportWarning)(doc, element, node, NESTED_QUOTATION);
1373 }
1374
1375 if ( TY_(nodeIsText)(node) )
1376 {
1377 /* only called for 1st child */
1378 if ( element->content == NULL && !(mode & Preformatted) )
1379 TrimSpaces( doc, element );
1380
1381 if ( node->start >= node->end )
1382 {
1383 TY_(FreeNode)( doc, node );
1384 continue;
1385 }
1386
1387 TY_(InsertNodeAtEnd)(element, node);
1388 continue;
1389 }
1390
1391 /* mixed content model so allow text */
1392 if (InsertMisc(element, node))
1393 continue;
1394
1395 /* deal with HTML tags */
1396 if ( nodeIsHTML(node) )
1397 {
1398 if ( TY_(nodeIsElement)(node) )
1399 {
1400 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1401 TY_(FreeNode)( doc, node );
1402 continue;
1403 }
1404
1405 /* otherwise infer end of inline element */
1406 TY_(UngetToken)( doc );
1407
1408 if (!(mode & Preformatted))
1409 TrimSpaces(doc, element);
1410
1411 return;
1412 }
1413
1414 /* within <dt> or <pre> map <p> to <br> */
1415 if ( nodeIsP(node) &&
1416 node->type == StartTag &&
1417 ( (mode & Preformatted) ||
1418 nodeIsDT(element) ||
1419 DescendantOf(element, TidyTag_DT )
1420 )
1421 )
1422 {
1423 node->tag = TY_(LookupTagDef)( TidyTag_BR );
1424 TidyDocFree(doc, node->element);
1425 node->element = TY_(tmbstrdup)(doc->allocator, "br");
1426 TrimSpaces(doc, element);
1427 TY_(InsertNodeAtEnd)(element, node);
1428 continue;
1429 }
1430
1431 /* <p> allowed within <address> in HTML 4.01 Transitional */
1432 if ( nodeIsP(node) &&
1433 node->type == StartTag &&
1434 nodeIsADDRESS(element) )
1435 {
1436 TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
1437 TY_(InsertNodeAtEnd)(element, node);
1438 (*node->tag->parser)( doc, node, mode );
1439 continue;
1440 }
1441
1442 /* ignore unknown and PARAM tags */
1443 if ( node->tag == NULL || nodeIsPARAM(node) )
1444 {
1445 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1446 TY_(FreeNode)( doc, node );
1447 continue;
1448 }
1449
1450 if ( nodeIsBR(node) && node->type == EndTag )
1451 node->type = StartTag;
1452
1453 if ( node->type == EndTag )
1454 {
1455 /* coerce </br> to <br> */
1456 if ( nodeIsBR(node) )
1457 node->type = StartTag;
1458 else if ( nodeIsP(node) )
1459 {
1460 /* coerce unmatched </p> to <br><br> */
1461 if ( !DescendantOf(element, TidyTag_P) )
1462 {
1463 TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
1464 TrimSpaces( doc, element );
1465 TY_(InsertNodeAtEnd)( element, node );
1466 node = TY_(InferredTag)(doc, TidyTag_BR);
1467 TY_(InsertNodeAtEnd)( element, node ); /* todo: check this */
1468 continue;
1469 }
1470 }
1471 else if ( TY_(nodeHasCM)(node, CM_INLINE)
1472 && !nodeIsA(node)
1473 && !TY_(nodeHasCM)(node, CM_OBJECT)
1474 && TY_(nodeHasCM)(element, CM_INLINE) )
1475 {
1476 /* allow any inline end tag to end current element */
1477
1478 /* http://tidy.sf.net/issue/1426419 */
1479 /* but, like the browser, retain an earlier inline element.
1480 This is implemented by setting the lexer into a mode
1481 where it gets tokens from the inline stack rather than
1482 from the input stream. Check if the scenerio fits. */
1483 if ( !nodeIsA(element)
1484 && (node->tag != element->tag)
1485 && TY_(IsPushed)( doc, node )
1486 && TY_(IsPushed)( doc, element ) )
1487 {
1488 /* we have something like
1489 <b>bold <i>bold and italic</b> italics</i> */
1490 if ( TY_(SwitchInline)( doc, element, node ) )
1491 {
1492 TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG);
1493 TY_(UngetToken)( doc ); /* put this back */
1494 TY_(InlineDup1)( doc, NULL, element ); /* dupe the <i>, after </b> */
1495 if (!(mode & Preformatted))
1496 TrimSpaces( doc, element );
1497 return; /* close <i>, but will re-open it, after </b> */
1498 }
1499 }
1500 TY_(PopInline)( doc, element );
1501
1502 if ( !nodeIsA(element) )
1503 {
1504 if ( nodeIsA(node) && node->tag != element->tag )
1505 {
1506 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1507 TY_(UngetToken)( doc );
1508 }
1509 else
1510 {
1511 TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG);
1512 TY_(FreeNode)( doc, node);
1513 }
1514
1515 if (!(mode & Preformatted))
1516 TrimSpaces(doc, element);
1517
1518 return;
1519 }
1520
1521 /* if parent is <a> then discard unexpected inline end tag */
1522 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1523 TY_(FreeNode)( doc, node);
1524 continue;
1525 } /* special case </tr> etc. for stuff moved in front of table */
1526 else if ( lexer->exiled
1527 && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
1528 {
1529 TY_(UngetToken)( doc );
1530 TrimSpaces(doc, element);
1531 return;
1532 }
1533 }
1534
1535 /* allow any header tag to end current header */
1536 if ( TY_(nodeHasCM)(node, CM_HEADING) && TY_(nodeHasCM)(element, CM_HEADING) )
1537 {
1538
1539 if ( node->tag == element->tag )
1540 {
1541 TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG );
1542 TY_(FreeNode)( doc, node);
1543 }
1544 else
1545 {
1546 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1547 TY_(UngetToken)( doc );
1548 }
1549
1550 if (!(mode & Preformatted))
1551 TrimSpaces(doc, element);
1552
1553 return;
1554 }
1555
1556 /*
1557 an <A> tag to ends any open <A> element
1558 but <A href=...> is mapped to </A><A href=...>
1559 */
1560 /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
1561 /* if (node->tag == doc->tags.tag_a && !node->implicit && TY_(IsPushed)(doc, node)) */
1562 if ( nodeIsA(node) && !node->implicit &&
1563 (nodeIsA(element) || DescendantOf(element, TidyTag_A)) )
1564 {
1565 /* coerce <a> to </a> unless it has some attributes */
1566 /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
1567 /* other fixes by Dave Raggett */
1568 /* if (node->attributes == NULL) */
1569 if (node->type != EndTag && node->attributes == NULL)
1570 {
1571 node->type = EndTag;
1572 TY_(ReportError)(doc, element, node, COERCE_TO_ENDTAG);
1573 /* TY_(PopInline)( doc, node ); */
1574 TY_(UngetToken)( doc );
1575 continue;
1576 }
1577
1578 TY_(UngetToken)( doc );
1579 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
1580 /* TY_(PopInline)( doc, element ); */
1581
1582 if (!(mode & Preformatted))
1583 TrimSpaces(doc, element);
1584
1585 return;
1586 }
1587
1588 if (element->tag->model & CM_HEADING)
1589 {
1590 if ( nodeIsCENTER(node) || nodeIsDIV(node) )
1591 {
1592 if (!TY_(nodeIsElement)(node))
1593 {
1594 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1595 TY_(FreeNode)( doc, node);
1596 continue;
1597 }
1598
1599 TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
1600
1601 /* insert center as parent if heading is empty */
1602 if (element->content == NULL)
1603 {
1604 InsertNodeAsParent(element, node);
1605 continue;
1606 }
1607
1608 /* split heading and make center parent of 2nd part */
1609 TY_(InsertNodeAfterElement)(element, node);
1610
1611 if (!(mode & Preformatted))
1612 TrimSpaces(doc, element);
1613
1614 element = TY_(CloneNode)( doc, element );
1615 TY_(InsertNodeAtEnd)(node, element);
1616 continue;
1617 }
1618
1619 if ( nodeIsHR(node) )
1620 {
1621 if ( !TY_(nodeIsElement)(node) )
1622 {
1623 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1624 TY_(FreeNode)( doc, node);
1625 continue;
1626 }
1627
1628 TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
1629
1630 /* insert hr before heading if heading is empty */
1631 if (element->content == NULL)
1632 {
1633 TY_(InsertNodeBeforeElement)(element, node);
1634 continue;
1635 }
1636
1637 /* split heading and insert hr before 2nd part */
1638 TY_(InsertNodeAfterElement)(element, node);
1639
1640 if (!(mode & Preformatted))
1641 TrimSpaces(doc, element);
1642
1643 element = TY_(CloneNode)( doc, element );
1644 TY_(InsertNodeAfterElement)(node, element);
1645 continue;
1646 }
1647 }
1648
1649 if ( nodeIsDT(element) )
1650 {
1651 if ( nodeIsHR(node) )
1652 {
1653 Node *dd;
1654 if ( !TY_(nodeIsElement)(node) )
1655 {
1656 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1657 TY_(FreeNode)( doc, node);
1658 continue;
1659 }
1660
1661 TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
1662 dd = TY_(InferredTag)(doc, TidyTag_DD);
1663
1664 /* insert hr within dd before dt if dt is empty */
1665 if (element->content == NULL)
1666 {
1667 TY_(InsertNodeBeforeElement)(element, dd);
1668 TY_(InsertNodeAtEnd)(dd, node);
1669 continue;
1670 }
1671
1672 /* split dt and insert hr within dd before 2nd part */
1673 TY_(InsertNodeAfterElement)(element, dd);
1674 TY_(InsertNodeAtEnd)(dd, node);
1675
1676 if (!(mode & Preformatted))
1677 TrimSpaces(doc, element);
1678
1679 element = TY_(CloneNode)( doc, element );
1680 TY_(InsertNodeAfterElement)(dd, element);
1681 continue;
1682 }
1683 }
1684
1685
1686 /*
1687 if this is the end tag for an ancestor element
1688 then infer end tag for this element
1689 */
1690 if (node->type == EndTag)
1691 {
1692 for (parent = element->parent;
1693 parent != NULL; parent = parent->parent)
1694 {
1695 if (node->tag == parent->tag)
1696 {
1697 if (!(element->tag->model & CM_OPT) && !element->implicit)
1698 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
1699
1700 if( TY_(IsPushedLast)( doc, element, node ) )
1701 TY_(PopInline)( doc, element );
1702 TY_(UngetToken)( doc );
1703
1704 if (!(mode & Preformatted))
1705 TrimSpaces(doc, element);
1706
1707 return;
1708 }
1709 }
1710 }
1711
1712 /* block level tags end this element */
1713 if (!(node->tag->model & CM_INLINE) &&
1714 !(element->tag->model & CM_MIXED))
1715 {
1716 if ( !TY_(nodeIsElement)(node) )
1717 {
1718 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1719 TY_(FreeNode)( doc, node);
1720 continue;
1721 }
1722
1723 if (!(element->tag->model & CM_OPT))
1724 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
1725
1726 if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK))
1727 {
1728 MoveToHead(doc, element, node);
1729 continue;
1730 }
1731
1732 /*
1733 prevent anchors from propagating into block tags
1734 except for headings h1 to h6
1735 */
1736 if ( nodeIsA(element) )
1737 {
1738 if (node->tag && !(node->tag->model & CM_HEADING))
1739 TY_(PopInline)( doc, element );
1740 else if (!(element->content))
1741 {
1742 TY_(DiscardElement)( doc, element );
1743 TY_(UngetToken)( doc );
1744 return;
1745 }
1746 }
1747
1748 TY_(UngetToken)( doc );
1749
1750 if (!(mode & Preformatted))
1751 TrimSpaces(doc, element);
1752
1753 return;
1754 }
1755
1756 /* parse inline element */
1757 if (TY_(nodeIsElement)(node))
1758 {
1759 if (node->implicit)
1760 TY_(ReportError)(doc, element, node, INSERTING_TAG);
1761
1762 /* trim white space before <br> */
1763 if ( nodeIsBR(node) )
1764 TrimSpaces(doc, element);
1765
1766 TY_(InsertNodeAtEnd)(element, node);
1767 ParseTag(doc, node, mode);
1768 continue;
1769 }
1770
1771 /* discard unexpected tags */
1772 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1773 TY_(FreeNode)( doc, node );
1774 continue;
1775 }
1776
1777 if (!(element->tag->model & CM_OPT))
1778 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_FOR);
1779
1780 }
1781
TY_(ParseEmpty)1782 void TY_(ParseEmpty)(TidyDocImpl* doc, Node *element, GetTokenMode mode)
1783 {
1784 Lexer* lexer = doc->lexer;
1785 if ( lexer->isvoyager )
1786 {
1787 Node *node = TY_(GetToken)( doc, mode);
1788 if ( node )
1789 {
1790 if ( !(node->type == EndTag && node->tag == element->tag) )
1791 {
1792 TY_(ReportError)(doc, element, node, ELEMENT_NOT_EMPTY);
1793 TY_(UngetToken)( doc );
1794 }
1795 else
1796 {
1797 TY_(FreeNode)( doc, node );
1798 }
1799 }
1800 }
1801 }
1802
TY_(ParseDefList)1803 void TY_(ParseDefList)(TidyDocImpl* doc, Node *list, GetTokenMode mode)
1804 {
1805 Lexer* lexer = doc->lexer;
1806 Node *node, *parent;
1807
1808 if (list->tag->model & CM_EMPTY)
1809 return;
1810
1811 lexer->insert = NULL; /* defer implicit inline start tags */
1812
1813 while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL)
1814 {
1815 if (node->tag == list->tag && node->type == EndTag)
1816 {
1817 TY_(FreeNode)( doc, node);
1818 list->closed = yes;
1819 return;
1820 }
1821
1822 /* deal with comments etc. */
1823 if (InsertMisc(list, node))
1824 continue;
1825
1826 if (TY_(nodeIsText)(node))
1827 {
1828 TY_(UngetToken)( doc );
1829 node = TY_(InferredTag)(doc, TidyTag_DT);
1830 TY_(ReportError)(doc, list, node, MISSING_STARTTAG);
1831 }
1832
1833 if (node->tag == NULL)
1834 {
1835 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1836 TY_(FreeNode)( doc, node);
1837 continue;
1838 }
1839
1840 /*
1841 if this is the end tag for an ancestor element
1842 then infer end tag for this element
1843 */
1844 if (node->type == EndTag)
1845 {
1846 Bool discardIt = no;
1847 if ( nodeIsFORM(node) )
1848 {
1849 BadForm( doc );
1850 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1851 TY_(FreeNode)( doc, node );
1852 continue;
1853 }
1854
1855 for (parent = list->parent;
1856 parent != NULL; parent = parent->parent)
1857 {
1858 /* Do not match across BODY to avoid infinite loop
1859 between ParseBody and this parser,
1860 See http://tidy.sf.net/bug/1098012. */
1861 if (nodeIsBODY(parent))
1862 {
1863 discardIt = yes;
1864 break;
1865 }
1866 if (node->tag == parent->tag)
1867 {
1868 TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
1869
1870 TY_(UngetToken)( doc );
1871 return;
1872 }
1873 }
1874 if (discardIt)
1875 {
1876 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1877 TY_(FreeNode)( doc, node);
1878 continue;
1879 }
1880 }
1881
1882 /* center in a dt or a dl breaks the dl list in two */
1883 if ( nodeIsCENTER(node) )
1884 {
1885 if (list->content)
1886 TY_(InsertNodeAfterElement)(list, node);
1887 else /* trim empty dl list */
1888 {
1889 TY_(InsertNodeBeforeElement)(list, node);
1890
1891 }
1892
1893 /* #426885 - fix by Glenn Carroll 19 Apr 00, and
1894 Gary Dechaines 11 Aug 00 */
1895 /* ParseTag can destroy node, if it finds that
1896 * this <center> is followed immediately by </center>.
1897 * It's awkward but necessary to determine if this
1898 * has happened.
1899 */
1900 parent = node->parent;
1901
1902 /* and parse contents of center */
1903 lexer->excludeBlocks = no;
1904 ParseTag( doc, node, mode);
1905 lexer->excludeBlocks = yes;
1906
1907 /* now create a new dl element,
1908 * unless node has been blown away because the
1909 * center was empty, as above.
1910 */
1911 if (parent->last == node)
1912 {
1913 list = TY_(InferredTag)(doc, TidyTag_DL);
1914 TY_(InsertNodeAfterElement)(node, list);
1915 }
1916 continue;
1917 }
1918
1919 if ( !(nodeIsDT(node) || nodeIsDD(node)) )
1920 {
1921 TY_(UngetToken)( doc );
1922
1923 if (!(node->tag->model & (CM_BLOCK | CM_INLINE)))
1924 {
1925 TY_(ReportError)(doc, list, node, TAG_NOT_ALLOWED_IN);
1926 return;
1927 }
1928
1929 /* if DD appeared directly in BODY then exclude blocks */
1930 if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks)
1931 return;
1932
1933 node = TY_(InferredTag)(doc, TidyTag_DD);
1934 TY_(ReportError)(doc, list, node, MISSING_STARTTAG);
1935 }
1936
1937 if (node->type == EndTag)
1938 {
1939 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1940 TY_(FreeNode)( doc, node);
1941 continue;
1942 }
1943
1944 /* node should be <DT> or <DD>*/
1945 TY_(InsertNodeAtEnd)(list, node);
1946 ParseTag( doc, node, IgnoreWhitespace);
1947 }
1948
1949 TY_(ReportError)(doc, list, node, MISSING_ENDTAG_FOR);
1950 }
1951
FindLastLI(Node * list,Node ** lastli)1952 static Bool FindLastLI( Node *list, Node **lastli )
1953 {
1954 Node *node;
1955
1956 *lastli = NULL;
1957 for ( node = list->content; node ; node = node->next )
1958 if ( nodeIsLI(node) && node->type == StartTag )
1959 *lastli=node;
1960 return *lastli ? yes:no;
1961 }
1962
TY_(ParseList)1963 void TY_(ParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode))
1964 {
1965 Lexer* lexer = doc->lexer;
1966 Node *node, *parent, *lastli;
1967 Bool wasblock;
1968
1969 if (list->tag->model & CM_EMPTY)
1970 return;
1971
1972 lexer->insert = NULL; /* defer implicit inline start tags */
1973
1974 while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL)
1975 {
1976 if (node->tag == list->tag && node->type == EndTag)
1977 {
1978 TY_(FreeNode)( doc, node);
1979 list->closed = yes;
1980 return;
1981 }
1982
1983 /* deal with comments etc. */
1984 if (InsertMisc(list, node))
1985 continue;
1986
1987 if (node->type != TextNode && node->tag == NULL)
1988 {
1989 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1990 TY_(FreeNode)( doc, node);
1991 continue;
1992 }
1993
1994 /*
1995 if this is the end tag for an ancestor element
1996 then infer end tag for this element
1997 */
1998 if (node->type == EndTag)
1999 {
2000 if ( nodeIsFORM(node) )
2001 {
2002 BadForm( doc );
2003 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2004 TY_(FreeNode)( doc, node );
2005 continue;
2006 }
2007
2008 if (TY_(nodeHasCM)(node,CM_INLINE))
2009 {
2010 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2011 TY_(PopInline)( doc, node );
2012 TY_(FreeNode)( doc, node);
2013 continue;
2014 }
2015
2016 for ( parent = list->parent;
2017 parent != NULL; parent = parent->parent )
2018 {
2019 /* Do not match across BODY to avoid infinite loop
2020 between ParseBody and this parser,
2021 See http://tidy.sf.net/bug/1053626. */
2022 if (nodeIsBODY(parent))
2023 break;
2024 if (node->tag == parent->tag)
2025 {
2026 TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
2027 TY_(UngetToken)( doc );
2028 return;
2029 }
2030 }
2031
2032 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2033 TY_(FreeNode)( doc, node);
2034 continue;
2035 }
2036
2037 if ( !nodeIsLI(node) )
2038 {
2039 TY_(UngetToken)( doc );
2040
2041 if (TY_(nodeHasCM)(node,CM_BLOCK) && lexer->excludeBlocks)
2042 {
2043 TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
2044 return;
2045 }
2046 /* http://tidy.sf.net/issue/1316307 */
2047 /* In exiled mode, return so table processing can continue. */
2048 else if ( lexer->exiled
2049 && (TY_(nodeHasCM)(node, CM_TABLE|CM_ROWGRP|CM_ROW)
2050 || nodeIsTABLE(node)) )
2051 return;
2052
2053 /* http://tidy.sf.net/issue/836462
2054 If "list" is an unordered list, insert the next tag within
2055 the last <li> to preserve the numbering to match the visual
2056 rendering of most browsers. */
2057 if ( nodeIsOL(list) && FindLastLI(list, &lastli) )
2058 {
2059 /* Create a node for error reporting */
2060 node = TY_(InferredTag)(doc, TidyTag_LI);
2061 TY_(ReportError)(doc, list, node, MISSING_STARTTAG );
2062 TY_(FreeNode)( doc, node);
2063 node = lastli;
2064 }
2065 else
2066 {
2067 /* Add an inferred <li> */
2068 wasblock = TY_(nodeHasCM)(node,CM_BLOCK);
2069 node = TY_(InferredTag)(doc, TidyTag_LI);
2070 /* Add "display: inline" to avoid a blank line after <li> with
2071 Internet Explorer. See http://tidy.sf.net/issue/836462 */
2072 TY_(AddStyleProperty)( doc, node,
2073 wasblock
2074 ? "list-style: none; display: inline"
2075 : "list-style: none"
2076 );
2077 TY_(ReportError)(doc, list, node, MISSING_STARTTAG );
2078 TY_(InsertNodeAtEnd)(list,node);
2079 }
2080 }
2081 else
2082 /* node is <LI> */
2083 TY_(InsertNodeAtEnd)(list,node);
2084
2085 ParseTag( doc, node, IgnoreWhitespace);
2086 }
2087
2088 TY_(ReportError)(doc, list, node, MISSING_ENDTAG_FOR);
2089 }
2090
2091 /*
2092 unexpected content in table row is moved to just before
2093 the table in accordance with Netscape and IE. This code
2094 assumes that node hasn't been inserted into the row.
2095 */
MoveBeforeTable(TidyDocImpl * ARG_UNUSED (doc),Node * row,Node * node)2096 static void MoveBeforeTable( TidyDocImpl* ARG_UNUSED(doc), Node *row,
2097 Node *node )
2098 {
2099 Node *table;
2100
2101 /* first find the table element */
2102 for (table = row->parent; table; table = table->parent)
2103 {
2104 if ( nodeIsTABLE(table) )
2105 {
2106 TY_(InsertNodeBeforeElement)( table, node );
2107 return;
2108 }
2109 }
2110 /* No table element */
2111 TY_(InsertNodeBeforeElement)( row->parent, node );
2112 }
2113
2114 /*
2115 if a table row is empty then insert an empty cell
2116 this practice is consistent with browser behavior
2117 and avoids potential problems with row spanning cells
2118 */
FixEmptyRow(TidyDocImpl * doc,Node * row)2119 static void FixEmptyRow(TidyDocImpl* doc, Node *row)
2120 {
2121 Node *cell;
2122
2123 if (row->content == NULL)
2124 {
2125 cell = TY_(InferredTag)(doc, TidyTag_TD);
2126 TY_(InsertNodeAtEnd)(row, cell);
2127 TY_(ReportError)(doc, row, cell, MISSING_STARTTAG);
2128 }
2129 }
2130
TY_(ParseRow)2131 void TY_(ParseRow)(TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode))
2132 {
2133 Lexer* lexer = doc->lexer;
2134 Node *node;
2135 Bool exclude_state;
2136
2137 if (row->tag->model & CM_EMPTY)
2138 return;
2139
2140 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2141 {
2142 if (node->tag == row->tag)
2143 {
2144 if (node->type == EndTag)
2145 {
2146 TY_(FreeNode)( doc, node);
2147 row->closed = yes;
2148 FixEmptyRow( doc, row);
2149 return;
2150 }
2151
2152 /* New row start implies end of current row */
2153 TY_(UngetToken)( doc );
2154 FixEmptyRow( doc, row);
2155 return;
2156 }
2157
2158 /*
2159 if this is the end tag for an ancestor element
2160 then infer end tag for this element
2161 */
2162 if ( node->type == EndTag )
2163 {
2164 if ( (TY_(nodeHasCM)(node, CM_HTML|CM_TABLE) || nodeIsTABLE(node))
2165 && DescendantOf(row, TagId(node)) )
2166 {
2167 TY_(UngetToken)( doc );
2168 return;
2169 }
2170
2171 if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2172 {
2173 if ( nodeIsFORM(node) )
2174 BadForm( doc );
2175
2176 TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2177 TY_(FreeNode)( doc, node);
2178 continue;
2179 }
2180
2181 if ( nodeIsTD(node) || nodeIsTH(node) )
2182 {
2183 TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2184 TY_(FreeNode)( doc, node);
2185 continue;
2186 }
2187 }
2188
2189 /* deal with comments etc. */
2190 if (InsertMisc(row, node))
2191 continue;
2192
2193 /* discard unknown tags */
2194 if (node->tag == NULL && node->type != TextNode)
2195 {
2196 TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2197 TY_(FreeNode)( doc, node);
2198 continue;
2199 }
2200
2201 /* discard unexpected <table> element */
2202 if ( nodeIsTABLE(node) )
2203 {
2204 TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2205 TY_(FreeNode)( doc, node);
2206 continue;
2207 }
2208
2209 /* THEAD, TFOOT or TBODY */
2210 if ( TY_(nodeHasCM)(node, CM_ROWGRP) )
2211 {
2212 TY_(UngetToken)( doc );
2213 return;
2214 }
2215
2216 if (node->type == EndTag)
2217 {
2218 TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2219 TY_(FreeNode)( doc, node);
2220 continue;
2221 }
2222
2223 /*
2224 if text or inline or block move before table
2225 if head content move to head
2226 */
2227
2228 if (node->type != EndTag)
2229 {
2230 if ( nodeIsFORM(node) )
2231 {
2232 TY_(UngetToken)( doc );
2233 node = TY_(InferredTag)(doc, TidyTag_TD);
2234 TY_(ReportError)(doc, row, node, MISSING_STARTTAG);
2235 }
2236 else if ( TY_(nodeIsText)(node)
2237 || TY_(nodeHasCM)(node, CM_BLOCK | CM_INLINE) )
2238 {
2239 MoveBeforeTable( doc, row, node );
2240 TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN);
2241 lexer->exiled = yes;
2242 exclude_state = lexer->excludeBlocks;
2243 lexer->excludeBlocks = no;
2244
2245 if (node->type != TextNode)
2246 ParseTag( doc, node, IgnoreWhitespace);
2247
2248 lexer->exiled = no;
2249 lexer->excludeBlocks = exclude_state;
2250 continue;
2251 }
2252 else if (node->tag->model & CM_HEAD)
2253 {
2254 TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN);
2255 MoveToHead( doc, row, node);
2256 continue;
2257 }
2258 }
2259
2260 if ( !(nodeIsTD(node) || nodeIsTH(node)) )
2261 {
2262 TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN);
2263 TY_(FreeNode)( doc, node);
2264 continue;
2265 }
2266
2267 /* node should be <TD> or <TH> */
2268 TY_(InsertNodeAtEnd)(row, node);
2269 exclude_state = lexer->excludeBlocks;
2270 lexer->excludeBlocks = no;
2271 ParseTag( doc, node, IgnoreWhitespace);
2272 lexer->excludeBlocks = exclude_state;
2273
2274 /* pop inline stack */
2275
2276 while ( lexer->istacksize > lexer->istackbase )
2277 TY_(PopInline)( doc, NULL );
2278 }
2279
2280 }
2281
TY_(ParseRowGroup)2282 void TY_(ParseRowGroup)(TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSED(mode))
2283 {
2284 Lexer* lexer = doc->lexer;
2285 Node *node, *parent;
2286
2287 if (rowgroup->tag->model & CM_EMPTY)
2288 return;
2289
2290 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2291 {
2292 if (node->tag == rowgroup->tag)
2293 {
2294 if (node->type == EndTag)
2295 {
2296 rowgroup->closed = yes;
2297 TY_(FreeNode)( doc, node);
2298 return;
2299 }
2300
2301 TY_(UngetToken)( doc );
2302 return;
2303 }
2304
2305 /* if </table> infer end tag */
2306 if ( nodeIsTABLE(node) && node->type == EndTag )
2307 {
2308 TY_(UngetToken)( doc );
2309 return;
2310 }
2311
2312 /* deal with comments etc. */
2313 if (InsertMisc(rowgroup, node))
2314 continue;
2315
2316 /* discard unknown tags */
2317 if (node->tag == NULL && node->type != TextNode)
2318 {
2319 TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2320 TY_(FreeNode)( doc, node);
2321 continue;
2322 }
2323
2324 /*
2325 if TD or TH then infer <TR>
2326 if text or inline or block move before table
2327 if head content move to head
2328 */
2329
2330 if (node->type != EndTag)
2331 {
2332 if ( nodeIsTD(node) || nodeIsTH(node) )
2333 {
2334 TY_(UngetToken)( doc );
2335 node = TY_(InferredTag)(doc, TidyTag_TR);
2336 TY_(ReportError)(doc, rowgroup, node, MISSING_STARTTAG);
2337 }
2338 else if ( TY_(nodeIsText)(node)
2339 || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2340 {
2341 MoveBeforeTable( doc, rowgroup, node );
2342 TY_(ReportError)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
2343 lexer->exiled = yes;
2344
2345 if (node->type != TextNode)
2346 ParseTag(doc, node, IgnoreWhitespace);
2347
2348 lexer->exiled = no;
2349 continue;
2350 }
2351 else if (node->tag->model & CM_HEAD)
2352 {
2353 TY_(ReportError)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
2354 MoveToHead(doc, rowgroup, node);
2355 continue;
2356 }
2357 }
2358
2359 /*
2360 if this is the end tag for ancestor element
2361 then infer end tag for this element
2362 */
2363 if (node->type == EndTag)
2364 {
2365 if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2366 {
2367 if ( nodeIsFORM(node) )
2368 BadForm( doc );
2369
2370 TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2371 TY_(FreeNode)( doc, node);
2372 continue;
2373 }
2374
2375 if ( nodeIsTR(node) || nodeIsTD(node) || nodeIsTH(node) )
2376 {
2377 TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2378 TY_(FreeNode)( doc, node);
2379 continue;
2380 }
2381
2382 for ( parent = rowgroup->parent;
2383 parent != NULL;
2384 parent = parent->parent )
2385 {
2386 if (node->tag == parent->tag)
2387 {
2388 TY_(UngetToken)( doc );
2389 return;
2390 }
2391 }
2392 }
2393
2394 /*
2395 if THEAD, TFOOT or TBODY then implied end tag
2396
2397 */
2398 if (node->tag->model & CM_ROWGRP)
2399 {
2400 if (node->type != EndTag)
2401 {
2402 TY_(UngetToken)( doc );
2403 return;
2404 }
2405 }
2406
2407 if (node->type == EndTag)
2408 {
2409 TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2410 TY_(FreeNode)( doc, node);
2411 continue;
2412 }
2413
2414 if ( !nodeIsTR(node) )
2415 {
2416 node = TY_(InferredTag)(doc, TidyTag_TR);
2417 TY_(ReportError)(doc, rowgroup, node, MISSING_STARTTAG);
2418 TY_(UngetToken)( doc );
2419 }
2420
2421 /* node should be <TR> */
2422 TY_(InsertNodeAtEnd)(rowgroup, node);
2423 ParseTag(doc, node, IgnoreWhitespace);
2424 }
2425
2426 }
2427
TY_(ParseColGroup)2428 void TY_(ParseColGroup)(TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSED(mode))
2429 {
2430 Node *node, *parent;
2431
2432 if (colgroup->tag->model & CM_EMPTY)
2433 return;
2434
2435 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2436 {
2437 if (node->tag == colgroup->tag && node->type == EndTag)
2438 {
2439 TY_(FreeNode)( doc, node);
2440 colgroup->closed = yes;
2441 return;
2442 }
2443
2444 /*
2445 if this is the end tag for an ancestor element
2446 then infer end tag for this element
2447 */
2448 if (node->type == EndTag)
2449 {
2450 if ( nodeIsFORM(node) )
2451 {
2452 BadForm( doc );
2453 TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2454 TY_(FreeNode)( doc, node);
2455 continue;
2456 }
2457
2458 for ( parent = colgroup->parent;
2459 parent != NULL;
2460 parent = parent->parent )
2461 {
2462 if (node->tag == parent->tag)
2463 {
2464 TY_(UngetToken)( doc );
2465 return;
2466 }
2467 }
2468 }
2469
2470 if (TY_(nodeIsText)(node))
2471 {
2472 TY_(UngetToken)( doc );
2473 return;
2474 }
2475
2476 /* deal with comments etc. */
2477 if (InsertMisc(colgroup, node))
2478 continue;
2479
2480 /* discard unknown tags */
2481 if (node->tag == NULL)
2482 {
2483 TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2484 TY_(FreeNode)( doc, node);
2485 continue;
2486 }
2487
2488 if ( !nodeIsCOL(node) )
2489 {
2490 TY_(UngetToken)( doc );
2491 return;
2492 }
2493
2494 if (node->type == EndTag)
2495 {
2496 TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2497 TY_(FreeNode)( doc, node);
2498 continue;
2499 }
2500
2501 /* node should be <COL> */
2502 TY_(InsertNodeAtEnd)(colgroup, node);
2503 ParseTag(doc, node, IgnoreWhitespace);
2504 }
2505 }
2506
TY_(ParseTableTag)2507 void TY_(ParseTableTag)(TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(mode))
2508 {
2509 Lexer* lexer = doc->lexer;
2510 Node *node, *parent;
2511 uint istackbase;
2512
2513 TY_(DeferDup)( doc );
2514 istackbase = lexer->istackbase;
2515 lexer->istackbase = lexer->istacksize;
2516
2517 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2518 {
2519 if (node->tag == table->tag && node->type == EndTag)
2520 {
2521 TY_(FreeNode)( doc, node);
2522 lexer->istackbase = istackbase;
2523 table->closed = yes;
2524 return;
2525 }
2526
2527 /* deal with comments etc. */
2528 if (InsertMisc(table, node))
2529 continue;
2530
2531 /* discard unknown tags */
2532 if (node->tag == NULL && node->type != TextNode)
2533 {
2534 TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2535 TY_(FreeNode)( doc, node);
2536 continue;
2537 }
2538
2539 /* if TD or TH or text or inline or block then infer <TR> */
2540
2541 if (node->type != EndTag)
2542 {
2543 if ( nodeIsTD(node) || nodeIsTH(node) || nodeIsTABLE(node) )
2544 {
2545 TY_(UngetToken)( doc );
2546 node = TY_(InferredTag)(doc, TidyTag_TR);
2547 TY_(ReportError)(doc, table, node, MISSING_STARTTAG);
2548 }
2549 else if ( TY_(nodeIsText)(node) ||TY_(nodeHasCM)(node,CM_BLOCK|CM_INLINE) )
2550 {
2551 TY_(InsertNodeBeforeElement)(table, node);
2552 TY_(ReportError)(doc, table, node, TAG_NOT_ALLOWED_IN);
2553 lexer->exiled = yes;
2554
2555 if (node->type != TextNode)
2556 ParseTag(doc, node, IgnoreWhitespace);
2557
2558 lexer->exiled = no;
2559 continue;
2560 }
2561 else if (node->tag->model & CM_HEAD)
2562 {
2563 MoveToHead(doc, table, node);
2564 continue;
2565 }
2566 }
2567
2568 /*
2569 if this is the end tag for an ancestor element
2570 then infer end tag for this element
2571 */
2572 if (node->type == EndTag)
2573 {
2574 if ( nodeIsFORM(node) )
2575 {
2576 BadForm( doc );
2577 TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2578 TY_(FreeNode)( doc, node);
2579 continue;
2580 }
2581
2582 /* best to discard unexpected block/inline end tags */
2583 if ( TY_(nodeHasCM)(node, CM_TABLE|CM_ROW) ||
2584 TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2585 {
2586 TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2587 TY_(FreeNode)( doc, node);
2588 continue;
2589 }
2590
2591 for ( parent = table->parent;
2592 parent != NULL;
2593 parent = parent->parent )
2594 {
2595 if (node->tag == parent->tag)
2596 {
2597 TY_(ReportError)(doc, table, node, MISSING_ENDTAG_BEFORE );
2598 TY_(UngetToken)( doc );
2599 lexer->istackbase = istackbase;
2600 return;
2601 }
2602 }
2603 }
2604
2605 if (!(node->tag->model & CM_TABLE))
2606 {
2607 TY_(UngetToken)( doc );
2608 TY_(ReportError)(doc, table, node, TAG_NOT_ALLOWED_IN);
2609 lexer->istackbase = istackbase;
2610 return;
2611 }
2612
2613 if (TY_(nodeIsElement)(node))
2614 {
2615 TY_(InsertNodeAtEnd)(table, node);
2616 ParseTag(doc, node, IgnoreWhitespace);
2617 continue;
2618 }
2619
2620 /* discard unexpected text nodes and end tags */
2621 TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2622 TY_(FreeNode)( doc, node);
2623 }
2624
2625 TY_(ReportError)(doc, table, node, MISSING_ENDTAG_FOR);
2626 lexer->istackbase = istackbase;
2627 }
2628
2629 /* acceptable content for pre elements */
PreContent(TidyDocImpl * ARG_UNUSED (doc),Node * node)2630 static Bool PreContent( TidyDocImpl* ARG_UNUSED(doc), Node* node )
2631 {
2632 /* p is coerced to br's, Text OK too */
2633 if ( nodeIsP(node) || TY_(nodeIsText)(node) )
2634 return yes;
2635
2636 if ( node->tag == NULL ||
2637 nodeIsPARAM(node) ||
2638 !TY_(nodeHasCM)(node, CM_INLINE|CM_NEW) )
2639 return no;
2640
2641 return yes;
2642 }
2643
TY_(ParsePre)2644 void TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) )
2645 {
2646 Node *node;
2647
2648 if (pre->tag->model & CM_EMPTY)
2649 return;
2650
2651 TY_(InlineDup)( doc, NULL ); /* tell lexer to insert inlines if needed */
2652
2653 while ((node = TY_(GetToken)(doc, Preformatted)) != NULL)
2654 {
2655 if ( node->type == EndTag &&
2656 (node->tag == pre->tag || DescendantOf(pre, TagId(node))) )
2657 {
2658 if (nodeIsBODY(node) || nodeIsHTML(node))
2659 {
2660 TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2661 TY_(FreeNode)(doc, node);
2662 continue;
2663 }
2664 if (node->tag == pre->tag)
2665 {
2666 TY_(FreeNode)(doc, node);
2667 }
2668 else
2669 {
2670 TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_BEFORE );
2671 TY_(UngetToken)( doc );
2672 }
2673 pre->closed = yes;
2674 TrimSpaces(doc, pre);
2675 return;
2676 }
2677
2678 if (TY_(nodeIsText)(node))
2679 {
2680 TY_(InsertNodeAtEnd)(pre, node);
2681 continue;
2682 }
2683
2684 /* deal with comments etc. */
2685 if (InsertMisc(pre, node))
2686 continue;
2687
2688 if (node->tag == NULL)
2689 {
2690 TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2691 TY_(FreeNode)(doc, node);
2692 continue;
2693 }
2694
2695 /* strip unexpected tags */
2696 if ( !PreContent(doc, node) )
2697 {
2698 Node *newnode;
2699
2700 /* fix for http://tidy.sf.net/bug/772205 */
2701 if (node->type == EndTag)
2702 {
2703 /* http://tidy.sf.net/issue/1590220 */
2704 if ( doc->lexer->exiled
2705 && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
2706 {
2707 TY_(UngetToken)(doc);
2708 TrimSpaces(doc, pre);
2709 return;
2710 }
2711
2712 TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2713 TY_(FreeNode)(doc, node);
2714 continue;
2715 }
2716 /* http://tidy.sf.net/issue/1590220 */
2717 else if (TY_(nodeHasCM)(node, CM_TABLE|CM_ROW)
2718 || nodeIsTABLE(node) )
2719 {
2720 if (!doc->lexer->exiled)
2721 /* No missing close warning if exiled. */
2722 TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_BEFORE);
2723
2724 TY_(UngetToken)(doc);
2725 return;
2726 }
2727
2728 /*
2729 This is basically what Tidy 04 August 2000 did and far more accurate
2730 with respect to browser behaivour than the code commented out above.
2731 Tidy could try to propagate the <pre> into each disallowed child where
2732 <pre> is allowed in order to replicate some browsers behaivour, but
2733 there are a lot of exceptions, e.g. Internet Explorer does not propagate
2734 <pre> into table cells while Mozilla does. Opera 6 never propagates
2735 <pre> into blocklevel elements while Opera 7 behaves much like Mozilla.
2736
2737 Tidy behaves thus mostly like Opera 6 except for nested <pre> elements
2738 which are handled like Mozilla takes them (Opera6 closes all <pre> after
2739 the first </pre>).
2740
2741 There are similar issues like replacing <p> in <pre> with <br>, for
2742 example
2743
2744 <pre>...<p>...</pre> (Input)
2745 <pre>...<br>...</pre> (Tidy)
2746 <pre>...<br>...</pre> (Opera 7 and Internet Explorer)
2747 <pre>...<br><br>...</pre> (Opera 6 and Mozilla)
2748
2749 <pre>...<p>...</p>...</pre> (Input)
2750 <pre>...<br>......</pre> (Tidy, BUG!)
2751 <pre>...<br>...<br>...</pre> (Internet Explorer)
2752 <pre>...<br><br>...<br><br>...</pre> (Mozilla, Opera 6)
2753 <pre>...<br>...<br><br>...</pre> (Opera 7)
2754
2755 or something similar, they could also be closing the <pre> and propagate
2756 the <pre> into the newly opened <p>.
2757
2758 Todo: IMG, OBJECT, APPLET, BIG, SMALL, SUB, SUP, FONT, and BASEFONT are
2759 dissallowed in <pre>, Tidy neither detects this nor does it perform any
2760 cleanup operation. Tidy should at least issue a warning if it encounters
2761 such constructs.
2762
2763 Todo: discarding </p> is abviously a bug, it should be replaced by <br>.
2764 */
2765 TY_(InsertNodeAfterElement)(pre, node);
2766 TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_BEFORE);
2767 ParseTag(doc, node, IgnoreWhitespace);
2768
2769 newnode = TY_(InferredTag)(doc, TidyTag_PRE);
2770 TY_(ReportError)(doc, pre, newnode, INSERTING_TAG);
2771 pre = newnode;
2772 TY_(InsertNodeAfterElement)(node, pre);
2773
2774 continue;
2775 }
2776
2777 if ( nodeIsP(node) )
2778 {
2779 if (node->type == StartTag)
2780 {
2781 TY_(ReportError)(doc, pre, node, USING_BR_INPLACE_OF);
2782
2783 /* trim white space before <p> in <pre>*/
2784 TrimSpaces(doc, pre);
2785
2786 /* coerce both <p> and </p> to <br> */
2787 TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
2788 TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */
2789 TY_(InsertNodeAtEnd)( pre, node );
2790 }
2791 else
2792 {
2793 TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2794 TY_(FreeNode)( doc, node);
2795 }
2796 continue;
2797 }
2798
2799 if ( TY_(nodeIsElement)(node) )
2800 {
2801 /* trim white space before <br> */
2802 if ( nodeIsBR(node) )
2803 TrimSpaces(doc, pre);
2804
2805 TY_(InsertNodeAtEnd)(pre, node);
2806 ParseTag(doc, node, Preformatted);
2807 continue;
2808 }
2809
2810 /* discard unexpected tags */
2811 TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2812 TY_(FreeNode)( doc, node);
2813 }
2814
2815 TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_FOR);
2816 }
2817
TY_(ParseOptGroup)2818 void TY_(ParseOptGroup)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode))
2819 {
2820 Lexer* lexer = doc->lexer;
2821 Node *node;
2822
2823 lexer->insert = NULL; /* defer implicit inline start tags */
2824
2825 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2826 {
2827 if (node->tag == field->tag && node->type == EndTag)
2828 {
2829 TY_(FreeNode)( doc, node);
2830 field->closed = yes;
2831 TrimSpaces(doc, field);
2832 return;
2833 }
2834
2835 /* deal with comments etc. */
2836 if (InsertMisc(field, node))
2837 continue;
2838
2839 if ( node->type == StartTag &&
2840 (nodeIsOPTION(node) || nodeIsOPTGROUP(node)) )
2841 {
2842 if ( nodeIsOPTGROUP(node) )
2843 TY_(ReportError)(doc, field, node, CANT_BE_NESTED);
2844
2845 TY_(InsertNodeAtEnd)(field, node);
2846 ParseTag(doc, node, MixedContent);
2847 continue;
2848 }
2849
2850 /* discard unexpected tags */
2851 TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED );
2852 TY_(FreeNode)( doc, node);
2853 }
2854 }
2855
2856
TY_(ParseSelect)2857 void TY_(ParseSelect)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode))
2858 {
2859 Lexer* lexer = doc->lexer;
2860 Node *node;
2861
2862 lexer->insert = NULL; /* defer implicit inline start tags */
2863
2864 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2865 {
2866 if (node->tag == field->tag && node->type == EndTag)
2867 {
2868 TY_(FreeNode)( doc, node);
2869 field->closed = yes;
2870 TrimSpaces(doc, field);
2871 return;
2872 }
2873
2874 /* deal with comments etc. */
2875 if (InsertMisc(field, node))
2876 continue;
2877
2878 if ( node->type == StartTag &&
2879 ( nodeIsOPTION(node) ||
2880 nodeIsOPTGROUP(node) ||
2881 nodeIsSCRIPT(node))
2882 )
2883 {
2884 TY_(InsertNodeAtEnd)(field, node);
2885 ParseTag(doc, node, IgnoreWhitespace);
2886 continue;
2887 }
2888
2889 /* discard unexpected tags */
2890 TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED);
2891 TY_(FreeNode)( doc, node);
2892 }
2893
2894 TY_(ReportError)(doc, field, node, MISSING_ENDTAG_FOR);
2895 }
2896
TY_(ParseText)2897 void TY_(ParseText)(TidyDocImpl* doc, Node *field, GetTokenMode mode)
2898 {
2899 Lexer* lexer = doc->lexer;
2900 Node *node;
2901
2902 lexer->insert = NULL; /* defer implicit inline start tags */
2903
2904 if ( nodeIsTEXTAREA(field) )
2905 mode = Preformatted;
2906 else
2907 mode = MixedContent; /* kludge for font tags */
2908
2909 while ((node = TY_(GetToken)(doc, mode)) != NULL)
2910 {
2911 if (node->tag == field->tag && node->type == EndTag)
2912 {
2913 TY_(FreeNode)( doc, node);
2914 field->closed = yes;
2915 TrimSpaces(doc, field);
2916 return;
2917 }
2918
2919 /* deal with comments etc. */
2920 if (InsertMisc(field, node))
2921 continue;
2922
2923 if (TY_(nodeIsText)(node))
2924 {
2925 /* only called for 1st child */
2926 if (field->content == NULL && !(mode & Preformatted))
2927 TrimSpaces(doc, field);
2928
2929 if (node->start >= node->end)
2930 {
2931 TY_(FreeNode)( doc, node);
2932 continue;
2933 }
2934
2935 TY_(InsertNodeAtEnd)(field, node);
2936 continue;
2937 }
2938
2939 /* for textarea should all cases of < and & be escaped? */
2940
2941 /* discard inline tags e.g. font */
2942 if ( node->tag
2943 && node->tag->model & CM_INLINE
2944 && !(node->tag->model & CM_FIELD)) /* #487283 - fix by Lee Passey 25 Jan 02 */
2945 {
2946 TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED);
2947 TY_(FreeNode)( doc, node);
2948 continue;
2949 }
2950
2951 /* terminate element on other tags */
2952 if (!(field->tag->model & CM_OPT))
2953 TY_(ReportError)(doc, field, node, MISSING_ENDTAG_BEFORE);
2954
2955 TY_(UngetToken)( doc );
2956 TrimSpaces(doc, field);
2957 return;
2958 }
2959
2960 if (!(field->tag->model & CM_OPT))
2961 TY_(ReportError)(doc, field, node, MISSING_ENDTAG_FOR);
2962 }
2963
2964
TY_(ParseTitle)2965 void TY_(ParseTitle)(TidyDocImpl* doc, Node *title, GetTokenMode ARG_UNUSED(mode))
2966 {
2967 Node *node;
2968 while ((node = TY_(GetToken)(doc, MixedContent)) != NULL)
2969 {
2970 if (node->tag == title->tag && node->type == StartTag)
2971 {
2972 TY_(ReportError)(doc, title, node, COERCE_TO_ENDTAG);
2973 node->type = EndTag;
2974 TY_(UngetToken)( doc );
2975 continue;
2976 }
2977 else if (node->tag == title->tag && node->type == EndTag)
2978 {
2979 TY_(FreeNode)( doc, node);
2980 title->closed = yes;
2981 TrimSpaces(doc, title);
2982 return;
2983 }
2984
2985 if (TY_(nodeIsText)(node))
2986 {
2987 /* only called for 1st child */
2988 if (title->content == NULL)
2989 TrimInitialSpace(doc, title, node);
2990
2991 if (node->start >= node->end)
2992 {
2993 TY_(FreeNode)( doc, node);
2994 continue;
2995 }
2996
2997 TY_(InsertNodeAtEnd)(title, node);
2998 continue;
2999 }
3000
3001 /* deal with comments etc. */
3002 if (InsertMisc(title, node))
3003 continue;
3004
3005 /* discard unknown tags */
3006 if (node->tag == NULL)
3007 {
3008 TY_(ReportError)(doc, title, node, DISCARDING_UNEXPECTED);
3009 TY_(FreeNode)( doc, node);
3010 continue;
3011 }
3012
3013 /* pushback unexpected tokens */
3014 TY_(ReportError)(doc, title, node, MISSING_ENDTAG_BEFORE);
3015 TY_(UngetToken)( doc );
3016 TrimSpaces(doc, title);
3017 return;
3018 }
3019
3020 TY_(ReportError)(doc, title, node, MISSING_ENDTAG_FOR);
3021 }
3022
3023 /*
3024 This isn't quite right for CDATA content as it recognises
3025 tags within the content and parses them accordingly.
3026 This will unfortunately screw up scripts which include
3027 < + letter, < + !, < + ? or < + / + letter
3028 */
3029
TY_(ParseScript)3030 void TY_(ParseScript)(TidyDocImpl* doc, Node *script, GetTokenMode ARG_UNUSED(mode))
3031 {
3032 Node *node;
3033
3034 doc->lexer->parent = script;
3035 node = TY_(GetToken)(doc, CdataContent);
3036 doc->lexer->parent = NULL;
3037
3038 if (node)
3039 {
3040 TY_(InsertNodeAtEnd)(script, node);
3041 }
3042 else
3043 {
3044 /* handle e.g. a document like "<script>" */
3045 TY_(ReportError)(doc, script, NULL, MISSING_ENDTAG_FOR);
3046 return;
3047 }
3048
3049 node = TY_(GetToken)(doc, IgnoreWhitespace);
3050
3051 if (!(node && node->type == EndTag && node->tag &&
3052 node->tag->id == script->tag->id))
3053 {
3054 TY_(ReportError)(doc, script, node, MISSING_ENDTAG_FOR);
3055
3056 if (node)
3057 TY_(UngetToken)(doc);
3058 }
3059 else
3060 {
3061 TY_(FreeNode)(doc, node);
3062 }
3063 }
3064
TY_(IsJavaScript)3065 Bool TY_(IsJavaScript)(Node *node)
3066 {
3067 Bool result = no;
3068 AttVal *attr;
3069
3070 if (node->attributes == NULL)
3071 return yes;
3072
3073 for (attr = node->attributes; attr; attr = attr->next)
3074 {
3075 if ( (attrIsLANGUAGE(attr) || attrIsTYPE(attr))
3076 && AttrContains(attr, "javascript") )
3077 {
3078 result = yes;
3079 break;
3080 }
3081 }
3082
3083 return result;
3084 }
3085
TY_(ParseHead)3086 void TY_(ParseHead)(TidyDocImpl* doc, Node *head, GetTokenMode ARG_UNUSED(mode))
3087 {
3088 Lexer* lexer = doc->lexer;
3089 Node *node;
3090 int HasTitle = 0;
3091 int HasBase = 0;
3092
3093 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
3094 {
3095 if (node->tag == head->tag && node->type == EndTag)
3096 {
3097 TY_(FreeNode)( doc, node);
3098 head->closed = yes;
3099 break;
3100 }
3101
3102 /* find and discard multiple <head> elements */
3103 /* find and discard <html> in <head> elements */
3104 if ((node->tag == head->tag || nodeIsHTML(node)) && node->type == StartTag)
3105 {
3106 TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED);
3107 TY_(FreeNode)(doc, node);
3108 continue;
3109 }
3110
3111 if (TY_(nodeIsText)(node))
3112 {
3113 TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN);
3114 TY_(UngetToken)( doc );
3115 break;
3116 }
3117
3118 if (node->type == ProcInsTag && node->element &&
3119 TY_(tmbstrcmp)(node->element, "xml-stylesheet") == 0)
3120 {
3121 TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN);
3122 TY_(InsertNodeBeforeElement)(TY_(FindHTML)(doc), node);
3123 continue;
3124 }
3125
3126 /* deal with comments etc. */
3127 if (InsertMisc(head, node))
3128 continue;
3129
3130 if (node->type == DocTypeTag)
3131 {
3132 InsertDocType(doc, head, node);
3133 continue;
3134 }
3135
3136 /* discard unknown tags */
3137 if (node->tag == NULL)
3138 {
3139 TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED);
3140 TY_(FreeNode)( doc, node);
3141 continue;
3142 }
3143
3144 /*
3145 if it doesn't belong in the head then
3146 treat as implicit end of head and deal
3147 with as part of the body
3148 */
3149 if (!(node->tag->model & CM_HEAD))
3150 {
3151 /* #545067 Implicit closing of head broken - warn only for XHTML input */
3152 if ( lexer->isvoyager )
3153 TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN );
3154 TY_(UngetToken)( doc );
3155 break;
3156 }
3157
3158 if (TY_(nodeIsElement)(node))
3159 {
3160 if ( nodeIsTITLE(node) )
3161 {
3162 ++HasTitle;
3163
3164 if (HasTitle > 1)
3165 TY_(ReportError)(doc, head, node,
3166 head ?
3167 TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS);
3168 }
3169 else if ( nodeIsBASE(node) )
3170 {
3171 ++HasBase;
3172
3173 if (HasBase > 1)
3174 TY_(ReportError)(doc, head, node,
3175 head ?
3176 TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS);
3177 }
3178 else if ( nodeIsNOSCRIPT(node) )
3179 {
3180 TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN);
3181 }
3182
3183 #ifdef AUTO_INPUT_ENCODING
3184 else if (nodeIsMETA(node))
3185 {
3186 AttVal * httpEquiv = AttrGetById(node, TidyAttr_HTTP_EQUIV);
3187 AttVal * content = AttrGetById(node, TidyAttr_CONTENT);
3188 if (httpEquiv && AttrValueIs(httpEquiv, "Content-Type") && AttrHasValue(content))
3189 {
3190 tmbstr val, charset;
3191 uint end = 0;
3192 val = charset = TY_(tmbstrdup)(doc->allocator, content->value);
3193 val = TY_(tmbstrtolower)(val);
3194 val = strstr(content->value, "charset");
3195
3196 if (val)
3197 val += 7;
3198
3199 while(val && *val && (TY_(IsWhite)((tchar)*val) ||
3200 *val == '=' || *val == '"' || *val == '\''))
3201 ++val;
3202
3203 while(val && val[end] && !(TY_(IsWhite)((tchar)val[end]) ||
3204 val[end] == '"' || val[end] == '\'' || val[end] == ';'))
3205 ++end;
3206
3207 if (val && end)
3208 {
3209 tmbstr encoding = TY_(tmbstrndup)(doc->allocator,val, end);
3210 uint id = TY_(GetEncodingIdFromName)(encoding);
3211
3212 /* todo: detect mismatch with BOM/XMLDecl/declared */
3213 /* todo: error for unsupported encodings */
3214 /* todo: try to re-init transcoder */
3215 /* todo: change input/output encoding settings */
3216 /* todo: store id in StreamIn */
3217
3218 TidyDocFree(doc, encoding);
3219 }
3220
3221 TidyDocFree(doc, charset);
3222 }
3223 }
3224 #endif /* AUTO_INPUT_ENCODING */
3225
3226 TY_(InsertNodeAtEnd)(head, node);
3227 ParseTag(doc, node, IgnoreWhitespace);
3228 continue;
3229 }
3230
3231 /* discard unexpected text nodes and end tags */
3232 TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED);
3233 TY_(FreeNode)( doc, node);
3234 }
3235 }
3236
TY_(ParseBody)3237 void TY_(ParseBody)(TidyDocImpl* doc, Node *body, GetTokenMode mode)
3238 {
3239 Lexer* lexer = doc->lexer;
3240 Node *node;
3241 Bool checkstack, iswhitenode;
3242
3243 mode = IgnoreWhitespace;
3244 checkstack = yes;
3245
3246 TY_(BumpObject)( doc, body->parent );
3247
3248 while ((node = TY_(GetToken)(doc, mode)) != NULL)
3249 {
3250 /* find and discard multiple <body> elements */
3251 if (node->tag == body->tag && node->type == StartTag)
3252 {
3253 TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3254 TY_(FreeNode)(doc, node);
3255 continue;
3256 }
3257
3258 /* #538536 Extra endtags not detected */
3259 if ( nodeIsHTML(node) )
3260 {
3261 if (TY_(nodeIsElement)(node) || lexer->seenEndHtml)
3262 TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3263 else
3264 lexer->seenEndHtml = 1;
3265
3266 TY_(FreeNode)( doc, node);
3267 continue;
3268 }
3269
3270 if ( lexer->seenEndBody &&
3271 ( node->type == StartTag ||
3272 node->type == EndTag ||
3273 node->type == StartEndTag ) )
3274 {
3275 TY_(ReportError)(doc, body, node, CONTENT_AFTER_BODY );
3276 }
3277
3278 if ( node->tag == body->tag && node->type == EndTag )
3279 {
3280 body->closed = yes;
3281 TrimSpaces(doc, body);
3282 TY_(FreeNode)( doc, node);
3283 lexer->seenEndBody = 1;
3284 mode = IgnoreWhitespace;
3285
3286 if ( nodeIsNOFRAMES(body->parent) )
3287 break;
3288
3289 continue;
3290 }
3291
3292 if ( nodeIsNOFRAMES(node) )
3293 {
3294 if (node->type == StartTag)
3295 {
3296 TY_(InsertNodeAtEnd)(body, node);
3297 TY_(ParseBlock)(doc, node, mode);
3298 continue;
3299 }
3300
3301 if (node->type == EndTag && nodeIsNOFRAMES(body->parent) )
3302 {
3303 TrimSpaces(doc, body);
3304 TY_(UngetToken)( doc );
3305 break;
3306 }
3307 }
3308
3309 if ( (nodeIsFRAME(node) || nodeIsFRAMESET(node))
3310 && nodeIsNOFRAMES(body->parent) )
3311 {
3312 TrimSpaces(doc, body);
3313 TY_(UngetToken)( doc );
3314 break;
3315 }
3316
3317 iswhitenode = no;
3318
3319 if ( TY_(nodeIsText)(node) &&
3320 node->end <= node->start + 1 &&
3321 lexer->lexbuf[node->start] == ' ' )
3322 iswhitenode = yes;
3323
3324 /* deal with comments etc. */
3325 if (InsertMisc(body, node))
3326 continue;
3327
3328 /* mixed content model permits text */
3329 if (TY_(nodeIsText)(node))
3330 {
3331 if (iswhitenode && mode == IgnoreWhitespace)
3332 {
3333 TY_(FreeNode)( doc, node);
3334 continue;
3335 }
3336
3337 /* HTML 2 and HTML4 strict don't allow text here */
3338 TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT | VERS_HTML20));
3339
3340 if (checkstack)
3341 {
3342 checkstack = no;
3343
3344 if ( TY_(InlineDup)(doc, node) > 0 )
3345 continue;
3346 }
3347
3348 TY_(InsertNodeAtEnd)(body, node);
3349 mode = MixedContent;
3350 continue;
3351 }
3352
3353 if (node->type == DocTypeTag)
3354 {
3355 InsertDocType(doc, body, node);
3356 continue;
3357 }
3358 /* discard unknown and PARAM tags */
3359 if ( node->tag == NULL || nodeIsPARAM(node) )
3360 {
3361 TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3362 TY_(FreeNode)( doc, node);
3363 continue;
3364 }
3365
3366 /*
3367 Netscape allows LI and DD directly in BODY
3368 We infer UL or DL respectively and use this
3369 Bool to exclude block-level elements so as
3370 to match Netscape's observed behaviour.
3371 */
3372 lexer->excludeBlocks = no;
3373
3374 if ( nodeIsINPUT(node) ||
3375 (!TY_(nodeHasCM)(node, CM_BLOCK) && !TY_(nodeHasCM)(node, CM_INLINE))
3376 )
3377 {
3378 /* avoid this error message being issued twice */
3379 if (!(node->tag->model & CM_HEAD))
3380 TY_(ReportError)(doc, body, node, TAG_NOT_ALLOWED_IN);
3381
3382 if (node->tag->model & CM_HTML)
3383 {
3384 /* copy body attributes if current body was inferred */
3385 if ( nodeIsBODY(node) && body->implicit
3386 && body->attributes == NULL )
3387 {
3388 body->attributes = node->attributes;
3389 node->attributes = NULL;
3390 }
3391
3392 TY_(FreeNode)( doc, node);
3393 continue;
3394 }
3395
3396 if (node->tag->model & CM_HEAD)
3397 {
3398 MoveToHead(doc, body, node);
3399 continue;
3400 }
3401
3402 if (node->tag->model & CM_LIST)
3403 {
3404 TY_(UngetToken)( doc );
3405 node = TY_(InferredTag)(doc, TidyTag_UL);
3406 AddClassNoIndent(doc, node);
3407 lexer->excludeBlocks = yes;
3408 }
3409 else if (node->tag->model & CM_DEFLIST)
3410 {
3411 TY_(UngetToken)( doc );
3412 node = TY_(InferredTag)(doc, TidyTag_DL);
3413 lexer->excludeBlocks = yes;
3414 }
3415 else if (node->tag->model & (CM_TABLE | CM_ROWGRP | CM_ROW))
3416 {
3417 /* http://tidy.sf.net/issue/2855621 */
3418 if (node->type != EndTag) {
3419 TY_(UngetToken)( doc );
3420 node = TY_(InferredTag)(doc, TidyTag_TABLE);
3421 }
3422 lexer->excludeBlocks = yes;
3423 }
3424 else if ( nodeIsINPUT(node) )
3425 {
3426 TY_(UngetToken)( doc );
3427 node = TY_(InferredTag)(doc, TidyTag_FORM);
3428 lexer->excludeBlocks = yes;
3429 }
3430 else
3431 {
3432 if ( !TY_(nodeHasCM)(node, CM_ROW | CM_FIELD) )
3433 {
3434 TY_(UngetToken)( doc );
3435 return;
3436 }
3437
3438 /* ignore </td> </th> <option> etc. */
3439 TY_(FreeNode)( doc, node );
3440 continue;
3441 }
3442 }
3443
3444 if (node->type == EndTag)
3445 {
3446 if ( nodeIsBR(node) )
3447 node->type = StartTag;
3448 else if ( nodeIsP(node) )
3449 {
3450 node->type = StartEndTag;
3451 node->implicit = yes;
3452 #if OBSOLETE
3453 TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
3454 FreeAttrs( doc, node ); /* discard align attribute etc. */
3455 TY_(InsertNodeAtEnd)(body, node);
3456 node = TY_(InferredTag)(doc, TidyTag_BR);
3457 #endif
3458 }
3459 else if ( TY_(nodeHasCM)(node, CM_INLINE) )
3460 TY_(PopInline)( doc, node );
3461 }
3462
3463 if (TY_(nodeIsElement)(node))
3464 {
3465 if ( TY_(nodeHasCM)(node, CM_INLINE) && !TY_(nodeHasCM)(node, CM_MIXED) )
3466 {
3467 /* HTML4 strict doesn't allow inline content here */
3468 /* but HTML2 does allow img elements as children of body */
3469 if ( nodeIsIMG(node) )
3470 TY_(ConstrainVersion)(doc, ~VERS_HTML40_STRICT);
3471 else
3472 TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT|VERS_HTML20));
3473
3474 if (checkstack && !node->implicit)
3475 {
3476 checkstack = no;
3477
3478 if ( TY_(InlineDup)(doc, node) > 0 )
3479 continue;
3480 }
3481
3482 mode = MixedContent;
3483 }
3484 else
3485 {
3486 checkstack = yes;
3487 mode = IgnoreWhitespace;
3488 }
3489
3490 if (node->implicit)
3491 TY_(ReportError)(doc, body, node, INSERTING_TAG);
3492
3493 TY_(InsertNodeAtEnd)(body, node);
3494 ParseTag(doc, node, mode);
3495 continue;
3496 }
3497
3498 /* discard unexpected tags */
3499 TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3500 TY_(FreeNode)( doc, node);
3501 }
3502 }
3503
TY_(ParseNoFrames)3504 void TY_(ParseNoFrames)(TidyDocImpl* doc, Node *noframes, GetTokenMode mode)
3505 {
3506 Lexer* lexer = doc->lexer;
3507 Node *node;
3508
3509 if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3510 {
3511 doc->badAccess |= BA_USING_NOFRAMES;
3512 }
3513 mode = IgnoreWhitespace;
3514
3515 while ( (node = TY_(GetToken)(doc, mode)) != NULL )
3516 {
3517 if ( node->tag == noframes->tag && node->type == EndTag )
3518 {
3519 TY_(FreeNode)( doc, node);
3520 noframes->closed = yes;
3521 TrimSpaces(doc, noframes);
3522 return;
3523 }
3524
3525 if ( nodeIsFRAME(node) || nodeIsFRAMESET(node) )
3526 {
3527 TrimSpaces(doc, noframes);
3528 if (node->type == EndTag)
3529 {
3530 TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3531 TY_(FreeNode)( doc, node); /* Throw it away */
3532 }
3533 else
3534 {
3535 TY_(ReportError)(doc, noframes, node, MISSING_ENDTAG_BEFORE);
3536 TY_(UngetToken)( doc );
3537 }
3538 return;
3539 }
3540
3541 if ( nodeIsHTML(node) )
3542 {
3543 if (TY_(nodeIsElement)(node))
3544 TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3545
3546 TY_(FreeNode)( doc, node);
3547 continue;
3548 }
3549
3550 /* deal with comments etc. */
3551 if (InsertMisc(noframes, node))
3552 continue;
3553
3554 if ( nodeIsBODY(node) && node->type == StartTag )
3555 {
3556 Bool seen_body = lexer->seenEndBody;
3557 TY_(InsertNodeAtEnd)(noframes, node);
3558 ParseTag(doc, node, IgnoreWhitespace /*MixedContent*/);
3559
3560 /* fix for bug http://tidy.sf.net/bug/887259 */
3561 if (seen_body && TY_(FindBody)(doc) != node)
3562 {
3563 TY_(CoerceNode)(doc, node, TidyTag_DIV, no, no);
3564 MoveNodeToBody(doc, node);
3565 }
3566 continue;
3567 }
3568
3569 /* implicit body element inferred */
3570 if (TY_(nodeIsText)(node) || (node->tag && node->type != EndTag))
3571 {
3572 Node *body = TY_(FindBody)( doc );
3573 if ( body || lexer->seenEndBody )
3574 {
3575 if ( body == NULL )
3576 {
3577 TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3578 TY_(FreeNode)( doc, node);
3579 continue;
3580 }
3581 if ( TY_(nodeIsText)(node) )
3582 {
3583 TY_(UngetToken)( doc );
3584 node = TY_(InferredTag)(doc, TidyTag_P);
3585 TY_(ReportError)(doc, noframes, node, CONTENT_AFTER_BODY );
3586 }
3587 TY_(InsertNodeAtEnd)( body, node );
3588 }
3589 else
3590 {
3591 TY_(UngetToken)( doc );
3592 node = TY_(InferredTag)(doc, TidyTag_BODY);
3593 if ( cfgBool(doc, TidyXmlOut) )
3594 TY_(ReportError)(doc, noframes, node, INSERTING_TAG);
3595 TY_(InsertNodeAtEnd)( noframes, node );
3596 }
3597
3598 ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ );
3599 continue;
3600 }
3601
3602 /* discard unexpected end tags */
3603 TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3604 TY_(FreeNode)( doc, node);
3605 }
3606
3607 TY_(ReportError)(doc, noframes, node, MISSING_ENDTAG_FOR);
3608 }
3609
TY_(ParseFrameSet)3610 void TY_(ParseFrameSet)(TidyDocImpl* doc, Node *frameset, GetTokenMode ARG_UNUSED(mode))
3611 {
3612 Lexer* lexer = doc->lexer;
3613 Node *node;
3614
3615 if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3616 {
3617 doc->badAccess |= BA_USING_FRAMES;
3618 }
3619
3620 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
3621 {
3622 if (node->tag == frameset->tag && node->type == EndTag)
3623 {
3624 TY_(FreeNode)( doc, node);
3625 frameset->closed = yes;
3626 TrimSpaces(doc, frameset);
3627 return;
3628 }
3629
3630 /* deal with comments etc. */
3631 if (InsertMisc(frameset, node))
3632 continue;
3633
3634 if (node->tag == NULL)
3635 {
3636 TY_(ReportError)(doc, frameset, node, DISCARDING_UNEXPECTED);
3637 TY_(FreeNode)( doc, node);
3638 continue;
3639 }
3640
3641 if (TY_(nodeIsElement)(node))
3642 {
3643 if (node->tag && node->tag->model & CM_HEAD)
3644 {
3645 MoveToHead(doc, frameset, node);
3646 continue;
3647 }
3648 }
3649
3650 if ( nodeIsBODY(node) )
3651 {
3652 TY_(UngetToken)( doc );
3653 node = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
3654 TY_(ReportError)(doc, frameset, node, INSERTING_TAG);
3655 }
3656
3657 if (node->type == StartTag && (node->tag->model & CM_FRAMES))
3658 {
3659 TY_(InsertNodeAtEnd)(frameset, node);
3660 lexer->excludeBlocks = no;
3661 ParseTag(doc, node, MixedContent);
3662 continue;
3663 }
3664 else if (node->type == StartEndTag && (node->tag->model & CM_FRAMES))
3665 {
3666 TY_(InsertNodeAtEnd)(frameset, node);
3667 continue;
3668 }
3669
3670 /* discard unexpected tags */
3671 #if SUPPORT_ACCESSIBILITY_CHECKS
3672 /* WAI [6.5.1.4] link is being discarded outside of NOFRAME */
3673 if ( nodeIsA(node) )
3674 doc->badAccess |= BA_INVALID_LINK_NOFRAMES;
3675 #endif
3676
3677 TY_(ReportError)(doc, frameset, node, DISCARDING_UNEXPECTED);
3678 TY_(FreeNode)( doc, node);
3679 }
3680
3681 TY_(ReportError)(doc, frameset, node, MISSING_ENDTAG_FOR);
3682 }
3683
TY_(ParseHTML)3684 void TY_(ParseHTML)(TidyDocImpl* doc, Node *html, GetTokenMode mode)
3685 {
3686 Node *node, *head;
3687 Node *frameset = NULL;
3688 Node *noframes = NULL;
3689
3690 TY_(SetOptionBool)( doc, TidyXmlTags, no );
3691
3692 for (;;)
3693 {
3694 node = TY_(GetToken)(doc, IgnoreWhitespace);
3695
3696 if (node == NULL)
3697 {
3698 node = TY_(InferredTag)(doc, TidyTag_HEAD);
3699 break;
3700 }
3701
3702 if ( nodeIsHEAD(node) )
3703 break;
3704
3705 if (node->tag == html->tag && node->type == EndTag)
3706 {
3707 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3708 TY_(FreeNode)( doc, node);
3709 continue;
3710 }
3711
3712 /* find and discard multiple <html> elements */
3713 if (node->tag == html->tag && node->type == StartTag)
3714 {
3715 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3716 TY_(FreeNode)(doc, node);
3717 continue;
3718 }
3719
3720 /* deal with comments etc. */
3721 if (InsertMisc(html, node))
3722 continue;
3723
3724 TY_(UngetToken)( doc );
3725 node = TY_(InferredTag)(doc, TidyTag_HEAD);
3726 break;
3727 }
3728
3729 head = node;
3730 TY_(InsertNodeAtEnd)(html, head);
3731 TY_(ParseHead)(doc, head, mode);
3732
3733 for (;;)
3734 {
3735 node = TY_(GetToken)(doc, IgnoreWhitespace);
3736
3737 if (node == NULL)
3738 {
3739 if (frameset == NULL) /* implied body */
3740 {
3741 node = TY_(InferredTag)(doc, TidyTag_BODY);
3742 TY_(InsertNodeAtEnd)(html, node);
3743 TY_(ParseBody)(doc, node, mode);
3744 }
3745
3746 return;
3747 }
3748
3749 /* robustly handle html tags */
3750 if (node->tag == html->tag)
3751 {
3752 if (node->type != StartTag && frameset == NULL)
3753 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3754
3755 TY_(FreeNode)( doc, node);
3756 continue;
3757 }
3758
3759 /* deal with comments etc. */
3760 if (InsertMisc(html, node))
3761 continue;
3762
3763 /* if frameset document coerce <body> to <noframes> */
3764 if ( nodeIsBODY(node) )
3765 {
3766 if (node->type != StartTag)
3767 {
3768 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3769 TY_(FreeNode)( doc, node);
3770 continue;
3771 }
3772
3773 if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3774 {
3775 if (frameset != NULL)
3776 {
3777 TY_(UngetToken)( doc );
3778
3779 if (noframes == NULL)
3780 {
3781 noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
3782 TY_(InsertNodeAtEnd)(frameset, noframes);
3783 TY_(ReportError)(doc, html, noframes, INSERTING_TAG);
3784 }
3785 else
3786 {
3787 if (noframes->type == StartEndTag)
3788 noframes->type = StartTag;
3789 }
3790
3791 ParseTag(doc, noframes, mode);
3792 continue;
3793 }
3794 }
3795
3796 TY_(ConstrainVersion)(doc, ~VERS_FRAMESET);
3797 break; /* to parse body */
3798 }
3799
3800 /* flag an error if we see more than one frameset */
3801 if ( nodeIsFRAMESET(node) )
3802 {
3803 if (node->type != StartTag)
3804 {
3805 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3806 TY_(FreeNode)( doc, node);
3807 continue;
3808 }
3809
3810 if (frameset != NULL)
3811 TY_(ReportFatal)(doc, html, node, DUPLICATE_FRAMESET);
3812 else
3813 frameset = node;
3814
3815 TY_(InsertNodeAtEnd)(html, node);
3816 ParseTag(doc, node, mode);
3817
3818 /*
3819 see if it includes a noframes element so
3820 that we can merge subsequent noframes elements
3821 */
3822
3823 for (node = frameset->content; node; node = node->next)
3824 {
3825 if ( nodeIsNOFRAMES(node) )
3826 noframes = node;
3827 }
3828 continue;
3829 }
3830
3831 /* if not a frameset document coerce <noframes> to <body> */
3832 if ( nodeIsNOFRAMES(node) )
3833 {
3834 if (node->type != StartTag)
3835 {
3836 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3837 TY_(FreeNode)( doc, node);
3838 continue;
3839 }
3840
3841 if (frameset == NULL)
3842 {
3843 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3844 TY_(FreeNode)( doc, node);
3845 node = TY_(InferredTag)(doc, TidyTag_BODY);
3846 break;
3847 }
3848
3849 if (noframes == NULL)
3850 {
3851 noframes = node;
3852 TY_(InsertNodeAtEnd)(frameset, noframes);
3853 }
3854 else
3855 TY_(FreeNode)( doc, node);
3856
3857 ParseTag(doc, noframes, mode);
3858 continue;
3859 }
3860
3861 if (TY_(nodeIsElement)(node))
3862 {
3863 if (node->tag && node->tag->model & CM_HEAD)
3864 {
3865 MoveToHead(doc, html, node);
3866 continue;
3867 }
3868
3869 /* discard illegal frame element following a frameset */
3870 if ( frameset != NULL && nodeIsFRAME(node) )
3871 {
3872 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3873 TY_(FreeNode)(doc, node);
3874 continue;
3875 }
3876 }
3877
3878 TY_(UngetToken)( doc );
3879
3880 /* insert other content into noframes element */
3881
3882 if (frameset)
3883 {
3884 if (noframes == NULL)
3885 {
3886 noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
3887 TY_(InsertNodeAtEnd)(frameset, noframes);
3888 }
3889 else
3890 {
3891 TY_(ReportError)(doc, html, node, NOFRAMES_CONTENT);
3892 if (noframes->type == StartEndTag)
3893 noframes->type = StartTag;
3894 }
3895
3896 TY_(ConstrainVersion)(doc, VERS_FRAMESET);
3897 ParseTag(doc, noframes, mode);
3898 continue;
3899 }
3900
3901 node = TY_(InferredTag)(doc, TidyTag_BODY);
3902 TY_(ReportError)(doc, html, node, INSERTING_TAG );
3903 TY_(ConstrainVersion)(doc, ~VERS_FRAMESET);
3904 break;
3905 }
3906
3907 /* node must be body */
3908
3909 TY_(InsertNodeAtEnd)(html, node);
3910 ParseTag(doc, node, mode);
3911 }
3912
nodeCMIsOnlyInline(Node * node)3913 static Bool nodeCMIsOnlyInline( Node* node )
3914 {
3915 return TY_(nodeHasCM)( node, CM_INLINE ) && !TY_(nodeHasCM)( node, CM_BLOCK );
3916 }
3917
EncloseBodyText(TidyDocImpl * doc)3918 static void EncloseBodyText(TidyDocImpl* doc)
3919 {
3920 Node* node;
3921 Node* body = TY_(FindBody)(doc);
3922
3923 if (!body)
3924 return;
3925
3926 node = body->content;
3927
3928 while (node)
3929 {
3930 if ((TY_(nodeIsText)(node) && !TY_(IsBlank)(doc->lexer, node)) ||
3931 (TY_(nodeIsElement)(node) && nodeCMIsOnlyInline(node)))
3932 {
3933 Node* p = TY_(InferredTag)(doc, TidyTag_P);
3934 TY_(InsertNodeBeforeElement)(node, p);
3935 while (node && (!TY_(nodeIsElement)(node) || nodeCMIsOnlyInline(node)))
3936 {
3937 Node* next = node->next;
3938 TY_(RemoveNode)(node);
3939 TY_(InsertNodeAtEnd)(p, node);
3940 node = next;
3941 }
3942 TrimSpaces(doc, p);
3943 continue;
3944 }
3945 node = node->next;
3946 }
3947 }
3948
3949 /* <form>, <blockquote> and <noscript> do not allow #PCDATA in
3950 HTML 4.01 Strict (%block; model instead of %flow;).
3951 When requested, text nodes in these elements are wrapped in <p>. */
EncloseBlockText(TidyDocImpl * doc,Node * node)3952 static void EncloseBlockText(TidyDocImpl* doc, Node* node)
3953 {
3954 Node *next;
3955 Node *block;
3956
3957 while (node)
3958 {
3959 next = node->next;
3960
3961 if (node->content)
3962 EncloseBlockText(doc, node->content);
3963
3964 if (!(nodeIsFORM(node) || nodeIsNOSCRIPT(node) ||
3965 nodeIsBLOCKQUOTE(node))
3966 || !node->content)
3967 {
3968 node = next;
3969 continue;
3970 }
3971
3972 block = node->content;
3973
3974 if ((TY_(nodeIsText)(block) && !TY_(IsBlank)(doc->lexer, block)) ||
3975 (TY_(nodeIsElement)(block) && nodeCMIsOnlyInline(block)))
3976 {
3977 Node* p = TY_(InferredTag)(doc, TidyTag_P);
3978 TY_(InsertNodeBeforeElement)(block, p);
3979 while (block &&
3980 (!TY_(nodeIsElement)(block) || nodeCMIsOnlyInline(block)))
3981 {
3982 Node* tempNext = block->next;
3983 TY_(RemoveNode)(block);
3984 TY_(InsertNodeAtEnd)(p, block);
3985 block = tempNext;
3986 }
3987 TrimSpaces(doc, p);
3988 continue;
3989 }
3990
3991 node = next;
3992 }
3993 }
3994
ReplaceObsoleteElements(TidyDocImpl * doc,Node * node)3995 static void ReplaceObsoleteElements(TidyDocImpl* doc, Node* node)
3996 {
3997 Node *next;
3998
3999 while (node)
4000 {
4001 next = node->next;
4002
4003 if (nodeIsDIR(node) || nodeIsMENU(node))
4004 TY_(CoerceNode)(doc, node, TidyTag_UL, yes, yes);
4005
4006 if (nodeIsXMP(node) || nodeIsLISTING(node) ||
4007 (node->tag && node->tag->id == TidyTag_PLAINTEXT))
4008 TY_(CoerceNode)(doc, node, TidyTag_PRE, yes, yes);
4009
4010 if (node->content)
4011 ReplaceObsoleteElements(doc, node->content);
4012
4013 node = next;
4014 }
4015 }
4016
AttributeChecks(TidyDocImpl * doc,Node * node)4017 static void AttributeChecks(TidyDocImpl* doc, Node* node)
4018 {
4019 Node *next;
4020
4021 while (node)
4022 {
4023 next = node->next;
4024
4025 if (TY_(nodeIsElement)(node))
4026 {
4027 if (node->tag->chkattrs)
4028 node->tag->chkattrs(doc, node);
4029 else
4030 TY_(CheckAttributes)(doc, node);
4031 }
4032
4033 if (node->content)
4034 AttributeChecks(doc, node->content);
4035
4036 assert( next != node ); /* http://tidy.sf.net/issue/1603538 */
4037 node = next;
4038 }
4039 }
4040
4041 /*
4042 HTML is the top level element
4043 */
TY_(ParseDocument)4044 void TY_(ParseDocument)(TidyDocImpl* doc)
4045 {
4046 Node *node, *html, *doctype = NULL;
4047
4048 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
4049 {
4050 if (node->type == XmlDecl)
4051 {
4052 if (TY_(FindXmlDecl)(doc) && doc->root.content)
4053 {
4054 TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4055 TY_(FreeNode)(doc, node);
4056 continue;
4057 }
4058 if (node->line != 1 || (node->line == 1 && node->column != 1))
4059 {
4060 TY_(ReportError)(doc, &doc->root, node, SPACE_PRECEDING_XMLDECL);
4061 }
4062 }
4063 #ifdef AUTO_INPUT_ENCODING
4064 if (node->type == XmlDecl)
4065 {
4066 AttVal* encoding = GetAttrByName(node, "encoding");
4067 if (AttrHasValue(encoding))
4068 {
4069 uint id = TY_(GetEncodingIdFromName)(encoding->value);
4070
4071 /* todo: detect mismatch with BOM/XMLDecl/declared */
4072 /* todo: error for unsupported encodings */
4073 /* todo: try to re-init transcoder */
4074 /* todo: change input/output encoding settings */
4075 /* todo: store id in StreamIn */
4076 }
4077 }
4078 #endif /* AUTO_INPUT_ENCODING */
4079
4080 /* deal with comments etc. */
4081 if (InsertMisc( &doc->root, node ))
4082 continue;
4083
4084 if (node->type == DocTypeTag)
4085 {
4086 if (doctype == NULL)
4087 {
4088 TY_(InsertNodeAtEnd)( &doc->root, node);
4089 doctype = node;
4090 }
4091 else
4092 {
4093 TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4094 TY_(FreeNode)( doc, node);
4095 }
4096 continue;
4097 }
4098
4099 if (node->type == EndTag)
4100 {
4101 TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4102 TY_(FreeNode)( doc, node);
4103 continue;
4104 }
4105
4106 if (node->type == StartTag && nodeIsHTML(node))
4107 {
4108 AttVal *xmlns;
4109
4110 xmlns = TY_(AttrGetById)(node, TidyAttr_XMLNS);
4111
4112 if (AttrValueIs(xmlns, XHTML_NAMESPACE))
4113 {
4114 Bool htmlOut = cfgBool( doc, TidyHtmlOut );
4115 doc->lexer->isvoyager = yes; /* Unless plain HTML */
4116 TY_(SetOptionBool)( doc, TidyXhtmlOut, !htmlOut ); /* is specified, output*/
4117 TY_(SetOptionBool)( doc, TidyXmlOut, !htmlOut ); /* will be XHTML. */
4118
4119 /* adjust other config options, just as in config.c */
4120 if ( !htmlOut )
4121 {
4122 TY_(SetOptionBool)( doc, TidyUpperCaseTags, no );
4123 TY_(SetOptionBool)( doc, TidyUpperCaseAttrs, no );
4124 }
4125 }
4126 }
4127
4128 if ( node->type != StartTag || !nodeIsHTML(node) )
4129 {
4130 TY_(UngetToken)( doc );
4131 html = TY_(InferredTag)(doc, TidyTag_HTML);
4132 }
4133 else
4134 html = node;
4135
4136 if (!TY_(FindDocType)(doc))
4137 TY_(ReportError)(doc, NULL, NULL, MISSING_DOCTYPE);
4138
4139 TY_(InsertNodeAtEnd)( &doc->root, html);
4140 TY_(ParseHTML)( doc, html, IgnoreWhitespace );
4141 break;
4142 }
4143
4144 #if SUPPORT_ACCESSIBILITY_CHECKS
4145 /* do this before any more document fixes */
4146 if ( cfg( doc, TidyAccessibilityCheckLevel ) > 0 )
4147 TY_(AccessibilityChecks)( doc );
4148 #endif /* #if SUPPORT_ACCESSIBILITY_CHECKS */
4149
4150 if (!TY_(FindHTML)(doc))
4151 {
4152 /* a later check should complain if <body> is empty */
4153 html = TY_(InferredTag)(doc, TidyTag_HTML);
4154 TY_(InsertNodeAtEnd)( &doc->root, html);
4155 TY_(ParseHTML)(doc, html, IgnoreWhitespace);
4156 }
4157
4158 if (!TY_(FindTITLE)(doc))
4159 {
4160 Node* head = TY_(FindHEAD)(doc);
4161 TY_(ReportError)(doc, head, NULL, MISSING_TITLE_ELEMENT);
4162 TY_(InsertNodeAtEnd)(head, TY_(InferredTag)(doc, TidyTag_TITLE));
4163 }
4164
4165 AttributeChecks(doc, &doc->root);
4166 ReplaceObsoleteElements(doc, &doc->root);
4167 TY_(DropEmptyElements)(doc, &doc->root);
4168 CleanSpaces(doc, &doc->root);
4169
4170 if (cfgBool(doc, TidyEncloseBodyText))
4171 EncloseBodyText(doc);
4172 if (cfgBool(doc, TidyEncloseBlockText))
4173 EncloseBlockText(doc, &doc->root);
4174 }
4175
TY_(XMLPreserveWhiteSpace)4176 Bool TY_(XMLPreserveWhiteSpace)( TidyDocImpl* doc, Node *element)
4177 {
4178 AttVal *attribute;
4179
4180 /* search attributes for xml:space */
4181 for (attribute = element->attributes; attribute; attribute = attribute->next)
4182 {
4183 if (attrIsXML_SPACE(attribute))
4184 {
4185 if (AttrValueIs(attribute, "preserve"))
4186 return yes;
4187
4188 return no;
4189 }
4190 }
4191
4192 if (element->element == NULL)
4193 return no;
4194
4195 /* kludge for html docs without explicit xml:space attribute */
4196 if (nodeIsPRE(element) ||
4197 nodeIsSCRIPT(element) ||
4198 nodeIsSTYLE(element) ||
4199 TY_(FindParser)(doc, element) == TY_(ParsePre))
4200 return yes;
4201
4202 /* kludge for XSL docs */
4203 if ( TY_(tmbstrcasecmp)(element->element, "xsl:text") == 0 )
4204 return yes;
4205
4206 return no;
4207 }
4208
4209 /*
4210 XML documents
4211 */
ParseXMLElement(TidyDocImpl * doc,Node * element,GetTokenMode mode)4212 static void ParseXMLElement(TidyDocImpl* doc, Node *element, GetTokenMode mode)
4213 {
4214 Lexer* lexer = doc->lexer;
4215 Node *node;
4216
4217 /* if node is pre or has xml:space="preserve" then do so */
4218
4219 if ( TY_(XMLPreserveWhiteSpace)(doc, element) )
4220 mode = Preformatted;
4221
4222 while ((node = TY_(GetToken)(doc, mode)) != NULL)
4223 {
4224 if (node->type == EndTag &&
4225 node->element && element->element &&
4226 TY_(tmbstrcmp)(node->element, element->element) == 0)
4227 {
4228 TY_(FreeNode)( doc, node);
4229 element->closed = yes;
4230 break;
4231 }
4232
4233 /* discard unexpected end tags */
4234 if (node->type == EndTag)
4235 {
4236 if (element)
4237 TY_(ReportFatal)(doc, element, node, UNEXPECTED_ENDTAG_IN);
4238 else
4239 TY_(ReportFatal)(doc, element, node, UNEXPECTED_ENDTAG);
4240
4241 TY_(FreeNode)( doc, node);
4242 continue;
4243 }
4244
4245 /* parse content on seeing start tag */
4246 if (node->type == StartTag)
4247 ParseXMLElement( doc, node, mode );
4248
4249 TY_(InsertNodeAtEnd)(element, node);
4250 }
4251
4252 /*
4253 if first child is text then trim initial space and
4254 delete text node if it is empty.
4255 */
4256
4257 node = element->content;
4258
4259 if (TY_(nodeIsText)(node) && mode != Preformatted)
4260 {
4261 if ( lexer->lexbuf[node->start] == ' ' )
4262 {
4263 node->start++;
4264
4265 if (node->start >= node->end)
4266 TY_(DiscardElement)( doc, node );
4267 }
4268 }
4269
4270 /*
4271 if last child is text then trim final space and
4272 delete the text node if it is empty
4273 */
4274
4275 node = element->last;
4276
4277 if (TY_(nodeIsText)(node) && mode != Preformatted)
4278 {
4279 if ( lexer->lexbuf[node->end - 1] == ' ' )
4280 {
4281 node->end--;
4282
4283 if (node->start >= node->end)
4284 TY_(DiscardElement)( doc, node );
4285 }
4286 }
4287 }
4288
TY_(ParseXMLDocument)4289 void TY_(ParseXMLDocument)(TidyDocImpl* doc)
4290 {
4291 Node *node, *doctype = NULL;
4292
4293 TY_(SetOptionBool)( doc, TidyXmlTags, yes );
4294
4295 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
4296 {
4297 /* discard unexpected end tags */
4298 if (node->type == EndTag)
4299 {
4300 TY_(ReportError)(doc, NULL, node, UNEXPECTED_ENDTAG);
4301 TY_(FreeNode)( doc, node);
4302 continue;
4303 }
4304
4305 /* deal with comments etc. */
4306 if (InsertMisc( &doc->root, node))
4307 continue;
4308
4309 if (node->type == DocTypeTag)
4310 {
4311 if (doctype == NULL)
4312 {
4313 TY_(InsertNodeAtEnd)( &doc->root, node);
4314 doctype = node;
4315 }
4316 else
4317 {
4318 TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4319 TY_(FreeNode)( doc, node);
4320 }
4321 continue;
4322 }
4323
4324 if (node->type == StartEndTag)
4325 {
4326 TY_(InsertNodeAtEnd)( &doc->root, node);
4327 continue;
4328 }
4329
4330 /* if start tag then parse element's content */
4331 if (node->type == StartTag)
4332 {
4333 TY_(InsertNodeAtEnd)( &doc->root, node );
4334 ParseXMLElement( doc, node, IgnoreWhitespace );
4335 continue;
4336 }
4337
4338 TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4339 TY_(FreeNode)( doc, node);
4340 }
4341
4342 /* ensure presence of initial <?xml version="1.0"?> */
4343 if ( cfgBool(doc, TidyXmlDecl) )
4344 TY_(FixXmlDecl)( doc );
4345 }
4346
4347 /*
4348 * local variables:
4349 * mode: c
4350 * indent-tabs-mode: nil
4351 * c-basic-offset: 4
4352 * eval: (c-set-offset 'substatement-open 0)
4353 * end:
4354 */
4355