1 /*
2   clean.c -- clean up misuse of presentation markup
3 
4   (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
5   See tidy.h for the copyright notice.
6 
7   Filters from other formats such as Microsoft Word
8   often make excessive use of presentation markup such
9   as font tags, B, I, and the align attribute. By applying
10   a set of production rules, it is straight forward to
11   transform this to use CSS.
12 
13   Some rules replace some of the children of an element by
14   style properties on the element, e.g.
15 
16   <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
17 
18   Such rules are applied to the element's content and then
19   to the element itself until none of the rules more apply.
20   Having applied all the rules to an element, it will have
21   a style attribute with one or more properties.
22 
23   Other rules strip the element they apply to, replacing
24   it by style properties on the contents, e.g.
25 
26   <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
27 
28   These rules are applied to an element before processing
29   its content and replace the current element by the first
30   element in the exposed content.
31 
32   After applying both sets of rules, you can replace the
33   style attribute by a class value and style rule in the
34   document head. To support this, an association of styles
35   and class names is built.
36 
37   A naive approach is to rely on string matching to test
38   when two property lists are the same. A better approach
39   would be to first sort the properties before matching.
40 
41 */
42 
43 #include <stdio.h>
44 #include <stdlib.h>
45 #include <string.h>
46 
47 #include "tidy-int.h"
48 #include "clean.h"
49 #include "lexer.h"
50 #include "parser.h"
51 #include "attrs.h"
52 #include "message.h"
53 #include "tmbstr.h"
54 #include "utf8.h"
55 
56 static Node* CleanNode( TidyDocImpl* doc, Node *node );
57 
RenameElem(TidyDocImpl * doc,Node * node,TidyTagId tid)58 static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid )
59 {
60     const Dict* dict = TY_(LookupTagDef)( tid );
61     TidyDocFree( doc, node->element );
62     node->element = TY_(tmbstrdup)( doc->allocator, dict->name );
63     node->tag = dict;
64 }
65 
FreeStyleProps(TidyDocImpl * doc,StyleProp * props)66 static void FreeStyleProps(TidyDocImpl* doc, StyleProp *props)
67 {
68     StyleProp *next;
69 
70     while (props)
71     {
72         next = props->next;
73         TidyDocFree(doc, props->name);
74         TidyDocFree(doc, props->value);
75         TidyDocFree(doc, props);
76         props = next;
77     }
78 }
79 
InsertProperty(TidyDocImpl * doc,StyleProp * props,ctmbstr name,ctmbstr value)80 static StyleProp *InsertProperty( TidyDocImpl* doc, StyleProp* props, ctmbstr name, ctmbstr value )
81 {
82     StyleProp *first, *prev, *prop;
83     int cmp;
84 
85     prev = NULL;
86     first = props;
87 
88     while (props)
89     {
90         cmp = TY_(tmbstrcmp)(props->name, name);
91 
92         if (cmp == 0)
93         {
94             /* this property is already defined, ignore new value */
95             return first;
96         }
97 
98         if (cmp > 0)
99         {
100             /* insert before this */
101 
102             prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
103             prop->name = TY_(tmbstrdup)(doc->allocator, name);
104             prop->value = TY_(tmbstrdup)(doc->allocator, value);
105             prop->next = props;
106 
107             if (prev)
108                 prev->next = prop;
109             else
110                 first = prop;
111 
112             return first;
113         }
114 
115         prev = props;
116         props = props->next;
117     }
118 
119     prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
120     prop->name = TY_(tmbstrdup)(doc->allocator, name);
121     prop->value = TY_(tmbstrdup)(doc->allocator, value);
122     prop->next = NULL;
123 
124     if (prev)
125         prev->next = prop;
126     else
127         first = prop;
128 
129     return first;
130 }
131 
132 /*
133  Create sorted linked list of properties from style string
134  It temporarily places nulls in place of ':' and ';' to
135  delimit the strings for the property name and value.
136  Some systems don't allow you to NULL literal strings,
137  so to avoid this, a copy is made first.
138 */
CreateProps(TidyDocImpl * doc,StyleProp * prop,ctmbstr style)139 static StyleProp* CreateProps( TidyDocImpl* doc, StyleProp* prop, ctmbstr style )
140 {
141     tmbstr name, value = NULL, name_end, value_end, line;
142     Bool more;
143 
144     line = TY_(tmbstrdup)(doc->allocator, style);
145     name = line;
146 
147     while (*name)
148     {
149         while (*name == ' ')
150             ++name;
151 
152         name_end = name;
153 
154         while (*name_end)
155         {
156             if (*name_end == ':')
157             {
158                 value = name_end + 1;
159                 break;
160             }
161 
162             ++name_end;
163         }
164 
165         if (*name_end != ':')
166             break;
167 
168         while ( value && *value == ' ')
169             ++value;
170 
171         value_end = value;
172         more = no;
173 
174         while (*value_end)
175         {
176             if (*value_end == ';')
177             {
178                 more = yes;
179                 break;
180             }
181 
182             ++value_end;
183         }
184 
185         *name_end = '\0';
186         *value_end = '\0';
187 
188         prop = InsertProperty(doc, prop, name, value);
189         *name_end = ':';
190 
191         if (more)
192         {
193             *value_end = ';';
194             name = value_end + 1;
195             continue;
196         }
197 
198         break;
199     }
200 
201     TidyDocFree(doc, line);  /* free temporary copy */
202     return prop;
203 }
204 
CreatePropString(TidyDocImpl * doc,StyleProp * props)205 static tmbstr CreatePropString(TidyDocImpl* doc, StyleProp *props)
206 {
207     tmbstr style, p, s;
208     uint len;
209     StyleProp *prop;
210 
211     /* compute length */
212 
213     for (len = 0, prop = props; prop; prop = prop->next)
214     {
215         len += TY_(tmbstrlen)(prop->name) + 2;
216         if (prop->value)
217             len += TY_(tmbstrlen)(prop->value) + 2;
218     }
219 
220     style = (tmbstr) TidyDocAlloc(doc, len+1);
221     style[0] = '\0';
222 
223     for (p = style, prop = props; prop; prop = prop->next)
224     {
225         s = prop->name;
226 
227         while((*p++ = *s++))
228             continue;
229 
230         if (prop->value)
231         {
232             *--p = ':';
233             *++p = ' ';
234             ++p;
235 
236             s = prop->value;
237             while((*p++ = *s++))
238                 continue;
239         }
240         if (prop->next == NULL)
241             break;
242 
243         *--p = ';';
244         *++p = ' ';
245         ++p;
246     }
247 
248     return style;
249 }
250 
251 /*
252   create string with merged properties
253 static tmbstr AddProperty( ctmbstr style, ctmbstr property )
254 {
255     tmbstr line;
256     StyleProp *prop;
257 
258     prop = CreateProps(doc, NULL, style);
259     prop = CreateProps(doc, prop, property);
260     line = CreatePropString(doc, prop);
261     FreeStyleProps(doc, prop);
262     return line;
263 }
264 */
265 
TY_(FreeStyles)266 void TY_(FreeStyles)( TidyDocImpl* doc )
267 {
268     Lexer* lexer = doc->lexer;
269     if ( lexer )
270     {
271         TagStyle *style, *next;
272         for ( style = lexer->styles; style; style = next )
273         {
274             next = style->next;
275             TidyDocFree( doc, style->tag );
276             TidyDocFree( doc, style->tag_class );
277             TidyDocFree( doc, style->properties );
278             TidyDocFree( doc, style );
279         }
280     }
281 }
282 
GensymClass(TidyDocImpl * doc)283 static tmbstr GensymClass( TidyDocImpl* doc )
284 {
285     tmbchar buf[512];  /* CSSPrefix is limited to 256 characters */
286     ctmbstr pfx = cfgStr(doc, TidyCSSPrefix);
287     if ( pfx == NULL || *pfx == 0 )
288       pfx = "c";
289 
290     TY_(tmbsnprintf)(buf, sizeof(buf), "%s%u", pfx, ++doc->nClassId );
291     return TY_(tmbstrdup)(doc->allocator, buf);
292 }
293 
FindStyle(TidyDocImpl * doc,ctmbstr tag,ctmbstr properties)294 static ctmbstr FindStyle( TidyDocImpl* doc, ctmbstr tag, ctmbstr properties )
295 {
296     Lexer* lexer = doc->lexer;
297     TagStyle* style;
298 
299     for (style = lexer->styles; style; style=style->next)
300     {
301         if (TY_(tmbstrcmp)(style->tag, tag) == 0 &&
302             TY_(tmbstrcmp)(style->properties, properties) == 0)
303             return style->tag_class;
304     }
305 
306     style = (TagStyle *)TidyDocAlloc( doc, sizeof(TagStyle) );
307     style->tag = TY_(tmbstrdup)(doc->allocator, tag);
308     style->tag_class = GensymClass( doc );
309     style->properties = TY_(tmbstrdup)( doc->allocator, properties );
310     style->next = lexer->styles;
311     lexer->styles = style;
312     return style->tag_class;
313 }
314 
315 /*
316  Add class="foo" to node
317 */
AddClass(TidyDocImpl * doc,Node * node,ctmbstr classname)318 static void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname )
319 {
320     AttVal *classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);;
321 
322     /*
323      if there already is a class attribute
324      then append class name after a space.
325     */
326     if (classattr)
327         TY_(AppendToClassAttr)( doc, classattr, classname );
328     else /* create new class attribute */
329         TY_(AddAttribute)( doc, node, "class", classname );
330 }
331 
TY_(AddStyleAsClass)332 void TY_(AddStyleAsClass)( TidyDocImpl* doc, Node *node, ctmbstr stylevalue )
333 {
334     ctmbstr classname;
335 
336     classname = FindStyle( doc, node->element, stylevalue );
337     AddClass( doc, node, classname);
338 }
339 
340 /*
341  Find style attribute in node, and replace it
342  by corresponding class attribute. Search for
343  class in style dictionary otherwise gensym
344  new class and add to dictionary.
345 
346  Assumes that node doesn't have a class attribute
347 */
Style2Rule(TidyDocImpl * doc,Node * node)348 static void Style2Rule( TidyDocImpl* doc, Node *node)
349 {
350     AttVal *styleattr, *classattr;
351     ctmbstr classname;
352 
353     styleattr = TY_(AttrGetById)(node, TidyAttr_STYLE);
354 
355     if (styleattr)
356     {
357         /* fix for http://tidy.sf.net/bug/850215 */
358         if (!styleattr->value)
359         {
360             TY_(RemoveAttribute)(doc, node, styleattr);
361             return;
362         }
363 
364         classname = FindStyle( doc, node->element, styleattr->value );
365         classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);
366 
367         /*
368          if there already is a class attribute
369          then append class name after an underscore
370         */
371         if (classattr)
372         {
373             TY_(AppendToClassAttr)( doc, classattr, classname );
374             TY_(RemoveAttribute)( doc, node, styleattr );
375         }
376         else /* reuse style attribute for class attribute */
377         {
378             TidyDocFree(doc, styleattr->attribute);
379             TidyDocFree(doc, styleattr->value);
380             styleattr->attribute = TY_(tmbstrdup)(doc->allocator, "class");
381             styleattr->value = TY_(tmbstrdup)(doc->allocator, classname);
382         }
383     }
384 }
385 
AddColorRule(Lexer * lexer,ctmbstr selector,ctmbstr color)386 static void AddColorRule( Lexer* lexer, ctmbstr selector, ctmbstr color )
387 {
388     if ( selector && color )
389     {
390         TY_(AddStringLiteral)(lexer, selector);
391         TY_(AddStringLiteral)(lexer, " { color: ");
392         TY_(AddStringLiteral)(lexer, color);
393         TY_(AddStringLiteral)(lexer, " }\n");
394     }
395 }
396 
397 /*
398  move presentation attribs from body to style element
399 
400  background="foo" ->  body { background-image: url(foo) }
401  bgcolor="foo"    ->  body { background-color: foo }
402  text="foo"       ->  body { color: foo }
403  link="foo"       ->  :link { color: foo }
404  vlink="foo"      ->  :visited { color: foo }
405  alink="foo"      ->  :active { color: foo }
406 */
CleanBodyAttrs(TidyDocImpl * doc,Node * body)407 static void CleanBodyAttrs( TidyDocImpl* doc, Node* body )
408 {
409     Lexer* lexer  = doc->lexer;
410     tmbstr bgurl   = NULL;
411     tmbstr bgcolor = NULL;
412     tmbstr color   = NULL;
413     AttVal* attr;
414 
415     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BACKGROUND)))
416     {
417         bgurl = attr->value;
418         attr->value = NULL;
419         TY_(RemoveAttribute)( doc, body, attr );
420     }
421 
422     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BGCOLOR)))
423     {
424         bgcolor = attr->value;
425         attr->value = NULL;
426         TY_(RemoveAttribute)( doc, body, attr );
427     }
428 
429     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_TEXT)))
430     {
431         color = attr->value;
432         attr->value = NULL;
433         TY_(RemoveAttribute)( doc, body, attr );
434     }
435 
436     if ( bgurl || bgcolor || color )
437     {
438         TY_(AddStringLiteral)(lexer, " body {\n");
439         if (bgurl)
440         {
441             TY_(AddStringLiteral)(lexer, "  background-image: url(");
442             TY_(AddStringLiteral)(lexer, bgurl);
443             TY_(AddStringLiteral)(lexer, ");\n");
444             TidyDocFree(doc, bgurl);
445         }
446         if (bgcolor)
447         {
448             TY_(AddStringLiteral)(lexer, "  background-color: ");
449             TY_(AddStringLiteral)(lexer, bgcolor);
450             TY_(AddStringLiteral)(lexer, ";\n");
451             TidyDocFree(doc, bgcolor);
452         }
453         if (color)
454         {
455             TY_(AddStringLiteral)(lexer, "  color: ");
456             TY_(AddStringLiteral)(lexer, color);
457             TY_(AddStringLiteral)(lexer, ";\n");
458             TidyDocFree(doc, color);
459         }
460 
461         TY_(AddStringLiteral)(lexer, " }\n");
462     }
463 
464     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_LINK)))
465     {
466         AddColorRule(lexer, " :link", attr->value);
467         TY_(RemoveAttribute)( doc, body, attr );
468     }
469 
470     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_VLINK)))
471     {
472         AddColorRule(lexer, " :visited", attr->value);
473         TY_(RemoveAttribute)( doc, body, attr );
474     }
475 
476     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_ALINK)))
477     {
478         AddColorRule(lexer, " :active", attr->value);
479         TY_(RemoveAttribute)( doc, body, attr );
480     }
481 }
482 
NiceBody(TidyDocImpl * doc)483 static Bool NiceBody( TidyDocImpl* doc )
484 {
485     Node* node = TY_(FindBody)(doc);
486     if (node)
487     {
488         if (TY_(AttrGetById)(node, TidyAttr_BACKGROUND) ||
489             TY_(AttrGetById)(node, TidyAttr_BGCOLOR)    ||
490             TY_(AttrGetById)(node, TidyAttr_TEXT)       ||
491             TY_(AttrGetById)(node, TidyAttr_LINK)       ||
492             TY_(AttrGetById)(node, TidyAttr_VLINK)      ||
493             TY_(AttrGetById)(node, TidyAttr_ALINK))
494         {
495             doc->badLayout |= USING_BODY;
496             return no;
497         }
498     }
499 
500     return yes;
501 }
502 
503 /* create style element using rules from dictionary */
CreateStyleElement(TidyDocImpl * doc)504 static void CreateStyleElement( TidyDocImpl* doc )
505 {
506     Lexer* lexer = doc->lexer;
507     Node *node, *head, *body;
508     TagStyle *style;
509     AttVal *av;
510 
511     if ( lexer->styles == NULL && NiceBody(doc) )
512         return;
513 
514     node = TY_(NewNode)( doc->allocator, lexer );
515     node->type = StartTag;
516     node->implicit = yes;
517     node->element = TY_(tmbstrdup)(doc->allocator, "style");
518     TY_(FindTag)( doc, node );
519 
520     /* insert type attribute */
521     av = TY_(NewAttributeEx)( doc, "type", "text/css", '"' );
522     TY_(InsertAttributeAtStart)( node, av );
523 
524     body = TY_(FindBody)( doc );
525     lexer->txtstart = lexer->lexsize;
526     if ( body )
527         CleanBodyAttrs( doc, body );
528 
529     for (style = lexer->styles; style; style = style->next)
530     {
531         TY_(AddCharToLexer)(lexer, ' ');
532         TY_(AddStringLiteral)(lexer, style->tag);
533         TY_(AddCharToLexer)(lexer, '.');
534         TY_(AddStringLiteral)(lexer, style->tag_class);
535         TY_(AddCharToLexer)(lexer, ' ');
536         TY_(AddCharToLexer)(lexer, '{');
537         TY_(AddStringLiteral)(lexer, style->properties);
538         TY_(AddCharToLexer)(lexer, '}');
539         TY_(AddCharToLexer)(lexer, '\n');
540     }
541 
542     lexer->txtend = lexer->lexsize;
543 
544     TY_(InsertNodeAtEnd)( node, TY_(TextToken)(lexer) );
545 
546     /*
547      now insert style element into document head
548 
549      doc is root node. search its children for html node
550      the head node should be first child of html node
551     */
552     if ( NULL != (head = TY_(FindHEAD)( doc )) )
553         TY_(InsertNodeAtEnd)( head, node );
554 }
555 
556 
557 /* ensure bidirectional links are consistent */
TY_(FixNodeLinks)558 void TY_(FixNodeLinks)(Node *node)
559 {
560     Node *child;
561 
562     if (node->prev)
563         node->prev->next = node;
564     else
565         node->parent->content = node;
566 
567     if (node->next)
568         node->next->prev = node;
569     else
570         node->parent->last = node;
571 
572     for (child = node->content; child; child = child->next)
573         child->parent = node;
574 }
575 
576 /*
577  used to strip child of node when
578  the node has one and only one child
579 */
StripOnlyChild(TidyDocImpl * doc,Node * node)580 static void StripOnlyChild(TidyDocImpl* doc, Node *node)
581 {
582     Node *child;
583 
584     child = node->content;
585     node->content = child->content;
586     node->last = child->last;
587     child->content = NULL;
588     TY_(FreeNode)(doc, child);
589 
590     for (child = node->content; child; child = child->next)
591         child->parent = node;
592 }
593 
594 /*
595   used to strip font start and end tags.
596   Extricate "element", replace it by its content and delete it.
597 */
DiscardContainer(TidyDocImpl * doc,Node * element,Node ** pnode)598 static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
599 {
600     if (element->content)
601     {
602         Node *node, *parent = element->parent;
603 
604         element->last->next = element->next;
605 
606         if (element->next)
607         {
608             element->next->prev = element->last;
609         }
610         else
611             parent->last = element->last;
612 
613         if (element->prev)
614         {
615             element->content->prev = element->prev;
616             element->prev->next = element->content;
617         }
618         else
619             parent->content = element->content;
620 
621         for (node = element->content; node; node = node->next)
622             node->parent = parent;
623 
624         *pnode = element->content;
625 
626         element->next = element->content = NULL;
627         TY_(FreeNode)(doc, element);
628     }
629     else
630     {
631         *pnode = TY_(DiscardElement)(doc, element);
632     }
633 }
634 
635 /*
636   Create new string that consists of the
637   combined style properties in s1 and s2
638 
639   To merge property lists, we build a linked
640   list of property/values and insert properties
641   into the list in order, merging values for
642   the same property name.
643 */
MergeProperties(TidyDocImpl * doc,ctmbstr s1,ctmbstr s2)644 static tmbstr MergeProperties( TidyDocImpl* doc, ctmbstr s1, ctmbstr s2 )
645 {
646     tmbstr s;
647     StyleProp *prop;
648 
649     prop = CreateProps(doc, NULL, s1);
650     prop = CreateProps(doc, prop, s2);
651     s = CreatePropString(doc, prop);
652     FreeStyleProps(doc, prop);
653     return s;
654 }
655 
656 /*
657  Add style property to element, creating style
658  attribute as needed and adding ; delimiter
659 */
TY_(AddStyleProperty)660 void TY_(AddStyleProperty)(TidyDocImpl* doc, Node *node, ctmbstr property )
661 {
662     AttVal *av = TY_(AttrGetById)(node, TidyAttr_STYLE);
663 
664     /* if style attribute already exists then insert property */
665 
666     if ( av )
667     {
668         if (av->value != NULL)
669         {
670             tmbstr s = MergeProperties( doc, av->value, property );
671             TidyDocFree( doc, av->value );
672             av->value = s;
673         }
674         else
675         {
676             av->value = TY_(tmbstrdup)( doc->allocator, property );
677         }
678     }
679     else /* else create new style attribute */
680     {
681         av = TY_(NewAttributeEx)( doc, "style", property, '"' );
682         TY_(InsertAttributeAtStart)( node, av );
683     }
684 }
685 
MergeClasses(TidyDocImpl * doc,Node * node,Node * child)686 static void MergeClasses(TidyDocImpl* doc, Node *node, Node *child)
687 {
688     AttVal *av;
689     tmbstr s1, s2, names;
690 
691     for (s2 = NULL, av = child->attributes; av; av = av->next)
692     {
693         if (attrIsCLASS(av))
694         {
695             s2 = av->value;
696             break;
697         }
698     }
699 
700     for (s1 = NULL, av = node->attributes; av; av = av->next)
701     {
702         if (attrIsCLASS(av))
703         {
704             s1 = av->value;
705             break;
706         }
707     }
708 
709     if (s1)
710     {
711         if (s2)  /* merge class names from both */
712         {
713             uint l1, l2;
714             l1 = TY_(tmbstrlen)(s1);
715             l2 = TY_(tmbstrlen)(s2);
716             names = (tmbstr) TidyDocAlloc(doc, l1 + l2 + 2);
717             TY_(tmbstrcpy)(names, s1);
718             names[l1] = ' ';
719             TY_(tmbstrcpy)(names+l1+1, s2);
720             TidyDocFree(doc, av->value);
721             av->value = names;
722         }
723     }
724     else if (s2)  /* copy class names from child */
725     {
726         av = TY_(NewAttributeEx)( doc, "class", s2, '"' );
727         TY_(InsertAttributeAtStart)( node, av );
728     }
729 }
730 
MergeStyles(TidyDocImpl * doc,Node * node,Node * child)731 static void MergeStyles(TidyDocImpl* doc, Node *node, Node *child)
732 {
733     AttVal *av;
734     tmbstr s1, s2, style;
735 
736     /*
737        the child may have a class attribute used
738        for attaching styles, if so the class name
739        needs to be copied to node's class
740     */
741     MergeClasses(doc, node, child);
742 
743     for (s2 = NULL, av = child->attributes; av; av = av->next)
744     {
745         if (attrIsSTYLE(av))
746         {
747             s2 = av->value;
748             break;
749         }
750     }
751 
752     for (s1 = NULL, av = node->attributes; av; av = av->next)
753     {
754         if (attrIsSTYLE(av))
755         {
756             s1 = av->value;
757             break;
758         }
759     }
760 
761     if (s1)
762     {
763         if (s2)  /* merge styles from both */
764         {
765             style = MergeProperties(doc, s1, s2);
766             TidyDocFree(doc, av->value);
767             av->value = style;
768         }
769     }
770     else if (s2)  /* copy style of child */
771     {
772         av = TY_(NewAttributeEx)( doc, "style", s2, '"' );
773         TY_(InsertAttributeAtStart)( node, av );
774     }
775 }
776 
FontSize2Name(ctmbstr size)777 static ctmbstr FontSize2Name(ctmbstr size)
778 {
779     static const ctmbstr sizes[7] =
780     {
781         "60%", "70%", "80%", NULL,
782         "120%", "150%", "200%"
783     };
784 
785     /* increment of 0.8 */
786     static const ctmbstr minussizes[] =
787     {
788         "100%", "80%", "64%", "51%",
789         "40%", "32%", "26%"
790     };
791 
792     /* increment of 1.2 */
793     static const ctmbstr plussizes[] =
794     {
795         "100%", "120%", "144%", "172%",
796         "207%", "248%", "298%"
797     };
798 
799     if (size[0] == '\0')
800         return NULL;
801 
802     if ('0' <= size[0] && size[0] <= '6')
803     {
804         int n = size[0] - '0';
805         return sizes[n];
806     }
807 
808     if (size[0] == '-')
809     {
810         if ('0' <= size[1] && size[1] <= '6')
811         {
812             int n = size[1] - '0';
813             return minussizes[n];
814         }
815         return "smaller"; /*"70%"; */
816     }
817 
818     if ('0' <= size[1] && size[1] <= '6')
819     {
820         int n = size[1] - '0';
821         return plussizes[n];
822     }
823 
824     return "larger"; /* "140%" */
825 }
826 
AddFontFace(TidyDocImpl * doc,Node * node,ctmbstr face)827 static void AddFontFace( TidyDocImpl* doc, Node *node, ctmbstr face )
828 {
829     tmbchar buf[256];
830     TY_(tmbsnprintf)(buf, sizeof(buf), "font-family: %s", face );
831     TY_(AddStyleProperty)( doc, node, buf );
832 }
833 
AddFontSize(TidyDocImpl * doc,Node * node,ctmbstr size)834 static void AddFontSize( TidyDocImpl* doc, Node* node, ctmbstr size )
835 {
836     ctmbstr value = NULL;
837 
838     if (nodeIsP(node))
839     {
840         if (TY_(tmbstrcmp)(size, "6") == 0)
841             value = "h1";
842         else if (TY_(tmbstrcmp)(size, "5") == 0)
843             value = "h2";
844         else if (TY_(tmbstrcmp)(size, "4") == 0)
845             value = "h3";
846 
847         if (value)
848         {
849             TidyDocFree(doc, node->element);
850             node->element = TY_(tmbstrdup)(doc->allocator, value);
851             TY_(FindTag)(doc, node);
852             return;
853         }
854     }
855 
856     value = FontSize2Name(size);
857 
858     if (value)
859     {
860         tmbchar buf[64];
861         TY_(tmbsnprintf)(buf, sizeof(buf), "font-size: %s", value);
862         TY_(AddStyleProperty)( doc, node, buf );
863     }
864 }
865 
AddFontColor(TidyDocImpl * doc,Node * node,ctmbstr color)866 static void AddFontColor( TidyDocImpl* doc, Node *node, ctmbstr color)
867 {
868     tmbchar buf[128];
869     TY_(tmbsnprintf)(buf, sizeof(buf), "color: %s", color);
870     TY_(AddStyleProperty)( doc, node, buf );
871 }
872 
873 /* force alignment value to lower case */
AddAlign(TidyDocImpl * doc,Node * node,ctmbstr align)874 static void AddAlign( TidyDocImpl* doc, Node *node, ctmbstr align )
875 {
876     uint i;
877     tmbchar buf[128];
878 
879     TY_(tmbstrcpy)( buf, "text-align: " );
880     for ( i = 12; i < sizeof(buf)/sizeof(buf[0])-1; ++i )
881     {
882         if ( (buf[i] = (tmbchar)TY_(ToLower)(*align++)) == '\0' )
883             break;
884     }
885     buf[i] = '\0';
886     TY_(AddStyleProperty)( doc, node, buf );
887 }
888 
889 /*
890  add style properties to node corresponding to
891  the font face, size and color attributes
892 */
AddFontStyles(TidyDocImpl * doc,Node * node,AttVal * av)893 static void AddFontStyles( TidyDocImpl* doc, Node *node, AttVal *av)
894 {
895     while (av)
896     {
897         if (AttrHasValue(av))
898         {
899             if (attrIsFACE(av))
900                 AddFontFace( doc, node, av->value );
901             else if (attrIsSIZE(av))
902                 AddFontSize( doc, node, av->value );
903             else if (attrIsCOLOR(av))
904                 AddFontColor( doc, node, av->value );
905         }
906         av = av->next;
907     }
908 }
909 
910 /*
911     Symptom: <p align=center>
912     Action: <p style="text-align: center">
913 */
TextAlign(TidyDocImpl * doc,Node * node)914 static void TextAlign( TidyDocImpl* doc, Node* node )
915 {
916     AttVal *av, *prev;
917 
918     prev = NULL;
919 
920     for (av = node->attributes; av; av = av->next)
921     {
922         if (attrIsALIGN(av))
923         {
924             if (prev)
925                 prev->next = av->next;
926             else
927                 node->attributes = av->next;
928 
929             if (av->value)
930                 AddAlign( doc, node, av->value );
931 
932             TY_(FreeAttribute)(doc, av);
933             break;
934         }
935 
936         prev = av;
937     }
938 }
939 
940 /*
941     Symptom: <table bgcolor="red">
942     Action: <table style="background-color: red">
943 */
TableBgColor(TidyDocImpl * doc,Node * node)944 static void TableBgColor( TidyDocImpl* doc, Node* node )
945 {
946     AttVal* attr;
947     tmbchar buf[256];
948 
949     if (NULL != (attr = TY_(AttrGetById)(node, TidyAttr_BGCOLOR)))
950     {
951         TY_(tmbsnprintf)(buf, sizeof(buf), "background-color: %s", attr->value );
952         TY_(RemoveAttribute)( doc, node, attr );
953         TY_(AddStyleProperty)( doc, node, buf );
954     }
955 }
956 
957 /*
958    The clean up rules use the pnode argument to return the
959    next node when the original node has been deleted
960 */
961 
962 /*
963     Symptom: <dir> <li> where <li> is only child
964     Action: coerce <dir> <li> to <div> with indent.
965 */
966 
Dir2Div(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))967 static Bool Dir2Div( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode))
968 {
969     Node *child;
970 
971     if ( nodeIsDIR(node) || nodeIsUL(node) || nodeIsOL(node) )
972     {
973         child = node->content;
974 
975         if (child == NULL)
976             return no;
977 
978         /* check child has no peers */
979 
980         if (child->next)
981             return no;
982 
983         if ( !nodeIsLI(child) )
984             return no;
985 
986         if ( !child->implicit )
987             return no;
988 
989         /* coerce dir to div */
990         node->tag = TY_(LookupTagDef)( TidyTag_DIV );
991         TidyDocFree( doc, node->element );
992         node->element = TY_(tmbstrdup)(doc->allocator, "div");
993         TY_(AddStyleProperty)( doc, node, "margin-left: 2em" );
994         StripOnlyChild( doc, node );
995         return yes;
996     }
997 
998     return no;
999 }
1000 
1001 /*
1002     Symptom: <center>
1003     Action: replace <center> by <div style="text-align: center">
1004 */
1005 
Center2Div(TidyDocImpl * doc,Node * node,Node ** pnode)1006 static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode)
1007 {
1008     if ( nodeIsCENTER(node) )
1009     {
1010         RenameElem( doc, node, TidyTag_DIV );
1011         TY_(AddStyleProperty)( doc, node, "text-align: center" );
1012         return yes;
1013     }
1014 
1015     return no;
1016 }
1017 
1018 /* Copy child attributes to node. Duplicate attributes are overwritten.
1019    Unique attributes (such as ID) disable the action.
1020    Attributes style and class are not dealt with. A call to MergeStyles
1021    will do that.
1022 */
CopyAttrs(TidyDocImpl * doc,Node * node,Node * child)1023 static Bool CopyAttrs( TidyDocImpl* doc, Node *node, Node *child)
1024 {
1025     AttVal *av1, *av2;
1026     TidyAttrId id;
1027 
1028     /* Detect attributes that cannot be merged or overwritten. */
1029     if (TY_(AttrGetById)(child, TidyAttr_ID) != NULL
1030         && TY_(AttrGetById)(node, TidyAttr_ID) != NULL)
1031         return no;
1032 
1033     /* Move child attributes to node. Attributes in node
1034      can be overwritten or merged. */
1035     for (av2 = child->attributes; av2; )
1036     {
1037         /* Dealt by MergeStyles. */
1038         if (attrIsSTYLE(av2) || attrIsCLASS(av2))
1039         {
1040             av2 = av2->next;
1041             continue;
1042         }
1043         /* Avoid duplicates in node */
1044         if ((id=AttrId(av2)) != TidyAttr_UNKNOWN
1045             && (av1=TY_(AttrGetById)(node, id))!= NULL)
1046             TY_(RemoveAttribute)( doc, node, av1 );
1047 
1048         /* Move attribute from child to node */
1049         TY_(DetachAttribute)( child, av2 );
1050         av1 = av2;
1051         av2 = av2->next;
1052         av1->next = NULL;
1053         TY_(InsertAttributeAtEnd)( node, av1 );
1054     }
1055 
1056     return yes;
1057 }
1058 
1059 /*
1060     Symptom <XX><XX>...</XX></XX>
1061     Action: merge the two XXs
1062 
1063   For instance, this is useful after nested <dir>s used by Word
1064   for indenting have been converted to <div>s
1065 
1066   If state is "no", no merging.
1067   If state is "yes", inner element is discarded. Only Style and Class
1068   attributes are merged using MergeStyles().
1069   If state is "auto", atttibutes are merged as described in CopyAttrs().
1070   Style and Class attributes are merged using MergeStyles().
1071 */
MergeNestedElements(TidyDocImpl * doc,TidyTagId Id,TidyTriState state,Node * node,Node ** ARG_UNUSED (pnode))1072 static Bool MergeNestedElements( TidyDocImpl* doc,
1073                                  TidyTagId Id, TidyTriState state, Node *node,
1074                                  Node **ARG_UNUSED(pnode))
1075 {
1076     Node *child;
1077 
1078     if ( state == TidyNoState
1079          || !TagIsId(node, Id) )
1080         return no;
1081 
1082     child = node->content;
1083 
1084     if ( child == NULL
1085          || child->next != NULL
1086          || !TagIsId(child, Id) )
1087         return no;
1088 
1089     if ( state == TidyAutoState
1090          && CopyAttrs(doc, node, child) == no )
1091         return no;
1092 
1093     MergeStyles( doc, node, child );
1094     StripOnlyChild( doc, node );
1095     return yes;
1096 }
1097 
1098 /*
1099     Symptom: <ul><li><ul>...</ul></li></ul>
1100     Action: discard outer list
1101 */
1102 
NestedList(TidyDocImpl * doc,Node * node,Node ** pnode)1103 static Bool NestedList( TidyDocImpl* doc, Node *node, Node **pnode )
1104 {
1105     Node *child, *list;
1106 
1107     if ( nodeIsUL(node) || nodeIsOL(node) )
1108     {
1109         child = node->content;
1110 
1111         if (child == NULL)
1112             return no;
1113 
1114         /* check child has no peers */
1115 
1116         if (child->next)
1117             return no;
1118 
1119         list = child->content;
1120 
1121         if (!list)
1122             return no;
1123 
1124         if (list->tag != node->tag)
1125             return no;
1126 
1127         /* check list has no peers */
1128         if (list->next)
1129             return no;
1130 
1131         *pnode = list;  /* Set node to resume iteration */
1132 
1133         /* move inner list node into position of outer node */
1134         list->prev = node->prev;
1135         list->next = node->next;
1136         list->parent = node->parent;
1137         TY_(FixNodeLinks)(list);
1138 
1139         /* get rid of outer ul and its li */
1140         child->content = NULL;
1141         TY_(FreeNode)( doc, child ); /* See test #427841. */
1142         child = NULL;
1143         node->content = NULL;
1144         node->next = NULL;
1145         TY_(FreeNode)( doc, node );
1146         node = NULL;
1147 
1148         /*
1149           If prev node was a list the chances are this node
1150           should be appended to that list. Word has no way of
1151           recognizing nested lists and just uses indents
1152         */
1153 
1154         if (list->prev)
1155         {
1156             if ( (nodeIsUL(list->prev) || nodeIsOL(list->prev))
1157                  && list->prev->last )
1158             {
1159                 node = list;
1160                 list = node->prev;
1161 
1162                 child = list->last;  /* <li> */
1163 
1164                 list->next = node->next;
1165                 TY_(FixNodeLinks)(list);
1166 
1167                 node->parent = child;
1168                 node->next = NULL;
1169                 node->prev = child->last;
1170                 TY_(FixNodeLinks)(node);
1171                 CleanNode( doc, node );
1172             }
1173         }
1174 
1175         return yes;
1176     }
1177 
1178     return no;
1179 }
1180 
1181 /* Find CSS equivalent in a SPAN element */
1182 static
FindCSSSpanEq(Node * node,ctmbstr * s,Bool deprecatedOnly)1183 Bool FindCSSSpanEq( Node *node, ctmbstr *s, Bool deprecatedOnly )
1184 {
1185     struct
1186     {
1187         TidyTagId id;
1188         ctmbstr CSSeq;
1189         Bool deprecated;
1190     }
1191     const CSS_SpanEq[] =
1192         {
1193             { TidyTag_B, "font-weight: bold", no },
1194             { TidyTag_I, "font-style: italic", no },
1195             { TidyTag_S, "text-decoration: line-through", yes},
1196             { TidyTag_STRIKE, "text-decoration: line-through", yes},
1197             { TidyTag_U, "text-decoration: underline", yes},
1198             { TidyTag_UNKNOWN, NULL, no }
1199         };
1200     uint i;
1201 
1202     for (i=0; CSS_SpanEq[i].CSSeq; ++i)
1203         if ( (!deprecatedOnly || CSS_SpanEq[i].deprecated)
1204              && TagIsId(node, CSS_SpanEq[i].id) )
1205         {
1206             *s = CSS_SpanEq[i].CSSeq;
1207             return yes;
1208         }
1209     return no;
1210 }
1211 
1212 /* Necessary conditions to apply BlockStyle(). */
CanApplyBlockStyle(Node * node)1213 static Bool CanApplyBlockStyle( Node *node )
1214 {
1215     if (TY_(nodeHasCM)(node,CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE)
1216         && !nodeIsDIV(node) && !nodeIsP(node)
1217         && !nodeIsTABLE(node) && !nodeIsTR(node) && !nodeIsLI(node) )
1218     {
1219         return yes;
1220     }
1221     return no;
1222 }
1223 
1224 /*
1225   Symptom: the only child of a block-level element is a
1226   presentation element such as B, I or FONT
1227 
1228   Action: add style "font-weight: bold" to the block and
1229   strip the <b> element, leaving its children.
1230 
1231   example:
1232 
1233     <p>
1234       <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1235     </p>
1236 
1237   becomes:
1238 
1239       <p style="font-weight: bold; font-family: Arial; font-size: 6">
1240         Draft Recommended Practice
1241       </p>
1242 
1243   This code also replaces the align attribute by a style attribute.
1244   However, to avoid CSS problems with Navigator 4, this isn't done
1245   for the elements: caption, tr and table
1246 */
BlockStyle(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1247 static Bool BlockStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1248 {
1249     Node *child;
1250     ctmbstr CSSeq;
1251 
1252     /* check for bgcolor */
1253     if (   nodeIsTABLE(node)
1254         || nodeIsTD(node) || nodeIsTH(node) || nodeIsTR( node ))
1255         TableBgColor( doc, node );
1256 
1257     if (CanApplyBlockStyle(node))
1258     {
1259         /* check for align attribute */
1260         if ( !nodeIsCAPTION(node) )
1261             TextAlign( doc, node );
1262 
1263         child = node->content;
1264         if (child == NULL)
1265             return no;
1266 
1267         /* check child has no peers */
1268         if (child->next)
1269             return no;
1270 
1271         if ( FindCSSSpanEq(child, &CSSeq, no) )
1272         {
1273             MergeStyles( doc, node, child );
1274             TY_(AddStyleProperty)( doc, node, CSSeq );
1275             StripOnlyChild( doc, node );
1276             return yes;
1277         }
1278         else if ( nodeIsFONT(child) )
1279         {
1280             MergeStyles( doc, node, child );
1281             AddFontStyles( doc, node, child->attributes );
1282             StripOnlyChild( doc, node );
1283             return yes;
1284         }
1285     }
1286 
1287     return no;
1288 }
1289 
1290 /* Necessary conditions to apply InlineStyle(). */
CanApplyInlineStyle(Node * node)1291 static Bool CanApplyInlineStyle( Node *node )
1292 {
1293     return !nodeIsFONT(node) && TY_(nodeHasCM)(node, CM_INLINE|CM_ROW);
1294 }
1295 
1296 /* the only child of table cell or an inline element such as em */
InlineStyle(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1297 static Bool InlineStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1298 {
1299     Node *child;
1300     ctmbstr CSSeq;
1301 
1302     if ( CanApplyInlineStyle(node) )
1303     {
1304         child = node->content;
1305 
1306         if (child == NULL)
1307             return no;
1308 
1309         /* check child has no peers */
1310 
1311         if (child->next)
1312             return no;
1313 
1314         if ( FindCSSSpanEq(child, &CSSeq, no) )
1315         {
1316             MergeStyles( doc, node, child );
1317             TY_(AddStyleProperty)( doc, node, CSSeq );
1318             StripOnlyChild( doc, node );
1319             return yes;
1320         }
1321         else if ( nodeIsFONT(child) )
1322         {
1323             MergeStyles( doc, node, child );
1324             AddFontStyles( doc, node, child->attributes );
1325             StripOnlyChild( doc, node );
1326             return yes;
1327         }
1328     }
1329 
1330     return no;
1331 }
1332 
1333 /*
1334     Transform element to equivalent CSS
1335 */
InlineElementToCSS(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1336 static Bool InlineElementToCSS( TidyDocImpl* doc, Node* node,
1337                                 Node **ARG_UNUSED(pnode)  )
1338 {
1339     ctmbstr CSSeq;
1340 
1341     /* if node is the only child of parent element then leave alone
1342           Do so only if BlockStyle may be succesful. */
1343     if ( node->parent->content == node && node->next == NULL &&
1344          (CanApplyBlockStyle(node->parent)
1345           || CanApplyInlineStyle(node->parent)) )
1346         return no;
1347 
1348     if ( FindCSSSpanEq(node, &CSSeq, yes) )
1349     {
1350         RenameElem( doc, node, TidyTag_SPAN );
1351         TY_(AddStyleProperty)( doc, node, CSSeq );
1352         return yes;
1353     }
1354     return no;
1355 }
1356 
1357 /*
1358   Replace font elements by span elements, deleting
1359   the font element's attributes and replacing them
1360   by a single style attribute.
1361 */
Font2Span(TidyDocImpl * doc,Node * node,Node ** pnode)1362 static Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode )
1363 {
1364     AttVal *av, *style, *next;
1365 
1366     if ( nodeIsFONT(node) )
1367     {
1368         /* if node is the only child of parent element then leave alone
1369           Do so only if BlockStyle may be succesful. */
1370         if ( node->parent->content == node && node->next == NULL &&
1371              CanApplyBlockStyle(node->parent) )
1372             return no;
1373 
1374         AddFontStyles( doc, node, node->attributes );
1375 
1376         /* extract style attribute and free the rest */
1377         av = node->attributes;
1378         style = NULL;
1379 
1380         while (av)
1381         {
1382             next = av->next;
1383 
1384             if (attrIsSTYLE(av))
1385             {
1386                 av->next = NULL;
1387                 style = av;
1388             }
1389             else
1390             {
1391                 TY_(FreeAttribute)( doc, av );
1392             }
1393             av = next;
1394         }
1395 
1396         node->attributes = style;
1397         RenameElem( doc, node, TidyTag_SPAN );
1398         return yes;
1399     }
1400 
1401     return no;
1402 }
1403 
1404 /*
1405   Applies all matching rules to a node.
1406 */
CleanNode(TidyDocImpl * doc,Node * node)1407 Node* CleanNode( TidyDocImpl* doc, Node *node )
1408 {
1409     Node *next = NULL;
1410     TidyTriState mergeDivs = cfgAutoBool(doc, TidyMergeDivs);
1411     TidyTriState mergeSpans = cfgAutoBool(doc, TidyMergeSpans);
1412 
1413     for (next = node; TY_(nodeIsElement)(node); node = next)
1414     {
1415         if ( Dir2Div(doc, node, &next) )
1416             continue;
1417 
1418         /* Special case: true result means
1419         ** that arg node and its parent no longer exist.
1420         ** So we must jump back up the CreateStyleProperties()
1421         ** call stack until we have a valid node reference.
1422         */
1423         if ( NestedList(doc, node, &next) )
1424             return next;
1425 
1426         if ( Center2Div(doc, node, &next) )
1427             continue;
1428 
1429         if ( MergeNestedElements(doc, TidyTag_DIV, mergeDivs, node, &next) )
1430             continue;
1431 
1432         if ( MergeNestedElements(doc, TidyTag_SPAN, mergeSpans, node, &next) )
1433             continue;
1434 
1435         if ( BlockStyle(doc, node, &next) )
1436             continue;
1437 
1438         if ( InlineStyle(doc, node, &next) )
1439             continue;
1440 
1441         if ( InlineElementToCSS(doc, node, &next) )
1442             continue;
1443 
1444         if ( Font2Span(doc, node, &next) )
1445             continue;
1446 
1447         break;
1448     }
1449 
1450     return next;
1451 }
1452 
1453 /* Special case: if the current node is destroyed by
1454 ** CleanNode() lower in the tree, this node and its parent
1455 ** no longer exist.  So we must jump back up the CleanTree()
1456 ** call stack until we have a valid node reference.
1457 */
1458 
CleanTree(TidyDocImpl * doc,Node * node)1459 static Node* CleanTree( TidyDocImpl* doc, Node *node )
1460 {
1461     if (node->content)
1462     {
1463         Node *child;
1464         for (child = node->content; child != NULL; child = child->next)
1465         {
1466             child = CleanTree( doc, child );
1467             if ( !child )
1468                 break;
1469         }
1470     }
1471 
1472     return CleanNode( doc, node );
1473 }
1474 
DefineStyleRules(TidyDocImpl * doc,Node * node)1475 static void DefineStyleRules( TidyDocImpl* doc, Node *node )
1476 {
1477     Node *child;
1478 
1479     if (node->content)
1480     {
1481         for (child = node->content;
1482                 child != NULL; child = child->next)
1483         {
1484             DefineStyleRules( doc, child );
1485         }
1486     }
1487 
1488     Style2Rule( doc, node );
1489 }
1490 
TY_(CleanDocument)1491 void TY_(CleanDocument)( TidyDocImpl* doc )
1492 {
1493     /* placeholder.  CleanTree()/CleanNode() will not
1494     ** zap root element
1495     */
1496     CleanTree( doc, &doc->root );
1497 
1498     if ( cfgBool(doc, TidyMakeClean) )
1499     {
1500         DefineStyleRules( doc, &doc->root );
1501         CreateStyleElement( doc );
1502     }
1503 }
1504 
1505 /* simplifies <b><b> ... </b> ...</b> etc. */
TY_(NestedEmphasis)1506 void TY_(NestedEmphasis)( TidyDocImpl* doc, Node* node )
1507 {
1508     Node *next;
1509 
1510     while (node)
1511     {
1512         next = node->next;
1513 
1514         if ( (nodeIsB(node) || nodeIsI(node))
1515              && node->parent && node->parent->tag == node->tag)
1516         {
1517             /* strip redundant inner element */
1518             DiscardContainer( doc, node, &next );
1519             node = next;
1520             continue;
1521         }
1522 
1523         if ( node->content )
1524             TY_(NestedEmphasis)( doc, node->content );
1525 
1526         node = next;
1527     }
1528 }
1529 
1530 
1531 
1532 /* replace i by em and b by strong */
TY_(EmFromI)1533 void TY_(EmFromI)( TidyDocImpl* doc, Node* node )
1534 {
1535     while (node)
1536     {
1537         if ( nodeIsI(node) )
1538             RenameElem( doc, node, TidyTag_EM );
1539         else if ( nodeIsB(node) )
1540             RenameElem( doc, node, TidyTag_STRONG );
1541 
1542         if ( node->content )
1543             TY_(EmFromI)( doc, node->content );
1544 
1545         node = node->next;
1546     }
1547 }
1548 
HasOneChild(Node * node)1549 static Bool HasOneChild(Node *node)
1550 {
1551     return (node->content && node->content->next == NULL);
1552 }
1553 
1554 /*
1555  Some people use dir or ul without an li
1556  to indent the content. The pattern to
1557  look for is a list with a single implicit
1558  li. This is recursively replaced by an
1559  implicit blockquote.
1560 */
TY_(List2BQ)1561 void TY_(List2BQ)( TidyDocImpl* doc, Node* node )
1562 {
1563     while (node)
1564     {
1565         if (node->content)
1566             TY_(List2BQ)( doc, node->content );
1567 
1568         if ( node->tag && node->tag->parser == TY_(ParseList) &&
1569              HasOneChild(node) && node->content->implicit )
1570         {
1571             StripOnlyChild( doc, node );
1572             RenameElem( doc, node, TidyTag_BLOCKQUOTE );
1573             node->implicit = yes;
1574         }
1575 
1576         node = node->next;
1577     }
1578 }
1579 
1580 
1581 /*
1582  Replace implicit blockquote by div with an indent
1583  taking care to reduce nested blockquotes to a single
1584  div with the indent set to match the nesting depth
1585 */
TY_(BQ2Div)1586 void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
1587 {
1588     tmbchar indent_buf[ 32 ];
1589     uint indent;
1590 
1591     while (node)
1592     {
1593         if ( nodeIsBLOCKQUOTE(node) && node->implicit )
1594         {
1595             indent = 1;
1596 
1597             while( HasOneChild(node) &&
1598                    nodeIsBLOCKQUOTE(node->content) &&
1599                    node->implicit)
1600             {
1601                 ++indent;
1602                 StripOnlyChild( doc, node );
1603             }
1604 
1605             if (node->content)
1606                 TY_(BQ2Div)( doc, node->content );
1607 
1608             TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
1609                              2*indent);
1610 
1611             RenameElem( doc, node, TidyTag_DIV );
1612             TY_(AddStyleProperty)(doc, node, indent_buf );
1613         }
1614         else if (node->content)
1615             TY_(BQ2Div)( doc, node->content );
1616 
1617         node = node->next;
1618     }
1619 }
1620 
1621 
FindEnclosingCell(TidyDocImpl * ARG_UNUSED (doc),Node * node)1622 static Node* FindEnclosingCell( TidyDocImpl* ARG_UNUSED(doc), Node *node)
1623 {
1624     Node *check;
1625 
1626     for ( check=node; check; check = check->parent )
1627     {
1628       if ( nodeIsTD(check) )
1629         return check;
1630     }
1631     return NULL;
1632 }
1633 
1634 /* node is <![if ...]> prune up to <![endif]> */
PruneSection(TidyDocImpl * doc,Node * node)1635 static Node* PruneSection( TidyDocImpl* doc, Node *node )
1636 {
1637     Lexer* lexer = doc->lexer;
1638 
1639     for (;;)
1640     {
1641         if (node == NULL)
1642             return NULL;
1643 
1644         ctmbstr lexbuf = lexer->lexbuf + node->start;
1645         if ( TY_(tmbstrncmp)(lexbuf, "if !supportEmptyParas", 21) == 0 )
1646         {
1647           Node* cell = FindEnclosingCell( doc, node );
1648           if ( cell )
1649           {
1650             /* Need to put &nbsp; into cell so it doesn't look weird
1651             */
1652             Node* nbsp = TY_(NewLiteralTextNode)( lexer, "\240" );
1653             assert( (byte)'\240' == (byte)160 );
1654             TY_(InsertNodeBeforeElement)( node, nbsp );
1655           }
1656         }
1657 
1658         /* discard node and returns next, unless it is a text node */
1659         if ( node->type == TextNode )
1660             node = node->next;
1661         else
1662             node = TY_(DiscardElement)( doc, node );
1663 
1664         if (node == NULL)
1665             return NULL;
1666 
1667         if (node->type == SectionTag)
1668         {
1669             if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0)
1670             {
1671                 node = PruneSection( doc, node );
1672                 continue;
1673             }
1674 
1675             if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "endif", 5) == 0)
1676             {
1677                 node = TY_(DiscardElement)( doc, node );
1678                 break;
1679             }
1680         }
1681     }
1682 
1683     return node;
1684 }
1685 
TY_(DropSections)1686 void TY_(DropSections)( TidyDocImpl* doc, Node* node )
1687 {
1688     Lexer* lexer = doc->lexer;
1689     while (node)
1690     {
1691         if (node->type == SectionTag)
1692         {
1693             /* prune up to matching endif */
1694             if ((TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0) &&
1695                 (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if !vml", 7) != 0)) /* #444394 - fix 13 Sep 01 */
1696             {
1697                 node = PruneSection( doc, node );
1698                 continue;
1699             }
1700 
1701             /* discard others as well */
1702             node = TY_(DiscardElement)( doc, node );
1703             continue;
1704         }
1705 
1706         if (node->content)
1707             TY_(DropSections)( doc, node->content );
1708 
1709         node = node->next;
1710     }
1711 }
1712 
PurgeWord2000Attributes(TidyDocImpl * doc,Node * node)1713 static void PurgeWord2000Attributes( TidyDocImpl* doc, Node* node )
1714 {
1715     AttVal *attr, *next, *prev = NULL;
1716 
1717     for ( attr = node->attributes; attr; attr = next )
1718     {
1719         next = attr->next;
1720 
1721         /* special check for class="Code" denoting pre text */
1722         /* Pass thru user defined styles as HTML class names */
1723         if (attrIsCLASS(attr))
1724         {
1725             if (AttrValueIs(attr, "Code") ||
1726                  TY_(tmbstrncmp)(attr->value, "Mso", 3) != 0 )
1727             {
1728                 prev = attr;
1729                 continue;
1730             }
1731         }
1732 
1733         if (attrIsCLASS(attr) ||
1734             attrIsSTYLE(attr) ||
1735             attrIsLANG(attr)  ||
1736              ( (attrIsHEIGHT(attr) || attrIsWIDTH(attr)) &&
1737                (nodeIsTD(node) || nodeIsTR(node) || nodeIsTH(node)) ) ||
1738              (attr->attribute && TY_(tmbstrncmp)(attr->attribute, "x:", 2) == 0) )
1739         {
1740             if (prev)
1741                 prev->next = next;
1742             else
1743                 node->attributes = next;
1744 
1745             TY_(FreeAttribute)( doc, attr );
1746         }
1747         else
1748             prev = attr;
1749     }
1750 }
1751 
1752 /* Word2000 uses span excessively, so we strip span out */
StripSpan(TidyDocImpl * doc,Node * span)1753 static Node* StripSpan( TidyDocImpl* doc, Node* span )
1754 {
1755     Node *node, *prev = NULL, *content;
1756 
1757     /*
1758      deal with span elements that have content
1759      by splicing the content in place of the span
1760      after having processed it
1761     */
1762 
1763     TY_(CleanWord2000)( doc, span->content );
1764     content = span->content;
1765 
1766     if (span->prev)
1767         prev = span->prev;
1768     else if (content)
1769     {
1770         node = content;
1771         content = content->next;
1772         TY_(RemoveNode)(node);
1773         TY_(InsertNodeBeforeElement)(span, node);
1774         prev = node;
1775     }
1776 
1777     while (content)
1778     {
1779         node = content;
1780         content = content->next;
1781         TY_(RemoveNode)(node);
1782         TY_(InsertNodeAfterElement)(prev, node);
1783         prev = node;
1784     }
1785 
1786     if (span->next == NULL)
1787         span->parent->last = prev;
1788 
1789     node = span->next;
1790     span->content = NULL;
1791     TY_(DiscardElement)( doc, span );
1792     return node;
1793 }
1794 
1795 /* map non-breaking spaces to regular spaces */
TY_(NormalizeSpaces)1796 void TY_(NormalizeSpaces)(Lexer *lexer, Node *node)
1797 {
1798     while ( node )
1799     {
1800         if ( node->content )
1801             TY_(NormalizeSpaces)( lexer, node->content );
1802 
1803         if (TY_(nodeIsText)(node))
1804         {
1805             uint i, c;
1806             tmbstr p = lexer->lexbuf + node->start;
1807 
1808             for (i = node->start; i < node->end; ++i)
1809             {
1810                 c = (byte) lexer->lexbuf[i];
1811 
1812                 /* look for UTF-8 multibyte character */
1813                 if ( c > 0x7F )
1814                     i += TY_(GetUTF8)( lexer->lexbuf + i, &c );
1815 
1816                 if ( c == 160 )
1817                     c = ' ';
1818 
1819                 p = TY_(PutUTF8)(p, c);
1820             }
1821             node->end = p - lexer->lexbuf;
1822         }
1823 
1824         node = node->next;
1825     }
1826 }
1827 
1828 /* used to hunt for hidden preformatted sections */
NoMargins(Node * node)1829 static Bool NoMargins(Node *node)
1830 {
1831     AttVal *attval = TY_(AttrGetById)(node, TidyAttr_STYLE);
1832 
1833     if ( !AttrHasValue(attval) )
1834         return no;
1835 
1836     /* search for substring "margin-top: 0" */
1837     if (!TY_(tmbsubstr)(attval->value, "margin-top: 0"))
1838         return no;
1839 
1840     /* search for substring "margin-bottom: 0" */
1841     if (!TY_(tmbsubstr)(attval->value, "margin-bottom: 0"))
1842         return no;
1843 
1844     return yes;
1845 }
1846 
1847 /* does element have a single space as its content? */
SingleSpace(Lexer * lexer,Node * node)1848 static Bool SingleSpace( Lexer* lexer, Node* node )
1849 {
1850     if ( node->content )
1851     {
1852         node = node->content;
1853 
1854         if ( node->next != NULL )
1855             return no;
1856 
1857         if ( node->type != TextNode )
1858             return no;
1859 
1860         if ( (node->end - node->start) == 1 &&
1861              lexer->lexbuf[node->start] == ' ' )
1862             return yes;
1863 
1864         if ( (node->end - node->start) == 2 )
1865         {
1866             uint c = 0;
1867             TY_(GetUTF8)( lexer->lexbuf + node->start, &c );
1868             if ( c == 160 )
1869                 return yes;
1870         }
1871     }
1872 
1873     return no;
1874 }
1875 
1876 /*
1877  This is a major clean up to strip out all the extra stuff you get
1878  when you save as web page from Word 2000. It doesn't yet know what
1879  to do with VML tags, but these will appear as errors unless you
1880  declare them as new tags, such as o:p which needs to be declared
1881  as inline.
1882 */
TY_(CleanWord2000)1883 void TY_(CleanWord2000)( TidyDocImpl* doc, Node *node)
1884 {
1885     /* used to a list from a sequence of bulletted p's */
1886     Lexer* lexer = doc->lexer;
1887     Node* list = NULL;
1888     AttVal *next_attr, *attval;
1889 
1890     while ( node )
1891     {
1892         /* get rid of Word's xmlns attributes */
1893         if ( nodeIsHTML(node) )
1894         {
1895             /* check that it's a Word 2000 document */
1896             if ( !TY_(IsWord2000) (doc) ) /* Is. #896 */
1897                 return;
1898 
1899             /* Output proprietary attributes to maintain errout compatability
1900              * with traditional Tidy. This is a result of moving all of the
1901              * proprietary checks to near the end of the cleanup process,
1902              * meaning this result would not ordinarily be displayed.
1903              */
1904             attval = node->attributes;
1905             while ( attval ) {
1906                 next_attr = attval->next;
1907 
1908                 /* Issue #591 - take care of a NULL attribute, too. */
1909                 if ( !attval->attribute || ( strcmp(attval->attribute, "xmlns") != 0 ))
1910                     TY_(ReportAttrError)(doc, node, attval, PROPRIETARY_ATTRIBUTE);
1911                 attval = next_attr;
1912             }
1913 
1914             TY_(FreeAttrs)( doc, node );
1915         }
1916 
1917         /* fix up preformatted sections by looking for a
1918         ** sequence of paragraphs with zero top/bottom margin
1919         */
1920         if ( nodeIsP(node) )
1921         {
1922             if (NoMargins(node))
1923             {
1924                 Node *pre, *next;
1925                 TY_(CoerceNode)(doc, node, TidyTag_PRE, no, yes);
1926 
1927                 PurgeWord2000Attributes( doc, node );
1928 
1929                 if (node->content)
1930                     TY_(CleanWord2000)( doc, node->content );
1931 
1932                 pre = node;
1933                 node = node->next;
1934 
1935                 /* continue to strip p's */
1936 
1937                 while ( nodeIsP(node) && NoMargins(node) )
1938                 {
1939                     next = node->next;
1940                     TY_(RemoveNode)(node);
1941                     TY_(InsertNodeAtEnd)(pre, TY_(NewLineNode)(lexer));
1942                     TY_(InsertNodeAtEnd)(pre, node);
1943                     StripSpan( doc, node );
1944                     node = next;
1945                 }
1946 
1947                 if (node == NULL)
1948                     break;
1949             }
1950         }
1951 
1952         if (node->tag && (node->tag->model & CM_BLOCK)
1953             && SingleSpace(lexer, node))
1954         {
1955             node = StripSpan( doc, node );
1956             continue;
1957         }
1958         /* discard Word's style verbiage */
1959         if ( nodeIsSTYLE(node) || nodeIsMETA(node) ||
1960              node->type == CommentTag )
1961         {
1962             node = TY_(DiscardElement)( doc, node );
1963             continue;
1964         }
1965 
1966         /* strip out all span and font tags Word scatters so liberally! */
1967         if ( nodeIsSPAN(node) || nodeIsFONT(node) )
1968         {
1969             node = StripSpan( doc, node );
1970             continue;
1971         }
1972 
1973         if ( nodeIsLINK(node) )
1974         {
1975             AttVal *attr = TY_(AttrGetById)(node, TidyAttr_REL);
1976 
1977             if (AttrValueIs(attr, "File-List"))
1978             {
1979                 node = TY_(DiscardElement)( doc, node );
1980                 continue;
1981             }
1982         }
1983 
1984         /* discards <o:p> which encodes the paragraph mark */
1985         if ( node->tag && TY_(tmbstrcmp)(node->tag->name,"o:p")==0)
1986         {
1987             /* Output proprietary elements to maintain errout compatability
1988              * with traditional Tidy. This is a result of moving all of the
1989              * proprietary checks to near the end of the cleanup process,
1990              * meaning this result would not ordinarily be displayed.
1991              */
1992             Node* next;
1993             TY_(Report)(doc, NULL, node, PROPRIETARY_ELEMENT);
1994             DiscardContainer( doc, node, &next );
1995             node = next;
1996             continue;
1997         }
1998 
1999         /* discard empty paragraphs */
2000 
2001         if ( node->content == NULL && nodeIsP(node) )
2002         {
2003             /*  Use the existing function to ensure consistency */
2004             Node *next = TY_(TrimEmptyElement)( doc, node );
2005             node = next;
2006             continue;
2007         }
2008 
2009         if ( nodeIsP(node) )
2010         {
2011             AttVal *attr, *atrStyle;
2012 
2013             attr = TY_(AttrGetById)(node, TidyAttr_CLASS);
2014             atrStyle = TY_(AttrGetById)(node, TidyAttr_STYLE);
2015             /*
2016                (JES) Sometimes Word marks a list item with the following hokie syntax
2017                <p class="MsoNormal" style="...;mso-list:l1 level1 lfo1;
2018                 translate these into <li>
2019             */
2020             /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
2021             /* map <p class="MsoListNumber"> to <ol>...</ol> */
2022             if ( AttrValueIs(attr, "MsoListBullet") ||
2023                  AttrValueIs(attr, "MsoListNumber") ||
2024                  AttrContains(atrStyle, "mso-list:") )
2025             {
2026                 TidyTagId listType = TidyTag_UL;
2027                 if (AttrValueIs(attr, "MsoListNumber"))
2028                     listType = TidyTag_OL;
2029 
2030                 TY_(CoerceNode)(doc, node, TidyTag_LI, no, yes);
2031 
2032                 if ( !list || TagId(list) != listType )
2033                 {
2034                     const Dict* tag = TY_(LookupTagDef)( listType );
2035                     list = TY_(InferredTag)(doc, tag->id);
2036                     TY_(InsertNodeBeforeElement)(node, list);
2037                 }
2038 
2039                 PurgeWord2000Attributes( doc, node );
2040 
2041                 if ( node->content )
2042                     TY_(CleanWord2000)( doc, node->content );
2043 
2044                 /* remove node and append to contents of list */
2045                 TY_(RemoveNode)(node);
2046                 TY_(InsertNodeAtEnd)(list, node);
2047                 node = list;
2048             }
2049             /* map sequence of <p class="Code"> to <pre>...</pre> */
2050             else if (AttrValueIs(attr, "Code"))
2051             {
2052                 Node *br = TY_(NewLineNode)(lexer);
2053                 TY_(NormalizeSpaces)(lexer, node->content);
2054 
2055                 if ( !list || TagId(list) != TidyTag_PRE )
2056                 {
2057                     list = TY_(InferredTag)(doc, TidyTag_PRE);
2058                     TY_(InsertNodeBeforeElement)(node, list);
2059                 }
2060 
2061                 /* remove node and append to contents of list */
2062                 TY_(RemoveNode)(node);
2063                 TY_(InsertNodeAtEnd)(list, node);
2064                 StripSpan( doc, node );
2065                 TY_(InsertNodeAtEnd)(list, br);
2066                 node = list->next;
2067             }
2068             else
2069                 list = NULL;
2070         }
2071         else
2072             list = NULL;
2073 
2074         if (!node)
2075             return;
2076 
2077         /* strip out style and class attributes */
2078         if (TY_(nodeIsElement)(node))
2079             PurgeWord2000Attributes( doc, node );
2080 
2081         if (node->content)
2082             TY_(CleanWord2000)( doc, node->content );
2083 
2084         node = node->next;
2085     }
2086 }
2087 
TY_(IsWord2000)2088 Bool TY_(IsWord2000)( TidyDocImpl* doc )
2089 {
2090     AttVal *attval;
2091     Node *node, *head;
2092     Node *html = TY_(FindHTML)( doc );
2093 
2094     if (html && TY_(GetAttrByName)(html, "xmlns:o"))
2095         return yes;
2096 
2097     /* search for <meta name="GENERATOR" content="Microsoft ..."> */
2098     head = TY_(FindHEAD)( doc );
2099 
2100     if (head)
2101     {
2102         for (node = head->content; node; node = node->next)
2103         {
2104             if ( !nodeIsMETA(node) )
2105                 continue;
2106 
2107             attval = TY_(AttrGetById)( node, TidyAttr_NAME );
2108 
2109             if ( !AttrValueIs(attval, "generator") )
2110                 continue;
2111 
2112             attval =  TY_(AttrGetById)( node, TidyAttr_CONTENT );
2113 
2114             if ( AttrContains(attval, "Microsoft") )
2115                 return yes;
2116         }
2117     }
2118 
2119     return no;
2120 }
2121 
2122 /* where appropriate move object elements from head to body */
TY_(BumpObject)2123 void TY_(BumpObject)( TidyDocImpl* doc, Node *html )
2124 {
2125     Node *node, *next, *head = NULL, *body = NULL;
2126 
2127     if (!html)
2128         return;
2129 
2130     for ( node = html->content; node != NULL; node = node->next )
2131     {
2132         if ( nodeIsHEAD(node) )
2133             head = node;
2134 
2135         if ( nodeIsBODY(node) )
2136             body = node;
2137     }
2138 
2139     if ( head != NULL && body != NULL )
2140     {
2141         for (node = head->content; node != NULL; node = next)
2142         {
2143             next = node->next;
2144 
2145             if ( nodeIsOBJECT(node) )
2146             {
2147                 Node *child;
2148                 Bool bump = no;
2149 
2150                 for (child = node->content; child != NULL; child = child->next)
2151                 {
2152                     /* bump to body unless content is param */
2153                     if ( (TY_(nodeIsText)(child) && !TY_(IsBlank)(doc->lexer, node))
2154                          || !nodeIsPARAM(child) )
2155                     {
2156                             bump = yes;
2157                             break;
2158                     }
2159                 }
2160 
2161                 if ( bump )
2162                 {
2163                     TY_(RemoveNode)( node );
2164                     TY_(InsertNodeAtStart)( body, node );
2165                 }
2166             }
2167         }
2168     }
2169 }
2170 
2171 
2172 /*\
2173 *  Issue #456 - Check meta charset
2174 *  1. if there is no meta charset, it adds one, according to doctype, no warning.
2175 *  2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
2176 *  3. if it doesn't match the output encoding, and fix. Naybe no warning?
2177 *  4. if there are duplicates, discard them, with warning.
2178 \*/
TY_(TidyMetaCharset)2179 Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
2180 {
2181     AttVal *charsetAttr;
2182     AttVal *contentAttr;
2183     AttVal *httpEquivAttr;
2184     Bool charsetFound = no;
2185     uint outenc = cfg(doc, TidyOutCharEncoding);
2186     ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
2187     Node *currentNode;
2188     Node *head = TY_(FindHEAD)(doc);
2189     Node *metaTag;
2190     Node *prevNode;
2191     TidyBuffer buf;
2192     TidyBuffer charsetString;
2193     /* tmbstr httpEquivAttrValue; */
2194     /* tmbstr lcontent; */
2195     tmbstr newValue;
2196     Bool add_meta = cfgBool(doc, TidyMetaCharset);
2197 
2198     /* We can't do anything we don't have a head or encoding is NULL */
2199     if (!head || !enc || !TY_(tmbstrlen)(enc))
2200         return no;
2201     if (outenc == RAW)
2202         return no;
2203 #ifndef NO_NATIVE_ISO2022_SUPPORT
2204     if (outenc == ISO2022)
2205         return no;
2206 #endif
2207     if (cfgAutoBool(doc, TidyBodyOnly) == TidyYesState)
2208         return no; /* nothing to do here if showing body only */
2209 
2210     tidyBufInit(&charsetString);
2211     /* Set up the content test 'charset=value' */
2212     tidyBufClear(&charsetString);
2213     tidyBufAppend(&charsetString, "charset=", 8);
2214     tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
2215     tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
2216     /* process the children of the head */
2217     /* Issue #656 - guard against 'currentNode' being set NULL in loop */
2218     for (currentNode = head->content; currentNode;
2219         currentNode = (currentNode ? currentNode->next : NULL))
2220     {
2221         if (!nodeIsMETA(currentNode))
2222             continue;   /* not a meta node */
2223         charsetAttr = attrGetCHARSET(currentNode);
2224         httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
2225         if (!charsetAttr && !httpEquivAttr)
2226             continue;   /* has no charset attribute */
2227                         /*
2228                         Meta charset comes in quite a few flavors:
2229                         1. <meta charset="value"> - expected for (X)HTML5.
2230                         */
2231         if (charsetAttr && !httpEquivAttr)
2232         {
2233             /* we already found one, so remove the rest. */
2234             if (charsetFound || !charsetAttr->value)
2235             {
2236                 prevNode = currentNode->prev;
2237                 TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2238                 TY_(DiscardElement)(doc, currentNode);
2239                 currentNode = prevNode;
2240                 continue;
2241             }
2242             charsetFound = yes;
2243             /* Fix mismatched attribute value */
2244             if (TY_(tmbstrcasecmp)(charsetAttr->value, enc) != 0)
2245             {
2246                 newValue = (tmbstr)TidyDocAlloc(doc, TY_(tmbstrlen)(enc) + 1);   /* allocate + 1 for 0 */
2247                 TY_(tmbstrcpy)(newValue, enc);
2248                 /* Note: previously http-equiv had been modified, without warning
2249                 in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2250                 */
2251                 TY_(ReportAttrError)(doc, currentNode, charsetAttr, ATTRIBUTE_VALUE_REPLACED);
2252                 TidyDocFree(doc, charsetAttr->value);   /* free current value */
2253                 charsetAttr->value = newValue;
2254             }
2255             /* Make sure it's the first element. */
2256             if (currentNode != head->content->next) {
2257                 TY_(RemoveNode)(currentNode);
2258                 TY_(InsertNodeAtStart)(head, currentNode);
2259             }
2260             continue;
2261         }
2262         /*
2263         2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
2264         expected for HTML4. This is normally ok - but can clash.
2265         */
2266         if (httpEquivAttr && !charsetAttr)
2267         {
2268             contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
2269             if (!contentAttr)
2270                 continue;   /* has no 'content' attribute */
2271             if (!httpEquivAttr->value)
2272             {
2273                 prevNode = currentNode->prev;
2274                 TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2275                 TY_(DiscardElement)(doc, currentNode);
2276                 currentNode = prevNode;
2277                 continue;
2278             }
2279             /* httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); */
2280             if (TY_(tmbstrcasecmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
2281                 continue;   /* is not 'content-type' */
2282             if (!contentAttr->value)
2283             {
2284                 continue; /* has no 'content' attribute has NO VALUE! */
2285             }
2286             /* check encoding matches
2287             If a miss-match found here, fix it. previous silently done
2288             in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2289             lcontent = TY_(tmbstrtolower)(contentAttr->value);
2290             */
2291             if (TY_(tmbstrcasecmp)(contentAttr->value, (ctmbstr)charsetString.bp) == 0)
2292             {
2293                 /* we already found one, so remove the rest. */
2294                 if (charsetFound)
2295                 {
2296                     prevNode = currentNode->prev;
2297                     TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2298                     TY_(DiscardElement)(doc, currentNode);
2299                     currentNode = prevNode;
2300                     continue;
2301                 }
2302                 charsetFound = yes;
2303             }
2304             else
2305             {
2306                 /* fix a mis-match */
2307                 if (charsetFound)
2308                 {
2309                     prevNode = currentNode->prev;
2310                     TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2311                     TY_(DiscardElement)(doc, currentNode);
2312                     currentNode = prevNode;
2313                 }
2314                 else
2315                 {
2316                     /* correct the content */
2317                     newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
2318                     TY_(tmbstrcpy)(newValue, "text/html; charset=");
2319                     TY_(tmbstrcpy)(newValue + 19, enc);
2320                     if (cfgBool(doc, TidyShowMetaChange))   /* Issue #456 - backward compatibility only */
2321                         TY_(ReportAttrError)(doc, currentNode, contentAttr, ATTRIBUTE_VALUE_REPLACED);
2322                     TidyDocFree(doc, contentAttr->value);
2323                     contentAttr->value = newValue;
2324                     charsetFound = yes;
2325                 }
2326             }
2327             continue;
2328         }
2329         /*
2330         3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
2331         This is generally bad. Discard and warn.
2332         */
2333         if (httpEquivAttr && charsetAttr)
2334         {
2335             /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
2336             prevNode = currentNode->prev;
2337             TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2338             TY_(DiscardElement)(doc, currentNode);
2339             currentNode = prevNode;
2340         }
2341     }
2342 
2343     /* completed head scan - add appropriate meta - if 'yes' and none exists */
2344     if (add_meta && !charsetFound)
2345     {
2346         /* add appropriate meta charset tag - no warning */
2347         metaTag = TY_(InferredTag)(doc, TidyTag_META);
2348         switch (TY_(HTMLVersion)(doc))
2349         {
2350         case HT50:
2351         case XH50:
2352             TY_(AddAttribute)(doc, metaTag, "charset", enc);
2353             break;
2354         default:
2355             tidyBufInit(&buf);
2356             tidyBufAppend(&buf, "text/html; ", 11);
2357             tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)((ctmbstr)charsetString.bp));
2358             tidyBufAppend(&buf, "\0", 1);   /* zero terminate the buffer */
2359             TY_(AddAttribute)(doc, metaTag, "http-equiv", "Content-Type"); /* add 'http-equiv' const. */
2360             TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);  /* add 'content="<enc>"' */
2361             tidyBufFree(&buf);
2362         }
2363         TY_(InsertNodeAtStart)(head, metaTag);
2364         TY_(Report)(doc, metaTag, head, ADDED_MISSING_CHARSET); /* actually just 'Info:' */
2365     }
2366     tidyBufFree(&charsetString);
2367     return yes;
2368 }
2369 
2370 
TY_(DropComments)2371 void TY_(DropComments)(TidyDocImpl* doc, Node* node)
2372 {
2373     Node* next;
2374 
2375     while (node)
2376     {
2377         next = node->next;
2378 
2379         if (node->type == CommentTag)
2380         {
2381             TY_(RemoveNode)(node);
2382             TY_(FreeNode)(doc, node);
2383             node = next;
2384             continue;
2385         }
2386 
2387         if (node->content)
2388             TY_(DropComments)(doc, node->content);
2389 
2390         node = next;
2391     }
2392 }
2393 
TY_(DropFontElements)2394 void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **ARG_UNUSED(pnode))
2395 {
2396     Node* next;
2397 
2398     while (node)
2399     {
2400         next = node->next;
2401 
2402         if (nodeIsFONT(node))
2403         {
2404             DiscardContainer(doc, node, &next);
2405             node = next;
2406             continue;
2407         }
2408 
2409         if (node->content)
2410             TY_(DropFontElements)(doc, node->content, &next);
2411 
2412         node = next;
2413     }
2414 }
2415 
TY_(WbrToSpace)2416 void TY_(WbrToSpace)(TidyDocImpl* doc, Node* node)
2417 {
2418     Node* next;
2419 
2420     while (node)
2421     {
2422         next = node->next;
2423 
2424         if (nodeIsWBR(node))
2425         {
2426             Node* text;
2427             text = TY_(NewLiteralTextNode)(doc->lexer, " ");
2428             TY_(InsertNodeAfterElement)(node, text);
2429             TY_(RemoveNode)(node);
2430             TY_(FreeNode)(doc, node);
2431             node = next;
2432             continue;
2433         }
2434 
2435         if (node->content)
2436             TY_(WbrToSpace)(doc, node->content);
2437 
2438         node = next;
2439    }
2440 }
2441 
2442 /*
2443   Filters from Word and PowerPoint often use smart
2444   quotes resulting in character codes between 128
2445   and 159. Unfortunately, the corresponding HTML 4.0
2446   entities for these are not widely supported. The
2447   following converts dashes and quotation marks to
2448   the nearest ASCII equivalent. My thanks to
2449   Andrzej Novosiolov for his help with this code.
2450 
2451   Note: The old code in the pretty printer applied
2452   this to all node types and attribute values while
2453   this routine applies it only to text nodes. First,
2454   Microsoft Office products rarely put the relevant
2455   characters into these tokens, second support for
2456   them is much better now and last but not least, it
2457   can be harmful to replace these characters since
2458   US-ASCII quote marks are often used as syntax
2459   characters, a simple
2460 
2461     <a onmouseover="alert('&#x2018;')">...</a>
2462 
2463   would be broken if the U+2018 is replaced by "'".
2464   The old code would neither take care whether the
2465   quote mark is already used as delimiter,
2466 
2467     <p title='&#x2018;'>...</p>
2468 
2469   got
2470 
2471     <p title='''>...</p>
2472 
2473   Since browser support is much better nowadays and
2474   high-quality typography is better than ASCII it'd
2475   be probably a good idea to drop the feature...
2476 */
TY_(DowngradeTypography)2477 void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node)
2478 {
2479     Node* next;
2480     Lexer* lexer = doc->lexer;
2481 
2482     while (node)
2483     {
2484         next = node->next;
2485 
2486         if (TY_(nodeIsText)(node))
2487         {
2488             uint i, c;
2489             tmbstr p = lexer->lexbuf + node->start;
2490 
2491             for (i = node->start; i < node->end; ++i)
2492             {
2493                 c = (unsigned char) lexer->lexbuf[i];
2494 
2495                 if (c > 0x7F)
2496                     i += TY_(GetUTF8)(lexer->lexbuf + i, &c);
2497 
2498                 if (c >= 0x2013 && c <= 0x201E)
2499                 {
2500                     switch (c)
2501                     {
2502                     case 0x2013: /* en dash */
2503                     case 0x2014: /* em dash */
2504                         c = '-';
2505                         break;
2506                     case 0x2018: /* left single  quotation mark */
2507                     case 0x2019: /* right single quotation mark */
2508                     case 0x201A: /* single low-9 quotation mark */
2509                         c = '\'';
2510                         break;
2511                     case 0x201C: /* left double  quotation mark */
2512                     case 0x201D: /* right double quotation mark */
2513                     case 0x201E: /* double low-9 quotation mark */
2514                         c = '"';
2515                         break;
2516                     }
2517                 }
2518 
2519                 p = TY_(PutUTF8)(p, c);
2520             }
2521 
2522             node->end = p - lexer->lexbuf;
2523         }
2524 
2525         if (node->content)
2526             TY_(DowngradeTypography)(doc, node->content);
2527 
2528         node = next;
2529     }
2530 }
2531 
TY_(ReplacePreformattedSpaces)2532 void TY_(ReplacePreformattedSpaces)(TidyDocImpl* doc, Node* node)
2533 {
2534     Node* next;
2535 
2536     while (node)
2537     {
2538         next = node->next;
2539 
2540         if (node->tag && node->tag->parser == TY_(ParsePre))
2541         {
2542             TY_(NormalizeSpaces)(doc->lexer, node->content);
2543             node = next;
2544             continue;
2545         }
2546 
2547         if (node->content)
2548             TY_(ReplacePreformattedSpaces)(doc, node->content);
2549 
2550         node = next;
2551     }
2552 }
2553 
TY_(ConvertCDATANodes)2554 void TY_(ConvertCDATANodes)(TidyDocImpl* doc, Node* node)
2555 {
2556     Node* next;
2557 
2558     while (node)
2559     {
2560         next = node->next;
2561 
2562         if (node->type == CDATATag)
2563             node->type = TextNode;
2564 
2565         if (node->content)
2566             TY_(ConvertCDATANodes)(doc, node->content);
2567 
2568         node = next;
2569     }
2570 }
2571 
2572 /*
2573   FixLanguageInformation ensures that the document contains (only)
2574   the attributes for language information desired by the output
2575   document type. For example, for XHTML 1.0 documents both
2576   'xml:lang' and 'lang' are desired, for XHTML 1.1 only 'xml:lang'
2577   is desired and for HTML 4.01 only 'lang' is desired.
2578 */
TY_(FixLanguageInformation)2579 void TY_(FixLanguageInformation)(TidyDocImpl* doc, Node* node, Bool wantXmlLang, Bool wantLang)
2580 {
2581     Node* next;
2582 
2583     while (node)
2584     {
2585         next = node->next;
2586 
2587         /* todo: report modifications made here to the report system */
2588 
2589         if (TY_(nodeIsElement)(node))
2590         {
2591             AttVal* lang = TY_(AttrGetById)(node, TidyAttr_LANG);
2592             AttVal* xmlLang = TY_(AttrGetById)(node, TidyAttr_XML_LANG);
2593 
2594             if (lang && xmlLang)
2595             {
2596                 /*
2597                   todo: check whether both attributes are in sync,
2598                   here or elsewhere, where elsewhere is probably
2599                   preferable.
2600                   AD - March 2005: not mandatory according the standards.
2601                 */
2602             }
2603             else if (lang && wantXmlLang)
2604             {
2605                 if (TY_(NodeAttributeVersions)( node, TidyAttr_XML_LANG )
2606                     & doc->lexer->versionEmitted)
2607                     TY_(RepairAttrValue)(doc, node, "xml:lang", lang->value);
2608             }
2609             else if (xmlLang && wantLang)
2610             {
2611                 if (TY_(NodeAttributeVersions)( node, TidyAttr_LANG )
2612                     & doc->lexer->versionEmitted)
2613                     TY_(RepairAttrValue)(doc, node, "lang", xmlLang->value);
2614             }
2615 
2616             if (lang && !wantLang)
2617                 TY_(RemoveAttribute)(doc, node, lang);
2618 
2619             if (xmlLang && !wantXmlLang)
2620                 TY_(RemoveAttribute)(doc, node, xmlLang);
2621         }
2622 
2623         if (node->content)
2624             TY_(FixLanguageInformation)(doc, node->content, wantXmlLang, wantLang);
2625 
2626         node = next;
2627     }
2628 }
2629 
2630 /*
2631   Set/fix/remove <html xmlns='...'>
2632 */
TY_(FixXhtmlNamespace)2633 void TY_(FixXhtmlNamespace)(TidyDocImpl* doc, Bool wantXmlns)
2634 {
2635     Node* html = TY_(FindHTML)(doc);
2636     AttVal* xmlns;
2637 
2638     if (!html)
2639         return;
2640 
2641     xmlns = TY_(AttrGetById)(html, TidyAttr_XMLNS);
2642 
2643     if (wantXmlns)
2644     {
2645         if (!AttrValueIs(xmlns, XHTML_NAMESPACE))
2646             TY_(RepairAttrValue)(doc, html, "xmlns", XHTML_NAMESPACE);
2647     }
2648     else if (xmlns)
2649     {
2650         TY_(RemoveAttribute)(doc, html, xmlns);
2651     }
2652 }
2653 
2654 /*
2655   ...
2656 */
TY_(FixAnchors)2657 void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
2658 {
2659     Node* next;
2660 
2661     while (node)
2662     {
2663         next = node->next;
2664 
2665         if (TY_(IsAnchorElement)(doc, node))
2666         {
2667             AttVal *name = TY_(AttrGetById)(node, TidyAttr_NAME);
2668             AttVal *id = TY_(AttrGetById)(node, TidyAttr_ID);
2669             Bool hadName = name!=NULL;
2670             Bool hadId = id!=NULL;
2671             Bool IdEmitted = no;
2672             Bool NameEmitted = no;
2673 
2674             /* todo: how are empty name/id attributes handled? */
2675 
2676             if (name && id)
2677             {
2678                 Bool NameHasValue = AttrHasValue(name);
2679                 Bool IdHasValue = AttrHasValue(id);
2680                 if ( (NameHasValue != IdHasValue) ||
2681                      (NameHasValue && IdHasValue &&
2682                      TY_(tmbstrcmp)(name->value, id->value) != 0 ) )
2683                     TY_(ReportAttrError)( doc, node, name, ID_NAME_MISMATCH);
2684             }
2685             else if (name && wantId)
2686             {
2687                 if (TY_(NodeAttributeVersions)( node, TidyAttr_ID )
2688                     & doc->lexer->versionEmitted)
2689                 {
2690                     if (TY_(IsValidHTMLID)(name->value))
2691                     {
2692                         TY_(RepairAttrValue)(doc, node, "id", name->value);
2693                         IdEmitted = yes;
2694                     }
2695                     else
2696                         TY_(ReportAttrError)(doc, node, name, INVALID_XML_ID);
2697                  }
2698             }
2699             else if (id && wantName)
2700             {
2701                 if (TY_(NodeAttributeVersions)( node, TidyAttr_NAME )
2702                     & doc->lexer->versionEmitted)
2703                 {
2704                     /* todo: do not assume id is valid */
2705                     TY_(RepairAttrValue)(doc, node, "name", id->value);
2706                     NameEmitted = yes;
2707                 }
2708             }
2709 
2710             if (id && !wantId
2711                 /* make sure that Name has been emitted if requested */
2712                 && (hadName || !wantName || NameEmitted) ) {
2713                 if (!wantId && !wantName)
2714                     TY_(RemoveAnchorByNode)(doc, id->value, node);
2715                 TY_(RemoveAttribute)(doc, node, id);
2716             }
2717 
2718             if (name && !wantName
2719                 /* make sure that Id has been emitted if requested */
2720                 && (hadId || !wantId || IdEmitted) ) {
2721                 if (!wantId && !wantName)
2722                     TY_(RemoveAnchorByNode)(doc, name->value, node);
2723                 TY_(RemoveAttribute)(doc, node, name);
2724             }
2725         }
2726 
2727         if (node->content)
2728             TY_(FixAnchors)(doc, node->content, wantName, wantId);
2729 
2730         node = next;
2731     }
2732 }
2733 
2734 /* Issue #567 - move style elements from body to head
2735  * ==================================================
2736  */
StyleToHead(TidyDocImpl * doc,Node * head,Node * node,Bool fix,int indent)2737 static void StyleToHead(TidyDocImpl* doc, Node *head, Node *node, Bool fix, int indent)
2738 {
2739 	Node *next;
2740 	while (node)
2741 	{
2742 		next = node->next;	/* get 'next' now , in case the node is moved */
2743 		/* dbg_show_node(doc, node, 0, indent); */
2744 		if (nodeIsSTYLE(node))
2745 		{
2746 			if (fix)
2747 			{
2748 				TY_(RemoveNode)(node); /* unhook style node from body */
2749 				TY_(InsertNodeAtEnd)(head, node);   /* add to end of head */
2750 				TY_(Report)(doc, node, head, MOVED_STYLE_TO_HEAD); /* report move */
2751 			}
2752 			else
2753 			{
2754 				TY_(Report)(doc, node, head, FOUND_STYLE_IN_BODY);
2755 			}
2756 		}
2757 		else if (node->content)
2758 		{
2759 			StyleToHead(doc, head, node->content, fix, indent + 1);
2760 		}
2761 		node = next;	/* process the 'next', if any */
2762 	}
2763 }
2764 
2765 
TY_(CleanStyle)2766 void TY_(CleanStyle)(TidyDocImpl* doc, Node *html)
2767 {
2768     Node *head = NULL, *body = NULL;
2769     Bool fix = cfgBool(doc, TidyStyleTags);
2770 
2771     if (!html)
2772         return; /* oops, not given a start node */
2773 
2774     head = TY_(FindHEAD)( doc );
2775     body = TY_(FindBody)( doc );
2776 
2777     if ((head != NULL) && (body != NULL))
2778     {
2779 		StyleToHead(doc, head, body, fix, 0); /* found head and body */
2780     }
2781 }
2782 /* ==================================================
2783  */
2784 
2785 /*
2786  * CleanHead - clean the head node, if it exists, and we
2787  * are going to show it in the output.
2788  * Issue #692 - Remove multiple title elements
2789  */
TY_(CleanHead)2790 void TY_(CleanHead)(TidyDocImpl* doc)
2791 {
2792     Node *head, *node, *next;
2793     uint titles = 0;
2794     if (cfgAutoBool(doc, TidyBodyOnly) == TidyYesState)
2795         return; /* not going to show head, so forget it */
2796     head = TY_(FindHEAD)(doc);
2797     if (!head)
2798         return;
2799     node = head->content;
2800     while (node)
2801     {
2802         next = node->next;  /* get any 'next' */
2803         if (nodeIsTITLE(node))
2804         {
2805             titles++;
2806             if (titles > 1)
2807             {
2808                 TY_(Report)(doc, head, node, DISCARDING_UNEXPECTED);
2809                 TY_(DiscardElement)(doc, node); /* delete this node */
2810             }
2811         }
2812         node = next;
2813     }
2814 }
2815 
2816 /*
2817  * local variables:
2818  * mode: c
2819  * indent-tabs-mode: nil
2820  * c-basic-offset: 4
2821  * eval: (c-set-offset 'substatement-open 0)
2822  * end:
2823  */
2824