1 /*
2   clean.c -- clean up misuse of presentation markup
3 
4   (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
5   See tidy.h for the copyright notice.
6 
7   CVS Info :
8 
9     $Author: arnaud02 $
10     $Date: 2008/10/14 12:18:10 $
11     $Revision: 1.111 $
12 
13   Filters from other formats such as Microsoft Word
14   often make excessive use of presentation markup such
15   as font tags, B, I, and the align attribute. By applying
16   a set of production rules, it is straight forward to
17   transform this to use CSS.
18 
19   Some rules replace some of the children of an element by
20   style properties on the element, e.g.
21 
22   <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
23 
24   Such rules are applied to the element's content and then
25   to the element itself until none of the rules more apply.
26   Having applied all the rules to an element, it will have
27   a style attribute with one or more properties.
28 
29   Other rules strip the element they apply to, replacing
30   it by style properties on the contents, e.g.
31 
32   <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
33 
34   These rules are applied to an element before processing
35   its content and replace the current element by the first
36   element in the exposed content.
37 
38   After applying both sets of rules, you can replace the
39   style attribute by a class value and style rule in the
40   document head. To support this, an association of styles
41   and class names is built.
42 
43   A naive approach is to rely on string matching to test
44   when two property lists are the same. A better approach
45   would be to first sort the properties before matching.
46 
47 */
48 
49 #include <stdio.h>
50 #include <stdlib.h>
51 #include <string.h>
52 
53 #include "tidy-int.h"
54 #include "clean.h"
55 #include "lexer.h"
56 #include "parser.h"
57 #include "attrs.h"
58 #include "message.h"
59 #include "tmbstr.h"
60 #include "utf8.h"
61 
62 static Node* CleanNode( TidyDocImpl* doc, Node *node );
63 
RenameElem(TidyDocImpl * doc,Node * node,TidyTagId tid)64 static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid )
65 {
66     const Dict* dict = TY_(LookupTagDef)( tid );
67     TidyDocFree( doc, node->element );
68     node->element = TY_(tmbstrdup)( doc->allocator, dict->name );
69     node->tag = dict;
70 }
71 
FreeStyleProps(TidyDocImpl * doc,StyleProp * props)72 static void FreeStyleProps(TidyDocImpl* doc, StyleProp *props)
73 {
74     StyleProp *next;
75 
76     while (props)
77     {
78         next = props->next;
79         TidyDocFree(doc, props->name);
80         TidyDocFree(doc, props->value);
81         TidyDocFree(doc, props);
82         props = next;
83     }
84 }
85 
InsertProperty(TidyDocImpl * doc,StyleProp * props,ctmbstr name,ctmbstr value)86 static StyleProp *InsertProperty( TidyDocImpl* doc, StyleProp* props, ctmbstr name, ctmbstr value )
87 {
88     StyleProp *first, *prev, *prop;
89     int cmp;
90 
91     prev = NULL;
92     first = props;
93 
94     while (props)
95     {
96         cmp = TY_(tmbstrcmp)(props->name, name);
97 
98         if (cmp == 0)
99         {
100             /* this property is already defined, ignore new value */
101             return first;
102         }
103 
104         if (cmp > 0)
105         {
106             /* insert before this */
107 
108             prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
109             prop->name = TY_(tmbstrdup)(doc->allocator, name);
110             prop->value = TY_(tmbstrdup)(doc->allocator, value);
111             prop->next = props;
112 
113             if (prev)
114                 prev->next = prop;
115             else
116                 first = prop;
117 
118             return first;
119         }
120 
121         prev = props;
122         props = props->next;
123     }
124 
125     prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
126     prop->name = TY_(tmbstrdup)(doc->allocator, name);
127     prop->value = TY_(tmbstrdup)(doc->allocator, value);
128     prop->next = NULL;
129 
130     if (prev)
131         prev->next = prop;
132     else
133         first = prop;
134 
135     return first;
136 }
137 
138 /*
139  Create sorted linked list of properties from style string
140  It temporarily places nulls in place of ':' and ';' to
141  delimit the strings for the property name and value.
142  Some systems don't allow you to NULL literal strings,
143  so to avoid this, a copy is made first.
144 */
CreateProps(TidyDocImpl * doc,StyleProp * prop,ctmbstr style)145 static StyleProp* CreateProps( TidyDocImpl* doc, StyleProp* prop, ctmbstr style )
146 {
147     tmbstr name, value = NULL, name_end, value_end, line;
148     Bool more;
149 
150     line = TY_(tmbstrdup)(doc->allocator, style);
151     name = line;
152 
153     while (*name)
154     {
155         while (*name == ' ')
156             ++name;
157 
158         name_end = name;
159 
160         while (*name_end)
161         {
162             if (*name_end == ':')
163             {
164                 value = name_end + 1;
165                 break;
166             }
167 
168             ++name_end;
169         }
170 
171         if (*name_end != ':')
172             break;
173 
174         while ( value && *value == ' ')
175             ++value;
176 
177         value_end = value;
178         more = no;
179 
180         while (*value_end)
181         {
182             if (*value_end == ';')
183             {
184                 more = yes;
185                 break;
186             }
187 
188             ++value_end;
189         }
190 
191         *name_end = '\0';
192         *value_end = '\0';
193 
194         prop = InsertProperty(doc, prop, name, value);
195         *name_end = ':';
196 
197         if (more)
198         {
199             *value_end = ';';
200             name = value_end + 1;
201             continue;
202         }
203 
204         break;
205     }
206 
207     TidyDocFree(doc, line);  /* free temporary copy */
208     return prop;
209 }
210 
CreatePropString(TidyDocImpl * doc,StyleProp * props)211 static tmbstr CreatePropString(TidyDocImpl* doc, StyleProp *props)
212 {
213     tmbstr style, p, s;
214     uint len;
215     StyleProp *prop;
216 
217     /* compute length */
218 
219     for (len = 0, prop = props; prop; prop = prop->next)
220     {
221         len += TY_(tmbstrlen)(prop->name) + 2;
222         if (prop->value)
223             len += TY_(tmbstrlen)(prop->value) + 2;
224     }
225 
226     style = (tmbstr) TidyDocAlloc(doc, len+1);
227     style[0] = '\0';
228 
229     for (p = style, prop = props; prop; prop = prop->next)
230     {
231         s = prop->name;
232 
233         while((*p++ = *s++))
234             continue;
235 
236         if (prop->value)
237         {
238             *--p = ':';
239             *++p = ' ';
240             ++p;
241 
242             s = prop->value;
243             while((*p++ = *s++))
244                 continue;
245         }
246         if (prop->next == NULL)
247             break;
248 
249         *--p = ';';
250         *++p = ' ';
251         ++p;
252     }
253 
254     return style;
255 }
256 
257 /*
258   create string with merged properties
259 static tmbstr AddProperty( ctmbstr style, ctmbstr property )
260 {
261     tmbstr line;
262     StyleProp *prop;
263 
264     prop = CreateProps(doc, NULL, style);
265     prop = CreateProps(doc, prop, property);
266     line = CreatePropString(doc, prop);
267     FreeStyleProps(doc, prop);
268     return line;
269 }
270 */
271 
TY_(FreeStyles)272 void TY_(FreeStyles)( TidyDocImpl* doc )
273 {
274     Lexer* lexer = doc->lexer;
275     if ( lexer )
276     {
277         TagStyle *style, *next;
278         for ( style = lexer->styles; style; style = next )
279         {
280             next = style->next;
281             TidyDocFree( doc, style->tag );
282             TidyDocFree( doc, style->tag_class );
283             TidyDocFree( doc, style->properties );
284             TidyDocFree( doc, style );
285         }
286     }
287 }
288 
GensymClass(TidyDocImpl * doc)289 static tmbstr GensymClass( TidyDocImpl* doc )
290 {
291     tmbchar buf[512];  /* CSSPrefix is limited to 256 characters */
292     ctmbstr pfx = cfgStr(doc, TidyCSSPrefix);
293     if ( pfx == NULL || *pfx == 0 )
294       pfx = "c";
295 
296     TY_(tmbsnprintf)(buf, sizeof(buf), "%s%u", pfx, ++doc->nClassId );
297     return TY_(tmbstrdup)(doc->allocator, buf);
298 }
299 
FindStyle(TidyDocImpl * doc,ctmbstr tag,ctmbstr properties)300 static ctmbstr FindStyle( TidyDocImpl* doc, ctmbstr tag, ctmbstr properties )
301 {
302     Lexer* lexer = doc->lexer;
303     TagStyle* style;
304 
305     for (style = lexer->styles; style; style=style->next)
306     {
307         if (TY_(tmbstrcmp)(style->tag, tag) == 0 &&
308             TY_(tmbstrcmp)(style->properties, properties) == 0)
309             return style->tag_class;
310     }
311 
312     style = (TagStyle *)TidyDocAlloc( doc, sizeof(TagStyle) );
313     style->tag = TY_(tmbstrdup)(doc->allocator, tag);
314     style->tag_class = GensymClass( doc );
315     style->properties = TY_(tmbstrdup)( doc->allocator, properties );
316     style->next = lexer->styles;
317     lexer->styles = style;
318     return style->tag_class;
319 }
320 
321 /*
322  Add class="foo" to node
323 */
AddClass(TidyDocImpl * doc,Node * node,ctmbstr classname)324 static void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname )
325 {
326     AttVal *classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);;
327 
328     /*
329      if there already is a class attribute
330      then append class name after a space.
331     */
332     if (classattr)
333         TY_(AppendToClassAttr)( doc, classattr, classname );
334     else /* create new class attribute */
335         TY_(AddAttribute)( doc, node, "class", classname );
336 }
337 
TY_(AddStyleAsClass)338 void TY_(AddStyleAsClass)( TidyDocImpl* doc, Node *node, ctmbstr stylevalue )
339 {
340     ctmbstr classname;
341 
342     classname = FindStyle( doc, node->element, stylevalue );
343     AddClass( doc, node, classname);
344 }
345 
346 /*
347  Find style attribute in node, and replace it
348  by corresponding class attribute. Search for
349  class in style dictionary otherwise gensym
350  new class and add to dictionary.
351 
352  Assumes that node doesn't have a class attribute
353 */
Style2Rule(TidyDocImpl * doc,Node * node)354 static void Style2Rule( TidyDocImpl* doc, Node *node)
355 {
356     AttVal *styleattr, *classattr;
357     ctmbstr classname;
358 
359     styleattr = TY_(AttrGetById)(node, TidyAttr_STYLE);
360 
361     if (styleattr)
362     {
363         /* fix for http://tidy.sf.net/bug/850215 */
364         if (!styleattr->value)
365         {
366             TY_(RemoveAttribute)(doc, node, styleattr);
367             return;
368         }
369 
370         classname = FindStyle( doc, node->element, styleattr->value );
371         classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);
372 
373         /*
374          if there already is a class attribute
375          then append class name after an underscore
376         */
377         if (classattr)
378         {
379             TY_(AppendToClassAttr)( doc, classattr, classname );
380             TY_(RemoveAttribute)( doc, node, styleattr );
381         }
382         else /* reuse style attribute for class attribute */
383         {
384             TidyDocFree(doc, styleattr->attribute);
385             TidyDocFree(doc, styleattr->value);
386             styleattr->attribute = TY_(tmbstrdup)(doc->allocator, "class");
387             styleattr->value = TY_(tmbstrdup)(doc->allocator, classname);
388         }
389     }
390 }
391 
AddColorRule(Lexer * lexer,ctmbstr selector,ctmbstr color)392 static void AddColorRule( Lexer* lexer, ctmbstr selector, ctmbstr color )
393 {
394     if ( selector && color )
395     {
396         TY_(AddStringLiteral)(lexer, selector);
397         TY_(AddStringLiteral)(lexer, " { color: ");
398         TY_(AddStringLiteral)(lexer, color);
399         TY_(AddStringLiteral)(lexer, " }\n");
400     }
401 }
402 
403 /*
404  move presentation attribs from body to style element
405 
406  background="foo" ->  body { background-image: url(foo) }
407  bgcolor="foo"    ->  body { background-color: foo }
408  text="foo"       ->  body { color: foo }
409  link="foo"       ->  :link { color: foo }
410  vlink="foo"      ->  :visited { color: foo }
411  alink="foo"      ->  :active { color: foo }
412 */
CleanBodyAttrs(TidyDocImpl * doc,Node * body)413 static void CleanBodyAttrs( TidyDocImpl* doc, Node* body )
414 {
415     Lexer* lexer  = doc->lexer;
416     tmbstr bgurl   = NULL;
417     tmbstr bgcolor = NULL;
418     tmbstr color   = NULL;
419     AttVal* attr;
420 
421     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BACKGROUND)))
422     {
423         bgurl = attr->value;
424         attr->value = NULL;
425         TY_(RemoveAttribute)( doc, body, attr );
426     }
427 
428     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BGCOLOR)))
429     {
430         bgcolor = attr->value;
431         attr->value = NULL;
432         TY_(RemoveAttribute)( doc, body, attr );
433     }
434 
435     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_TEXT)))
436     {
437         color = attr->value;
438         attr->value = NULL;
439         TY_(RemoveAttribute)( doc, body, attr );
440     }
441 
442     if ( bgurl || bgcolor || color )
443     {
444         TY_(AddStringLiteral)(lexer, " body {\n");
445         if (bgurl)
446         {
447             TY_(AddStringLiteral)(lexer, "  background-image: url(");
448             TY_(AddStringLiteral)(lexer, bgurl);
449             TY_(AddStringLiteral)(lexer, ");\n");
450             TidyDocFree(doc, bgurl);
451         }
452         if (bgcolor)
453         {
454             TY_(AddStringLiteral)(lexer, "  background-color: ");
455             TY_(AddStringLiteral)(lexer, bgcolor);
456             TY_(AddStringLiteral)(lexer, ";\n");
457             TidyDocFree(doc, bgcolor);
458         }
459         if (color)
460         {
461             TY_(AddStringLiteral)(lexer, "  color: ");
462             TY_(AddStringLiteral)(lexer, color);
463             TY_(AddStringLiteral)(lexer, ";\n");
464             TidyDocFree(doc, color);
465         }
466 
467         TY_(AddStringLiteral)(lexer, " }\n");
468     }
469 
470     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_LINK)))
471     {
472         AddColorRule(lexer, " :link", attr->value);
473         TY_(RemoveAttribute)( doc, body, attr );
474     }
475 
476     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_VLINK)))
477     {
478         AddColorRule(lexer, " :visited", attr->value);
479         TY_(RemoveAttribute)( doc, body, attr );
480     }
481 
482     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_ALINK)))
483     {
484         AddColorRule(lexer, " :active", attr->value);
485         TY_(RemoveAttribute)( doc, body, attr );
486     }
487 }
488 
NiceBody(TidyDocImpl * doc)489 static Bool NiceBody( TidyDocImpl* doc )
490 {
491     Node* node = TY_(FindBody)(doc);
492     if (node)
493     {
494         if (TY_(AttrGetById)(node, TidyAttr_BACKGROUND) ||
495             TY_(AttrGetById)(node, TidyAttr_BGCOLOR)    ||
496             TY_(AttrGetById)(node, TidyAttr_TEXT)       ||
497             TY_(AttrGetById)(node, TidyAttr_LINK)       ||
498             TY_(AttrGetById)(node, TidyAttr_VLINK)      ||
499             TY_(AttrGetById)(node, TidyAttr_ALINK))
500         {
501             doc->badLayout |= USING_BODY;
502             return no;
503         }
504     }
505 
506     return yes;
507 }
508 
509 /* create style element using rules from dictionary */
CreateStyleElement(TidyDocImpl * doc)510 static void CreateStyleElement( TidyDocImpl* doc )
511 {
512     Lexer* lexer = doc->lexer;
513     Node *node, *head, *body;
514     TagStyle *style;
515     AttVal *av;
516 
517     if ( lexer->styles == NULL && NiceBody(doc) )
518         return;
519 
520     node = TY_(NewNode)( doc->allocator, lexer );
521     node->type = StartTag;
522     node->implicit = yes;
523     node->element = TY_(tmbstrdup)(doc->allocator, "style");
524     TY_(FindTag)( doc, node );
525 
526     /* insert type attribute */
527     av = TY_(NewAttributeEx)( doc, "type", "text/css", '"' );
528     TY_(InsertAttributeAtStart)( node, av );
529 
530     body = TY_(FindBody)( doc );
531     lexer->txtstart = lexer->lexsize;
532     if ( body )
533         CleanBodyAttrs( doc, body );
534 
535     for (style = lexer->styles; style; style = style->next)
536     {
537         TY_(AddCharToLexer)(lexer, ' ');
538         TY_(AddStringLiteral)(lexer, style->tag);
539         TY_(AddCharToLexer)(lexer, '.');
540         TY_(AddStringLiteral)(lexer, style->tag_class);
541         TY_(AddCharToLexer)(lexer, ' ');
542         TY_(AddCharToLexer)(lexer, '{');
543         TY_(AddStringLiteral)(lexer, style->properties);
544         TY_(AddCharToLexer)(lexer, '}');
545         TY_(AddCharToLexer)(lexer, '\n');
546     }
547 
548     lexer->txtend = lexer->lexsize;
549 
550     TY_(InsertNodeAtEnd)( node, TY_(TextToken)(lexer) );
551 
552     /*
553      now insert style element into document head
554 
555      doc is root node. search its children for html node
556      the head node should be first child of html node
557     */
558     if ( NULL != (head = TY_(FindHEAD)( doc )) )
559         TY_(InsertNodeAtEnd)( head, node );
560 }
561 
562 
563 /* ensure bidirectional links are consistent */
TY_(FixNodeLinks)564 void TY_(FixNodeLinks)(Node *node)
565 {
566     Node *child;
567 
568     if (node->prev)
569         node->prev->next = node;
570     else
571         node->parent->content = node;
572 
573     if (node->next)
574         node->next->prev = node;
575     else
576         node->parent->last = node;
577 
578     for (child = node->content; child; child = child->next)
579         child->parent = node;
580 }
581 
582 /*
583  used to strip child of node when
584  the node has one and only one child
585 */
StripOnlyChild(TidyDocImpl * doc,Node * node)586 static void StripOnlyChild(TidyDocImpl* doc, Node *node)
587 {
588     Node *child;
589 
590     child = node->content;
591     node->content = child->content;
592     node->last = child->last;
593     child->content = NULL;
594     TY_(FreeNode)(doc, child);
595 
596     for (child = node->content; child; child = child->next)
597         child->parent = node;
598 }
599 
600 /*
601   used to strip font start and end tags.
602   Extricate "element", replace it by its content and delete it.
603 */
DiscardContainer(TidyDocImpl * doc,Node * element,Node ** pnode)604 static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
605 {
606     if (element->content)
607     {
608         Node *node, *parent = element->parent;
609 
610         element->last->next = element->next;
611 
612         if (element->next)
613         {
614             element->next->prev = element->last;
615         }
616         else
617             parent->last = element->last;
618 
619         if (element->prev)
620         {
621             element->content->prev = element->prev;
622             element->prev->next = element->content;
623         }
624         else
625             parent->content = element->content;
626 
627         for (node = element->content; node; node = node->next)
628             node->parent = parent;
629 
630         *pnode = element->content;
631 
632         element->next = element->content = NULL;
633         TY_(FreeNode)(doc, element);
634     }
635     else
636     {
637         *pnode = TY_(DiscardElement)(doc, element);
638     }
639 }
640 
641 /*
642   Create new string that consists of the
643   combined style properties in s1 and s2
644 
645   To merge property lists, we build a linked
646   list of property/values and insert properties
647   into the list in order, merging values for
648   the same property name.
649 */
MergeProperties(TidyDocImpl * doc,ctmbstr s1,ctmbstr s2)650 static tmbstr MergeProperties( TidyDocImpl* doc, ctmbstr s1, ctmbstr s2 )
651 {
652     tmbstr s;
653     StyleProp *prop;
654 
655     prop = CreateProps(doc, NULL, s1);
656     prop = CreateProps(doc, prop, s2);
657     s = CreatePropString(doc, prop);
658     FreeStyleProps(doc, prop);
659     return s;
660 }
661 
662 /*
663  Add style property to element, creating style
664  attribute as needed and adding ; delimiter
665 */
TY_(AddStyleProperty)666 void TY_(AddStyleProperty)(TidyDocImpl* doc, Node *node, ctmbstr property )
667 {
668     AttVal *av = TY_(AttrGetById)(node, TidyAttr_STYLE);
669 
670     /* if style attribute already exists then insert property */
671 
672     if ( av )
673     {
674         if (av->value != NULL)
675         {
676             tmbstr s = MergeProperties( doc, av->value, property );
677             TidyDocFree( doc, av->value );
678             av->value = s;
679         }
680         else
681         {
682             av->value = TY_(tmbstrdup)( doc->allocator, property );
683         }
684     }
685     else /* else create new style attribute */
686     {
687         av = TY_(NewAttributeEx)( doc, "style", property, '"' );
688         TY_(InsertAttributeAtStart)( node, av );
689     }
690 }
691 
MergeClasses(TidyDocImpl * doc,Node * node,Node * child)692 static void MergeClasses(TidyDocImpl* doc, Node *node, Node *child)
693 {
694     AttVal *av;
695     tmbstr s1, s2, names;
696 
697     for (s2 = NULL, av = child->attributes; av; av = av->next)
698     {
699         if (attrIsCLASS(av))
700         {
701             s2 = av->value;
702             break;
703         }
704     }
705 
706     for (s1 = NULL, av = node->attributes; av; av = av->next)
707     {
708         if (attrIsCLASS(av))
709         {
710             s1 = av->value;
711             break;
712         }
713     }
714 
715     if (s1)
716     {
717         if (s2)  /* merge class names from both */
718         {
719             uint l1, l2;
720             l1 = TY_(tmbstrlen)(s1);
721             l2 = TY_(tmbstrlen)(s2);
722             names = (tmbstr) TidyDocAlloc(doc, l1 + l2 + 2);
723             TY_(tmbstrcpy)(names, s1);
724             names[l1] = ' ';
725             TY_(tmbstrcpy)(names+l1+1, s2);
726             TidyDocFree(doc, av->value);
727             av->value = names;
728         }
729     }
730     else if (s2)  /* copy class names from child */
731     {
732         av = TY_(NewAttributeEx)( doc, "class", s2, '"' );
733         TY_(InsertAttributeAtStart)( node, av );
734     }
735 }
736 
MergeStyles(TidyDocImpl * doc,Node * node,Node * child)737 static void MergeStyles(TidyDocImpl* doc, Node *node, Node *child)
738 {
739     AttVal *av;
740     tmbstr s1, s2, style;
741 
742     /*
743        the child may have a class attribute used
744        for attaching styles, if so the class name
745        needs to be copied to node's class
746     */
747     MergeClasses(doc, node, child);
748 
749     for (s2 = NULL, av = child->attributes; av; av = av->next)
750     {
751         if (attrIsSTYLE(av))
752         {
753             s2 = av->value;
754             break;
755         }
756     }
757 
758     for (s1 = NULL, av = node->attributes; av; av = av->next)
759     {
760         if (attrIsSTYLE(av))
761         {
762             s1 = av->value;
763             break;
764         }
765     }
766 
767     if (s1)
768     {
769         if (s2)  /* merge styles from both */
770         {
771             style = MergeProperties(doc, s1, s2);
772             TidyDocFree(doc, av->value);
773             av->value = style;
774         }
775     }
776     else if (s2)  /* copy style of child */
777     {
778         av = TY_(NewAttributeEx)( doc, "style", s2, '"' );
779         TY_(InsertAttributeAtStart)( node, av );
780     }
781 }
782 
FontSize2Name(ctmbstr size)783 static ctmbstr FontSize2Name(ctmbstr size)
784 {
785     static const ctmbstr sizes[7] =
786     {
787         "60%", "70%", "80%", NULL,
788         "120%", "150%", "200%"
789     };
790 
791     /* increment of 0.8 */
792     static const ctmbstr minussizes[] =
793     {
794         "100%", "80%", "64%", "51%",
795         "40%", "32%", "26%"
796     };
797 
798     /* increment of 1.2 */
799     static const ctmbstr plussizes[] =
800     {
801         "100%", "120%", "144%", "172%",
802         "207%", "248%", "298%"
803     };
804 
805     if (size[0] == '\0')
806         return NULL;
807 
808     if ('0' <= size[0] && size[0] <= '6')
809     {
810         int n = size[0] - '0';
811         return sizes[n];
812     }
813 
814     if (size[0] == '-')
815     {
816         if ('0' <= size[1] && size[1] <= '6')
817         {
818             int n = size[1] - '0';
819             return minussizes[n];
820         }
821         return "smaller"; /*"70%"; */
822     }
823 
824     if ('0' <= size[1] && size[1] <= '6')
825     {
826         int n = size[1] - '0';
827         return plussizes[n];
828     }
829 
830     return "larger"; /* "140%" */
831 }
832 
AddFontFace(TidyDocImpl * doc,Node * node,ctmbstr face)833 static void AddFontFace( TidyDocImpl* doc, Node *node, ctmbstr face )
834 {
835     tmbchar buf[256];
836     TY_(tmbsnprintf)(buf, sizeof(buf), "font-family: %s", face );
837     TY_(AddStyleProperty)( doc, node, buf );
838 }
839 
AddFontSize(TidyDocImpl * doc,Node * node,ctmbstr size)840 static void AddFontSize( TidyDocImpl* doc, Node* node, ctmbstr size )
841 {
842     ctmbstr value = NULL;
843 
844     if (nodeIsP(node))
845     {
846         if (TY_(tmbstrcmp)(size, "6") == 0)
847             value = "h1";
848         else if (TY_(tmbstrcmp)(size, "5") == 0)
849             value = "h2";
850         else if (TY_(tmbstrcmp)(size, "4") == 0)
851             value = "h3";
852 
853         if (value)
854         {
855             TidyDocFree(doc, node->element);
856             node->element = TY_(tmbstrdup)(doc->allocator, value);
857             TY_(FindTag)(doc, node);
858             return;
859         }
860     }
861 
862     value = FontSize2Name(size);
863 
864     if (value)
865     {
866         tmbchar buf[64];
867         TY_(tmbsnprintf)(buf, sizeof(buf), "font-size: %s", value);
868         TY_(AddStyleProperty)( doc, node, buf );
869     }
870 }
871 
AddFontColor(TidyDocImpl * doc,Node * node,ctmbstr color)872 static void AddFontColor( TidyDocImpl* doc, Node *node, ctmbstr color)
873 {
874     tmbchar buf[128];
875     TY_(tmbsnprintf)(buf, sizeof(buf), "color: %s", color);
876     TY_(AddStyleProperty)( doc, node, buf );
877 }
878 
879 /* force alignment value to lower case */
AddAlign(TidyDocImpl * doc,Node * node,ctmbstr align)880 static void AddAlign( TidyDocImpl* doc, Node *node, ctmbstr align )
881 {
882     uint i;
883     tmbchar buf[128];
884 
885     TY_(tmbstrcpy)( buf, "text-align: " );
886     for ( i = 12; i < sizeof(buf)/sizeof(buf[0])-1; ++i )
887     {
888         if ( (buf[i] = (tmbchar)TY_(ToLower)(*align++)) == '\0' )
889             break;
890     }
891     buf[i] = '\0';
892     TY_(AddStyleProperty)( doc, node, buf );
893 }
894 
895 /*
896  add style properties to node corresponding to
897  the font face, size and color attributes
898 */
AddFontStyles(TidyDocImpl * doc,Node * node,AttVal * av)899 static void AddFontStyles( TidyDocImpl* doc, Node *node, AttVal *av)
900 {
901     while (av)
902     {
903         if (AttrHasValue(av))
904         {
905             if (attrIsFACE(av))
906                 AddFontFace( doc, node, av->value );
907             else if (attrIsSIZE(av))
908                 AddFontSize( doc, node, av->value );
909             else if (attrIsCOLOR(av))
910                 AddFontColor( doc, node, av->value );
911         }
912         av = av->next;
913     }
914 }
915 
916 /*
917     Symptom: <p align=center>
918     Action: <p style="text-align: center">
919 */
TextAlign(TidyDocImpl * doc,Node * node)920 static void TextAlign( TidyDocImpl* doc, Node* node )
921 {
922     AttVal *av, *prev;
923 
924     prev = NULL;
925 
926     for (av = node->attributes; av; av = av->next)
927     {
928         if (attrIsALIGN(av))
929         {
930             if (prev)
931                 prev->next = av->next;
932             else
933                 node->attributes = av->next;
934 
935             if (av->value)
936                 AddAlign( doc, node, av->value );
937 
938             TY_(FreeAttribute)(doc, av);
939             break;
940         }
941 
942         prev = av;
943     }
944 }
945 
946 /*
947     Symptom: <table bgcolor="red">
948     Action: <table style="background-color: red">
949 */
TableBgColor(TidyDocImpl * doc,Node * node)950 static void TableBgColor( TidyDocImpl* doc, Node* node )
951 {
952     AttVal* attr;
953     tmbchar buf[256];
954 
955     if (NULL != (attr = TY_(AttrGetById)(node, TidyAttr_BGCOLOR)))
956     {
957         TY_(tmbsnprintf)(buf, sizeof(buf), "background-color: %s", attr->value );
958         TY_(RemoveAttribute)( doc, node, attr );
959         TY_(AddStyleProperty)( doc, node, buf );
960     }
961 }
962 
963 /*
964    The clean up rules use the pnode argument to return the
965    next node when the original node has been deleted
966 */
967 
968 /*
969     Symptom: <dir> <li> where <li> is only child
970     Action: coerce <dir> <li> to <div> with indent.
971 */
972 
Dir2Div(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))973 static Bool Dir2Div( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode))
974 {
975     Node *child;
976 
977     if ( nodeIsDIR(node) || nodeIsUL(node) || nodeIsOL(node) )
978     {
979         child = node->content;
980 
981         if (child == NULL)
982             return no;
983 
984         /* check child has no peers */
985 
986         if (child->next)
987             return no;
988 
989         if ( !nodeIsLI(child) )
990             return no;
991 
992         if ( !child->implicit )
993             return no;
994 
995         /* coerce dir to div */
996         node->tag = TY_(LookupTagDef)( TidyTag_DIV );
997         TidyDocFree( doc, node->element );
998         node->element = TY_(tmbstrdup)(doc->allocator, "div");
999         TY_(AddStyleProperty)( doc, node, "margin-left: 2em" );
1000         StripOnlyChild( doc, node );
1001         return yes;
1002     }
1003 
1004     return no;
1005 }
1006 
1007 /*
1008     Symptom: <center>
1009     Action: replace <center> by <div style="text-align: center">
1010 */
1011 
Center2Div(TidyDocImpl * doc,Node * node,Node ** pnode)1012 static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode)
1013 {
1014     if ( nodeIsCENTER(node) )
1015     {
1016         if ( cfgBool(doc, TidyDropFontTags) )
1017         {
1018             if (node->content)
1019             {
1020                 Node *last = node->last;
1021                 DiscardContainer( doc, node, pnode );
1022 
1023                 node = TY_(InferredTag)(doc, TidyTag_BR);
1024                 TY_(InsertNodeAfterElement)(last, node);
1025             }
1026             else
1027             {
1028                 Node *prev = node->prev, *next = node->next,
1029                      *parent = node->parent;
1030                 DiscardContainer( doc, node, pnode );
1031 
1032                 node = TY_(InferredTag)(doc, TidyTag_BR);
1033                 if (next)
1034                     TY_(InsertNodeBeforeElement)(next, node);
1035                 else if (prev)
1036                     TY_(InsertNodeAfterElement)(prev, node);
1037                 else
1038                     TY_(InsertNodeAtStart)(parent, node);
1039             }
1040 
1041             return yes;
1042         }
1043 
1044         RenameElem( doc, node, TidyTag_DIV );
1045         TY_(AddStyleProperty)( doc, node, "text-align: center" );
1046         return yes;
1047     }
1048 
1049     return no;
1050 }
1051 
1052 /* Copy child attributes to node. Duplicate attributes are overwritten.
1053    Unique attributes (such as ID) disable the action.
1054    Attributes style and class are not dealt with. A call to MergeStyles
1055    will do that.
1056 */
CopyAttrs(TidyDocImpl * doc,Node * node,Node * child)1057 static Bool CopyAttrs( TidyDocImpl* doc, Node *node, Node *child)
1058 {
1059     AttVal *av1, *av2;
1060     TidyAttrId id;
1061 
1062     /* Detect attributes that cannot be merged or overwritten. */
1063     if (TY_(AttrGetById)(child, TidyAttr_ID) != NULL
1064         && TY_(AttrGetById)(node, TidyAttr_ID) != NULL)
1065         return no;
1066 
1067     /* Move child attributes to node. Attributes in node
1068      can be overwritten or merged. */
1069     for (av2 = child->attributes; av2; )
1070     {
1071         /* Dealt by MergeStyles. */
1072         if (attrIsSTYLE(av2) || attrIsCLASS(av2))
1073         {
1074             av2 = av2->next;
1075             continue;
1076         }
1077         /* Avoid duplicates in node */
1078         if ((id=AttrId(av2)) != TidyAttr_UNKNOWN
1079             && (av1=TY_(AttrGetById)(node, id))!= NULL)
1080             TY_(RemoveAttribute)( doc, node, av1 );
1081 
1082         /* Move attribute from child to node */
1083         TY_(DetachAttribute)( child, av2 );
1084         av1 = av2;
1085         av2 = av2->next;
1086         av1->next = NULL;
1087         TY_(InsertAttributeAtEnd)( node, av1 );
1088     }
1089 
1090     return yes;
1091 }
1092 
1093 /*
1094     Symptom <XX><XX>...</XX></XX>
1095     Action: merge the two XXs
1096 
1097   For instance, this is useful after nested <dir>s used by Word
1098   for indenting have been converted to <div>s
1099 
1100   If state is "no", no merging.
1101   If state is "yes", inner element is discarded. Only Style and Class
1102   attributes are merged using MergeStyles().
1103   If state is "auto", atttibutes are merged as described in CopyAttrs().
1104   Style and Class attributes are merged using MergeStyles().
1105 */
MergeNestedElements(TidyDocImpl * doc,TidyTagId Id,TidyTriState state,Node * node,Node ** ARG_UNUSED (pnode))1106 static Bool MergeNestedElements( TidyDocImpl* doc,
1107                                  TidyTagId Id, TidyTriState state, Node *node,
1108                                  Node **ARG_UNUSED(pnode))
1109 {
1110     Node *child;
1111 
1112     if ( state == TidyNoState
1113          || !TagIsId(node, Id) )
1114         return no;
1115 
1116     child = node->content;
1117 
1118     if ( child == NULL
1119          || child->next != NULL
1120          || !TagIsId(child, Id) )
1121         return no;
1122 
1123     if ( state == TidyAutoState
1124          && CopyAttrs(doc, node, child) == no )
1125         return no;
1126 
1127     MergeStyles( doc, node, child );
1128     StripOnlyChild( doc, node );
1129     return yes;
1130 }
1131 
1132 /*
1133     Symptom: <ul><li><ul>...</ul></li></ul>
1134     Action: discard outer list
1135 */
1136 
NestedList(TidyDocImpl * doc,Node * node,Node ** pnode)1137 static Bool NestedList( TidyDocImpl* doc, Node *node, Node **pnode )
1138 {
1139     Node *child, *list;
1140 
1141     if ( nodeIsUL(node) || nodeIsOL(node) )
1142     {
1143         child = node->content;
1144 
1145         if (child == NULL)
1146             return no;
1147 
1148         /* check child has no peers */
1149 
1150         if (child->next)
1151             return no;
1152 
1153         list = child->content;
1154 
1155         if (!list)
1156             return no;
1157 
1158         if (list->tag != node->tag)
1159             return no;
1160 
1161         /* check list has no peers */
1162         if (list->next)
1163             return no;
1164 
1165         *pnode = list;  /* Set node to resume iteration */
1166 
1167         /* move inner list node into position of outer node */
1168         list->prev = node->prev;
1169         list->next = node->next;
1170         list->parent = node->parent;
1171         TY_(FixNodeLinks)(list);
1172 
1173         /* get rid of outer ul and its li */
1174         child->content = NULL;
1175         TY_(FreeNode)( doc, child ); /* See test #427841. */
1176         child = NULL;
1177         node->content = NULL;
1178         node->next = NULL;
1179         TY_(FreeNode)( doc, node );
1180         node = NULL;
1181 
1182         /*
1183           If prev node was a list the chances are this node
1184           should be appended to that list. Word has no way of
1185           recognizing nested lists and just uses indents
1186         */
1187 
1188         if (list->prev)
1189         {
1190             if ( (nodeIsUL(list->prev) || nodeIsOL(list->prev))
1191                  && list->prev->last )
1192             {
1193                 node = list;
1194                 list = node->prev;
1195 
1196                 child = list->last;  /* <li> */
1197 
1198                 list->next = node->next;
1199                 TY_(FixNodeLinks)(list);
1200 
1201                 node->parent = child;
1202                 node->next = NULL;
1203                 node->prev = child->last;
1204                 TY_(FixNodeLinks)(node);
1205                 CleanNode( doc, node );
1206             }
1207         }
1208 
1209         return yes;
1210     }
1211 
1212     return no;
1213 }
1214 
1215 /* Find CSS equivalent in a SPAN element */
1216 static
FindCSSSpanEq(Node * node,ctmbstr * s,Bool deprecatedOnly)1217 Bool FindCSSSpanEq( Node *node, ctmbstr *s, Bool deprecatedOnly )
1218 {
1219     struct
1220     {
1221         TidyTagId id;
1222         ctmbstr CSSeq;
1223         Bool deprecated;
1224     }
1225     const CSS_SpanEq[] =
1226         {
1227             { TidyTag_B, "font-weight: bold", no },
1228             { TidyTag_I, "font-style: italic", no },
1229             { TidyTag_S, "text-decoration: line-through", yes},
1230             { TidyTag_STRIKE, "text-decoration: line-through", yes},
1231             { TidyTag_U, "text-decoration: underline", yes},
1232             { TidyTag_UNKNOWN, NULL, no }
1233         };
1234     uint i;
1235 
1236     for (i=0; CSS_SpanEq[i].CSSeq; ++i)
1237         if ( (!deprecatedOnly || CSS_SpanEq[i].deprecated)
1238              && TagIsId(node, CSS_SpanEq[i].id) )
1239         {
1240             *s = CSS_SpanEq[i].CSSeq;
1241             return yes;
1242         }
1243     return no;
1244 }
1245 
1246 /* Necessary conditions to apply BlockStyle(). */
CanApplyBlockStyle(Node * node)1247 static Bool CanApplyBlockStyle( Node *node )
1248 {
1249     if (TY_(nodeHasCM)(node,CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE)
1250         && !nodeIsTABLE(node) && !nodeIsTR(node) && !nodeIsLI(node) )
1251     {
1252         return yes;
1253     }
1254     return no;
1255 }
1256 
1257 /*
1258   Symptom: the only child of a block-level element is a
1259   presentation element such as B, I or FONT
1260 
1261   Action: add style "font-weight: bold" to the block and
1262   strip the <b> element, leaving its children.
1263 
1264   example:
1265 
1266     <p>
1267       <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1268     </p>
1269 
1270   becomes:
1271 
1272       <p style="font-weight: bold; font-family: Arial; font-size: 6">
1273         Draft Recommended Practice
1274       </p>
1275 
1276   This code also replaces the align attribute by a style attribute.
1277   However, to avoid CSS problems with Navigator 4, this isn't done
1278   for the elements: caption, tr and table
1279 */
BlockStyle(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1280 static Bool BlockStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1281 {
1282     Node *child;
1283     ctmbstr CSSeq;
1284 
1285     /* check for bgcolor */
1286     if (   nodeIsTABLE(node)
1287         || nodeIsTD(node) || nodeIsTH(node) || nodeIsTR( node ))
1288         TableBgColor( doc, node );
1289 
1290     if (CanApplyBlockStyle(node))
1291     {
1292         /* check for align attribute */
1293         if ( !nodeIsCAPTION(node) )
1294             TextAlign( doc, node );
1295 
1296         child = node->content;
1297         if (child == NULL)
1298             return no;
1299 
1300         /* check child has no peers */
1301         if (child->next)
1302             return no;
1303 
1304         if ( FindCSSSpanEq(child, &CSSeq, no) )
1305         {
1306             MergeStyles( doc, node, child );
1307             TY_(AddStyleProperty)( doc, node, CSSeq );
1308             StripOnlyChild( doc, node );
1309             return yes;
1310         }
1311         else if ( nodeIsFONT(child) )
1312         {
1313             MergeStyles( doc, node, child );
1314             AddFontStyles( doc, node, child->attributes );
1315             StripOnlyChild( doc, node );
1316             return yes;
1317         }
1318     }
1319 
1320     return no;
1321 }
1322 
1323 /* Necessary conditions to apply InlineStyle(). */
CanApplyInlineStyle(Node * node)1324 static Bool CanApplyInlineStyle( Node *node )
1325 {
1326     return !nodeIsFONT(node) && TY_(nodeHasCM)(node, CM_INLINE|CM_ROW);
1327 }
1328 
1329 /* the only child of table cell or an inline element such as em */
InlineStyle(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1330 static Bool InlineStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1331 {
1332     Node *child;
1333     ctmbstr CSSeq;
1334 
1335     if ( CanApplyInlineStyle(node) )
1336     {
1337         child = node->content;
1338 
1339         if (child == NULL)
1340             return no;
1341 
1342         /* check child has no peers */
1343 
1344         if (child->next)
1345             return no;
1346 
1347         if ( FindCSSSpanEq(child, &CSSeq, no) )
1348         {
1349             MergeStyles( doc, node, child );
1350             TY_(AddStyleProperty)( doc, node, CSSeq );
1351             StripOnlyChild( doc, node );
1352             return yes;
1353         }
1354         else if ( nodeIsFONT(child) )
1355         {
1356             MergeStyles( doc, node, child );
1357             AddFontStyles( doc, node, child->attributes );
1358             StripOnlyChild( doc, node );
1359             return yes;
1360         }
1361     }
1362 
1363     return no;
1364 }
1365 
1366 /*
1367     Transform element to equivalent CSS
1368 */
InlineElementToCSS(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1369 static Bool InlineElementToCSS( TidyDocImpl* doc, Node* node,
1370                                 Node **ARG_UNUSED(pnode)  )
1371 {
1372     ctmbstr CSSeq;
1373 
1374     /* if node is the only child of parent element then leave alone
1375           Do so only if BlockStyle may be succesful. */
1376     if ( node->parent->content == node && node->next == NULL &&
1377          (CanApplyBlockStyle(node->parent)
1378           || CanApplyInlineStyle(node->parent)) )
1379         return no;
1380 
1381     if ( FindCSSSpanEq(node, &CSSeq, yes) )
1382     {
1383         RenameElem( doc, node, TidyTag_SPAN );
1384         TY_(AddStyleProperty)( doc, node, CSSeq );
1385         return yes;
1386     }
1387     return no;
1388 }
1389 
1390 /*
1391   Replace font elements by span elements, deleting
1392   the font element's attributes and replacing them
1393   by a single style attribute.
1394 */
Font2Span(TidyDocImpl * doc,Node * node,Node ** pnode)1395 static Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode )
1396 {
1397     AttVal *av, *style, *next;
1398 
1399     if ( nodeIsFONT(node) )
1400     {
1401         if ( cfgBool(doc, TidyDropFontTags) )
1402         {
1403             DiscardContainer( doc, node, pnode );
1404             return yes;
1405         }
1406 
1407         /* if node is the only child of parent element then leave alone
1408           Do so only if BlockStyle may be succesful. */
1409         if ( node->parent->content == node && node->next == NULL &&
1410              CanApplyBlockStyle(node->parent) )
1411             return no;
1412 
1413         AddFontStyles( doc, node, node->attributes );
1414 
1415         /* extract style attribute and free the rest */
1416         av = node->attributes;
1417         style = NULL;
1418 
1419         while (av)
1420         {
1421             next = av->next;
1422 
1423             if (attrIsSTYLE(av))
1424             {
1425                 av->next = NULL;
1426                 style = av;
1427             }
1428             else
1429             {
1430                 TY_(FreeAttribute)( doc, av );
1431             }
1432             av = next;
1433         }
1434 
1435         node->attributes = style;
1436         RenameElem( doc, node, TidyTag_SPAN );
1437         return yes;
1438     }
1439 
1440     return no;
1441 }
1442 
1443 /*
1444   Applies all matching rules to a node.
1445 */
CleanNode(TidyDocImpl * doc,Node * node)1446 Node* CleanNode( TidyDocImpl* doc, Node *node )
1447 {
1448     Node *next = NULL;
1449     TidyTriState mergeDivs = cfgAutoBool(doc, TidyMergeDivs);
1450     TidyTriState mergeSpans = cfgAutoBool(doc, TidyMergeSpans);
1451 
1452     for (next = node; TY_(nodeIsElement)(node); node = next)
1453     {
1454         if ( Dir2Div(doc, node, &next) )
1455             continue;
1456 
1457         /* Special case: true result means
1458         ** that arg node and its parent no longer exist.
1459         ** So we must jump back up the CreateStyleProperties()
1460         ** call stack until we have a valid node reference.
1461         */
1462         if ( NestedList(doc, node, &next) )
1463             return next;
1464 
1465         if ( Center2Div(doc, node, &next) )
1466             continue;
1467 
1468         if ( MergeNestedElements(doc, TidyTag_DIV, mergeDivs, node, &next) )
1469             continue;
1470 
1471         if ( MergeNestedElements(doc, TidyTag_SPAN, mergeSpans, node, &next) )
1472             continue;
1473 
1474         if ( BlockStyle(doc, node, &next) )
1475             continue;
1476 
1477         if ( InlineStyle(doc, node, &next) )
1478             continue;
1479 
1480         if ( InlineElementToCSS(doc, node, &next) )
1481             continue;
1482 
1483         if ( Font2Span(doc, node, &next) )
1484             continue;
1485 
1486         break;
1487     }
1488 
1489     return next;
1490 }
1491 
1492 /* Special case: if the current node is destroyed by
1493 ** CleanNode() lower in the tree, this node and its parent
1494 ** no longer exist.  So we must jump back up the CleanTree()
1495 ** call stack until we have a valid node reference.
1496 */
1497 
CleanTree(TidyDocImpl * doc,Node * node)1498 static Node* CleanTree( TidyDocImpl* doc, Node *node )
1499 {
1500     if (node->content)
1501     {
1502         Node *child;
1503         for (child = node->content; child != NULL; child = child->next)
1504         {
1505             child = CleanTree( doc, child );
1506             if ( !child )
1507                 break;
1508         }
1509     }
1510 
1511     return CleanNode( doc, node );
1512 }
1513 
DefineStyleRules(TidyDocImpl * doc,Node * node)1514 static void DefineStyleRules( TidyDocImpl* doc, Node *node )
1515 {
1516     Node *child;
1517 
1518     if (node->content)
1519     {
1520         for (child = node->content;
1521                 child != NULL; child = child->next)
1522         {
1523             DefineStyleRules( doc, child );
1524         }
1525     }
1526 
1527     Style2Rule( doc, node );
1528 }
1529 
TY_(CleanDocument)1530 void TY_(CleanDocument)( TidyDocImpl* doc )
1531 {
1532     /* placeholder.  CleanTree()/CleanNode() will not
1533     ** zap root element
1534     */
1535     CleanTree( doc, &doc->root );
1536 
1537     if ( cfgBool(doc, TidyMakeClean) )
1538     {
1539         DefineStyleRules( doc, &doc->root );
1540         CreateStyleElement( doc );
1541     }
1542 }
1543 
1544 /* simplifies <b><b> ... </b> ...</b> etc. */
TY_(NestedEmphasis)1545 void TY_(NestedEmphasis)( TidyDocImpl* doc, Node* node )
1546 {
1547     Node *next;
1548 
1549     while (node)
1550     {
1551         next = node->next;
1552 
1553         if ( (nodeIsB(node) || nodeIsI(node))
1554              && node->parent && node->parent->tag == node->tag)
1555         {
1556             /* strip redundant inner element */
1557             DiscardContainer( doc, node, &next );
1558             node = next;
1559             continue;
1560         }
1561 
1562         if ( node->content )
1563             TY_(NestedEmphasis)( doc, node->content );
1564 
1565         node = next;
1566     }
1567 }
1568 
1569 
1570 
1571 /* replace i by em and b by strong */
TY_(EmFromI)1572 void TY_(EmFromI)( TidyDocImpl* doc, Node* node )
1573 {
1574     while (node)
1575     {
1576         if ( nodeIsI(node) )
1577             RenameElem( doc, node, TidyTag_EM );
1578         else if ( nodeIsB(node) )
1579             RenameElem( doc, node, TidyTag_STRONG );
1580 
1581         if ( node->content )
1582             TY_(EmFromI)( doc, node->content );
1583 
1584         node = node->next;
1585     }
1586 }
1587 
HasOneChild(Node * node)1588 static Bool HasOneChild(Node *node)
1589 {
1590     return (node->content && node->content->next == NULL);
1591 }
1592 
1593 /*
1594  Some people use dir or ul without an li
1595  to indent the content. The pattern to
1596  look for is a list with a single implicit
1597  li. This is recursively replaced by an
1598  implicit blockquote.
1599 */
TY_(List2BQ)1600 void TY_(List2BQ)( TidyDocImpl* doc, Node* node )
1601 {
1602     while (node)
1603     {
1604         if (node->content)
1605             TY_(List2BQ)( doc, node->content );
1606 
1607         if ( node->tag && node->tag->parser == TY_(ParseList) &&
1608              HasOneChild(node) && node->content->implicit )
1609         {
1610             StripOnlyChild( doc, node );
1611             RenameElem( doc, node, TidyTag_BLOCKQUOTE );
1612             node->implicit = yes;
1613         }
1614 
1615         node = node->next;
1616     }
1617 }
1618 
1619 
1620 /*
1621  Replace implicit blockquote by div with an indent
1622  taking care to reduce nested blockquotes to a single
1623  div with the indent set to match the nesting depth
1624 */
TY_(BQ2Div)1625 void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
1626 {
1627     tmbchar indent_buf[ 32 ];
1628     uint indent;
1629 
1630     while (node)
1631     {
1632         if ( nodeIsBLOCKQUOTE(node) && node->implicit )
1633         {
1634             indent = 1;
1635 
1636             while( HasOneChild(node) &&
1637                    nodeIsBLOCKQUOTE(node->content) &&
1638                    node->implicit)
1639             {
1640                 ++indent;
1641                 StripOnlyChild( doc, node );
1642             }
1643 
1644             if (node->content)
1645                 TY_(BQ2Div)( doc, node->content );
1646 
1647             TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
1648                              2*indent);
1649 
1650             RenameElem( doc, node, TidyTag_DIV );
1651             TY_(AddStyleProperty)(doc, node, indent_buf );
1652         }
1653         else if (node->content)
1654             TY_(BQ2Div)( doc, node->content );
1655 
1656         node = node->next;
1657     }
1658 }
1659 
1660 
FindEnclosingCell(TidyDocImpl * ARG_UNUSED (doc),Node * node)1661 static Node* FindEnclosingCell( TidyDocImpl* ARG_UNUSED(doc), Node *node)
1662 {
1663     Node *check;
1664 
1665     for ( check=node; check; check = check->parent )
1666     {
1667       if ( nodeIsTD(check) )
1668         return check;
1669     }
1670     return NULL;
1671 }
1672 
1673 /* node is <![if ...]> prune up to <![endif]> */
PruneSection(TidyDocImpl * doc,Node * node)1674 static Node* PruneSection( TidyDocImpl* doc, Node *node )
1675 {
1676     Lexer* lexer = doc->lexer;
1677 
1678     for (;;)
1679     {
1680         ctmbstr lexbuf = lexer->lexbuf + node->start;
1681         if ( TY_(tmbstrncmp)(lexbuf, "if !supportEmptyParas", 21) == 0 )
1682         {
1683           Node* cell = FindEnclosingCell( doc, node );
1684           if ( cell )
1685           {
1686             /* Need to put &nbsp; into cell so it doesn't look weird
1687             */
1688             Node* nbsp = TY_(NewLiteralTextNode)( lexer, "\240" );
1689             assert( (byte)'\240' == (byte)160 );
1690             TY_(InsertNodeBeforeElement)( node, nbsp );
1691           }
1692         }
1693 
1694         /* discard node and returns next, unless it is a text node */
1695         if ( node->type == TextNode )
1696             node = node->next;
1697         else
1698             node = TY_(DiscardElement)( doc, node );
1699 
1700         if (node == NULL)
1701             return NULL;
1702 
1703         if (node->type == SectionTag)
1704         {
1705             if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0)
1706             {
1707                 node = PruneSection( doc, node );
1708                 continue;
1709             }
1710 
1711             if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "endif", 5) == 0)
1712             {
1713                 node = TY_(DiscardElement)( doc, node );
1714                 break;
1715             }
1716         }
1717     }
1718 
1719     return node;
1720 }
1721 
TY_(DropSections)1722 void TY_(DropSections)( TidyDocImpl* doc, Node* node )
1723 {
1724     Lexer* lexer = doc->lexer;
1725     while (node)
1726     {
1727         if (node->type == SectionTag)
1728         {
1729             /* prune up to matching endif */
1730             if ((TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0) &&
1731                 (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if !vml", 7) != 0)) /* #444394 - fix 13 Sep 01 */
1732             {
1733                 node = PruneSection( doc, node );
1734                 continue;
1735             }
1736 
1737             /* discard others as well */
1738             node = TY_(DiscardElement)( doc, node );
1739             continue;
1740         }
1741 
1742         if (node->content)
1743             TY_(DropSections)( doc, node->content );
1744 
1745         node = node->next;
1746     }
1747 }
1748 
PurgeWord2000Attributes(TidyDocImpl * doc,Node * node)1749 static void PurgeWord2000Attributes( TidyDocImpl* doc, Node* node )
1750 {
1751     AttVal *attr, *next, *prev = NULL;
1752 
1753     for ( attr = node->attributes; attr; attr = next )
1754     {
1755         next = attr->next;
1756 
1757         /* special check for class="Code" denoting pre text */
1758         /* Pass thru user defined styles as HTML class names */
1759         if (attrIsCLASS(attr))
1760         {
1761             if (AttrValueIs(attr, "Code") ||
1762                  TY_(tmbstrncmp)(attr->value, "Mso", 3) != 0 )
1763             {
1764                 prev = attr;
1765                 continue;
1766             }
1767         }
1768 
1769         if (attrIsCLASS(attr) ||
1770             attrIsSTYLE(attr) ||
1771             attrIsLANG(attr)  ||
1772              ( (attrIsHEIGHT(attr) || attrIsWIDTH(attr)) &&
1773                (nodeIsTD(node) || nodeIsTR(node) || nodeIsTH(node)) ) ||
1774              (attr->attribute && TY_(tmbstrncmp)(attr->attribute, "x:", 2) == 0) )
1775         {
1776             if (prev)
1777                 prev->next = next;
1778             else
1779                 node->attributes = next;
1780 
1781             TY_(FreeAttribute)( doc, attr );
1782         }
1783         else
1784             prev = attr;
1785     }
1786 }
1787 
1788 /* Word2000 uses span excessively, so we strip span out */
StripSpan(TidyDocImpl * doc,Node * span)1789 static Node* StripSpan( TidyDocImpl* doc, Node* span )
1790 {
1791     Node *node, *prev = NULL, *content;
1792 
1793     /*
1794      deal with span elements that have content
1795      by splicing the content in place of the span
1796      after having processed it
1797     */
1798 
1799     TY_(CleanWord2000)( doc, span->content );
1800     content = span->content;
1801 
1802     if (span->prev)
1803         prev = span->prev;
1804     else if (content)
1805     {
1806         node = content;
1807         content = content->next;
1808         TY_(RemoveNode)(node);
1809         TY_(InsertNodeBeforeElement)(span, node);
1810         prev = node;
1811     }
1812 
1813     while (content)
1814     {
1815         node = content;
1816         content = content->next;
1817         TY_(RemoveNode)(node);
1818         TY_(InsertNodeAfterElement)(prev, node);
1819         prev = node;
1820     }
1821 
1822     if (span->next == NULL)
1823         span->parent->last = prev;
1824 
1825     node = span->next;
1826     span->content = NULL;
1827     TY_(DiscardElement)( doc, span );
1828     return node;
1829 }
1830 
1831 /* map non-breaking spaces to regular spaces */
TY_(NormalizeSpaces)1832 void TY_(NormalizeSpaces)(Lexer *lexer, Node *node)
1833 {
1834     while ( node )
1835     {
1836         if ( node->content )
1837             TY_(NormalizeSpaces)( lexer, node->content );
1838 
1839         if (TY_(nodeIsText)(node))
1840         {
1841             uint i, c;
1842             tmbstr p = lexer->lexbuf + node->start;
1843 
1844             for (i = node->start; i < node->end; ++i)
1845             {
1846                 c = (byte) lexer->lexbuf[i];
1847 
1848                 /* look for UTF-8 multibyte character */
1849                 if ( c > 0x7F )
1850                     i += TY_(GetUTF8)( lexer->lexbuf + i, &c );
1851 
1852                 if ( c == 160 )
1853                     c = ' ';
1854 
1855                 p = TY_(PutUTF8)(p, c);
1856             }
1857             node->end = p - lexer->lexbuf;
1858         }
1859 
1860         node = node->next;
1861     }
1862 }
1863 
1864 /* used to hunt for hidden preformatted sections */
NoMargins(Node * node)1865 static Bool NoMargins(Node *node)
1866 {
1867     AttVal *attval = TY_(AttrGetById)(node, TidyAttr_STYLE);
1868 
1869     if ( !AttrHasValue(attval) )
1870         return no;
1871 
1872     /* search for substring "margin-top: 0" */
1873     if (!TY_(tmbsubstr)(attval->value, "margin-top: 0"))
1874         return no;
1875 
1876     /* search for substring "margin-bottom: 0" */
1877     if (!TY_(tmbsubstr)(attval->value, "margin-bottom: 0"))
1878         return no;
1879 
1880     return yes;
1881 }
1882 
1883 /* does element have a single space as its content? */
SingleSpace(Lexer * lexer,Node * node)1884 static Bool SingleSpace( Lexer* lexer, Node* node )
1885 {
1886     if ( node->content )
1887     {
1888         node = node->content;
1889 
1890         if ( node->next != NULL )
1891             return no;
1892 
1893         if ( node->type != TextNode )
1894             return no;
1895 
1896         if ( (node->end - node->start) == 1 &&
1897              lexer->lexbuf[node->start] == ' ' )
1898             return yes;
1899 
1900         if ( (node->end - node->start) == 2 )
1901         {
1902             uint c = 0;
1903             TY_(GetUTF8)( lexer->lexbuf + node->start, &c );
1904             if ( c == 160 )
1905                 return yes;
1906         }
1907     }
1908 
1909     return no;
1910 }
1911 
1912 /*
1913  This is a major clean up to strip out all the extra stuff you get
1914  when you save as web page from Word 2000. It doesn't yet know what
1915  to do with VML tags, but these will appear as errors unless you
1916  declare them as new tags, such as o:p which needs to be declared
1917  as inline.
1918 */
TY_(CleanWord2000)1919 void TY_(CleanWord2000)( TidyDocImpl* doc, Node *node)
1920 {
1921     /* used to a list from a sequence of bulletted p's */
1922     Lexer* lexer = doc->lexer;
1923     Node* list = NULL;
1924 
1925     while ( node )
1926     {
1927         /* get rid of Word's xmlns attributes */
1928         if ( nodeIsHTML(node) )
1929         {
1930             /* check that it's a Word 2000 document */
1931             if ( !TY_(GetAttrByName)(node, "xmlns:o") &&
1932                  !cfgBool(doc, TidyMakeBare) )
1933                 return;
1934 
1935             TY_(FreeAttrs)( doc, node );
1936         }
1937 
1938         /* fix up preformatted sections by looking for a
1939         ** sequence of paragraphs with zero top/bottom margin
1940         */
1941         if ( nodeIsP(node) )
1942         {
1943             if (NoMargins(node))
1944             {
1945                 Node *pre, *next;
1946                 TY_(CoerceNode)(doc, node, TidyTag_PRE, no, yes);
1947 
1948                 PurgeWord2000Attributes( doc, node );
1949 
1950                 if (node->content)
1951                     TY_(CleanWord2000)( doc, node->content );
1952 
1953                 pre = node;
1954                 node = node->next;
1955 
1956                 /* continue to strip p's */
1957 
1958                 while ( nodeIsP(node) && NoMargins(node) )
1959                 {
1960                     next = node->next;
1961                     TY_(RemoveNode)(node);
1962                     TY_(InsertNodeAtEnd)(pre, TY_(NewLineNode)(lexer));
1963                     TY_(InsertNodeAtEnd)(pre, node);
1964                     StripSpan( doc, node );
1965                     node = next;
1966                 }
1967 
1968                 if (node == NULL)
1969                     break;
1970             }
1971         }
1972 
1973         if (node->tag && (node->tag->model & CM_BLOCK)
1974             && SingleSpace(lexer, node))
1975         {
1976             node = StripSpan( doc, node );
1977             continue;
1978         }
1979         /* discard Word's style verbiage */
1980         if ( nodeIsSTYLE(node) || nodeIsMETA(node) ||
1981              node->type == CommentTag )
1982         {
1983             node = TY_(DiscardElement)( doc, node );
1984             continue;
1985         }
1986 
1987         /* strip out all span and font tags Word scatters so liberally! */
1988         if ( nodeIsSPAN(node) || nodeIsFONT(node) )
1989         {
1990             node = StripSpan( doc, node );
1991             continue;
1992         }
1993 
1994         if ( nodeIsLINK(node) )
1995         {
1996             AttVal *attr = TY_(AttrGetById)(node, TidyAttr_REL);
1997 
1998             if (AttrValueIs(attr, "File-List"))
1999             {
2000                 node = TY_(DiscardElement)( doc, node );
2001                 continue;
2002             }
2003         }
2004 
2005         /* discards <o:p> which encodes the paragraph mark */
2006         if ( node->tag && TY_(tmbstrcmp)(node->tag->name,"o:p")==0)
2007         {
2008             Node* next;
2009             DiscardContainer( doc, node, &next );
2010             node = next;
2011             continue;
2012         }
2013 
2014         /* discard empty paragraphs */
2015 
2016         if ( node->content == NULL && nodeIsP(node) )
2017         {
2018             /*  Use the existing function to ensure consistency */
2019             Node *next = TY_(TrimEmptyElement)( doc, node );
2020             node = next;
2021             continue;
2022         }
2023 
2024         if ( nodeIsP(node) )
2025         {
2026             AttVal *attr, *atrStyle;
2027 
2028             attr = TY_(AttrGetById)(node, TidyAttr_CLASS);
2029             atrStyle = TY_(AttrGetById)(node, TidyAttr_STYLE);
2030             /*
2031                (JES) Sometimes Word marks a list item with the following hokie syntax
2032                <p class="MsoNormal" style="...;mso-list:l1 level1 lfo1;
2033                 translate these into <li>
2034             */
2035             /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
2036             /* map <p class="MsoListNumber"> to <ol>...</ol> */
2037             if ( AttrValueIs(attr, "MsoListBullet") ||
2038                  AttrValueIs(attr, "MsoListNumber") ||
2039                  AttrContains(atrStyle, "mso-list:") )
2040             {
2041                 TidyTagId listType = TidyTag_UL;
2042                 if (AttrValueIs(attr, "MsoListNumber"))
2043                     listType = TidyTag_OL;
2044 
2045                 TY_(CoerceNode)(doc, node, TidyTag_LI, no, yes);
2046 
2047                 if ( !list || TagId(list) != listType )
2048                 {
2049                     const Dict* tag = TY_(LookupTagDef)( listType );
2050                     list = TY_(InferredTag)(doc, tag->id);
2051                     TY_(InsertNodeBeforeElement)(node, list);
2052                 }
2053 
2054                 PurgeWord2000Attributes( doc, node );
2055 
2056                 if ( node->content )
2057                     TY_(CleanWord2000)( doc, node->content );
2058 
2059                 /* remove node and append to contents of list */
2060                 TY_(RemoveNode)(node);
2061                 TY_(InsertNodeAtEnd)(list, node);
2062                 node = list;
2063             }
2064             /* map sequence of <p class="Code"> to <pre>...</pre> */
2065             else if (AttrValueIs(attr, "Code"))
2066             {
2067                 Node *br = TY_(NewLineNode)(lexer);
2068                 TY_(NormalizeSpaces)(lexer, node->content);
2069 
2070                 if ( !list || TagId(list) != TidyTag_PRE )
2071                 {
2072                     list = TY_(InferredTag)(doc, TidyTag_PRE);
2073                     TY_(InsertNodeBeforeElement)(node, list);
2074                 }
2075 
2076                 /* remove node and append to contents of list */
2077                 TY_(RemoveNode)(node);
2078                 TY_(InsertNodeAtEnd)(list, node);
2079                 StripSpan( doc, node );
2080                 TY_(InsertNodeAtEnd)(list, br);
2081                 node = list->next;
2082             }
2083             else
2084                 list = NULL;
2085         }
2086         else
2087             list = NULL;
2088 
2089         if (!node)
2090             return;
2091 
2092         /* strip out style and class attributes */
2093         if (TY_(nodeIsElement)(node))
2094             PurgeWord2000Attributes( doc, node );
2095 
2096         if (node->content)
2097             TY_(CleanWord2000)( doc, node->content );
2098 
2099         node = node->next;
2100     }
2101 }
2102 
TY_(IsWord2000)2103 Bool TY_(IsWord2000)( TidyDocImpl* doc )
2104 {
2105     AttVal *attval;
2106     Node *node, *head;
2107     Node *html = TY_(FindHTML)( doc );
2108 
2109     if (html && TY_(GetAttrByName)(html, "xmlns:o"))
2110         return yes;
2111 
2112     /* search for <meta name="GENERATOR" content="Microsoft ..."> */
2113     head = TY_(FindHEAD)( doc );
2114 
2115     if (head)
2116     {
2117         for (node = head->content; node; node = node->next)
2118         {
2119             if ( !nodeIsMETA(node) )
2120                 continue;
2121 
2122             attval = TY_(AttrGetById)( node, TidyAttr_NAME );
2123 
2124             if ( !AttrValueIs(attval, "generator") )
2125                 continue;
2126 
2127             attval =  TY_(AttrGetById)( node, TidyAttr_CONTENT );
2128 
2129             if ( AttrContains(attval, "Microsoft") )
2130                 return yes;
2131         }
2132     }
2133 
2134     return no;
2135 }
2136 
2137 /* where appropriate move object elements from head to body */
TY_(BumpObject)2138 void TY_(BumpObject)( TidyDocImpl* doc, Node *html )
2139 {
2140     Node *node, *next, *head = NULL, *body = NULL;
2141 
2142     if (!html)
2143         return;
2144 
2145     for ( node = html->content; node != NULL; node = node->next )
2146     {
2147         if ( nodeIsHEAD(node) )
2148             head = node;
2149 
2150         if ( nodeIsBODY(node) )
2151             body = node;
2152     }
2153 
2154     if ( head != NULL && body != NULL )
2155     {
2156         for (node = head->content; node != NULL; node = next)
2157         {
2158             next = node->next;
2159 
2160             if ( nodeIsOBJECT(node) )
2161             {
2162                 Node *child;
2163                 Bool bump = no;
2164 
2165                 for (child = node->content; child != NULL; child = child->next)
2166                 {
2167                     /* bump to body unless content is param */
2168                     if ( (TY_(nodeIsText)(child) && !TY_(IsBlank)(doc->lexer, node))
2169                          || !nodeIsPARAM(child) )
2170                     {
2171                             bump = yes;
2172                             break;
2173                     }
2174                 }
2175 
2176                 if ( bump )
2177                 {
2178                     TY_(RemoveNode)( node );
2179                     TY_(InsertNodeAtStart)( body, node );
2180                 }
2181             }
2182         }
2183     }
2184 }
2185 
2186 /* This is disabled due to http://tidy.sf.net/bug/681116 */
2187 #if 0
2188 void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
2189 {
2190     Node *pNode;
2191     Bool bBRDeleted = no;
2192 
2193     if (NULL == pParent)
2194         return;
2195 
2196     /*  First, check the status of All My Children  */
2197     pNode = pParent->content;
2198     while (NULL != pNode )
2199     {
2200         /* The node may get trimmed, so save the next pointer, if any */
2201         Node *pNext = pNode->next;
2202         FixBrakes( pDoc, pNode );
2203         pNode = pNext;
2204     }
2205 
2206 
2207     /*  As long as my last child is a <br />, move it to my last peer  */
2208     if ( nodeCMIsBlock( pParent ))
2209     {
2210         for ( pNode = pParent->last;
2211               NULL != pNode && nodeIsBR( pNode );
2212               pNode = pParent->last )
2213         {
2214             if ( NULL == pNode->attributes && no == bBRDeleted )
2215             {
2216                 TY_(DiscardElement)( pDoc, pNode );
2217                 bBRDeleted = yes;
2218             }
2219             else
2220             {
2221                 TY_(RemoveNode)( pNode );
2222                 TY_(InsertNodeAfterElement)( pParent, pNode );
2223             }
2224         }
2225         TY_(TrimEmptyElement)( pDoc, pParent );
2226     }
2227 }
2228 #endif
2229 
TY_(VerifyHTTPEquiv)2230 void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2231 {
2232     Node *pNode;
2233     StyleProp *pFirstProp = NULL, *pLastProp = NULL, *prop = NULL;
2234     tmbstr s, pszBegin, pszEnd;
2235     ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
2236 
2237     if (!enc)
2238         return;
2239 
2240     if (!nodeIsHEAD(head))
2241         head = TY_(FindHEAD)(doc);
2242 
2243     if (!head)
2244         return;
2245 
2246     /* Find any <meta http-equiv='Content-Type' content='...' /> */
2247     for (pNode = head->content; NULL != pNode; pNode = pNode->next)
2248     {
2249         AttVal* httpEquiv = TY_(AttrGetById)(pNode, TidyAttr_HTTP_EQUIV);
2250         AttVal* metaContent = TY_(AttrGetById)(pNode, TidyAttr_CONTENT);
2251 
2252         if ( !nodeIsMETA(pNode) || !metaContent ||
2253              !AttrValueIs(httpEquiv, "Content-Type") )
2254             continue;
2255 
2256         pszBegin = s = TY_(tmbstrdup)( doc->allocator, metaContent->value );
2257         while (pszBegin && *pszBegin)
2258         {
2259             while (isspace( *pszBegin ))
2260                 pszBegin++;
2261             pszEnd = pszBegin;
2262             while ('\0' != *pszEnd && ';' != *pszEnd)
2263                 pszEnd++;
2264             if (';' == *pszEnd )
2265                 *(pszEnd++) = '\0';
2266             if (pszEnd > pszBegin)
2267             {
2268                 prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
2269                 prop->name = TY_(tmbstrdup)( doc->allocator, pszBegin );
2270                 prop->value = NULL;
2271                 prop->next = NULL;
2272 
2273                 if (NULL != pLastProp)
2274                     pLastProp->next = prop;
2275                 else
2276                     pFirstProp = prop;
2277 
2278                 pLastProp = prop;
2279                 pszBegin = pszEnd;
2280             }
2281         }
2282         TidyDocFree( doc, s );
2283 
2284         /*  find the charset property */
2285         for (prop = pFirstProp; NULL != prop; prop = prop->next)
2286         {
2287             if (0 != TY_(tmbstrncasecmp)( prop->name, "charset", 7 ))
2288                 continue;
2289 
2290             TidyDocFree( doc, prop->name );
2291             prop->name = (tmbstr)TidyDocAlloc( doc, 8 + TY_(tmbstrlen)(enc) + 1 );
2292             TY_(tmbstrcpy)(prop->name, "charset=");
2293             TY_(tmbstrcpy)(prop->name+8, enc);
2294             s = CreatePropString( doc, pFirstProp );
2295             TidyDocFree( doc, metaContent->value );
2296             metaContent->value = s;
2297             break;
2298         }
2299         /* #718127, prevent memory leakage */
2300         FreeStyleProps(doc, pFirstProp);
2301         pFirstProp = NULL;
2302         pLastProp = NULL;
2303     }
2304 }
2305 
TY_(DropComments)2306 void TY_(DropComments)(TidyDocImpl* doc, Node* node)
2307 {
2308     Node* next;
2309 
2310     while (node)
2311     {
2312         next = node->next;
2313 
2314         if (node->type == CommentTag)
2315         {
2316             TY_(RemoveNode)(node);
2317             TY_(FreeNode)(doc, node);
2318             node = next;
2319             continue;
2320         }
2321 
2322         if (node->content)
2323             TY_(DropComments)(doc, node->content);
2324 
2325         node = next;
2326     }
2327 }
2328 
TY_(DropFontElements)2329 void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **ARG_UNUSED(pnode))
2330 {
2331     Node* next;
2332 
2333     while (node)
2334     {
2335         next = node->next;
2336 
2337         if (nodeIsFONT(node))
2338         {
2339             DiscardContainer(doc, node, &next);
2340             node = next;
2341             continue;
2342         }
2343 
2344         if (node->content)
2345             TY_(DropFontElements)(doc, node->content, &next);
2346 
2347         node = next;
2348     }
2349 }
2350 
TY_(WbrToSpace)2351 void TY_(WbrToSpace)(TidyDocImpl* doc, Node* node)
2352 {
2353     Node* next;
2354 
2355     while (node)
2356     {
2357         next = node->next;
2358 
2359         if (nodeIsWBR(node))
2360         {
2361             Node* text;
2362             text = TY_(NewLiteralTextNode)(doc->lexer, " ");
2363             TY_(InsertNodeAfterElement)(node, text);
2364             TY_(RemoveNode)(node);
2365             TY_(FreeNode)(doc, node);
2366             node = next;
2367             continue;
2368         }
2369 
2370         if (node->content)
2371             TY_(WbrToSpace)(doc, node->content);
2372 
2373         node = next;
2374    }
2375 }
2376 
2377 /*
2378   Filters from Word and PowerPoint often use smart
2379   quotes resulting in character codes between 128
2380   and 159. Unfortunately, the corresponding HTML 4.0
2381   entities for these are not widely supported. The
2382   following converts dashes and quotation marks to
2383   the nearest ASCII equivalent. My thanks to
2384   Andrzej Novosiolov for his help with this code.
2385 
2386   Note: The old code in the pretty printer applied
2387   this to all node types and attribute values while
2388   this routine applies it only to text nodes. First,
2389   Microsoft Office products rarely put the relevant
2390   characters into these tokens, second support for
2391   them is much better now and last but not least, it
2392   can be harmful to replace these characters since
2393   US-ASCII quote marks are often used as syntax
2394   characters, a simple
2395 
2396     <a onmouseover="alert('&#x2018;')">...</a>
2397 
2398   would be broken if the U+2018 is replaced by "'".
2399   The old code would neither take care whether the
2400   quote mark is already used as delimiter,
2401 
2402     <p title='&#x2018;'>...</p>
2403 
2404   got
2405 
2406     <p title='''>...</p>
2407 
2408   Since browser support is much better nowadays and
2409   high-quality typography is better than ASCII it'd
2410   be probably a good idea to drop the feature...
2411 */
TY_(DowngradeTypography)2412 void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node)
2413 {
2414     Node* next;
2415     Lexer* lexer = doc->lexer;
2416 
2417     while (node)
2418     {
2419         next = node->next;
2420 
2421         if (TY_(nodeIsText)(node))
2422         {
2423             uint i, c;
2424             tmbstr p = lexer->lexbuf + node->start;
2425 
2426             for (i = node->start; i < node->end; ++i)
2427             {
2428                 c = (unsigned char) lexer->lexbuf[i];
2429 
2430                 if (c > 0x7F)
2431                     i += TY_(GetUTF8)(lexer->lexbuf + i, &c);
2432 
2433                 if (c >= 0x2013 && c <= 0x201E)
2434                 {
2435                     switch (c)
2436                     {
2437                     case 0x2013: /* en dash */
2438                     case 0x2014: /* em dash */
2439                         c = '-';
2440                         break;
2441                     case 0x2018: /* left single  quotation mark */
2442                     case 0x2019: /* right single quotation mark */
2443                     case 0x201A: /* single low-9 quotation mark */
2444                         c = '\'';
2445                         break;
2446                     case 0x201C: /* left double  quotation mark */
2447                     case 0x201D: /* right double quotation mark */
2448                     case 0x201E: /* double low-9 quotation mark */
2449                         c = '"';
2450                         break;
2451                     }
2452                 }
2453 
2454                 p = TY_(PutUTF8)(p, c);
2455             }
2456 
2457             node->end = p - lexer->lexbuf;
2458         }
2459 
2460         if (node->content)
2461             TY_(DowngradeTypography)(doc, node->content);
2462 
2463         node = next;
2464     }
2465 }
2466 
TY_(ReplacePreformattedSpaces)2467 void TY_(ReplacePreformattedSpaces)(TidyDocImpl* doc, Node* node)
2468 {
2469     Node* next;
2470 
2471     while (node)
2472     {
2473         next = node->next;
2474 
2475         if (node->tag && node->tag->parser == TY_(ParsePre))
2476         {
2477             TY_(NormalizeSpaces)(doc->lexer, node->content);
2478             node = next;
2479             continue;
2480         }
2481 
2482         if (node->content)
2483             TY_(ReplacePreformattedSpaces)(doc, node->content);
2484 
2485         node = next;
2486     }
2487 }
2488 
TY_(ConvertCDATANodes)2489 void TY_(ConvertCDATANodes)(TidyDocImpl* doc, Node* node)
2490 {
2491     Node* next;
2492 
2493     while (node)
2494     {
2495         next = node->next;
2496 
2497         if (node->type == CDATATag)
2498             node->type = TextNode;
2499 
2500         if (node->content)
2501             TY_(ConvertCDATANodes)(doc, node->content);
2502 
2503         node = next;
2504     }
2505 }
2506 
2507 /*
2508   FixLanguageInformation ensures that the document contains (only)
2509   the attributes for language information desired by the output
2510   document type. For example, for XHTML 1.0 documents both
2511   'xml:lang' and 'lang' are desired, for XHTML 1.1 only 'xml:lang'
2512   is desired and for HTML 4.01 only 'lang' is desired.
2513 */
TY_(FixLanguageInformation)2514 void TY_(FixLanguageInformation)(TidyDocImpl* doc, Node* node, Bool wantXmlLang, Bool wantLang)
2515 {
2516     Node* next;
2517 
2518     while (node)
2519     {
2520         next = node->next;
2521 
2522         /* todo: report modifications made here to the report system */
2523 
2524         if (TY_(nodeIsElement)(node))
2525         {
2526             AttVal* lang = TY_(AttrGetById)(node, TidyAttr_LANG);
2527             AttVal* xmlLang = TY_(AttrGetById)(node, TidyAttr_XML_LANG);
2528 
2529             if (lang && xmlLang)
2530             {
2531                 /*
2532                   todo: check whether both attributes are in sync,
2533                   here or elsewhere, where elsewhere is probably
2534                   preferable.
2535                   AD - March 2005: not mandatory according the standards.
2536                 */
2537             }
2538             else if (lang && wantXmlLang)
2539             {
2540                 if (TY_(NodeAttributeVersions)( node, TidyAttr_XML_LANG )
2541                     & doc->lexer->versionEmitted)
2542                     TY_(RepairAttrValue)(doc, node, "xml:lang", lang->value);
2543             }
2544             else if (xmlLang && wantLang)
2545             {
2546                 if (TY_(NodeAttributeVersions)( node, TidyAttr_LANG )
2547                     & doc->lexer->versionEmitted)
2548                     TY_(RepairAttrValue)(doc, node, "lang", xmlLang->value);
2549             }
2550 
2551             if (lang && !wantLang)
2552                 TY_(RemoveAttribute)(doc, node, lang);
2553 
2554             if (xmlLang && !wantXmlLang)
2555                 TY_(RemoveAttribute)(doc, node, xmlLang);
2556         }
2557 
2558         if (node->content)
2559             TY_(FixLanguageInformation)(doc, node->content, wantXmlLang, wantLang);
2560 
2561         node = next;
2562     }
2563 }
2564 
2565 /*
2566   Set/fix/remove <html xmlns='...'>
2567 */
TY_(FixXhtmlNamespace)2568 void TY_(FixXhtmlNamespace)(TidyDocImpl* doc, Bool wantXmlns)
2569 {
2570     Node* html = TY_(FindHTML)(doc);
2571     AttVal* xmlns;
2572 
2573     if (!html)
2574         return;
2575 
2576     xmlns = TY_(AttrGetById)(html, TidyAttr_XMLNS);
2577 
2578     if (wantXmlns)
2579     {
2580         if (!AttrValueIs(xmlns, XHTML_NAMESPACE))
2581             TY_(RepairAttrValue)(doc, html, "xmlns", XHTML_NAMESPACE);
2582     }
2583     else if (xmlns)
2584     {
2585         TY_(RemoveAttribute)(doc, html, xmlns);
2586     }
2587 }
2588 
2589 /*
2590   ...
2591 */
TY_(FixAnchors)2592 void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
2593 {
2594     Node* next;
2595 
2596     while (node)
2597     {
2598         next = node->next;
2599 
2600         if (TY_(IsAnchorElement)(doc, node))
2601         {
2602             AttVal *name = TY_(AttrGetById)(node, TidyAttr_NAME);
2603             AttVal *id = TY_(AttrGetById)(node, TidyAttr_ID);
2604             Bool hadName = name!=NULL;
2605             Bool hadId = id!=NULL;
2606             Bool IdEmitted = no;
2607             Bool NameEmitted = no;
2608 
2609             /* todo: how are empty name/id attributes handled? */
2610 
2611             if (name && id)
2612             {
2613                 Bool NameHasValue = AttrHasValue(name);
2614                 Bool IdHasValue = AttrHasValue(id);
2615                 if ( (NameHasValue != IdHasValue) ||
2616                      (NameHasValue && IdHasValue &&
2617                      TY_(tmbstrcmp)(name->value, id->value) != 0 ) )
2618                     TY_(ReportAttrError)( doc, node, name, ID_NAME_MISMATCH);
2619             }
2620             else if (name && wantId)
2621             {
2622                 if (TY_(NodeAttributeVersions)( node, TidyAttr_ID )
2623                     & doc->lexer->versionEmitted)
2624                 {
2625                     if (TY_(IsValidHTMLID)(name->value))
2626                     {
2627                         TY_(RepairAttrValue)(doc, node, "id", name->value);
2628                         IdEmitted = yes;
2629                     }
2630                     else
2631                         TY_(ReportAttrError)(doc, node, name, INVALID_XML_ID);
2632                  }
2633             }
2634             else if (id && wantName)
2635             {
2636                 if (TY_(NodeAttributeVersions)( node, TidyAttr_NAME )
2637                     & doc->lexer->versionEmitted)
2638                 {
2639                     /* todo: do not assume id is valid */
2640                     TY_(RepairAttrValue)(doc, node, "name", id->value);
2641                     NameEmitted = yes;
2642                 }
2643             }
2644 
2645             if (id && !wantId
2646                 /* make sure that Name has been emitted if requested */
2647                 && (hadName || !wantName || NameEmitted) )
2648                 TY_(RemoveAttribute)(doc, node, id);
2649 
2650             if (name && !wantName
2651                 /* make sure that Id has been emitted if requested */
2652                 && (hadId || !wantId || IdEmitted) )
2653                 TY_(RemoveAttribute)(doc, node, name);
2654 
2655             if (TY_(AttrGetById)(node, TidyAttr_NAME) == NULL &&
2656                 TY_(AttrGetById)(node, TidyAttr_ID) == NULL)
2657                 TY_(RemoveAnchorByNode)(doc, node);
2658         }
2659 
2660         if (node->content)
2661             TY_(FixAnchors)(doc, node->content, wantName, wantId);
2662 
2663         node = next;
2664     }
2665 }
2666 
2667 /*
2668  * local variables:
2669  * mode: c
2670  * indent-tabs-mode: nil
2671  * c-basic-offset: 4
2672  * eval: (c-set-offset 'substatement-open 0)
2673  * end:
2674  */
2675