1 /*
2   clean.c -- clean up misuse of presentation markup
3 
4   (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
5   See tidyp.h for the copyright notice.
6 
7   Filters from other formats such as Microsoft Word
8   often make excessive use of presentation markup such
9   as font tags, B, I, and the align attribute. By applying
10   a set of production rules, it is straight forward to
11   transform this to use CSS.
12 
13   Some rules replace some of the children of an element by
14   style properties on the element, e.g.
15 
16   <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
17 
18   Such rules are applied to the element's content and then
19   to the element itself until none of the rules more apply.
20   Having applied all the rules to an element, it will have
21   a style attribute with one or more properties.
22 
23   Other rules strip the element they apply to, replacing
24   it by style properties on the contents, e.g.
25 
26   <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
27 
28   These rules are applied to an element before processing
29   its content and replace the current element by the first
30   element in the exposed content.
31 
32   After applying both sets of rules, you can replace the
33   style attribute by a class value and style rule in the
34   document head. To support this, an association of styles
35   and class names is built.
36 
37   A naive approach is to rely on string matching to test
38   when two property lists are the same. A better approach
39   would be to first sort the properties before matching.
40 
41 */
42 
43 #include <stdio.h>
44 #include <stdlib.h>
45 #include <string.h>
46 
47 #include "tidy-int.h"
48 #include "clean.h"
49 #include "lexer.h"
50 #include "parser.h"
51 #include "attrs.h"
52 #include "message.h"
53 #include "tmbstr.h"
54 #include "utf8.h"
55 
56 static Node* CleanNode( TidyDocImpl* doc, Node *node );
57 
RenameElem(TidyDocImpl * doc,Node * node,TidyTagId tid)58 static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid )
59 {
60     const Dict* dict = TY_(LookupTagDef)( tid );
61     TidyDocFree( doc, node->element );
62     node->element = TY_(tmbstrdup)( doc->allocator, dict->name );
63     node->tag = dict;
64 }
65 
FreeStyleProps(TidyDocImpl * doc,StyleProp * props)66 static void FreeStyleProps(TidyDocImpl* doc, StyleProp *props)
67 {
68     StyleProp *next;
69 
70     while (props)
71     {
72         next = props->next;
73         TidyDocFree(doc, props->name);
74         TidyDocFree(doc, props->value);
75         TidyDocFree(doc, props);
76         props = next;
77     }
78 }
79 
InsertProperty(TidyDocImpl * doc,StyleProp * props,ctmbstr name,ctmbstr value)80 static StyleProp *InsertProperty( TidyDocImpl* doc, StyleProp* props, ctmbstr name, ctmbstr value )
81 {
82     StyleProp *first, *prev, *prop;
83     int cmp;
84 
85     prev = NULL;
86     first = props;
87 
88     while (props)
89     {
90         cmp = TY_(tmbstrcmp)(props->name, name);
91 
92         if (cmp == 0)
93         {
94             /* this property is already defined, ignore new value */
95             return first;
96         }
97 
98         if (cmp > 0)
99         {
100             /* insert before this */
101 
102             prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
103             prop->name = TY_(tmbstrdup)(doc->allocator, name);
104             prop->value = TY_(tmbstrdup)(doc->allocator, value);
105             prop->next = props;
106 
107             if (prev)
108                 prev->next = prop;
109             else
110                 first = prop;
111 
112             return first;
113         }
114 
115         prev = props;
116         props = props->next;
117     }
118 
119     prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
120     prop->name = TY_(tmbstrdup)(doc->allocator, name);
121     prop->value = TY_(tmbstrdup)(doc->allocator, value);
122     prop->next = NULL;
123 
124     if (prev)
125         prev->next = prop;
126     else
127         first = prop;
128 
129     return first;
130 }
131 
132 /*
133  Create sorted linked list of properties from style string
134  It temporarily places nulls in place of ':' and ';' to
135  delimit the strings for the property name and value.
136  Some systems don't allow you to NULL literal strings,
137  so to avoid this, a copy is made first.
138 */
CreateProps(TidyDocImpl * doc,StyleProp * prop,ctmbstr style)139 static StyleProp* CreateProps( TidyDocImpl* doc, StyleProp* prop, ctmbstr style )
140 {
141     tmbstr name, value = NULL, name_end, value_end, line;
142     Bool more;
143 
144     line = TY_(tmbstrdup)(doc->allocator, style);
145     name = line;
146 
147     while (*name)
148     {
149         while (*name == ' ')
150             ++name;
151 
152         name_end = name;
153 
154         while (*name_end)
155         {
156             if (*name_end == ':')
157             {
158                 value = name_end + 1;
159                 break;
160             }
161 
162             ++name_end;
163         }
164 
165         if (*name_end != ':')
166             break;
167 
168         while ( value && *value == ' ')
169             ++value;
170 
171         value_end = value;
172         more = no;
173 
174         while (*value_end)
175         {
176             if (*value_end == ';')
177             {
178                 more = yes;
179                 break;
180             }
181 
182             ++value_end;
183         }
184 
185         *name_end = '\0';
186         *value_end = '\0';
187 
188         prop = InsertProperty(doc, prop, name, value);
189         *name_end = ':';
190 
191         if (more)
192         {
193             *value_end = ';';
194             name = value_end + 1;
195             continue;
196         }
197 
198         break;
199     }
200 
201     TidyDocFree(doc, line);  /* free temporary copy */
202     return prop;
203 }
204 
CreatePropString(TidyDocImpl * doc,StyleProp * props)205 static tmbstr CreatePropString(TidyDocImpl* doc, StyleProp *props)
206 {
207     tmbstr style, p, s;
208     uint len;
209     StyleProp *prop;
210 
211     /* compute length */
212 
213     for (len = 0, prop = props; prop; prop = prop->next)
214     {
215         len += TY_(tmbstrlen)(prop->name) + 2;
216         if (prop->value)
217             len += TY_(tmbstrlen)(prop->value) + 2;
218     }
219 
220     style = (tmbstr) TidyDocAlloc(doc, len+1);
221     style[0] = '\0';
222 
223     for (p = style, prop = props; prop; prop = prop->next)
224     {
225         s = prop->name;
226 
227         while((*p++ = *s++))
228             continue;
229 
230         if (prop->value)
231         {
232             *--p = ':';
233             *++p = ' ';
234             ++p;
235 
236             s = prop->value;
237             while((*p++ = *s++))
238                 continue;
239         }
240         if (prop->next == NULL)
241             break;
242 
243         *--p = ';';
244         *++p = ' ';
245         ++p;
246     }
247 
248     return style;
249 }
250 
251 /*
252   create string with merged properties
253 static tmbstr AddProperty( ctmbstr style, ctmbstr property )
254 {
255     tmbstr line;
256     StyleProp *prop;
257 
258     prop = CreateProps(doc, NULL, style);
259     prop = CreateProps(doc, prop, property);
260     line = CreatePropString(doc, prop);
261     FreeStyleProps(doc, prop);
262     return line;
263 }
264 */
265 
TY_(FreeStyles)266 void TY_(FreeStyles)( TidyDocImpl* doc )
267 {
268     Lexer* lexer = doc->lexer;
269     if ( lexer )
270     {
271         TagStyle *style, *next;
272         for ( style = lexer->styles; style; style = next )
273         {
274             next = style->next;
275             TidyDocFree( doc, style->tag );
276             TidyDocFree( doc, style->tag_class );
277             TidyDocFree( doc, style->properties );
278             TidyDocFree( doc, style );
279         }
280     }
281 }
282 
GensymClass(TidyDocImpl * doc)283 static tmbstr GensymClass( TidyDocImpl* doc )
284 {
285     tmbchar buf[512];  /* CSSPrefix is limited to 256 characters */
286     ctmbstr pfx = cfgStr(doc, TidyCSSPrefix);
287     if ( pfx == NULL || *pfx == 0 )
288       pfx = "c";
289 
290     TY_(tmbsnprintf)(buf, sizeof(buf), "%s%u", pfx, ++doc->nClassId );
291     return TY_(tmbstrdup)(doc->allocator, buf);
292 }
293 
FindStyle(TidyDocImpl * doc,ctmbstr tag,ctmbstr properties)294 static ctmbstr FindStyle( TidyDocImpl* doc, ctmbstr tag, ctmbstr properties )
295 {
296     Lexer* lexer = doc->lexer;
297     TagStyle* style;
298 
299     for (style = lexer->styles; style; style=style->next)
300     {
301         if (TY_(tmbstrcmp)(style->tag, tag) == 0 &&
302             TY_(tmbstrcmp)(style->properties, properties) == 0)
303             return style->tag_class;
304     }
305 
306     style = (TagStyle *)TidyDocAlloc( doc, sizeof(TagStyle) );
307     style->tag = TY_(tmbstrdup)(doc->allocator, tag);
308     style->tag_class = GensymClass( doc );
309     style->properties = TY_(tmbstrdup)( doc->allocator, properties );
310     style->next = lexer->styles;
311     lexer->styles = style;
312     return style->tag_class;
313 }
314 
315 /*
316  Add class="foo" to node
317 */
AddClass(TidyDocImpl * doc,Node * node,ctmbstr classname)318 static void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname )
319 {
320     AttVal *classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);;
321 
322     /*
323      if there already is a class attribute
324      then append class name after a space.
325     */
326     if (classattr)
327         TY_(AppendToClassAttr)( doc, classattr, classname );
328     else /* create new class attribute */
329         TY_(AddAttribute)( doc, node, "class", classname );
330 }
331 
TY_(AddStyleAsClass)332 void TY_(AddStyleAsClass)( TidyDocImpl* doc, Node *node, ctmbstr stylevalue )
333 {
334     ctmbstr classname;
335 
336     classname = FindStyle( doc, node->element, stylevalue );
337     AddClass( doc, node, classname);
338 }
339 
340 /*
341  Find style attribute in node, and replace it
342  by corresponding class attribute. Search for
343  class in style dictionary otherwise gensym
344  new class and add to dictionary.
345 
346  Assumes that node doesn't have a class attribute
347 */
Style2Rule(TidyDocImpl * doc,Node * node)348 static void Style2Rule( TidyDocImpl* doc, Node *node)
349 {
350     AttVal *styleattr, *classattr;
351     ctmbstr classname;
352 
353     styleattr = TY_(AttrGetById)(node, TidyAttr_STYLE);
354 
355     if (styleattr)
356     {
357         /* fix for http://tidy.sf.net/bug/850215 */
358         if (!styleattr->value)
359         {
360             TY_(RemoveAttribute)(doc, node, styleattr);
361             return;
362         }
363 
364         classname = FindStyle( doc, node->element, styleattr->value );
365         classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);
366 
367         /*
368          if there already is a class attribute
369          then append class name after an underscore
370         */
371         if (classattr)
372         {
373             TY_(AppendToClassAttr)( doc, classattr, classname );
374             TY_(RemoveAttribute)( doc, node, styleattr );
375         }
376         else /* reuse style attribute for class attribute */
377         {
378             TidyDocFree(doc, styleattr->attribute);
379             TidyDocFree(doc, styleattr->value);
380             styleattr->attribute = TY_(tmbstrdup)(doc->allocator, "class");
381             styleattr->value = TY_(tmbstrdup)(doc->allocator, classname);
382         }
383     }
384 }
385 
AddColorRule(Lexer * lexer,ctmbstr selector,ctmbstr color)386 static void AddColorRule( Lexer* lexer, ctmbstr selector, ctmbstr color )
387 {
388     if ( selector && color )
389     {
390         TY_(AddStringLiteral)(lexer, selector);
391         TY_(AddStringLiteral)(lexer, " { color: ");
392         TY_(AddStringLiteral)(lexer, color);
393         TY_(AddStringLiteral)(lexer, " }\n");
394     }
395 }
396 
397 /*
398  move presentation attribs from body to style element
399 
400  background="foo" ->  body { background-image: url(foo) }
401  bgcolor="foo"    ->  body { background-color: foo }
402  text="foo"       ->  body { color: foo }
403  link="foo"       ->  :link { color: foo }
404  vlink="foo"      ->  :visited { color: foo }
405  alink="foo"      ->  :active { color: foo }
406 */
CleanBodyAttrs(TidyDocImpl * doc,Node * body)407 static void CleanBodyAttrs( TidyDocImpl* doc, Node* body )
408 {
409     Lexer* lexer  = doc->lexer;
410     tmbstr bgurl   = NULL;
411     tmbstr bgcolor = NULL;
412     tmbstr color   = NULL;
413     AttVal* attr;
414 
415     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BACKGROUND)))
416     {
417         bgurl = attr->value;
418         attr->value = NULL;
419         TY_(RemoveAttribute)( doc, body, attr );
420     }
421 
422     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BGCOLOR)))
423     {
424         bgcolor = attr->value;
425         attr->value = NULL;
426         TY_(RemoveAttribute)( doc, body, attr );
427     }
428 
429     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_TEXT)))
430     {
431         color = attr->value;
432         attr->value = NULL;
433         TY_(RemoveAttribute)( doc, body, attr );
434     }
435 
436     if ( bgurl || bgcolor || color )
437     {
438         TY_(AddStringLiteral)(lexer, " body {\n");
439         if (bgurl)
440         {
441             TY_(AddStringLiteral)(lexer, "  background-image: url(");
442             TY_(AddStringLiteral)(lexer, bgurl);
443             TY_(AddStringLiteral)(lexer, ");\n");
444             TidyDocFree(doc, bgurl);
445         }
446         if (bgcolor)
447         {
448             TY_(AddStringLiteral)(lexer, "  background-color: ");
449             TY_(AddStringLiteral)(lexer, bgcolor);
450             TY_(AddStringLiteral)(lexer, ";\n");
451             TidyDocFree(doc, bgcolor);
452         }
453         if (color)
454         {
455             TY_(AddStringLiteral)(lexer, "  color: ");
456             TY_(AddStringLiteral)(lexer, color);
457             TY_(AddStringLiteral)(lexer, ";\n");
458             TidyDocFree(doc, color);
459         }
460 
461         TY_(AddStringLiteral)(lexer, " }\n");
462     }
463 
464     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_LINK)))
465     {
466         AddColorRule(lexer, " :link", attr->value);
467         TY_(RemoveAttribute)( doc, body, attr );
468     }
469 
470     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_VLINK)))
471     {
472         AddColorRule(lexer, " :visited", attr->value);
473         TY_(RemoveAttribute)( doc, body, attr );
474     }
475 
476     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_ALINK)))
477     {
478         AddColorRule(lexer, " :active", attr->value);
479         TY_(RemoveAttribute)( doc, body, attr );
480     }
481 }
482 
NiceBody(TidyDocImpl * doc)483 static Bool NiceBody( TidyDocImpl* doc )
484 {
485     Node* const node = TY_(FindBody)(doc);
486     if (node) {
487         if (TY_(AttrGetById)(node, TidyAttr_BACKGROUND) ||
488             TY_(AttrGetById)(node, TidyAttr_BGCOLOR)    ||
489             TY_(AttrGetById)(node, TidyAttr_TEXT)       ||
490             TY_(AttrGetById)(node, TidyAttr_LINK)       ||
491             TY_(AttrGetById)(node, TidyAttr_VLINK)      ||
492             TY_(AttrGetById)(node, TidyAttr_ALINK))
493         {
494             doc->badLayout |= USING_BODY;
495             return no;
496         }
497     }
498 
499     return yes;
500 }
501 
502 /* create style element using rules from dictionary */
CreateStyleElement(TidyDocImpl * doc)503 static void CreateStyleElement( TidyDocImpl* doc )
504 {
505     Lexer* lexer = doc->lexer;
506     Node *node, *head, *body;
507     TagStyle *style;
508     AttVal *av;
509 
510     if ( lexer->styles == NULL && NiceBody(doc) )
511         return;
512 
513     node = TY_(NewNode)( doc->allocator, lexer );
514     node->type = StartTag;
515     node->implicit = yes;
516     node->element = TY_(tmbstrdup)(doc->allocator, "style");
517     TY_(FindTag)( doc, node );
518 
519     /* insert type attribute */
520     av = TY_(NewAttributeEx)( doc, "type", "text/css", '"' );
521     TY_(InsertAttributeAtStart)( node, av );
522 
523     body = TY_(FindBody)( doc );
524     lexer->txtstart = lexer->lexsize;
525     if ( body )
526         CleanBodyAttrs( doc, body );
527 
528     for (style = lexer->styles; style; style = style->next)
529     {
530         TY_(AddCharToLexer)(lexer, ' ');
531         TY_(AddStringLiteral)(lexer, style->tag);
532         TY_(AddCharToLexer)(lexer, '.');
533         TY_(AddStringLiteral)(lexer, style->tag_class);
534         TY_(AddCharToLexer)(lexer, ' ');
535         TY_(AddCharToLexer)(lexer, '{');
536         TY_(AddStringLiteral)(lexer, style->properties);
537         TY_(AddCharToLexer)(lexer, '}');
538         TY_(AddCharToLexer)(lexer, '\n');
539     }
540 
541     lexer->txtend = lexer->lexsize;
542 
543     TY_(InsertNodeAtEnd)( node, TY_(TextToken)(lexer) );
544 
545     /*
546      now insert style element into document head
547 
548      doc is root node. search its children for html node
549      the head node should be first child of html node
550     */
551     if ( NULL != (head = TY_(FindHEAD)( doc )) )
552         TY_(InsertNodeAtEnd)( head, node );
553 }
554 
555 
556 /* ensure bidirectional links are consistent */
TY_(FixNodeLinks)557 void TY_(FixNodeLinks)(Node *node)
558 {
559     Node *child;
560 
561     if (node->prev)
562         node->prev->next = node;
563     else
564         node->parent->content = node;
565 
566     if (node->next)
567         node->next->prev = node;
568     else
569         node->parent->last = node;
570 
571     for (child = node->content; child; child = child->next)
572         child->parent = node;
573 }
574 
575 /*
576  used to strip child of node when
577  the node has one and only one child
578 */
StripOnlyChild(TidyDocImpl * doc,Node * node)579 static void StripOnlyChild(TidyDocImpl* doc, Node *node)
580 {
581     Node *child;
582 
583     child = node->content;
584     node->content = child->content;
585     node->last = child->last;
586     child->content = NULL;
587     TY_(FreeNode)(doc, child);
588 
589     for (child = node->content; child; child = child->next)
590         child->parent = node;
591 }
592 
593 /*
594   used to strip font start and end tags.
595   Extricate "element", replace it by its content and delete it.
596 */
DiscardContainer(TidyDocImpl * doc,Node * element,Node ** pnode)597 static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
598 {
599     if (element->content)
600     {
601         Node *node, *parent = element->parent;
602 
603         element->last->next = element->next;
604 
605         if (element->next)
606         {
607             element->next->prev = element->last;
608         }
609         else
610             parent->last = element->last;
611 
612         if (element->prev)
613         {
614             element->content->prev = element->prev;
615             element->prev->next = element->content;
616         }
617         else
618             parent->content = element->content;
619 
620         for (node = element->content; node; node = node->next)
621             node->parent = parent;
622 
623         *pnode = element->content;
624 
625         element->next = element->content = NULL;
626         TY_(FreeNode)(doc, element);
627     }
628     else
629     {
630         *pnode = TY_(DiscardElement)(doc, element);
631     }
632 }
633 
634 /*
635   Create new string that consists of the
636   combined style properties in s1 and s2
637 
638   To merge property lists, we build a linked
639   list of property/values and insert properties
640   into the list in order, merging values for
641   the same property name.
642 */
MergeProperties(TidyDocImpl * doc,ctmbstr s1,ctmbstr s2)643 static tmbstr MergeProperties( TidyDocImpl* doc, ctmbstr s1, ctmbstr s2 )
644 {
645     tmbstr s;
646     StyleProp *prop;
647 
648     prop = CreateProps(doc, NULL, s1);
649     prop = CreateProps(doc, prop, s2);
650     s = CreatePropString(doc, prop);
651     FreeStyleProps(doc, prop);
652     return s;
653 }
654 
655 /*
656  Add style property to element, creating style
657  attribute as needed and adding ; delimiter
658 */
TY_(AddStyleProperty)659 void TY_(AddStyleProperty)(TidyDocImpl* doc, Node *node, ctmbstr property )
660 {
661     AttVal *av = TY_(AttrGetById)(node, TidyAttr_STYLE);
662 
663     /* if style attribute already exists then insert property */
664 
665     if ( av )
666     {
667         if (av->value != NULL)
668         {
669             tmbstr s = MergeProperties( doc, av->value, property );
670             TidyDocFree( doc, av->value );
671             av->value = s;
672         }
673         else
674         {
675             av->value = TY_(tmbstrdup)( doc->allocator, property );
676         }
677     }
678     else /* else create new style attribute */
679     {
680         av = TY_(NewAttributeEx)( doc, "style", property, '"' );
681         TY_(InsertAttributeAtStart)( node, av );
682     }
683 }
684 
MergeClasses(TidyDocImpl * doc,Node * node,Node * child)685 static void MergeClasses(TidyDocImpl* doc, Node *node, Node *child)
686 {
687     AttVal *av;
688     tmbstr s1, s2, names;
689 
690     for (s2 = NULL, av = child->attributes; av; av = av->next)
691     {
692         if (attrIsCLASS(av))
693         {
694             s2 = av->value;
695             break;
696         }
697     }
698 
699     for (s1 = NULL, av = node->attributes; av; av = av->next)
700     {
701         if (attrIsCLASS(av))
702         {
703             s1 = av->value;
704             break;
705         }
706     }
707 
708     if (s1)
709     {
710         if (s2)  /* merge class names from both */
711         {
712             uint l1, l2;
713             l1 = TY_(tmbstrlen)(s1);
714             l2 = TY_(tmbstrlen)(s2);
715             names = (tmbstr) TidyDocAlloc(doc, l1 + l2 + 2);
716             TY_(tmbstrcpy)(names, s1);
717             names[l1] = ' ';
718             TY_(tmbstrcpy)(names+l1+1, s2);
719             TidyDocFree(doc, av->value);
720             av->value = names;
721         }
722     }
723     else if (s2)  /* copy class names from child */
724     {
725         av = TY_(NewAttributeEx)( doc, "class", s2, '"' );
726         TY_(InsertAttributeAtStart)( node, av );
727     }
728 }
729 
MergeStyles(TidyDocImpl * doc,Node * node,Node * child)730 static void MergeStyles(TidyDocImpl* doc, Node *node, Node *child)
731 {
732     AttVal *av;
733     tmbstr s1, s2, style;
734 
735     /*
736        the child may have a class attribute used
737        for attaching styles, if so the class name
738        needs to be copied to node's class
739     */
740     MergeClasses(doc, node, child);
741 
742     for (s2 = NULL, av = child->attributes; av; av = av->next)
743     {
744         if (attrIsSTYLE(av))
745         {
746             s2 = av->value;
747             break;
748         }
749     }
750 
751     for (s1 = NULL, av = node->attributes; av; av = av->next)
752     {
753         if (attrIsSTYLE(av))
754         {
755             s1 = av->value;
756             break;
757         }
758     }
759 
760     if (s1)
761     {
762         if (s2)  /* merge styles from both */
763         {
764             style = MergeProperties(doc, s1, s2);
765             TidyDocFree(doc, av->value);
766             av->value = style;
767         }
768     }
769     else if (s2)  /* copy style of child */
770     {
771         av = TY_(NewAttributeEx)( doc, "style", s2, '"' );
772         TY_(InsertAttributeAtStart)( node, av );
773     }
774 }
775 
FontSize2Name(ctmbstr size)776 static ctmbstr FontSize2Name(ctmbstr size)
777 {
778     static const ctmbstr sizes[7] =
779     {
780         "60%", "70%", "80%", NULL,
781         "120%", "150%", "200%"
782     };
783 
784     /* increment of 0.8 */
785     static const ctmbstr minussizes[] =
786     {
787         "100%", "80%", "64%", "51%",
788         "40%", "32%", "26%"
789     };
790 
791     /* increment of 1.2 */
792     static const ctmbstr plussizes[] =
793     {
794         "100%", "120%", "144%", "172%",
795         "207%", "248%", "298%"
796     };
797 
798     if (size[0] == '\0')
799         return NULL;
800 
801     if ('0' <= size[0] && size[0] <= '6')
802     {
803         int n = size[0] - '0';
804         return sizes[n];
805     }
806 
807     if (size[0] == '-')
808     {
809         if ('0' <= size[1] && size[1] <= '6')
810         {
811             int n = size[1] - '0';
812             return minussizes[n];
813         }
814         return "smaller"; /*"70%"; */
815     }
816 
817     if ('0' <= size[1] && size[1] <= '6')
818     {
819         int n = size[1] - '0';
820         return plussizes[n];
821     }
822 
823     return "larger"; /* "140%" */
824 }
825 
AddFontFace(TidyDocImpl * doc,Node * node,ctmbstr face)826 static void AddFontFace( TidyDocImpl* doc, Node *node, ctmbstr face )
827 {
828     tmbchar buf[256];
829     TY_(tmbsnprintf)(buf, sizeof(buf), "font-family: %s", face );
830     TY_(AddStyleProperty)( doc, node, buf );
831 }
832 
AddFontSize(TidyDocImpl * doc,Node * node,ctmbstr size)833 static void AddFontSize( TidyDocImpl* doc, Node* node, ctmbstr size )
834 {
835     ctmbstr value = NULL;
836 
837     if (nodeIsP(node))
838     {
839         if (TY_(tmbstrcmp)(size, "6") == 0)
840             value = "h1";
841         else if (TY_(tmbstrcmp)(size, "5") == 0)
842             value = "h2";
843         else if (TY_(tmbstrcmp)(size, "4") == 0)
844             value = "h3";
845 
846         if (value)
847         {
848             TidyDocFree(doc, node->element);
849             node->element = TY_(tmbstrdup)(doc->allocator, value);
850             TY_(FindTag)(doc, node);
851             return;
852         }
853     }
854 
855     value = FontSize2Name(size);
856 
857     if (value)
858     {
859         tmbchar buf[64];
860         TY_(tmbsnprintf)(buf, sizeof(buf), "font-size: %s", value);
861         TY_(AddStyleProperty)( doc, node, buf );
862     }
863 }
864 
AddFontColor(TidyDocImpl * doc,Node * node,ctmbstr color)865 static void AddFontColor( TidyDocImpl* doc, Node *node, ctmbstr color)
866 {
867     tmbchar buf[128];
868     TY_(tmbsnprintf)(buf, sizeof(buf), "color: %s", color);
869     TY_(AddStyleProperty)( doc, node, buf );
870 }
871 
872 /* force alignment value to lower case */
AddAlign(TidyDocImpl * doc,Node * node,ctmbstr align)873 static void AddAlign( TidyDocImpl* doc, Node *node, ctmbstr align )
874 {
875     uint i;
876     tmbchar buf[128];
877 
878     TY_(tmbstrcpy)( buf, "text-align: " );
879     for ( i = 12; i < sizeof(buf)/sizeof(buf[0])-1; ++i )
880     {
881         if ( (buf[i] = (tmbchar)TY_(ToLower)(*align++)) == '\0' )
882             break;
883     }
884     buf[i] = '\0';
885     TY_(AddStyleProperty)( doc, node, buf );
886 }
887 
888 /*
889  add style properties to node corresponding to
890  the font face, size and color attributes
891 */
AddFontStyles(TidyDocImpl * doc,Node * node,AttVal * av)892 static void AddFontStyles( TidyDocImpl* doc, Node *node, AttVal *av)
893 {
894     while (av)
895     {
896         if (AttrHasValue(av))
897         {
898             if (attrIsFACE(av))
899                 AddFontFace( doc, node, av->value );
900             else if (attrIsSIZE(av))
901                 AddFontSize( doc, node, av->value );
902             else if (attrIsCOLOR(av))
903                 AddFontColor( doc, node, av->value );
904         }
905         av = av->next;
906     }
907 }
908 
909 /*
910     Symptom: <p align=center>
911     Action: <p style="text-align: center">
912 */
TextAlign(TidyDocImpl * doc,Node * node)913 static void TextAlign( TidyDocImpl* doc, Node* node )
914 {
915     AttVal *av, *prev;
916 
917     prev = NULL;
918 
919     for (av = node->attributes; av; av = av->next)
920     {
921         if (attrIsALIGN(av))
922         {
923             if (prev)
924                 prev->next = av->next;
925             else
926                 node->attributes = av->next;
927 
928             if (av->value)
929                 AddAlign( doc, node, av->value );
930 
931             TY_(FreeAttribute)(doc, av);
932             break;
933         }
934 
935         prev = av;
936     }
937 }
938 
939 /*
940     Symptom: <table bgcolor="red">
941     Action: <table style="background-color: red">
942 */
TableBgColor(TidyDocImpl * doc,Node * node)943 static void TableBgColor( TidyDocImpl* doc, Node* node )
944 {
945     AttVal* attr;
946     tmbchar buf[256];
947 
948     if (NULL != (attr = TY_(AttrGetById)(node, TidyAttr_BGCOLOR)))
949     {
950         TY_(tmbsnprintf)(buf, sizeof(buf), "background-color: %s", attr->value );
951         TY_(RemoveAttribute)( doc, node, attr );
952         TY_(AddStyleProperty)( doc, node, buf );
953     }
954 }
955 
956 /*
957    The clean up rules use the pnode argument to return the
958    next node when the original node has been deleted
959 */
960 
961 /*
962     Symptom: <dir> <li> where <li> is only child
963     Action: coerce <dir> <li> to <div> with indent.
964 */
965 
Dir2Div(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))966 static Bool Dir2Div( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode))
967 {
968     Node *child;
969 
970     if ( nodeIsDIR(node) || nodeIsUL(node) || nodeIsOL(node) )
971     {
972         child = node->content;
973 
974         if (child == NULL)
975             return no;
976 
977         /* check child has no peers */
978 
979         if (child->next)
980             return no;
981 
982         if ( !nodeIsLI(child) )
983             return no;
984 
985         if ( !child->implicit )
986             return no;
987 
988         /* coerce dir to div */
989         node->tag = TY_(LookupTagDef)( TidyTag_DIV );
990         TidyDocFree( doc, node->element );
991         node->element = TY_(tmbstrdup)(doc->allocator, "div");
992         TY_(AddStyleProperty)( doc, node, "margin-left: 2em" );
993         StripOnlyChild( doc, node );
994         return yes;
995     }
996 
997     return no;
998 }
999 
1000 /*
1001     Symptom: <center>
1002     Action: replace <center> by <div style="text-align: center">
1003 */
1004 
Center2Div(TidyDocImpl * doc,Node * node,Node ** pnode)1005 static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode)
1006 {
1007     if ( nodeIsCENTER(node) )
1008     {
1009         if ( cfgBool(doc, TidyDropFontTags) )
1010         {
1011             if (node->content)
1012             {
1013                 Node *last = node->last;
1014                 DiscardContainer( doc, node, pnode );
1015 
1016                 node = TY_(InferredTag)(doc, TidyTag_BR);
1017                 TY_(InsertNodeAfterElement)(last, node);
1018             }
1019             else
1020             {
1021                 Node * const prev   = node->prev;
1022                 Node * const next   = node->next;
1023                 Node * const parent = node->parent;
1024 
1025                 DiscardContainer( doc, node, pnode );
1026 
1027                 node = TY_(InferredTag)(doc, TidyTag_BR);
1028                 if (next)
1029                     TY_(InsertNodeBeforeElement)(next, node);
1030                 else if (prev)
1031                     TY_(InsertNodeAfterElement)(prev, node);
1032                 else
1033                     TY_(InsertNodeAtStart)(parent, node);
1034             }
1035 
1036             return yes;
1037         }
1038 
1039         RenameElem( doc, node, TidyTag_DIV );
1040         TY_(AddStyleProperty)( doc, node, "text-align: center" );
1041         return yes;
1042     }
1043 
1044     return no;
1045 }
1046 
1047 /* Copy child attributes to node. Duplicate attributes are overwritten.
1048    Unique attributes (such as ID) disable the action.
1049    Attributes style and class are not dealt with. A call to MergeStyles
1050    will do that.
1051 */
CopyAttrs(TidyDocImpl * doc,Node * node,Node * child)1052 static Bool CopyAttrs( TidyDocImpl* doc, Node *node, Node *child)
1053 {
1054     AttVal *av1, *av2;
1055     TidyAttrId id;
1056 
1057     /* Detect attributes that cannot be merged or overwritten. */
1058     if (TY_(AttrGetById)(child, TidyAttr_ID) != NULL
1059         && TY_(AttrGetById)(node, TidyAttr_ID) != NULL)
1060         return no;
1061 
1062     /* Move child attributes to node. Attributes in node
1063      can be overwritten or merged. */
1064     for (av2 = child->attributes; av2; )
1065     {
1066         /* Dealt by MergeStyles. */
1067         if (attrIsSTYLE(av2) || attrIsCLASS(av2))
1068         {
1069             av2 = av2->next;
1070             continue;
1071         }
1072         /* Avoid duplicates in node */
1073         if ((id=AttrId(av2)) != TidyAttr_UNKNOWN
1074             && (av1=TY_(AttrGetById)(node, id))!= NULL)
1075             TY_(RemoveAttribute)( doc, node, av1 );
1076 
1077         /* Move attribute from child to node */
1078         TY_(DetachAttribute)( child, av2 );
1079         av1 = av2;
1080         av2 = av2->next;
1081         av1->next = NULL;
1082         TY_(InsertAttributeAtEnd)( node, av1 );
1083     }
1084 
1085     return yes;
1086 }
1087 
1088 /*
1089     Symptom <XX><XX>...</XX></XX>
1090     Action: merge the two XXs
1091 
1092   For instance, this is useful after nested <dir>s used by Word
1093   for indenting have been converted to <div>s
1094 
1095   If state is "no", no merging.
1096   If state is "yes", inner element is discarded. Only Style and Class
1097   attributes are merged using MergeStyles().
1098   If state is "auto", atttibutes are merged as described in CopyAttrs().
1099   Style and Class attributes are merged using MergeStyles().
1100 */
MergeNestedElements(TidyDocImpl * doc,TidyTagId Id,TidyTriState state,Node * node,Node ** ARG_UNUSED (pnode))1101 static Bool MergeNestedElements( TidyDocImpl* doc,
1102                                  TidyTagId Id, TidyTriState state, Node *node,
1103                                  Node **ARG_UNUSED(pnode))
1104 {
1105     Node *child;
1106 
1107     if ( state == TidyNoState
1108          || !TagIsId(node, Id) )
1109         return no;
1110 
1111     child = node->content;
1112 
1113     if ( child == NULL
1114          || child->next != NULL
1115          || !TagIsId(child, Id) )
1116         return no;
1117 
1118     if ( state == TidyAutoState && !CopyAttrs(doc, node, child) )
1119         return no;
1120 
1121     MergeStyles( doc, node, child );
1122     StripOnlyChild( doc, node );
1123     return yes;
1124 }
1125 
1126 /*
1127     Symptom: <ul><li><ul>...</ul></li></ul>
1128     Action: discard outer list
1129 */
1130 
NestedList(TidyDocImpl * doc,Node * node,Node ** pnode)1131 static Bool NestedList( TidyDocImpl* doc, Node *node, Node **pnode )
1132 {
1133     Node *list;
1134 
1135     if ( nodeIsUL(node) || nodeIsOL(node) )
1136     {
1137         Node *child = node->content;
1138 
1139         if (child == NULL)
1140             return no;
1141 
1142         /* check child has no peers */
1143 
1144         if (child->next)
1145             return no;
1146 
1147         list = child->content;
1148 
1149         if (!list)
1150             return no;
1151 
1152         if (list->tag != node->tag)
1153             return no;
1154 
1155         /* check list has no peers */
1156         if (list->next)
1157             return no;
1158 
1159         *pnode = list;  /* Set node to resume iteration */
1160 
1161         /* move inner list node into position of outer node */
1162         list->prev = node->prev;
1163         list->next = node->next;
1164         list->parent = node->parent;
1165         TY_(FixNodeLinks)(list);
1166 
1167         /* get rid of outer ul and its li */
1168         child->content = NULL;
1169         TY_(FreeNode)( doc, child ); /* See test #427841. */
1170         child = NULL;
1171         node->content = NULL;
1172         node->next = NULL;
1173         TY_(FreeNode)( doc, node );
1174         node = NULL;
1175 
1176         /*
1177           If prev node was a list the chances are this node
1178           should be appended to that list. Word has no way of
1179           recognizing nested lists and just uses indents
1180         */
1181 
1182         if (list->prev)
1183         {
1184             if ( (nodeIsUL(list->prev) || nodeIsOL(list->prev))
1185                  && list->prev->last )
1186             {
1187                 node = list;
1188                 list = node->prev;
1189 
1190                 child = list->last;  /* <li> */
1191 
1192                 list->next = node->next;
1193                 TY_(FixNodeLinks)(list);
1194 
1195                 node->parent = child;
1196                 node->next = NULL;
1197                 node->prev = child->last;
1198                 TY_(FixNodeLinks)(node);
1199                 CleanNode( doc, node );
1200             }
1201         }
1202 
1203         return yes;
1204     }
1205 
1206     return no;
1207 }
1208 
1209 /* Find CSS equivalent in a SPAN element */
1210 static
FindCSSSpanEq(Node * node,ctmbstr * s,Bool deprecatedOnly)1211 Bool FindCSSSpanEq( Node *node, ctmbstr *s, Bool deprecatedOnly )
1212 {
1213     struct
1214     {
1215         TidyTagId id;
1216         ctmbstr CSSeq;
1217         Bool deprecated;
1218     }
1219     const CSS_SpanEq[] =
1220         {
1221             { TidyTag_B, "font-weight: bold", no },
1222             { TidyTag_I, "font-style: italic", no },
1223             { TidyTag_S, "text-decoration: line-through", yes},
1224             { TidyTag_STRIKE, "text-decoration: line-through", yes},
1225             { TidyTag_U, "text-decoration: underline", yes},
1226             { TidyTag_UNKNOWN, NULL, no }
1227         };
1228     uint i;
1229 
1230     for (i=0; CSS_SpanEq[i].CSSeq; ++i)
1231         if ( (!deprecatedOnly || CSS_SpanEq[i].deprecated)
1232              && TagIsId(node, CSS_SpanEq[i].id) )
1233         {
1234             *s = CSS_SpanEq[i].CSSeq;
1235             return yes;
1236         }
1237     return no;
1238 }
1239 
1240 /* Necessary conditions to apply BlockStyle(). */
CanApplyBlockStyle(Node * node)1241 static Bool CanApplyBlockStyle( Node *node )
1242 {
1243     if (TY_(nodeHasCM)(node,CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE)
1244         && !nodeIsTABLE(node) && !nodeIsTR(node) && !nodeIsLI(node) )
1245     {
1246         return yes;
1247     }
1248     return no;
1249 }
1250 
1251 /*
1252   Symptom: the only child of a block-level element is a
1253   presentation element such as B, I or FONT
1254 
1255   Action: add style "font-weight: bold" to the block and
1256   strip the <b> element, leaving its children.
1257 
1258   example:
1259 
1260     <p>
1261       <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1262     </p>
1263 
1264   becomes:
1265 
1266       <p style="font-weight: bold; font-family: Arial; font-size: 6">
1267         Draft Recommended Practice
1268       </p>
1269 
1270   This code also replaces the align attribute by a style attribute.
1271   However, to avoid CSS problems with Navigator 4, this isn't done
1272   for the elements: caption, tr and table
1273 */
BlockStyle(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1274 static Bool BlockStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1275 {
1276     Node *child;
1277     ctmbstr CSSeq;
1278 
1279     /* check for bgcolor */
1280     if (   nodeIsTABLE(node)
1281         || nodeIsTD(node) || nodeIsTH(node) || nodeIsTR( node ))
1282         TableBgColor( doc, node );
1283 
1284     if (CanApplyBlockStyle(node))
1285     {
1286         /* check for align attribute */
1287         if ( !nodeIsCAPTION(node) )
1288             TextAlign( doc, node );
1289 
1290         child = node->content;
1291         if (child == NULL)
1292             return no;
1293 
1294         /* check child has no peers */
1295         if (child->next)
1296             return no;
1297 
1298         if ( FindCSSSpanEq(child, &CSSeq, no) )
1299         {
1300             MergeStyles( doc, node, child );
1301             TY_(AddStyleProperty)( doc, node, CSSeq );
1302             StripOnlyChild( doc, node );
1303             return yes;
1304         }
1305         else if ( nodeIsFONT(child) )
1306         {
1307             MergeStyles( doc, node, child );
1308             AddFontStyles( doc, node, child->attributes );
1309             StripOnlyChild( doc, node );
1310             return yes;
1311         }
1312     }
1313 
1314     return no;
1315 }
1316 
1317 /* Necessary conditions to apply InlineStyle(). */
CanApplyInlineStyle(Node * node)1318 static Bool CanApplyInlineStyle( Node *node )
1319 {
1320     return !nodeIsFONT(node) && TY_(nodeHasCM)(node, CM_INLINE|CM_ROW);
1321 }
1322 
1323 /* the only child of table cell or an inline element such as em */
InlineStyle(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1324 static Bool InlineStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1325 {
1326     Node *child;
1327     ctmbstr CSSeq;
1328 
1329     if ( CanApplyInlineStyle(node) )
1330     {
1331         child = node->content;
1332 
1333         if (child == NULL)
1334             return no;
1335 
1336         /* check child has no peers */
1337 
1338         if (child->next)
1339             return no;
1340 
1341         if ( FindCSSSpanEq(child, &CSSeq, no) )
1342         {
1343             MergeStyles( doc, node, child );
1344             TY_(AddStyleProperty)( doc, node, CSSeq );
1345             StripOnlyChild( doc, node );
1346             return yes;
1347         }
1348         else if ( nodeIsFONT(child) )
1349         {
1350             MergeStyles( doc, node, child );
1351             AddFontStyles( doc, node, child->attributes );
1352             StripOnlyChild( doc, node );
1353             return yes;
1354         }
1355     }
1356 
1357     return no;
1358 }
1359 
1360 /*
1361     Transform element to equivalent CSS
1362 */
InlineElementToCSS(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1363 static Bool InlineElementToCSS( TidyDocImpl* doc, Node* node,
1364                                 Node **ARG_UNUSED(pnode)  )
1365 {
1366     ctmbstr CSSeq;
1367 
1368     /* if node is the only child of parent element then leave alone
1369           Do so only if BlockStyle may be succesful. */
1370     if ( node->parent->content == node && node->next == NULL &&
1371          (CanApplyBlockStyle(node->parent)
1372           || CanApplyInlineStyle(node->parent)) )
1373         return no;
1374 
1375     if ( FindCSSSpanEq(node, &CSSeq, yes) )
1376     {
1377         RenameElem( doc, node, TidyTag_SPAN );
1378         TY_(AddStyleProperty)( doc, node, CSSeq );
1379         return yes;
1380     }
1381     return no;
1382 }
1383 
1384 /*
1385   Replace font elements by span elements, deleting
1386   the font element's attributes and replacing them
1387   by a single style attribute.
1388 */
Font2Span(TidyDocImpl * doc,Node * node,Node ** pnode)1389 static Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode )
1390 {
1391     AttVal *av, *style, *next;
1392 
1393     if ( nodeIsFONT(node) )
1394     {
1395         if ( cfgBool(doc, TidyDropFontTags) )
1396         {
1397             DiscardContainer( doc, node, pnode );
1398             return yes;
1399         }
1400 
1401         /* if node is the only child of parent element then leave alone
1402           Do so only if BlockStyle may be succesful. */
1403         if ( node->parent->content == node && node->next == NULL &&
1404              CanApplyBlockStyle(node->parent) )
1405             return no;
1406 
1407         AddFontStyles( doc, node, node->attributes );
1408 
1409         /* extract style attribute and free the rest */
1410         av = node->attributes;
1411         style = NULL;
1412 
1413         while (av)
1414         {
1415             next = av->next;
1416 
1417             if (attrIsSTYLE(av))
1418             {
1419                 av->next = NULL;
1420                 style = av;
1421             }
1422             else
1423             {
1424                 TY_(FreeAttribute)( doc, av );
1425             }
1426             av = next;
1427         }
1428 
1429         node->attributes = style;
1430         RenameElem( doc, node, TidyTag_SPAN );
1431         return yes;
1432     }
1433 
1434     return no;
1435 }
1436 
1437 /*
1438   Applies all matching rules to a node.
1439 */
CleanNode(TidyDocImpl * doc,Node * node)1440 Node* CleanNode( TidyDocImpl* doc, Node *node )
1441 {
1442     Node *next = NULL;
1443     TidyTriState mergeDivs = cfgAutoBool(doc, TidyMergeDivs);
1444     TidyTriState mergeSpans = cfgAutoBool(doc, TidyMergeSpans);
1445 
1446     for (next = node; TY_(nodeIsElement)(node); node = next)
1447     {
1448         if ( Dir2Div(doc, node, &next) )
1449             continue;
1450 
1451         /* Special case: true result means
1452         ** that arg node and its parent no longer exist.
1453         ** So we must jump back up the CreateStyleProperties()
1454         ** call stack until we have a valid node reference.
1455         */
1456         if ( NestedList(doc, node, &next) )
1457             return next;
1458 
1459         if ( Center2Div(doc, node, &next) )
1460             continue;
1461 
1462         if ( MergeNestedElements(doc, TidyTag_DIV, mergeDivs, node, &next) )
1463             continue;
1464 
1465         if ( MergeNestedElements(doc, TidyTag_SPAN, mergeSpans, node, &next) )
1466             continue;
1467 
1468         if ( BlockStyle(doc, node, &next) )
1469             continue;
1470 
1471         if ( InlineStyle(doc, node, &next) )
1472             continue;
1473 
1474         if ( InlineElementToCSS(doc, node, &next) )
1475             continue;
1476 
1477         if ( Font2Span(doc, node, &next) )
1478             continue;
1479 
1480         break;
1481     }
1482 
1483     return next;
1484 }
1485 
1486 /* Special case: if the current node is destroyed by
1487 ** CleanNode() lower in the tree, this node and its parent
1488 ** no longer exist.  So we must jump back up the CleanTree()
1489 ** call stack until we have a valid node reference.
1490 */
1491 
CleanTree(TidyDocImpl * doc,Node * node)1492 static Node* CleanTree( TidyDocImpl* doc, Node *node )
1493 {
1494     if (node->content)
1495     {
1496         Node *child;
1497         for (child = node->content; child != NULL; child = child->next)
1498         {
1499             child = CleanTree( doc, child );
1500             if ( !child )
1501                 break;
1502         }
1503     }
1504 
1505     return CleanNode( doc, node );
1506 }
1507 
DefineStyleRules(TidyDocImpl * doc,Node * node)1508 static void DefineStyleRules( TidyDocImpl* doc, Node *node )
1509 {
1510     Node *child;
1511 
1512     if (node->content)
1513     {
1514         for (child = node->content;
1515                 child != NULL; child = child->next)
1516         {
1517             DefineStyleRules( doc, child );
1518         }
1519     }
1520 
1521     Style2Rule( doc, node );
1522 }
1523 
TY_(CleanDocument)1524 void TY_(CleanDocument)( TidyDocImpl* doc )
1525 {
1526     /* placeholder.  CleanTree()/CleanNode() will not
1527     ** zap root element
1528     */
1529     CleanTree( doc, &doc->root );
1530 
1531     if ( cfgBool(doc, TidyMakeClean) )
1532     {
1533         DefineStyleRules( doc, &doc->root );
1534         CreateStyleElement( doc );
1535     }
1536 }
1537 
1538 /* simplifies <b><b> ... </b> ...</b> etc. */
TY_(NestedEmphasis)1539 void TY_(NestedEmphasis)( TidyDocImpl* doc, Node* node )
1540 {
1541     Node *next;
1542 
1543     while (node)
1544     {
1545         next = node->next;
1546 
1547         if ( (nodeIsB(node) || nodeIsI(node))
1548              && node->parent && node->parent->tag == node->tag)
1549         {
1550             /* strip redundant inner element */
1551             DiscardContainer( doc, node, &next );
1552             node = next;
1553             continue;
1554         }
1555 
1556         if ( node->content )
1557             TY_(NestedEmphasis)( doc, node->content );
1558 
1559         node = next;
1560     }
1561 }
1562 
1563 
1564 
1565 /* replace i by em and b by strong */
TY_(EmFromI)1566 void TY_(EmFromI)( TidyDocImpl* doc, Node* node )
1567 {
1568     while (node)
1569     {
1570         if ( nodeIsI(node) )
1571             RenameElem( doc, node, TidyTag_EM );
1572         else if ( nodeIsB(node) )
1573             RenameElem( doc, node, TidyTag_STRONG );
1574 
1575         if ( node->content )
1576             TY_(EmFromI)( doc, node->content );
1577 
1578         node = node->next;
1579     }
1580 }
1581 
HasOneChild(Node * node)1582 static Bool HasOneChild(Node *node)
1583 {
1584     return (node->content && node->content->next == NULL);
1585 }
1586 
1587 /*
1588  Some people use dir or ul without an li
1589  to indent the content. The pattern to
1590  look for is a list with a single implicit
1591  li. This is recursively replaced by an
1592  implicit blockquote.
1593 */
TY_(List2BQ)1594 void TY_(List2BQ)( TidyDocImpl* doc, Node* node )
1595 {
1596     while (node)
1597     {
1598         if (node->content)
1599             TY_(List2BQ)( doc, node->content );
1600 
1601         if ( node->tag && node->tag->parser == TY_(ParseList) &&
1602              HasOneChild(node) && node->content->implicit )
1603         {
1604             StripOnlyChild( doc, node );
1605             RenameElem( doc, node, TidyTag_BLOCKQUOTE );
1606             node->implicit = yes;
1607         }
1608 
1609         node = node->next;
1610     }
1611 }
1612 
1613 
1614 /*
1615  Replace implicit blockquote by div with an indent
1616  taking care to reduce nested blockquotes to a single
1617  div with the indent set to match the nesting depth
1618 */
TY_(BQ2Div)1619 void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
1620 {
1621     tmbchar indent_buf[ 32 ];
1622     uint indent;
1623 
1624     while (node)
1625     {
1626         if ( nodeIsBLOCKQUOTE(node) && node->implicit )
1627         {
1628             indent = 1;
1629 
1630             while( HasOneChild(node) &&
1631                    nodeIsBLOCKQUOTE(node->content) &&
1632                    node->implicit)
1633             {
1634                 ++indent;
1635                 StripOnlyChild( doc, node );
1636             }
1637 
1638             if (node->content)
1639                 TY_(BQ2Div)( doc, node->content );
1640 
1641             TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
1642                              2*indent);
1643 
1644             RenameElem( doc, node, TidyTag_DIV );
1645             TY_(AddStyleProperty)(doc, node, indent_buf );
1646         }
1647         else if (node->content)
1648             TY_(BQ2Div)( doc, node->content );
1649 
1650         node = node->next;
1651     }
1652 }
1653 
1654 
FindEnclosingCell(TidyDocImpl * ARG_UNUSED (doc),Node * node)1655 static Node* FindEnclosingCell( TidyDocImpl* ARG_UNUSED(doc), Node *node)
1656 {
1657     Node *check;
1658 
1659     for ( check=node; check; check = check->parent )
1660     {
1661       if ( nodeIsTD(check) )
1662         return check;
1663     }
1664     return NULL;
1665 }
1666 
1667 /* node is <![if ...]> prune up to <![endif]> */
PruneSection(TidyDocImpl * doc,Node * node)1668 static Node* PruneSection( TidyDocImpl* doc, Node *node )
1669 {
1670     Lexer* lexer = doc->lexer;
1671 
1672     for (;;)
1673     {
1674         ctmbstr lexbuf = lexer->lexbuf + node->start;
1675         if ( TY_(tmbstrncmp)(lexbuf, "if !supportEmptyParas", 21) == 0 )
1676         {
1677           Node* cell = FindEnclosingCell( doc, node );
1678           if ( cell )
1679           {
1680             /* Need to put &nbsp; into cell so it doesn't look weird
1681             */
1682             Node* nbsp = TY_(NewLiteralTextNode)( lexer, "\240" );
1683             assert( (byte)'\240' == (byte)160 );
1684             TY_(InsertNodeBeforeElement)( node, nbsp );
1685           }
1686         }
1687 
1688         /* discard node and returns next, unless it is a text node */
1689         if ( node->type == TextNode )
1690             node = node->next;
1691         else
1692             node = TY_(DiscardElement)( doc, node );
1693 
1694         if (node == NULL)
1695             return NULL;
1696 
1697         if (node->type == SectionTag)
1698         {
1699             if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0)
1700             {
1701                 node = PruneSection( doc, node );
1702                 continue;
1703             }
1704 
1705             if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "endif", 5) == 0)
1706             {
1707                 node = TY_(DiscardElement)( doc, node );
1708                 break;
1709             }
1710         }
1711     }
1712 
1713     return node;
1714 }
1715 
TY_(DropSections)1716 void TY_(DropSections)( TidyDocImpl* doc, Node* node )
1717 {
1718     Lexer* lexer = doc->lexer;
1719     while (node)
1720     {
1721         if (node->type == SectionTag)
1722         {
1723             /* prune up to matching endif */
1724             if ((TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0) &&
1725                 (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if !vml", 7) != 0)) /* #444394 - fix 13 Sep 01 */
1726             {
1727                 node = PruneSection( doc, node );
1728                 continue;
1729             }
1730 
1731             /* discard others as well */
1732             node = TY_(DiscardElement)( doc, node );
1733             continue;
1734         }
1735 
1736         if (node->content)
1737             TY_(DropSections)( doc, node->content );
1738 
1739         node = node->next;
1740     }
1741 }
1742 
PurgeWord2000Attributes(TidyDocImpl * doc,Node * node)1743 static void PurgeWord2000Attributes( TidyDocImpl* doc, Node* node )
1744 {
1745     AttVal *attr, *next, *prev = NULL;
1746 
1747     for ( attr = node->attributes; attr; attr = next )
1748     {
1749         next = attr->next;
1750 
1751         /* special check for class="Code" denoting pre text */
1752         /* Pass thru user defined styles as HTML class names */
1753         if (attrIsCLASS(attr))
1754         {
1755             if (AttrValueIs(attr, "Code") ||
1756                  TY_(tmbstrncmp)(attr->value, "Mso", 3) != 0 )
1757             {
1758                 prev = attr;
1759                 continue;
1760             }
1761         }
1762 
1763         if (attrIsCLASS(attr) ||
1764             attrIsSTYLE(attr) ||
1765             attrIsLANG(attr)  ||
1766              ( (attrIsHEIGHT(attr) || attrIsWIDTH(attr)) &&
1767                (nodeIsTD(node) || nodeIsTR(node) || nodeIsTH(node)) ) ||
1768              (attr->attribute && TY_(tmbstrncmp)(attr->attribute, "x:", 2) == 0) )
1769         {
1770             if (prev)
1771                 prev->next = next;
1772             else
1773                 node->attributes = next;
1774 
1775             TY_(FreeAttribute)( doc, attr );
1776         }
1777         else
1778             prev = attr;
1779     }
1780 }
1781 
1782 /* Word2000 uses span excessively, so we strip span out */
StripSpan(TidyDocImpl * doc,Node * span)1783 static Node* StripSpan( TidyDocImpl* doc, Node* span )
1784 {
1785     Node *node, *prev = NULL, *content;
1786 
1787     /*
1788      deal with span elements that have content
1789      by splicing the content in place of the span
1790      after having processed it
1791     */
1792 
1793     TY_(CleanWord2000)( doc, span->content );
1794     content = span->content;
1795 
1796     if (span->prev)
1797         prev = span->prev;
1798     else if (content)
1799     {
1800         node = content;
1801         content = content->next;
1802         TY_(RemoveNode)(node);
1803         TY_(InsertNodeBeforeElement)(span, node);
1804         prev = node;
1805     }
1806 
1807     while (content)
1808     {
1809         node = content;
1810         content = content->next;
1811         TY_(RemoveNode)(node);
1812         TY_(InsertNodeAfterElement)(prev, node);
1813         prev = node;
1814     }
1815 
1816     if (span->next == NULL)
1817         span->parent->last = prev;
1818 
1819     node = span->next;
1820     span->content = NULL;
1821     TY_(DiscardElement)( doc, span );
1822     return node;
1823 }
1824 
1825 /* map non-breaking spaces to regular spaces */
TY_(NormalizeSpaces)1826 void TY_(NormalizeSpaces)(Lexer *lexer, Node *node)
1827 {
1828     while ( node )
1829     {
1830         if ( node->content )
1831             TY_(NormalizeSpaces)( lexer, node->content );
1832 
1833         if (TY_(nodeIsText)(node))
1834         {
1835             uint i, c;
1836             tmbstr p = lexer->lexbuf + node->start;
1837 
1838             for (i = node->start; i < node->end; ++i)
1839             {
1840                 c = (byte) lexer->lexbuf[i];
1841 
1842                 /* look for UTF-8 multibyte character */
1843                 if ( c > 0x7F )
1844                     i += TY_(GetUTF8)( lexer->lexbuf + i, &c );
1845 
1846                 if ( c == 160 )
1847                     c = ' ';
1848 
1849                 p = TY_(PutUTF8)(p, c);
1850             }
1851             node->end = p - lexer->lexbuf;
1852         }
1853 
1854         node = node->next;
1855     }
1856 }
1857 
1858 /* used to hunt for hidden preformatted sections */
NoMargins(Node * node)1859 static Bool NoMargins(Node *node)
1860 {
1861     AttVal * const attval = TY_(AttrGetById)(node, TidyAttr_STYLE);
1862 
1863     if ( !AttrHasValue(attval) )
1864         return no;
1865 
1866     /* search for substring "margin-top: 0" */
1867     if (!TY_(tmbsubstr)(attval->value, "margin-top: 0"))
1868         return no;
1869 
1870     /* search for substring "margin-bottom: 0" */
1871     if (!TY_(tmbsubstr)(attval->value, "margin-bottom: 0"))
1872         return no;
1873 
1874     return yes;
1875 }
1876 
1877 /* does element have a single space as its content? */
SingleSpace(Lexer * lexer,Node * node)1878 static Bool SingleSpace( Lexer* lexer, Node* node )
1879 {
1880     if ( node->content )
1881     {
1882         node = node->content;
1883 
1884         if ( node->next != NULL )
1885             return no;
1886 
1887         if ( node->type != TextNode )
1888             return no;
1889 
1890         if ( (node->end - node->start) == 1 &&
1891              lexer->lexbuf[node->start] == ' ' )
1892             return yes;
1893 
1894         if ( (node->end - node->start) == 2 )
1895         {
1896             uint c = 0;
1897             TY_(GetUTF8)( lexer->lexbuf + node->start, &c );
1898             if ( c == 160 )
1899                 return yes;
1900         }
1901     }
1902 
1903     return no;
1904 }
1905 
1906 /*
1907  This is a major clean up to strip out all the extra stuff you get
1908  when you save as web page from Word 2000. It doesn't yet know what
1909  to do with VML tags, but these will appear as errors unless you
1910  declare them as new tags, such as o:p which needs to be declared
1911  as inline.
1912 */
TY_(CleanWord2000)1913 void TY_(CleanWord2000)( TidyDocImpl* doc, Node *node)
1914 {
1915     /* used to a list from a sequence of bulletted p's */
1916     Lexer* lexer = doc->lexer;
1917     Node* list = NULL;
1918 
1919     while ( node )
1920     {
1921         /* get rid of Word's xmlns attributes */
1922         if ( nodeIsHTML(node) )
1923         {
1924             /* check that it's a Word 2000 document */
1925             if ( !TY_(GetAttrByName)(node, "xmlns:o") &&
1926                  !cfgBool(doc, TidyMakeBare) )
1927                 return;
1928 
1929             TY_(FreeAttrs)( doc, node );
1930         }
1931 
1932         /* fix up preformatted sections by looking for a
1933         ** sequence of paragraphs with zero top/bottom margin
1934         */
1935         if ( nodeIsP(node) )
1936         {
1937             if (NoMargins(node))
1938             {
1939                 Node *pre, *next;
1940                 TY_(CoerceNode)(doc, node, TidyTag_PRE, no, yes);
1941 
1942                 PurgeWord2000Attributes( doc, node );
1943 
1944                 if (node->content)
1945                     TY_(CleanWord2000)( doc, node->content );
1946 
1947                 pre = node;
1948                 node = node->next;
1949 
1950                 /* continue to strip p's */
1951 
1952                 while ( nodeIsP(node) && NoMargins(node) )
1953                 {
1954                     next = node->next;
1955                     TY_(RemoveNode)(node);
1956                     TY_(InsertNodeAtEnd)(pre, TY_(NewLineNode)(lexer));
1957                     TY_(InsertNodeAtEnd)(pre, node);
1958                     StripSpan( doc, node );
1959                     node = next;
1960                 }
1961 
1962                 if (node == NULL)
1963                     break;
1964             }
1965         }
1966 
1967         if (node->tag && (node->tag->model & CM_BLOCK)
1968             && SingleSpace(lexer, node))
1969         {
1970             node = StripSpan( doc, node );
1971             continue;
1972         }
1973         /* discard Word's style verbiage */
1974         if ( nodeIsSTYLE(node) || nodeIsMETA(node) ||
1975              node->type == CommentTag )
1976         {
1977             node = TY_(DiscardElement)( doc, node );
1978             continue;
1979         }
1980 
1981         /* strip out all span and font tags Word scatters so liberally! */
1982         if ( nodeIsSPAN(node) || nodeIsFONT(node) )
1983         {
1984             node = StripSpan( doc, node );
1985             continue;
1986         }
1987 
1988         if ( nodeIsLINK(node) )
1989         {
1990             AttVal *attr = TY_(AttrGetById)(node, TidyAttr_REL);
1991 
1992             if (AttrValueIs(attr, "File-List"))
1993             {
1994                 node = TY_(DiscardElement)( doc, node );
1995                 continue;
1996             }
1997         }
1998 
1999         /* discards <o:p> which encodes the paragraph mark */
2000         if ( node->tag && TY_(tmbstrcmp)(node->tag->name,"o:p")==0)
2001         {
2002             Node* next;
2003             DiscardContainer( doc, node, &next );
2004             node = next;
2005             continue;
2006         }
2007 
2008         /* discard empty paragraphs */
2009 
2010         if ( node->content == NULL && nodeIsP(node) )
2011         {
2012             /*  Use the existing function to ensure consistency */
2013             Node *next = TY_(TrimEmptyElement)( doc, node );
2014             node = next;
2015             continue;
2016         }
2017 
2018         if ( nodeIsP(node) )
2019         {
2020             AttVal *attr, *atrStyle;
2021 
2022             attr = TY_(AttrGetById)(node, TidyAttr_CLASS);
2023             atrStyle = TY_(AttrGetById)(node, TidyAttr_STYLE);
2024             /*
2025                (JES) Sometimes Word marks a list item with the following hokie syntax
2026                <p class="MsoNormal" style="...;mso-list:l1 level1 lfo1;
2027                 translate these into <li>
2028             */
2029             /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
2030             /* map <p class="MsoListNumber"> to <ol>...</ol> */
2031             if ( AttrValueIs(attr, "MsoListBullet") ||
2032                  AttrValueIs(attr, "MsoListNumber") ||
2033                  AttrContains(atrStyle, "mso-list:") )
2034             {
2035                 TidyTagId listType = TidyTag_UL;
2036                 if (AttrValueIs(attr, "MsoListNumber"))
2037                     listType = TidyTag_OL;
2038 
2039                 TY_(CoerceNode)(doc, node, TidyTag_LI, no, yes);
2040 
2041                 if ( !list || TagId(list) != listType )
2042                 {
2043                     const Dict* tag = TY_(LookupTagDef)( listType );
2044                     list = TY_(InferredTag)(doc, tag->id);
2045                     TY_(InsertNodeBeforeElement)(node, list);
2046                 }
2047 
2048                 PurgeWord2000Attributes( doc, node );
2049 
2050                 if ( node->content )
2051                     TY_(CleanWord2000)( doc, node->content );
2052 
2053                 /* remove node and append to contents of list */
2054                 TY_(RemoveNode)(node);
2055                 TY_(InsertNodeAtEnd)(list, node);
2056                 node = list;
2057             }
2058             /* map sequence of <p class="Code"> to <pre>...</pre> */
2059             else if (AttrValueIs(attr, "Code"))
2060             {
2061                 Node *br = TY_(NewLineNode)(lexer);
2062                 TY_(NormalizeSpaces)(lexer, node->content);
2063 
2064                 if ( !list || TagId(list) != TidyTag_PRE )
2065                 {
2066                     list = TY_(InferredTag)(doc, TidyTag_PRE);
2067                     TY_(InsertNodeBeforeElement)(node, list);
2068                 }
2069 
2070                 /* remove node and append to contents of list */
2071                 TY_(RemoveNode)(node);
2072                 TY_(InsertNodeAtEnd)(list, node);
2073                 StripSpan( doc, node );
2074                 TY_(InsertNodeAtEnd)(list, br);
2075                 node = list->next;
2076             }
2077             else
2078                 list = NULL;
2079         }
2080         else
2081             list = NULL;
2082 
2083         if (!node)
2084             return;
2085 
2086         /* strip out style and class attributes */
2087         if (TY_(nodeIsElement)(node))
2088             PurgeWord2000Attributes( doc, node );
2089 
2090         if (node->content)
2091             TY_(CleanWord2000)( doc, node->content );
2092 
2093         node = node->next;
2094     }
2095 }
2096 
TY_(IsWord2000)2097 Bool TY_(IsWord2000)( TidyDocImpl* doc )
2098 {
2099     AttVal *attval;
2100     Node *node, *head;
2101     Node *html = TY_(FindHTML)( doc );
2102 
2103     if (html && TY_(GetAttrByName)(html, "xmlns:o"))
2104         return yes;
2105 
2106     /* search for <meta name="GENERATOR" content="Microsoft ..."> */
2107     head = TY_(FindHEAD)( doc );
2108 
2109     if (head)
2110     {
2111         for (node = head->content; node; node = node->next)
2112         {
2113             if ( !nodeIsMETA(node) )
2114                 continue;
2115 
2116             attval = TY_(AttrGetById)( node, TidyAttr_NAME );
2117 
2118             if ( !AttrValueIs(attval, "generator") )
2119                 continue;
2120 
2121             attval =  TY_(AttrGetById)( node, TidyAttr_CONTENT );
2122 
2123             if ( AttrContains(attval, "Microsoft") )
2124                 return yes;
2125         }
2126     }
2127 
2128     return no;
2129 }
2130 
2131 /* where appropriate move object elements from head to body */
TY_(BumpObject)2132 void TY_(BumpObject)( TidyDocImpl* doc, Node *html )
2133 {
2134     Node *node, *next, *head = NULL, *body = NULL;
2135 
2136     if (!html)
2137         return;
2138 
2139     for ( node = html->content; node != NULL; node = node->next )
2140     {
2141         if ( nodeIsHEAD(node) )
2142             head = node;
2143 
2144         if ( nodeIsBODY(node) )
2145             body = node;
2146     }
2147 
2148     if ( head != NULL && body != NULL )
2149     {
2150         for (node = head->content; node != NULL; node = next)
2151         {
2152             next = node->next;
2153 
2154             if ( nodeIsOBJECT(node) )
2155             {
2156                 Node *child;
2157                 Bool bump = no;
2158 
2159                 for (child = node->content; child != NULL; child = child->next)
2160                 {
2161                     /* bump to body unless content is param */
2162                     if ( (TY_(nodeIsText)(child) && !TY_(IsBlank)(doc->lexer, node))
2163                          || !nodeIsPARAM(child) )
2164                     {
2165                             bump = yes;
2166                             break;
2167                     }
2168                 }
2169 
2170                 if ( bump )
2171                 {
2172                     TY_(RemoveNode)( node );
2173                     TY_(InsertNodeAtStart)( body, node );
2174                 }
2175             }
2176         }
2177     }
2178 }
2179 
TY_(VerifyHTTPEquiv)2180 void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2181 {
2182     Node *pNode;
2183     StyleProp *pFirstProp = NULL, *pLastProp = NULL, *prop = NULL;
2184     tmbstr s, pszBegin, pszEnd;
2185     ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
2186 
2187     if (!enc)
2188         return;
2189 
2190     if (!nodeIsHEAD(head))
2191         head = TY_(FindHEAD)(doc);
2192 
2193     if (!head)
2194         return;
2195 
2196     /* Find any <meta http-equiv='Content-Type' content='...' /> */
2197     for (pNode = head->content; NULL != pNode; pNode = pNode->next)
2198     {
2199         AttVal* httpEquiv = TY_(AttrGetById)(pNode, TidyAttr_HTTP_EQUIV);
2200         AttVal* metaContent = TY_(AttrGetById)(pNode, TidyAttr_CONTENT);
2201 
2202         if ( !nodeIsMETA(pNode) || !metaContent ||
2203              !AttrValueIs(httpEquiv, "Content-Type") )
2204             continue;
2205 
2206         pszBegin = s = TY_(tmbstrdup)( doc->allocator, metaContent->value );
2207         while (pszBegin && *pszBegin)
2208         {
2209             while (isspace( *pszBegin ))
2210                 pszBegin++;
2211             pszEnd = pszBegin;
2212             while ('\0' != *pszEnd && ';' != *pszEnd)
2213                 pszEnd++;
2214             if (';' == *pszEnd )
2215                 *(pszEnd++) = '\0';
2216             if (pszEnd > pszBegin)
2217             {
2218                 prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
2219                 prop->name = TY_(tmbstrdup)( doc->allocator, pszBegin );
2220                 prop->value = NULL;
2221                 prop->next = NULL;
2222 
2223                 if (NULL != pLastProp)
2224                     pLastProp->next = prop;
2225                 else
2226                     pFirstProp = prop;
2227 
2228                 pLastProp = prop;
2229                 pszBegin = pszEnd;
2230             }
2231         }
2232         TidyDocFree( doc, s );
2233 
2234         /*  find the charset property */
2235         for (prop = pFirstProp; NULL != prop; prop = prop->next)
2236         {
2237             if (0 != TY_(tmbstrncasecmp)( prop->name, "charset", 7 ))
2238                 continue;
2239 
2240             TidyDocFree( doc, prop->name );
2241             prop->name = (tmbstr)TidyDocAlloc( doc, 8 + TY_(tmbstrlen)(enc) + 1 );
2242             TY_(tmbstrcpy)(prop->name, "charset=");
2243             TY_(tmbstrcpy)(prop->name+8, enc);
2244             s = CreatePropString( doc, pFirstProp );
2245             TidyDocFree( doc, metaContent->value );
2246             metaContent->value = s;
2247             break;
2248         }
2249         /* #718127, prevent memory leakage */
2250         FreeStyleProps(doc, pFirstProp);
2251         pFirstProp = NULL;
2252         pLastProp = NULL;
2253     }
2254 }
2255 
TY_(DropComments)2256 void TY_(DropComments)(TidyDocImpl* doc, Node* node)
2257 {
2258     Node* next;
2259 
2260     while (node)
2261     {
2262         next = node->next;
2263 
2264         if (node->type == CommentTag)
2265         {
2266             TY_(RemoveNode)(node);
2267             TY_(FreeNode)(doc, node);
2268             node = next;
2269             continue;
2270         }
2271 
2272         if (node->content)
2273             TY_(DropComments)(doc, node->content);
2274 
2275         node = next;
2276     }
2277 }
2278 
TY_(DropFontElements)2279 void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **ARG_UNUSED(pnode))
2280 {
2281     Node* next;
2282 
2283     while (node)
2284     {
2285         next = node->next;
2286 
2287         if (nodeIsFONT(node))
2288         {
2289             DiscardContainer(doc, node, &next);
2290             node = next;
2291             continue;
2292         }
2293 
2294         if (node->content)
2295             TY_(DropFontElements)(doc, node->content, &next);
2296 
2297         node = next;
2298     }
2299 }
2300 
TY_(WbrToSpace)2301 void TY_(WbrToSpace)(TidyDocImpl* doc, Node* node)
2302 {
2303     Node* next;
2304 
2305     while (node)
2306     {
2307         next = node->next;
2308 
2309         if (nodeIsWBR(node))
2310         {
2311             Node* text;
2312             text = TY_(NewLiteralTextNode)(doc->lexer, " ");
2313             TY_(InsertNodeAfterElement)(node, text);
2314             TY_(RemoveNode)(node);
2315             TY_(FreeNode)(doc, node);
2316             node = next;
2317             continue;
2318         }
2319 
2320         if (node->content)
2321             TY_(WbrToSpace)(doc, node->content);
2322 
2323         node = next;
2324    }
2325 }
2326 
2327 /*
2328   Filters from Word and PowerPoint often use smart
2329   quotes resulting in character codes between 128
2330   and 159. Unfortunately, the corresponding HTML 4.0
2331   entities for these are not widely supported. The
2332   following converts dashes and quotation marks to
2333   the nearest ASCII equivalent. My thanks to
2334   Andrzej Novosiolov for his help with this code.
2335 
2336   Note: The old code in the pretty printer applied
2337   this to all node types and attribute values while
2338   this routine applies it only to text nodes. First,
2339   Microsoft Office products rarely put the relevant
2340   characters into these tokens, second support for
2341   them is much better now and last but not least, it
2342   can be harmful to replace these characters since
2343   US-ASCII quote marks are often used as syntax
2344   characters, a simple
2345 
2346     <a onmouseover="alert('&#x2018;')">...</a>
2347 
2348   would be broken if the U+2018 is replaced by "'".
2349   The old code would neither take care whether the
2350   quote mark is already used as delimiter,
2351 
2352     <p title='&#x2018;'>...</p>
2353 
2354   got
2355 
2356     <p title='''>...</p>
2357 
2358   Since browser support is much better nowadays and
2359   high-quality typography is better than ASCII it'd
2360   be probably a good idea to drop the feature...
2361 */
TY_(DowngradeTypography)2362 void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node)
2363 {
2364     Node* next;
2365     Lexer* lexer = doc->lexer;
2366 
2367     while (node)
2368     {
2369         next = node->next;
2370 
2371         if (TY_(nodeIsText)(node))
2372         {
2373             uint i, c;
2374             tmbstr p = lexer->lexbuf + node->start;
2375 
2376             for (i = node->start; i < node->end; ++i)
2377             {
2378                 c = (unsigned char) lexer->lexbuf[i];
2379 
2380                 if (c > 0x7F)
2381                     i += TY_(GetUTF8)(lexer->lexbuf + i, &c);
2382 
2383                 if (c >= 0x2013 && c <= 0x201E)
2384                 {
2385                     switch (c)
2386                     {
2387                     case 0x2013: /* en dash */
2388                     case 0x2014: /* em dash */
2389                         c = '-';
2390                         break;
2391                     case 0x2018: /* left single  quotation mark */
2392                     case 0x2019: /* right single quotation mark */
2393                     case 0x201A: /* single low-9 quotation mark */
2394                         c = '\'';
2395                         break;
2396                     case 0x201C: /* left double  quotation mark */
2397                     case 0x201D: /* right double quotation mark */
2398                     case 0x201E: /* double low-9 quotation mark */
2399                         c = '"';
2400                         break;
2401                     }
2402                 }
2403 
2404                 p = TY_(PutUTF8)(p, c);
2405             }
2406 
2407             node->end = p - lexer->lexbuf;
2408         }
2409 
2410         if (node->content)
2411             TY_(DowngradeTypography)(doc, node->content);
2412 
2413         node = next;
2414     }
2415 }
2416 
TY_(ReplacePreformattedSpaces)2417 void TY_(ReplacePreformattedSpaces)(TidyDocImpl* doc, Node* node)
2418 {
2419     Node* next;
2420 
2421     while (node)
2422     {
2423         next = node->next;
2424 
2425         if (node->tag && node->tag->parser == TY_(ParsePre))
2426         {
2427             TY_(NormalizeSpaces)(doc->lexer, node->content);
2428             node = next;
2429             continue;
2430         }
2431 
2432         if (node->content)
2433             TY_(ReplacePreformattedSpaces)(doc, node->content);
2434 
2435         node = next;
2436     }
2437 }
2438 
TY_(ConvertCDATANodes)2439 void TY_(ConvertCDATANodes)(TidyDocImpl* doc, Node* node)
2440 {
2441     Node* next;
2442 
2443     while (node)
2444     {
2445         next = node->next;
2446 
2447         if (node->type == CDATATag)
2448             node->type = TextNode;
2449 
2450         if (node->content)
2451             TY_(ConvertCDATANodes)(doc, node->content);
2452 
2453         node = next;
2454     }
2455 }
2456 
2457 /*
2458   FixLanguageInformation ensures that the document contains (only)
2459   the attributes for language information desired by the output
2460   document type. For example, for XHTML 1.0 documents both
2461   'xml:lang' and 'lang' are desired, for XHTML 1.1 only 'xml:lang'
2462   is desired and for HTML 4.01 only 'lang' is desired.
2463 */
TY_(FixLanguageInformation)2464 void TY_(FixLanguageInformation)(TidyDocImpl* doc, Node* node, Bool wantXmlLang, Bool wantLang)
2465 {
2466     Node* next;
2467 
2468     while (node)
2469     {
2470         next = node->next;
2471 
2472         /* todo: report modifications made here to the report system */
2473 
2474         if (TY_(nodeIsElement)(node))
2475         {
2476             AttVal* lang = TY_(AttrGetById)(node, TidyAttr_LANG);
2477             AttVal* xmlLang = TY_(AttrGetById)(node, TidyAttr_XML_LANG);
2478 
2479             if (lang && xmlLang)
2480             {
2481                 /*
2482                   todo: check whether both attributes are in sync,
2483                   here or elsewhere, where elsewhere is probably
2484                   preferable.
2485                   AD - March 2005: not mandatory according the standards.
2486                 */
2487             }
2488             else if (lang && wantXmlLang)
2489             {
2490                 if (TY_(NodeAttributeVersions)( node, TidyAttr_XML_LANG )
2491                     & doc->lexer->versionEmitted)
2492                     TY_(RepairAttrValue)(doc, node, "xml:lang", lang->value);
2493             }
2494             else if (xmlLang && wantLang)
2495             {
2496                 if (TY_(NodeAttributeVersions)( node, TidyAttr_LANG )
2497                     & doc->lexer->versionEmitted)
2498                     TY_(RepairAttrValue)(doc, node, "lang", xmlLang->value);
2499             }
2500 
2501             if (lang && !wantLang)
2502                 TY_(RemoveAttribute)(doc, node, lang);
2503 
2504             if (xmlLang && !wantXmlLang)
2505                 TY_(RemoveAttribute)(doc, node, xmlLang);
2506         }
2507 
2508         if (node->content)
2509             TY_(FixLanguageInformation)(doc, node->content, wantXmlLang, wantLang);
2510 
2511         node = next;
2512     }
2513 }
2514 
2515 /*
2516   Set/fix/remove <html xmlns='...'>
2517 */
TY_(FixXhtmlNamespace)2518 void TY_(FixXhtmlNamespace)(TidyDocImpl* doc, Bool wantXmlns)
2519 {
2520     Node* html = TY_(FindHTML)(doc);
2521     AttVal* xmlns;
2522 
2523     if (!html)
2524         return;
2525 
2526     xmlns = TY_(AttrGetById)(html, TidyAttr_XMLNS);
2527 
2528     if (wantXmlns)
2529     {
2530         if (!AttrValueIs(xmlns, XHTML_NAMESPACE))
2531             TY_(RepairAttrValue)(doc, html, "xmlns", XHTML_NAMESPACE);
2532     }
2533     else if (xmlns)
2534     {
2535         TY_(RemoveAttribute)(doc, html, xmlns);
2536     }
2537 }
2538 
2539 /*
2540   ...
2541 */
TY_(FixAnchors)2542 void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
2543 {
2544     Node* next;
2545 
2546     while (node)
2547     {
2548         next = node->next;
2549 
2550         if (TY_(IsAnchorElement)(doc, node))
2551         {
2552             AttVal *name = TY_(AttrGetById)(node, TidyAttr_NAME);
2553             AttVal *id = TY_(AttrGetById)(node, TidyAttr_ID);
2554             Bool hadName = name!=NULL;
2555             Bool hadId = id!=NULL;
2556             Bool IdEmitted = no;
2557             Bool NameEmitted = no;
2558 
2559             /* todo: how are empty name/id attributes handled? */
2560 
2561             if (name && id)
2562             {
2563                 Bool NameHasValue = AttrHasValue(name);
2564                 Bool IdHasValue = AttrHasValue(id);
2565                 if ( (NameHasValue != IdHasValue) ||
2566                      (NameHasValue && IdHasValue &&
2567                      TY_(tmbstrcmp)(name->value, id->value) != 0 ) )
2568                     TY_(ReportAttrError)( doc, node, name, ID_NAME_MISMATCH);
2569             }
2570             else if (name && wantId)
2571             {
2572                 if (TY_(NodeAttributeVersions)( node, TidyAttr_ID )
2573                     & doc->lexer->versionEmitted)
2574                 {
2575                     if (TY_(IsValidHTMLID)(name->value))
2576                     {
2577                         TY_(RepairAttrValue)(doc, node, "id", name->value);
2578                         IdEmitted = yes;
2579                     }
2580                     else
2581                         TY_(ReportAttrError)(doc, node, name, INVALID_XML_ID);
2582                  }
2583             }
2584             else if (id && wantName)
2585             {
2586                 if (TY_(NodeAttributeVersions)( node, TidyAttr_NAME )
2587                     & doc->lexer->versionEmitted)
2588                 {
2589                     /* todo: do not assume id is valid */
2590                     TY_(RepairAttrValue)(doc, node, "name", id->value);
2591                     NameEmitted = yes;
2592                 }
2593             }
2594 
2595             if (id && !wantId
2596                 /* make sure that Name has been emitted if requested */
2597                 && (hadName || !wantName || NameEmitted) )
2598                 TY_(RemoveAttribute)(doc, node, id);
2599 
2600             if (name && !wantName
2601                 /* make sure that Id has been emitted if requested */
2602                 && (hadId || !wantId || IdEmitted) )
2603                 TY_(RemoveAttribute)(doc, node, name);
2604 
2605             if (TY_(AttrGetById)(node, TidyAttr_NAME) == NULL &&
2606                 TY_(AttrGetById)(node, TidyAttr_ID) == NULL)
2607                 TY_(RemoveAnchorByNode)(doc, node);
2608         }
2609 
2610         if (node->content)
2611             TY_(FixAnchors)(doc, node->content, wantName, wantId);
2612 
2613         node = next;
2614     }
2615 }
2616 
2617 /*
2618  * local variables:
2619  * mode: c
2620  * indent-tabs-mode: nil
2621  * c-basic-offset: 4
2622  * eval: (c-set-offset 'substatement-open 0)
2623  * end:
2624  */
2625