1 /*
2 clean.c -- clean up misuse of presentation markup
3
4 (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
5 See tidy.h for the copyright notice.
6
7 Filters from other formats such as Microsoft Word
8 often make excessive use of presentation markup such
9 as font tags, B, I, and the align attribute. By applying
10 a set of production rules, it is straight forward to
11 transform this to use CSS.
12
13 Some rules replace some of the children of an element by
14 style properties on the element, e.g.
15
16 <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
17
18 Such rules are applied to the element's content and then
19 to the element itself until none of the rules more apply.
20 Having applied all the rules to an element, it will have
21 a style attribute with one or more properties.
22
23 Other rules strip the element they apply to, replacing
24 it by style properties on the contents, e.g.
25
26 <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
27
28 These rules are applied to an element before processing
29 its content and replace the current element by the first
30 element in the exposed content.
31
32 After applying both sets of rules, you can replace the
33 style attribute by a class value and style rule in the
34 document head. To support this, an association of styles
35 and class names is built.
36
37 A naive approach is to rely on string matching to test
38 when two property lists are the same. A better approach
39 would be to first sort the properties before matching.
40
41 */
42
43 #include <stdio.h>
44 #include <stdlib.h>
45 #include <string.h>
46
47 #include "tidy-int.h"
48 #include "clean.h"
49 #include "lexer.h"
50 #include "parser.h"
51 #include "attrs.h"
52 #include "message.h"
53 #include "tmbstr.h"
54 #include "utf8.h"
55
56 static Node* CleanNode( TidyDocImpl* doc, Node *node );
57
RenameElem(TidyDocImpl * doc,Node * node,TidyTagId tid)58 static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid )
59 {
60 const Dict* dict = TY_(LookupTagDef)( tid );
61 TidyDocFree( doc, node->element );
62 node->element = TY_(tmbstrdup)( doc->allocator, dict->name );
63 node->tag = dict;
64 }
65
FreeStyleProps(TidyDocImpl * doc,StyleProp * props)66 static void FreeStyleProps(TidyDocImpl* doc, StyleProp *props)
67 {
68 StyleProp *next;
69
70 while (props)
71 {
72 next = props->next;
73 TidyDocFree(doc, props->name);
74 TidyDocFree(doc, props->value);
75 TidyDocFree(doc, props);
76 props = next;
77 }
78 }
79
InsertProperty(TidyDocImpl * doc,StyleProp * props,ctmbstr name,ctmbstr value)80 static StyleProp *InsertProperty( TidyDocImpl* doc, StyleProp* props, ctmbstr name, ctmbstr value )
81 {
82 StyleProp *first, *prev, *prop;
83 int cmp;
84
85 prev = NULL;
86 first = props;
87
88 while (props)
89 {
90 cmp = TY_(tmbstrcmp)(props->name, name);
91
92 if (cmp == 0)
93 {
94 /* this property is already defined, ignore new value */
95 return first;
96 }
97
98 if (cmp > 0)
99 {
100 /* insert before this */
101
102 prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
103 prop->name = TY_(tmbstrdup)(doc->allocator, name);
104 prop->value = TY_(tmbstrdup)(doc->allocator, value);
105 prop->next = props;
106
107 if (prev)
108 prev->next = prop;
109 else
110 first = prop;
111
112 return first;
113 }
114
115 prev = props;
116 props = props->next;
117 }
118
119 prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
120 prop->name = TY_(tmbstrdup)(doc->allocator, name);
121 prop->value = TY_(tmbstrdup)(doc->allocator, value);
122 prop->next = NULL;
123
124 if (prev)
125 prev->next = prop;
126 else
127 first = prop;
128
129 return first;
130 }
131
132 /*
133 Create sorted linked list of properties from style string
134 It temporarily places nulls in place of ':' and ';' to
135 delimit the strings for the property name and value.
136 Some systems don't allow you to NULL literal strings,
137 so to avoid this, a copy is made first.
138 */
CreateProps(TidyDocImpl * doc,StyleProp * prop,ctmbstr style)139 static StyleProp* CreateProps( TidyDocImpl* doc, StyleProp* prop, ctmbstr style )
140 {
141 tmbstr name, value = NULL, name_end, value_end, line;
142 Bool more;
143
144 line = TY_(tmbstrdup)(doc->allocator, style);
145 name = line;
146
147 while (*name)
148 {
149 while (*name == ' ')
150 ++name;
151
152 name_end = name;
153
154 while (*name_end)
155 {
156 if (*name_end == ':')
157 {
158 value = name_end + 1;
159 break;
160 }
161
162 ++name_end;
163 }
164
165 if (*name_end != ':')
166 break;
167
168 while ( value && *value == ' ')
169 ++value;
170
171 value_end = value;
172 more = no;
173
174 while (*value_end)
175 {
176 if (*value_end == ';')
177 {
178 more = yes;
179 break;
180 }
181
182 ++value_end;
183 }
184
185 *name_end = '\0';
186 *value_end = '\0';
187
188 prop = InsertProperty(doc, prop, name, value);
189 *name_end = ':';
190
191 if (more)
192 {
193 *value_end = ';';
194 name = value_end + 1;
195 continue;
196 }
197
198 break;
199 }
200
201 TidyDocFree(doc, line); /* free temporary copy */
202 return prop;
203 }
204
CreatePropString(TidyDocImpl * doc,StyleProp * props)205 static tmbstr CreatePropString(TidyDocImpl* doc, StyleProp *props)
206 {
207 tmbstr style, p, s;
208 uint len;
209 StyleProp *prop;
210
211 /* compute length */
212
213 for (len = 0, prop = props; prop; prop = prop->next)
214 {
215 len += TY_(tmbstrlen)(prop->name) + 2;
216 if (prop->value)
217 len += TY_(tmbstrlen)(prop->value) + 2;
218 }
219
220 style = (tmbstr) TidyDocAlloc(doc, len+1);
221 style[0] = '\0';
222
223 for (p = style, prop = props; prop; prop = prop->next)
224 {
225 s = prop->name;
226
227 while((*p++ = *s++))
228 continue;
229
230 if (prop->value)
231 {
232 *--p = ':';
233 *++p = ' ';
234 ++p;
235
236 s = prop->value;
237 while((*p++ = *s++))
238 continue;
239 }
240 if (prop->next == NULL)
241 break;
242
243 *--p = ';';
244 *++p = ' ';
245 ++p;
246 }
247
248 return style;
249 }
250
251 /*
252 create string with merged properties
253 static tmbstr AddProperty( ctmbstr style, ctmbstr property )
254 {
255 tmbstr line;
256 StyleProp *prop;
257
258 prop = CreateProps(doc, NULL, style);
259 prop = CreateProps(doc, prop, property);
260 line = CreatePropString(doc, prop);
261 FreeStyleProps(doc, prop);
262 return line;
263 }
264 */
265
TY_(FreeStyles)266 void TY_(FreeStyles)( TidyDocImpl* doc )
267 {
268 Lexer* lexer = doc->lexer;
269 if ( lexer )
270 {
271 TagStyle *style, *next;
272 for ( style = lexer->styles; style; style = next )
273 {
274 next = style->next;
275 TidyDocFree( doc, style->tag );
276 TidyDocFree( doc, style->tag_class );
277 TidyDocFree( doc, style->properties );
278 TidyDocFree( doc, style );
279 }
280 }
281 }
282
GensymClass(TidyDocImpl * doc)283 static tmbstr GensymClass( TidyDocImpl* doc )
284 {
285 tmbchar buf[512]; /* CSSPrefix is limited to 256 characters */
286 ctmbstr pfx = cfgStr(doc, TidyCSSPrefix);
287 if ( pfx == NULL || *pfx == 0 )
288 pfx = "c";
289
290 TY_(tmbsnprintf)(buf, sizeof(buf), "%s%u", pfx, ++doc->nClassId );
291 return TY_(tmbstrdup)(doc->allocator, buf);
292 }
293
FindStyle(TidyDocImpl * doc,ctmbstr tag,ctmbstr properties)294 static ctmbstr FindStyle( TidyDocImpl* doc, ctmbstr tag, ctmbstr properties )
295 {
296 Lexer* lexer = doc->lexer;
297 TagStyle* style;
298
299 for (style = lexer->styles; style; style=style->next)
300 {
301 if (TY_(tmbstrcmp)(style->tag, tag) == 0 &&
302 TY_(tmbstrcmp)(style->properties, properties) == 0)
303 return style->tag_class;
304 }
305
306 style = (TagStyle *)TidyDocAlloc( doc, sizeof(TagStyle) );
307 style->tag = TY_(tmbstrdup)(doc->allocator, tag);
308 style->tag_class = GensymClass( doc );
309 style->properties = TY_(tmbstrdup)( doc->allocator, properties );
310 style->next = lexer->styles;
311 lexer->styles = style;
312 return style->tag_class;
313 }
314
315 /*
316 Add class="foo" to node
317 */
AddClass(TidyDocImpl * doc,Node * node,ctmbstr classname)318 static void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname )
319 {
320 AttVal *classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);;
321
322 /*
323 if there already is a class attribute
324 then append class name after a space.
325 */
326 if (classattr)
327 TY_(AppendToClassAttr)( doc, classattr, classname );
328 else /* create new class attribute */
329 TY_(AddAttribute)( doc, node, "class", classname );
330 }
331
TY_(AddStyleAsClass)332 void TY_(AddStyleAsClass)( TidyDocImpl* doc, Node *node, ctmbstr stylevalue )
333 {
334 ctmbstr classname;
335
336 classname = FindStyle( doc, node->element, stylevalue );
337 AddClass( doc, node, classname);
338 }
339
340 /*
341 Find style attribute in node, and replace it
342 by corresponding class attribute. Search for
343 class in style dictionary otherwise gensym
344 new class and add to dictionary.
345
346 Assumes that node doesn't have a class attribute
347 */
Style2Rule(TidyDocImpl * doc,Node * node)348 static void Style2Rule( TidyDocImpl* doc, Node *node)
349 {
350 AttVal *styleattr, *classattr;
351 ctmbstr classname;
352
353 styleattr = TY_(AttrGetById)(node, TidyAttr_STYLE);
354
355 if (styleattr)
356 {
357 /* fix for http://tidy.sf.net/bug/850215 */
358 if (!styleattr->value)
359 {
360 TY_(RemoveAttribute)(doc, node, styleattr);
361 return;
362 }
363
364 classname = FindStyle( doc, node->element, styleattr->value );
365 classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);
366
367 /*
368 if there already is a class attribute
369 then append class name after an underscore
370 */
371 if (classattr)
372 {
373 TY_(AppendToClassAttr)( doc, classattr, classname );
374 TY_(RemoveAttribute)( doc, node, styleattr );
375 }
376 else /* reuse style attribute for class attribute */
377 {
378 TidyDocFree(doc, styleattr->attribute);
379 TidyDocFree(doc, styleattr->value);
380 styleattr->attribute = TY_(tmbstrdup)(doc->allocator, "class");
381 styleattr->value = TY_(tmbstrdup)(doc->allocator, classname);
382 }
383 }
384 }
385
AddColorRule(Lexer * lexer,ctmbstr selector,ctmbstr color)386 static void AddColorRule( Lexer* lexer, ctmbstr selector, ctmbstr color )
387 {
388 if ( selector && color )
389 {
390 TY_(AddStringLiteral)(lexer, selector);
391 TY_(AddStringLiteral)(lexer, " { color: ");
392 TY_(AddStringLiteral)(lexer, color);
393 TY_(AddStringLiteral)(lexer, " }\n");
394 }
395 }
396
397 /*
398 move presentation attribs from body to style element
399
400 background="foo" -> body { background-image: url(foo) }
401 bgcolor="foo" -> body { background-color: foo }
402 text="foo" -> body { color: foo }
403 link="foo" -> :link { color: foo }
404 vlink="foo" -> :visited { color: foo }
405 alink="foo" -> :active { color: foo }
406 */
CleanBodyAttrs(TidyDocImpl * doc,Node * body)407 static void CleanBodyAttrs( TidyDocImpl* doc, Node* body )
408 {
409 Lexer* lexer = doc->lexer;
410 tmbstr bgurl = NULL;
411 tmbstr bgcolor = NULL;
412 tmbstr color = NULL;
413 AttVal* attr;
414
415 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BACKGROUND)))
416 {
417 bgurl = attr->value;
418 attr->value = NULL;
419 TY_(RemoveAttribute)( doc, body, attr );
420 }
421
422 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BGCOLOR)))
423 {
424 bgcolor = attr->value;
425 attr->value = NULL;
426 TY_(RemoveAttribute)( doc, body, attr );
427 }
428
429 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_TEXT)))
430 {
431 color = attr->value;
432 attr->value = NULL;
433 TY_(RemoveAttribute)( doc, body, attr );
434 }
435
436 if ( bgurl || bgcolor || color )
437 {
438 TY_(AddStringLiteral)(lexer, " body {\n");
439 if (bgurl)
440 {
441 TY_(AddStringLiteral)(lexer, " background-image: url(");
442 TY_(AddStringLiteral)(lexer, bgurl);
443 TY_(AddStringLiteral)(lexer, ");\n");
444 TidyDocFree(doc, bgurl);
445 }
446 if (bgcolor)
447 {
448 TY_(AddStringLiteral)(lexer, " background-color: ");
449 TY_(AddStringLiteral)(lexer, bgcolor);
450 TY_(AddStringLiteral)(lexer, ";\n");
451 TidyDocFree(doc, bgcolor);
452 }
453 if (color)
454 {
455 TY_(AddStringLiteral)(lexer, " color: ");
456 TY_(AddStringLiteral)(lexer, color);
457 TY_(AddStringLiteral)(lexer, ";\n");
458 TidyDocFree(doc, color);
459 }
460
461 TY_(AddStringLiteral)(lexer, " }\n");
462 }
463
464 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_LINK)))
465 {
466 AddColorRule(lexer, " :link", attr->value);
467 TY_(RemoveAttribute)( doc, body, attr );
468 }
469
470 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_VLINK)))
471 {
472 AddColorRule(lexer, " :visited", attr->value);
473 TY_(RemoveAttribute)( doc, body, attr );
474 }
475
476 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_ALINK)))
477 {
478 AddColorRule(lexer, " :active", attr->value);
479 TY_(RemoveAttribute)( doc, body, attr );
480 }
481 }
482
NiceBody(TidyDocImpl * doc)483 static Bool NiceBody( TidyDocImpl* doc )
484 {
485 Node* node = TY_(FindBody)(doc);
486 if (node)
487 {
488 if (TY_(AttrGetById)(node, TidyAttr_BACKGROUND) ||
489 TY_(AttrGetById)(node, TidyAttr_BGCOLOR) ||
490 TY_(AttrGetById)(node, TidyAttr_TEXT) ||
491 TY_(AttrGetById)(node, TidyAttr_LINK) ||
492 TY_(AttrGetById)(node, TidyAttr_VLINK) ||
493 TY_(AttrGetById)(node, TidyAttr_ALINK))
494 {
495 doc->badLayout |= USING_BODY;
496 return no;
497 }
498 }
499
500 return yes;
501 }
502
503 /* create style element using rules from dictionary */
CreateStyleElement(TidyDocImpl * doc)504 static void CreateStyleElement( TidyDocImpl* doc )
505 {
506 Lexer* lexer = doc->lexer;
507 Node *node, *head, *body;
508 TagStyle *style;
509 AttVal *av;
510
511 if ( lexer->styles == NULL && NiceBody(doc) )
512 return;
513
514 node = TY_(NewNode)( doc->allocator, lexer );
515 node->type = StartTag;
516 node->implicit = yes;
517 node->element = TY_(tmbstrdup)(doc->allocator, "style");
518 TY_(FindTag)( doc, node );
519
520 /* insert type attribute */
521 av = TY_(NewAttributeEx)( doc, "type", "text/css", '"' );
522 TY_(InsertAttributeAtStart)( node, av );
523
524 body = TY_(FindBody)( doc );
525 lexer->txtstart = lexer->lexsize;
526 if ( body )
527 CleanBodyAttrs( doc, body );
528
529 for (style = lexer->styles; style; style = style->next)
530 {
531 TY_(AddCharToLexer)(lexer, ' ');
532 TY_(AddStringLiteral)(lexer, style->tag);
533 TY_(AddCharToLexer)(lexer, '.');
534 TY_(AddStringLiteral)(lexer, style->tag_class);
535 TY_(AddCharToLexer)(lexer, ' ');
536 TY_(AddCharToLexer)(lexer, '{');
537 TY_(AddStringLiteral)(lexer, style->properties);
538 TY_(AddCharToLexer)(lexer, '}');
539 TY_(AddCharToLexer)(lexer, '\n');
540 }
541
542 lexer->txtend = lexer->lexsize;
543
544 TY_(InsertNodeAtEnd)( node, TY_(TextToken)(lexer) );
545
546 /*
547 now insert style element into document head
548
549 doc is root node. search its children for html node
550 the head node should be first child of html node
551 */
552 if ( NULL != (head = TY_(FindHEAD)( doc )) )
553 TY_(InsertNodeAtEnd)( head, node );
554 }
555
556
557 /* ensure bidirectional links are consistent */
TY_(FixNodeLinks)558 void TY_(FixNodeLinks)(Node *node)
559 {
560 Node *child;
561
562 if (node->prev)
563 node->prev->next = node;
564 else
565 node->parent->content = node;
566
567 if (node->next)
568 node->next->prev = node;
569 else
570 node->parent->last = node;
571
572 for (child = node->content; child; child = child->next)
573 child->parent = node;
574 }
575
576 /*
577 used to strip child of node when
578 the node has one and only one child
579 */
StripOnlyChild(TidyDocImpl * doc,Node * node)580 static void StripOnlyChild(TidyDocImpl* doc, Node *node)
581 {
582 Node *child;
583
584 child = node->content;
585 node->content = child->content;
586 node->last = child->last;
587 child->content = NULL;
588 TY_(FreeNode)(doc, child);
589
590 for (child = node->content; child; child = child->next)
591 child->parent = node;
592 }
593
594 /*
595 used to strip font start and end tags.
596 Extricate "element", replace it by its content and delete it.
597 */
DiscardContainer(TidyDocImpl * doc,Node * element,Node ** pnode)598 static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
599 {
600 if (element->content)
601 {
602 Node *node, *parent = element->parent;
603
604 element->last->next = element->next;
605
606 if (element->next)
607 {
608 element->next->prev = element->last;
609 }
610 else
611 parent->last = element->last;
612
613 if (element->prev)
614 {
615 element->content->prev = element->prev;
616 element->prev->next = element->content;
617 }
618 else
619 parent->content = element->content;
620
621 for (node = element->content; node; node = node->next)
622 node->parent = parent;
623
624 *pnode = element->content;
625
626 element->next = element->content = NULL;
627 TY_(FreeNode)(doc, element);
628 }
629 else
630 {
631 *pnode = TY_(DiscardElement)(doc, element);
632 }
633 }
634
635 /*
636 Create new string that consists of the
637 combined style properties in s1 and s2
638
639 To merge property lists, we build a linked
640 list of property/values and insert properties
641 into the list in order, merging values for
642 the same property name.
643 */
MergeProperties(TidyDocImpl * doc,ctmbstr s1,ctmbstr s2)644 static tmbstr MergeProperties( TidyDocImpl* doc, ctmbstr s1, ctmbstr s2 )
645 {
646 tmbstr s;
647 StyleProp *prop;
648
649 prop = CreateProps(doc, NULL, s1);
650 prop = CreateProps(doc, prop, s2);
651 s = CreatePropString(doc, prop);
652 FreeStyleProps(doc, prop);
653 return s;
654 }
655
656 /*
657 Add style property to element, creating style
658 attribute as needed and adding ; delimiter
659 */
TY_(AddStyleProperty)660 void TY_(AddStyleProperty)(TidyDocImpl* doc, Node *node, ctmbstr property )
661 {
662 AttVal *av = TY_(AttrGetById)(node, TidyAttr_STYLE);
663
664 /* if style attribute already exists then insert property */
665
666 if ( av )
667 {
668 if (av->value != NULL)
669 {
670 tmbstr s = MergeProperties( doc, av->value, property );
671 TidyDocFree( doc, av->value );
672 av->value = s;
673 }
674 else
675 {
676 av->value = TY_(tmbstrdup)( doc->allocator, property );
677 }
678 }
679 else /* else create new style attribute */
680 {
681 av = TY_(NewAttributeEx)( doc, "style", property, '"' );
682 TY_(InsertAttributeAtStart)( node, av );
683 }
684 }
685
MergeClasses(TidyDocImpl * doc,Node * node,Node * child)686 static void MergeClasses(TidyDocImpl* doc, Node *node, Node *child)
687 {
688 AttVal *av;
689 tmbstr s1, s2, names;
690
691 for (s2 = NULL, av = child->attributes; av; av = av->next)
692 {
693 if (attrIsCLASS(av))
694 {
695 s2 = av->value;
696 break;
697 }
698 }
699
700 for (s1 = NULL, av = node->attributes; av; av = av->next)
701 {
702 if (attrIsCLASS(av))
703 {
704 s1 = av->value;
705 break;
706 }
707 }
708
709 if (s1)
710 {
711 if (s2) /* merge class names from both */
712 {
713 uint l1, l2;
714 l1 = TY_(tmbstrlen)(s1);
715 l2 = TY_(tmbstrlen)(s2);
716 names = (tmbstr) TidyDocAlloc(doc, l1 + l2 + 2);
717 TY_(tmbstrcpy)(names, s1);
718 names[l1] = ' ';
719 TY_(tmbstrcpy)(names+l1+1, s2);
720 TidyDocFree(doc, av->value);
721 av->value = names;
722 }
723 }
724 else if (s2) /* copy class names from child */
725 {
726 av = TY_(NewAttributeEx)( doc, "class", s2, '"' );
727 TY_(InsertAttributeAtStart)( node, av );
728 }
729 }
730
MergeStyles(TidyDocImpl * doc,Node * node,Node * child)731 static void MergeStyles(TidyDocImpl* doc, Node *node, Node *child)
732 {
733 AttVal *av;
734 tmbstr s1, s2, style;
735
736 /*
737 the child may have a class attribute used
738 for attaching styles, if so the class name
739 needs to be copied to node's class
740 */
741 MergeClasses(doc, node, child);
742
743 for (s2 = NULL, av = child->attributes; av; av = av->next)
744 {
745 if (attrIsSTYLE(av))
746 {
747 s2 = av->value;
748 break;
749 }
750 }
751
752 for (s1 = NULL, av = node->attributes; av; av = av->next)
753 {
754 if (attrIsSTYLE(av))
755 {
756 s1 = av->value;
757 break;
758 }
759 }
760
761 if (s1)
762 {
763 if (s2) /* merge styles from both */
764 {
765 style = MergeProperties(doc, s1, s2);
766 TidyDocFree(doc, av->value);
767 av->value = style;
768 }
769 }
770 else if (s2) /* copy style of child */
771 {
772 av = TY_(NewAttributeEx)( doc, "style", s2, '"' );
773 TY_(InsertAttributeAtStart)( node, av );
774 }
775 }
776
FontSize2Name(ctmbstr size)777 static ctmbstr FontSize2Name(ctmbstr size)
778 {
779 static const ctmbstr sizes[7] =
780 {
781 "60%", "70%", "80%", NULL,
782 "120%", "150%", "200%"
783 };
784
785 /* increment of 0.8 */
786 static const ctmbstr minussizes[] =
787 {
788 "100%", "80%", "64%", "51%",
789 "40%", "32%", "26%"
790 };
791
792 /* increment of 1.2 */
793 static const ctmbstr plussizes[] =
794 {
795 "100%", "120%", "144%", "172%",
796 "207%", "248%", "298%"
797 };
798
799 if (size[0] == '\0')
800 return NULL;
801
802 if ('0' <= size[0] && size[0] <= '6')
803 {
804 int n = size[0] - '0';
805 return sizes[n];
806 }
807
808 if (size[0] == '-')
809 {
810 if ('0' <= size[1] && size[1] <= '6')
811 {
812 int n = size[1] - '0';
813 return minussizes[n];
814 }
815 return "smaller"; /*"70%"; */
816 }
817
818 if ('0' <= size[1] && size[1] <= '6')
819 {
820 int n = size[1] - '0';
821 return plussizes[n];
822 }
823
824 return "larger"; /* "140%" */
825 }
826
AddFontFace(TidyDocImpl * doc,Node * node,ctmbstr face)827 static void AddFontFace( TidyDocImpl* doc, Node *node, ctmbstr face )
828 {
829 tmbchar buf[256];
830 TY_(tmbsnprintf)(buf, sizeof(buf), "font-family: %s", face );
831 TY_(AddStyleProperty)( doc, node, buf );
832 }
833
AddFontSize(TidyDocImpl * doc,Node * node,ctmbstr size)834 static void AddFontSize( TidyDocImpl* doc, Node* node, ctmbstr size )
835 {
836 ctmbstr value = NULL;
837
838 if (nodeIsP(node))
839 {
840 if (TY_(tmbstrcmp)(size, "6") == 0)
841 value = "h1";
842 else if (TY_(tmbstrcmp)(size, "5") == 0)
843 value = "h2";
844 else if (TY_(tmbstrcmp)(size, "4") == 0)
845 value = "h3";
846
847 if (value)
848 {
849 TidyDocFree(doc, node->element);
850 node->element = TY_(tmbstrdup)(doc->allocator, value);
851 TY_(FindTag)(doc, node);
852 return;
853 }
854 }
855
856 value = FontSize2Name(size);
857
858 if (value)
859 {
860 tmbchar buf[64];
861 TY_(tmbsnprintf)(buf, sizeof(buf), "font-size: %s", value);
862 TY_(AddStyleProperty)( doc, node, buf );
863 }
864 }
865
AddFontColor(TidyDocImpl * doc,Node * node,ctmbstr color)866 static void AddFontColor( TidyDocImpl* doc, Node *node, ctmbstr color)
867 {
868 tmbchar buf[128];
869 TY_(tmbsnprintf)(buf, sizeof(buf), "color: %s", color);
870 TY_(AddStyleProperty)( doc, node, buf );
871 }
872
873 /* force alignment value to lower case */
AddAlign(TidyDocImpl * doc,Node * node,ctmbstr align)874 static void AddAlign( TidyDocImpl* doc, Node *node, ctmbstr align )
875 {
876 uint i;
877 tmbchar buf[128];
878
879 TY_(tmbstrcpy)( buf, "text-align: " );
880 for ( i = 12; i < sizeof(buf)/sizeof(buf[0])-1; ++i )
881 {
882 if ( (buf[i] = (tmbchar)TY_(ToLower)(*align++)) == '\0' )
883 break;
884 }
885 buf[i] = '\0';
886 TY_(AddStyleProperty)( doc, node, buf );
887 }
888
889 /*
890 add style properties to node corresponding to
891 the font face, size and color attributes
892 */
AddFontStyles(TidyDocImpl * doc,Node * node,AttVal * av)893 static void AddFontStyles( TidyDocImpl* doc, Node *node, AttVal *av)
894 {
895 while (av)
896 {
897 if (AttrHasValue(av))
898 {
899 if (attrIsFACE(av))
900 AddFontFace( doc, node, av->value );
901 else if (attrIsSIZE(av))
902 AddFontSize( doc, node, av->value );
903 else if (attrIsCOLOR(av))
904 AddFontColor( doc, node, av->value );
905 }
906 av = av->next;
907 }
908 }
909
910 /*
911 Symptom: <p align=center>
912 Action: <p style="text-align: center">
913 */
TextAlign(TidyDocImpl * doc,Node * node)914 static void TextAlign( TidyDocImpl* doc, Node* node )
915 {
916 AttVal *av, *prev;
917
918 prev = NULL;
919
920 for (av = node->attributes; av; av = av->next)
921 {
922 if (attrIsALIGN(av))
923 {
924 if (prev)
925 prev->next = av->next;
926 else
927 node->attributes = av->next;
928
929 if (av->value)
930 AddAlign( doc, node, av->value );
931
932 TY_(FreeAttribute)(doc, av);
933 break;
934 }
935
936 prev = av;
937 }
938 }
939
940 /*
941 Symptom: <table bgcolor="red">
942 Action: <table style="background-color: red">
943 */
TableBgColor(TidyDocImpl * doc,Node * node)944 static void TableBgColor( TidyDocImpl* doc, Node* node )
945 {
946 AttVal* attr;
947 tmbchar buf[256];
948
949 if (NULL != (attr = TY_(AttrGetById)(node, TidyAttr_BGCOLOR)))
950 {
951 TY_(tmbsnprintf)(buf, sizeof(buf), "background-color: %s", attr->value );
952 TY_(RemoveAttribute)( doc, node, attr );
953 TY_(AddStyleProperty)( doc, node, buf );
954 }
955 }
956
957 /*
958 The clean up rules use the pnode argument to return the
959 next node when the original node has been deleted
960 */
961
962 /*
963 Symptom: <dir> <li> where <li> is only child
964 Action: coerce <dir> <li> to <div> with indent.
965 */
966
Dir2Div(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))967 static Bool Dir2Div( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode))
968 {
969 Node *child;
970
971 if ( nodeIsDIR(node) || nodeIsUL(node) || nodeIsOL(node) )
972 {
973 child = node->content;
974
975 if (child == NULL)
976 return no;
977
978 /* check child has no peers */
979
980 if (child->next)
981 return no;
982
983 if ( !nodeIsLI(child) )
984 return no;
985
986 if ( !child->implicit )
987 return no;
988
989 /* coerce dir to div */
990 node->tag = TY_(LookupTagDef)( TidyTag_DIV );
991 TidyDocFree( doc, node->element );
992 node->element = TY_(tmbstrdup)(doc->allocator, "div");
993 TY_(AddStyleProperty)( doc, node, "margin-left: 2em" );
994 StripOnlyChild( doc, node );
995 return yes;
996 }
997
998 return no;
999 }
1000
1001 /*
1002 Symptom: <center>
1003 Action: replace <center> by <div style="text-align: center">
1004 */
1005
Center2Div(TidyDocImpl * doc,Node * node,Node ** pnode)1006 static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode)
1007 {
1008 if ( nodeIsCENTER(node) )
1009 {
1010 RenameElem( doc, node, TidyTag_DIV );
1011 TY_(AddStyleProperty)( doc, node, "text-align: center" );
1012 return yes;
1013 }
1014
1015 return no;
1016 }
1017
1018 /* Copy child attributes to node. Duplicate attributes are overwritten.
1019 Unique attributes (such as ID) disable the action.
1020 Attributes style and class are not dealt with. A call to MergeStyles
1021 will do that.
1022 */
CopyAttrs(TidyDocImpl * doc,Node * node,Node * child)1023 static Bool CopyAttrs( TidyDocImpl* doc, Node *node, Node *child)
1024 {
1025 AttVal *av1, *av2;
1026 TidyAttrId id;
1027
1028 /* Detect attributes that cannot be merged or overwritten. */
1029 if (TY_(AttrGetById)(child, TidyAttr_ID) != NULL
1030 && TY_(AttrGetById)(node, TidyAttr_ID) != NULL)
1031 return no;
1032
1033 /* Move child attributes to node. Attributes in node
1034 can be overwritten or merged. */
1035 for (av2 = child->attributes; av2; )
1036 {
1037 /* Dealt by MergeStyles. */
1038 if (attrIsSTYLE(av2) || attrIsCLASS(av2))
1039 {
1040 av2 = av2->next;
1041 continue;
1042 }
1043 /* Avoid duplicates in node */
1044 if ((id=AttrId(av2)) != TidyAttr_UNKNOWN
1045 && (av1=TY_(AttrGetById)(node, id))!= NULL)
1046 TY_(RemoveAttribute)( doc, node, av1 );
1047
1048 /* Move attribute from child to node */
1049 TY_(DetachAttribute)( child, av2 );
1050 av1 = av2;
1051 av2 = av2->next;
1052 av1->next = NULL;
1053 TY_(InsertAttributeAtEnd)( node, av1 );
1054 }
1055
1056 return yes;
1057 }
1058
1059 /*
1060 Symptom <XX><XX>...</XX></XX>
1061 Action: merge the two XXs
1062
1063 For instance, this is useful after nested <dir>s used by Word
1064 for indenting have been converted to <div>s
1065
1066 If state is "no", no merging.
1067 If state is "yes", inner element is discarded. Only Style and Class
1068 attributes are merged using MergeStyles().
1069 If state is "auto", atttibutes are merged as described in CopyAttrs().
1070 Style and Class attributes are merged using MergeStyles().
1071 */
MergeNestedElements(TidyDocImpl * doc,TidyTagId Id,TidyTriState state,Node * node,Node ** ARG_UNUSED (pnode))1072 static Bool MergeNestedElements( TidyDocImpl* doc,
1073 TidyTagId Id, TidyTriState state, Node *node,
1074 Node **ARG_UNUSED(pnode))
1075 {
1076 Node *child;
1077
1078 if ( state == TidyNoState
1079 || !TagIsId(node, Id) )
1080 return no;
1081
1082 child = node->content;
1083
1084 if ( child == NULL
1085 || child->next != NULL
1086 || !TagIsId(child, Id) )
1087 return no;
1088
1089 if ( state == TidyAutoState
1090 && CopyAttrs(doc, node, child) == no )
1091 return no;
1092
1093 MergeStyles( doc, node, child );
1094 StripOnlyChild( doc, node );
1095 return yes;
1096 }
1097
1098 /*
1099 Symptom: <ul><li><ul>...</ul></li></ul>
1100 Action: discard outer list
1101 */
1102
NestedList(TidyDocImpl * doc,Node * node,Node ** pnode)1103 static Bool NestedList( TidyDocImpl* doc, Node *node, Node **pnode )
1104 {
1105 Node *child, *list;
1106
1107 if ( nodeIsUL(node) || nodeIsOL(node) )
1108 {
1109 child = node->content;
1110
1111 if (child == NULL)
1112 return no;
1113
1114 /* check child has no peers */
1115
1116 if (child->next)
1117 return no;
1118
1119 list = child->content;
1120
1121 if (!list)
1122 return no;
1123
1124 if (list->tag != node->tag)
1125 return no;
1126
1127 /* check list has no peers */
1128 if (list->next)
1129 return no;
1130
1131 *pnode = list; /* Set node to resume iteration */
1132
1133 /* move inner list node into position of outer node */
1134 list->prev = node->prev;
1135 list->next = node->next;
1136 list->parent = node->parent;
1137 TY_(FixNodeLinks)(list);
1138
1139 /* get rid of outer ul and its li */
1140 child->content = NULL;
1141 TY_(FreeNode)( doc, child ); /* See test #427841. */
1142 child = NULL;
1143 node->content = NULL;
1144 node->next = NULL;
1145 TY_(FreeNode)( doc, node );
1146 node = NULL;
1147
1148 /*
1149 If prev node was a list the chances are this node
1150 should be appended to that list. Word has no way of
1151 recognizing nested lists and just uses indents
1152 */
1153
1154 if (list->prev)
1155 {
1156 if ( (nodeIsUL(list->prev) || nodeIsOL(list->prev))
1157 && list->prev->last )
1158 {
1159 node = list;
1160 list = node->prev;
1161
1162 child = list->last; /* <li> */
1163
1164 list->next = node->next;
1165 TY_(FixNodeLinks)(list);
1166
1167 node->parent = child;
1168 node->next = NULL;
1169 node->prev = child->last;
1170 TY_(FixNodeLinks)(node);
1171 CleanNode( doc, node );
1172 }
1173 }
1174
1175 return yes;
1176 }
1177
1178 return no;
1179 }
1180
1181 /* Find CSS equivalent in a SPAN element */
1182 static
FindCSSSpanEq(Node * node,ctmbstr * s,Bool deprecatedOnly)1183 Bool FindCSSSpanEq( Node *node, ctmbstr *s, Bool deprecatedOnly )
1184 {
1185 struct
1186 {
1187 TidyTagId id;
1188 ctmbstr CSSeq;
1189 Bool deprecated;
1190 }
1191 const CSS_SpanEq[] =
1192 {
1193 { TidyTag_B, "font-weight: bold", no },
1194 { TidyTag_I, "font-style: italic", no },
1195 { TidyTag_S, "text-decoration: line-through", yes},
1196 { TidyTag_STRIKE, "text-decoration: line-through", yes},
1197 { TidyTag_U, "text-decoration: underline", yes},
1198 { TidyTag_UNKNOWN, NULL, no }
1199 };
1200 uint i;
1201
1202 for (i=0; CSS_SpanEq[i].CSSeq; ++i)
1203 if ( (!deprecatedOnly || CSS_SpanEq[i].deprecated)
1204 && TagIsId(node, CSS_SpanEq[i].id) )
1205 {
1206 *s = CSS_SpanEq[i].CSSeq;
1207 return yes;
1208 }
1209 return no;
1210 }
1211
1212 /* Necessary conditions to apply BlockStyle(). */
CanApplyBlockStyle(Node * node)1213 static Bool CanApplyBlockStyle( Node *node )
1214 {
1215 if (TY_(nodeHasCM)(node,CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE)
1216 && !nodeIsDIV(node) && !nodeIsP(node)
1217 && !nodeIsTABLE(node) && !nodeIsTR(node) && !nodeIsLI(node) )
1218 {
1219 return yes;
1220 }
1221 return no;
1222 }
1223
1224 /*
1225 Symptom: the only child of a block-level element is a
1226 presentation element such as B, I or FONT
1227
1228 Action: add style "font-weight: bold" to the block and
1229 strip the <b> element, leaving its children.
1230
1231 example:
1232
1233 <p>
1234 <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1235 </p>
1236
1237 becomes:
1238
1239 <p style="font-weight: bold; font-family: Arial; font-size: 6">
1240 Draft Recommended Practice
1241 </p>
1242
1243 This code also replaces the align attribute by a style attribute.
1244 However, to avoid CSS problems with Navigator 4, this isn't done
1245 for the elements: caption, tr and table
1246 */
BlockStyle(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1247 static Bool BlockStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1248 {
1249 Node *child;
1250 ctmbstr CSSeq;
1251
1252 /* check for bgcolor */
1253 if ( nodeIsTABLE(node)
1254 || nodeIsTD(node) || nodeIsTH(node) || nodeIsTR( node ))
1255 TableBgColor( doc, node );
1256
1257 if (CanApplyBlockStyle(node))
1258 {
1259 /* check for align attribute */
1260 if ( !nodeIsCAPTION(node) )
1261 TextAlign( doc, node );
1262
1263 child = node->content;
1264 if (child == NULL)
1265 return no;
1266
1267 /* check child has no peers */
1268 if (child->next)
1269 return no;
1270
1271 if ( FindCSSSpanEq(child, &CSSeq, no) )
1272 {
1273 MergeStyles( doc, node, child );
1274 TY_(AddStyleProperty)( doc, node, CSSeq );
1275 StripOnlyChild( doc, node );
1276 return yes;
1277 }
1278 else if ( nodeIsFONT(child) )
1279 {
1280 MergeStyles( doc, node, child );
1281 AddFontStyles( doc, node, child->attributes );
1282 StripOnlyChild( doc, node );
1283 return yes;
1284 }
1285 }
1286
1287 return no;
1288 }
1289
1290 /* Necessary conditions to apply InlineStyle(). */
CanApplyInlineStyle(Node * node)1291 static Bool CanApplyInlineStyle( Node *node )
1292 {
1293 return !nodeIsFONT(node) && TY_(nodeHasCM)(node, CM_INLINE|CM_ROW);
1294 }
1295
1296 /* the only child of table cell or an inline element such as em */
InlineStyle(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1297 static Bool InlineStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1298 {
1299 Node *child;
1300 ctmbstr CSSeq;
1301
1302 if ( CanApplyInlineStyle(node) )
1303 {
1304 child = node->content;
1305
1306 if (child == NULL)
1307 return no;
1308
1309 /* check child has no peers */
1310
1311 if (child->next)
1312 return no;
1313
1314 if ( FindCSSSpanEq(child, &CSSeq, no) )
1315 {
1316 MergeStyles( doc, node, child );
1317 TY_(AddStyleProperty)( doc, node, CSSeq );
1318 StripOnlyChild( doc, node );
1319 return yes;
1320 }
1321 else if ( nodeIsFONT(child) )
1322 {
1323 MergeStyles( doc, node, child );
1324 AddFontStyles( doc, node, child->attributes );
1325 StripOnlyChild( doc, node );
1326 return yes;
1327 }
1328 }
1329
1330 return no;
1331 }
1332
1333 /*
1334 Transform element to equivalent CSS
1335 */
InlineElementToCSS(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1336 static Bool InlineElementToCSS( TidyDocImpl* doc, Node* node,
1337 Node **ARG_UNUSED(pnode) )
1338 {
1339 ctmbstr CSSeq;
1340
1341 /* if node is the only child of parent element then leave alone
1342 Do so only if BlockStyle may be succesful. */
1343 if ( node->parent->content == node && node->next == NULL &&
1344 (CanApplyBlockStyle(node->parent)
1345 || CanApplyInlineStyle(node->parent)) )
1346 return no;
1347
1348 if ( FindCSSSpanEq(node, &CSSeq, yes) )
1349 {
1350 RenameElem( doc, node, TidyTag_SPAN );
1351 TY_(AddStyleProperty)( doc, node, CSSeq );
1352 return yes;
1353 }
1354 return no;
1355 }
1356
1357 /*
1358 Replace font elements by span elements, deleting
1359 the font element's attributes and replacing them
1360 by a single style attribute.
1361 */
Font2Span(TidyDocImpl * doc,Node * node,Node ** pnode)1362 static Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode )
1363 {
1364 AttVal *av, *style, *next;
1365
1366 if ( nodeIsFONT(node) )
1367 {
1368 /* if node is the only child of parent element then leave alone
1369 Do so only if BlockStyle may be succesful. */
1370 if ( node->parent->content == node && node->next == NULL &&
1371 CanApplyBlockStyle(node->parent) )
1372 return no;
1373
1374 AddFontStyles( doc, node, node->attributes );
1375
1376 /* extract style attribute and free the rest */
1377 av = node->attributes;
1378 style = NULL;
1379
1380 while (av)
1381 {
1382 next = av->next;
1383
1384 if (attrIsSTYLE(av))
1385 {
1386 av->next = NULL;
1387 style = av;
1388 }
1389 else
1390 {
1391 TY_(FreeAttribute)( doc, av );
1392 }
1393 av = next;
1394 }
1395
1396 node->attributes = style;
1397 RenameElem( doc, node, TidyTag_SPAN );
1398 return yes;
1399 }
1400
1401 return no;
1402 }
1403
1404 /*
1405 Applies all matching rules to a node.
1406 */
CleanNode(TidyDocImpl * doc,Node * node)1407 Node* CleanNode( TidyDocImpl* doc, Node *node )
1408 {
1409 Node *next = NULL;
1410 TidyTriState mergeDivs = cfgAutoBool(doc, TidyMergeDivs);
1411 TidyTriState mergeSpans = cfgAutoBool(doc, TidyMergeSpans);
1412
1413 for (next = node; TY_(nodeIsElement)(node); node = next)
1414 {
1415 if ( Dir2Div(doc, node, &next) )
1416 continue;
1417
1418 /* Special case: true result means
1419 ** that arg node and its parent no longer exist.
1420 ** So we must jump back up the CreateStyleProperties()
1421 ** call stack until we have a valid node reference.
1422 */
1423 if ( NestedList(doc, node, &next) )
1424 return next;
1425
1426 if ( Center2Div(doc, node, &next) )
1427 continue;
1428
1429 if ( MergeNestedElements(doc, TidyTag_DIV, mergeDivs, node, &next) )
1430 continue;
1431
1432 if ( MergeNestedElements(doc, TidyTag_SPAN, mergeSpans, node, &next) )
1433 continue;
1434
1435 if ( BlockStyle(doc, node, &next) )
1436 continue;
1437
1438 if ( InlineStyle(doc, node, &next) )
1439 continue;
1440
1441 if ( InlineElementToCSS(doc, node, &next) )
1442 continue;
1443
1444 if ( Font2Span(doc, node, &next) )
1445 continue;
1446
1447 break;
1448 }
1449
1450 return next;
1451 }
1452
1453 /* Special case: if the current node is destroyed by
1454 ** CleanNode() lower in the tree, this node and its parent
1455 ** no longer exist. So we must jump back up the CleanTree()
1456 ** call stack until we have a valid node reference.
1457 */
1458
CleanTree(TidyDocImpl * doc,Node * node)1459 static Node* CleanTree( TidyDocImpl* doc, Node *node )
1460 {
1461 if (node->content)
1462 {
1463 Node *child;
1464 for (child = node->content; child != NULL; child = child->next)
1465 {
1466 child = CleanTree( doc, child );
1467 if ( !child )
1468 break;
1469 }
1470 }
1471
1472 return CleanNode( doc, node );
1473 }
1474
DefineStyleRules(TidyDocImpl * doc,Node * node)1475 static void DefineStyleRules( TidyDocImpl* doc, Node *node )
1476 {
1477 Node *child;
1478
1479 if (node->content)
1480 {
1481 for (child = node->content;
1482 child != NULL; child = child->next)
1483 {
1484 DefineStyleRules( doc, child );
1485 }
1486 }
1487
1488 Style2Rule( doc, node );
1489 }
1490
TY_(CleanDocument)1491 void TY_(CleanDocument)( TidyDocImpl* doc )
1492 {
1493 /* placeholder. CleanTree()/CleanNode() will not
1494 ** zap root element
1495 */
1496 CleanTree( doc, &doc->root );
1497
1498 if ( cfgBool(doc, TidyMakeClean) )
1499 {
1500 DefineStyleRules( doc, &doc->root );
1501 CreateStyleElement( doc );
1502 }
1503 }
1504
1505 /* simplifies <b><b> ... </b> ...</b> etc. */
TY_(NestedEmphasis)1506 void TY_(NestedEmphasis)( TidyDocImpl* doc, Node* node )
1507 {
1508 Node *next;
1509
1510 while (node)
1511 {
1512 next = node->next;
1513
1514 if ( (nodeIsB(node) || nodeIsI(node))
1515 && node->parent && node->parent->tag == node->tag)
1516 {
1517 /* strip redundant inner element */
1518 DiscardContainer( doc, node, &next );
1519 node = next;
1520 continue;
1521 }
1522
1523 if ( node->content )
1524 TY_(NestedEmphasis)( doc, node->content );
1525
1526 node = next;
1527 }
1528 }
1529
1530
1531
1532 /* replace i by em and b by strong */
TY_(EmFromI)1533 void TY_(EmFromI)( TidyDocImpl* doc, Node* node )
1534 {
1535 while (node)
1536 {
1537 if ( nodeIsI(node) )
1538 RenameElem( doc, node, TidyTag_EM );
1539 else if ( nodeIsB(node) )
1540 RenameElem( doc, node, TidyTag_STRONG );
1541
1542 if ( node->content )
1543 TY_(EmFromI)( doc, node->content );
1544
1545 node = node->next;
1546 }
1547 }
1548
HasOneChild(Node * node)1549 static Bool HasOneChild(Node *node)
1550 {
1551 return (node->content && node->content->next == NULL);
1552 }
1553
1554 /*
1555 Some people use dir or ul without an li
1556 to indent the content. The pattern to
1557 look for is a list with a single implicit
1558 li. This is recursively replaced by an
1559 implicit blockquote.
1560 */
TY_(List2BQ)1561 void TY_(List2BQ)( TidyDocImpl* doc, Node* node )
1562 {
1563 while (node)
1564 {
1565 if (node->content)
1566 TY_(List2BQ)( doc, node->content );
1567
1568 if ( node->tag && node->tag->parser == TY_(ParseList) &&
1569 HasOneChild(node) && node->content->implicit )
1570 {
1571 StripOnlyChild( doc, node );
1572 RenameElem( doc, node, TidyTag_BLOCKQUOTE );
1573 node->implicit = yes;
1574 }
1575
1576 node = node->next;
1577 }
1578 }
1579
1580
1581 /*
1582 Replace implicit blockquote by div with an indent
1583 taking care to reduce nested blockquotes to a single
1584 div with the indent set to match the nesting depth
1585 */
TY_(BQ2Div)1586 void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
1587 {
1588 tmbchar indent_buf[ 32 ];
1589 uint indent;
1590
1591 while (node)
1592 {
1593 if ( nodeIsBLOCKQUOTE(node) && node->implicit )
1594 {
1595 indent = 1;
1596
1597 while( HasOneChild(node) &&
1598 nodeIsBLOCKQUOTE(node->content) &&
1599 node->implicit)
1600 {
1601 ++indent;
1602 StripOnlyChild( doc, node );
1603 }
1604
1605 if (node->content)
1606 TY_(BQ2Div)( doc, node->content );
1607
1608 TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
1609 2*indent);
1610
1611 RenameElem( doc, node, TidyTag_DIV );
1612 TY_(AddStyleProperty)(doc, node, indent_buf );
1613 }
1614 else if (node->content)
1615 TY_(BQ2Div)( doc, node->content );
1616
1617 node = node->next;
1618 }
1619 }
1620
1621
FindEnclosingCell(TidyDocImpl * ARG_UNUSED (doc),Node * node)1622 static Node* FindEnclosingCell( TidyDocImpl* ARG_UNUSED(doc), Node *node)
1623 {
1624 Node *check;
1625
1626 for ( check=node; check; check = check->parent )
1627 {
1628 if ( nodeIsTD(check) )
1629 return check;
1630 }
1631 return NULL;
1632 }
1633
1634 /* node is <![if ...]> prune up to <![endif]> */
PruneSection(TidyDocImpl * doc,Node * node)1635 static Node* PruneSection( TidyDocImpl* doc, Node *node )
1636 {
1637 Lexer* lexer = doc->lexer;
1638
1639 for (;;)
1640 {
1641 if (node == NULL)
1642 return NULL;
1643
1644 ctmbstr lexbuf = lexer->lexbuf + node->start;
1645 if ( TY_(tmbstrncmp)(lexbuf, "if !supportEmptyParas", 21) == 0 )
1646 {
1647 Node* cell = FindEnclosingCell( doc, node );
1648 if ( cell )
1649 {
1650 /* Need to put into cell so it doesn't look weird
1651 */
1652 Node* nbsp = TY_(NewLiteralTextNode)( lexer, "\240" );
1653 assert( (byte)'\240' == (byte)160 );
1654 TY_(InsertNodeBeforeElement)( node, nbsp );
1655 }
1656 }
1657
1658 /* discard node and returns next, unless it is a text node */
1659 if ( node->type == TextNode )
1660 node = node->next;
1661 else
1662 node = TY_(DiscardElement)( doc, node );
1663
1664 if (node == NULL)
1665 return NULL;
1666
1667 if (node->type == SectionTag)
1668 {
1669 if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0)
1670 {
1671 node = PruneSection( doc, node );
1672 continue;
1673 }
1674
1675 if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "endif", 5) == 0)
1676 {
1677 node = TY_(DiscardElement)( doc, node );
1678 break;
1679 }
1680 }
1681 }
1682
1683 return node;
1684 }
1685
TY_(DropSections)1686 void TY_(DropSections)( TidyDocImpl* doc, Node* node )
1687 {
1688 Lexer* lexer = doc->lexer;
1689 while (node)
1690 {
1691 if (node->type == SectionTag)
1692 {
1693 /* prune up to matching endif */
1694 if ((TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0) &&
1695 (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if !vml", 7) != 0)) /* #444394 - fix 13 Sep 01 */
1696 {
1697 node = PruneSection( doc, node );
1698 continue;
1699 }
1700
1701 /* discard others as well */
1702 node = TY_(DiscardElement)( doc, node );
1703 continue;
1704 }
1705
1706 if (node->content)
1707 TY_(DropSections)( doc, node->content );
1708
1709 node = node->next;
1710 }
1711 }
1712
PurgeWord2000Attributes(TidyDocImpl * doc,Node * node)1713 static void PurgeWord2000Attributes( TidyDocImpl* doc, Node* node )
1714 {
1715 AttVal *attr, *next, *prev = NULL;
1716
1717 for ( attr = node->attributes; attr; attr = next )
1718 {
1719 next = attr->next;
1720
1721 /* special check for class="Code" denoting pre text */
1722 /* Pass thru user defined styles as HTML class names */
1723 if (attrIsCLASS(attr))
1724 {
1725 if (AttrValueIs(attr, "Code") ||
1726 TY_(tmbstrncmp)(attr->value, "Mso", 3) != 0 )
1727 {
1728 prev = attr;
1729 continue;
1730 }
1731 }
1732
1733 if (attrIsCLASS(attr) ||
1734 attrIsSTYLE(attr) ||
1735 attrIsLANG(attr) ||
1736 ( (attrIsHEIGHT(attr) || attrIsWIDTH(attr)) &&
1737 (nodeIsTD(node) || nodeIsTR(node) || nodeIsTH(node)) ) ||
1738 (attr->attribute && TY_(tmbstrncmp)(attr->attribute, "x:", 2) == 0) )
1739 {
1740 if (prev)
1741 prev->next = next;
1742 else
1743 node->attributes = next;
1744
1745 TY_(FreeAttribute)( doc, attr );
1746 }
1747 else
1748 prev = attr;
1749 }
1750 }
1751
1752 /* Word2000 uses span excessively, so we strip span out */
StripSpan(TidyDocImpl * doc,Node * span)1753 static Node* StripSpan( TidyDocImpl* doc, Node* span )
1754 {
1755 Node *node, *prev = NULL, *content;
1756
1757 /*
1758 deal with span elements that have content
1759 by splicing the content in place of the span
1760 after having processed it
1761 */
1762
1763 TY_(CleanWord2000)( doc, span->content );
1764 content = span->content;
1765
1766 if (span->prev)
1767 prev = span->prev;
1768 else if (content)
1769 {
1770 node = content;
1771 content = content->next;
1772 TY_(RemoveNode)(node);
1773 TY_(InsertNodeBeforeElement)(span, node);
1774 prev = node;
1775 }
1776
1777 while (content)
1778 {
1779 node = content;
1780 content = content->next;
1781 TY_(RemoveNode)(node);
1782 TY_(InsertNodeAfterElement)(prev, node);
1783 prev = node;
1784 }
1785
1786 if (span->next == NULL)
1787 span->parent->last = prev;
1788
1789 node = span->next;
1790 span->content = NULL;
1791 TY_(DiscardElement)( doc, span );
1792 return node;
1793 }
1794
1795 /* map non-breaking spaces to regular spaces */
TY_(NormalizeSpaces)1796 void TY_(NormalizeSpaces)(Lexer *lexer, Node *node)
1797 {
1798 while ( node )
1799 {
1800 if ( node->content )
1801 TY_(NormalizeSpaces)( lexer, node->content );
1802
1803 if (TY_(nodeIsText)(node))
1804 {
1805 uint i, c;
1806 tmbstr p = lexer->lexbuf + node->start;
1807
1808 for (i = node->start; i < node->end; ++i)
1809 {
1810 c = (byte) lexer->lexbuf[i];
1811
1812 /* look for UTF-8 multibyte character */
1813 if ( c > 0x7F )
1814 i += TY_(GetUTF8)( lexer->lexbuf + i, &c );
1815
1816 if ( c == 160 )
1817 c = ' ';
1818
1819 p = TY_(PutUTF8)(p, c);
1820 }
1821 node->end = p - lexer->lexbuf;
1822 }
1823
1824 node = node->next;
1825 }
1826 }
1827
1828 /* used to hunt for hidden preformatted sections */
NoMargins(Node * node)1829 static Bool NoMargins(Node *node)
1830 {
1831 AttVal *attval = TY_(AttrGetById)(node, TidyAttr_STYLE);
1832
1833 if ( !AttrHasValue(attval) )
1834 return no;
1835
1836 /* search for substring "margin-top: 0" */
1837 if (!TY_(tmbsubstr)(attval->value, "margin-top: 0"))
1838 return no;
1839
1840 /* search for substring "margin-bottom: 0" */
1841 if (!TY_(tmbsubstr)(attval->value, "margin-bottom: 0"))
1842 return no;
1843
1844 return yes;
1845 }
1846
1847 /* does element have a single space as its content? */
SingleSpace(Lexer * lexer,Node * node)1848 static Bool SingleSpace( Lexer* lexer, Node* node )
1849 {
1850 if ( node->content )
1851 {
1852 node = node->content;
1853
1854 if ( node->next != NULL )
1855 return no;
1856
1857 if ( node->type != TextNode )
1858 return no;
1859
1860 if ( (node->end - node->start) == 1 &&
1861 lexer->lexbuf[node->start] == ' ' )
1862 return yes;
1863
1864 if ( (node->end - node->start) == 2 )
1865 {
1866 uint c = 0;
1867 TY_(GetUTF8)( lexer->lexbuf + node->start, &c );
1868 if ( c == 160 )
1869 return yes;
1870 }
1871 }
1872
1873 return no;
1874 }
1875
1876 /*
1877 This is a major clean up to strip out all the extra stuff you get
1878 when you save as web page from Word 2000. It doesn't yet know what
1879 to do with VML tags, but these will appear as errors unless you
1880 declare them as new tags, such as o:p which needs to be declared
1881 as inline.
1882 */
TY_(CleanWord2000)1883 void TY_(CleanWord2000)( TidyDocImpl* doc, Node *node)
1884 {
1885 /* used to a list from a sequence of bulletted p's */
1886 Lexer* lexer = doc->lexer;
1887 Node* list = NULL;
1888 AttVal *next_attr, *attval;
1889
1890 while ( node )
1891 {
1892 /* get rid of Word's xmlns attributes */
1893 if ( nodeIsHTML(node) )
1894 {
1895 /* check that it's a Word 2000 document */
1896 if ( !TY_(IsWord2000) (doc) ) /* Is. #896 */
1897 return;
1898
1899 /* Output proprietary attributes to maintain errout compatability
1900 * with traditional Tidy. This is a result of moving all of the
1901 * proprietary checks to near the end of the cleanup process,
1902 * meaning this result would not ordinarily be displayed.
1903 */
1904 attval = node->attributes;
1905 while ( attval ) {
1906 next_attr = attval->next;
1907
1908 /* Issue #591 - take care of a NULL attribute, too. */
1909 if ( !attval->attribute || ( strcmp(attval->attribute, "xmlns") != 0 ))
1910 TY_(ReportAttrError)(doc, node, attval, PROPRIETARY_ATTRIBUTE);
1911 attval = next_attr;
1912 }
1913
1914 TY_(FreeAttrs)( doc, node );
1915 }
1916
1917 /* fix up preformatted sections by looking for a
1918 ** sequence of paragraphs with zero top/bottom margin
1919 */
1920 if ( nodeIsP(node) )
1921 {
1922 if (NoMargins(node))
1923 {
1924 Node *pre, *next;
1925 TY_(CoerceNode)(doc, node, TidyTag_PRE, no, yes);
1926
1927 PurgeWord2000Attributes( doc, node );
1928
1929 if (node->content)
1930 TY_(CleanWord2000)( doc, node->content );
1931
1932 pre = node;
1933 node = node->next;
1934
1935 /* continue to strip p's */
1936
1937 while ( nodeIsP(node) && NoMargins(node) )
1938 {
1939 next = node->next;
1940 TY_(RemoveNode)(node);
1941 TY_(InsertNodeAtEnd)(pre, TY_(NewLineNode)(lexer));
1942 TY_(InsertNodeAtEnd)(pre, node);
1943 StripSpan( doc, node );
1944 node = next;
1945 }
1946
1947 if (node == NULL)
1948 break;
1949 }
1950 }
1951
1952 if (node->tag && (node->tag->model & CM_BLOCK)
1953 && SingleSpace(lexer, node))
1954 {
1955 node = StripSpan( doc, node );
1956 continue;
1957 }
1958 /* discard Word's style verbiage */
1959 if ( nodeIsSTYLE(node) || nodeIsMETA(node) ||
1960 node->type == CommentTag )
1961 {
1962 node = TY_(DiscardElement)( doc, node );
1963 continue;
1964 }
1965
1966 /* strip out all span and font tags Word scatters so liberally! */
1967 if ( nodeIsSPAN(node) || nodeIsFONT(node) )
1968 {
1969 node = StripSpan( doc, node );
1970 continue;
1971 }
1972
1973 if ( nodeIsLINK(node) )
1974 {
1975 AttVal *attr = TY_(AttrGetById)(node, TidyAttr_REL);
1976
1977 if (AttrValueIs(attr, "File-List"))
1978 {
1979 node = TY_(DiscardElement)( doc, node );
1980 continue;
1981 }
1982 }
1983
1984 /* discards <o:p> which encodes the paragraph mark */
1985 if ( node->tag && TY_(tmbstrcmp)(node->tag->name,"o:p")==0)
1986 {
1987 /* Output proprietary elements to maintain errout compatability
1988 * with traditional Tidy. This is a result of moving all of the
1989 * proprietary checks to near the end of the cleanup process,
1990 * meaning this result would not ordinarily be displayed.
1991 */
1992 Node* next;
1993 TY_(Report)(doc, NULL, node, PROPRIETARY_ELEMENT);
1994 DiscardContainer( doc, node, &next );
1995 node = next;
1996 continue;
1997 }
1998
1999 /* discard empty paragraphs */
2000
2001 if ( node->content == NULL && nodeIsP(node) )
2002 {
2003 /* Use the existing function to ensure consistency */
2004 Node *next = TY_(TrimEmptyElement)( doc, node );
2005 node = next;
2006 continue;
2007 }
2008
2009 if ( nodeIsP(node) )
2010 {
2011 AttVal *attr, *atrStyle;
2012
2013 attr = TY_(AttrGetById)(node, TidyAttr_CLASS);
2014 atrStyle = TY_(AttrGetById)(node, TidyAttr_STYLE);
2015 /*
2016 (JES) Sometimes Word marks a list item with the following hokie syntax
2017 <p class="MsoNormal" style="...;mso-list:l1 level1 lfo1;
2018 translate these into <li>
2019 */
2020 /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
2021 /* map <p class="MsoListNumber"> to <ol>...</ol> */
2022 if ( AttrValueIs(attr, "MsoListBullet") ||
2023 AttrValueIs(attr, "MsoListNumber") ||
2024 AttrContains(atrStyle, "mso-list:") )
2025 {
2026 TidyTagId listType = TidyTag_UL;
2027 if (AttrValueIs(attr, "MsoListNumber"))
2028 listType = TidyTag_OL;
2029
2030 TY_(CoerceNode)(doc, node, TidyTag_LI, no, yes);
2031
2032 if ( !list || TagId(list) != listType )
2033 {
2034 const Dict* tag = TY_(LookupTagDef)( listType );
2035 list = TY_(InferredTag)(doc, tag->id);
2036 TY_(InsertNodeBeforeElement)(node, list);
2037 }
2038
2039 PurgeWord2000Attributes( doc, node );
2040
2041 if ( node->content )
2042 TY_(CleanWord2000)( doc, node->content );
2043
2044 /* remove node and append to contents of list */
2045 TY_(RemoveNode)(node);
2046 TY_(InsertNodeAtEnd)(list, node);
2047 node = list;
2048 }
2049 /* map sequence of <p class="Code"> to <pre>...</pre> */
2050 else if (AttrValueIs(attr, "Code"))
2051 {
2052 Node *br = TY_(NewLineNode)(lexer);
2053 TY_(NormalizeSpaces)(lexer, node->content);
2054
2055 if ( !list || TagId(list) != TidyTag_PRE )
2056 {
2057 list = TY_(InferredTag)(doc, TidyTag_PRE);
2058 TY_(InsertNodeBeforeElement)(node, list);
2059 }
2060
2061 /* remove node and append to contents of list */
2062 TY_(RemoveNode)(node);
2063 TY_(InsertNodeAtEnd)(list, node);
2064 StripSpan( doc, node );
2065 TY_(InsertNodeAtEnd)(list, br);
2066 node = list->next;
2067 }
2068 else
2069 list = NULL;
2070 }
2071 else
2072 list = NULL;
2073
2074 if (!node)
2075 return;
2076
2077 /* strip out style and class attributes */
2078 if (TY_(nodeIsElement)(node))
2079 PurgeWord2000Attributes( doc, node );
2080
2081 if (node->content)
2082 TY_(CleanWord2000)( doc, node->content );
2083
2084 node = node->next;
2085 }
2086 }
2087
TY_(IsWord2000)2088 Bool TY_(IsWord2000)( TidyDocImpl* doc )
2089 {
2090 AttVal *attval;
2091 Node *node, *head;
2092 Node *html = TY_(FindHTML)( doc );
2093
2094 if (html && TY_(GetAttrByName)(html, "xmlns:o"))
2095 return yes;
2096
2097 /* search for <meta name="GENERATOR" content="Microsoft ..."> */
2098 head = TY_(FindHEAD)( doc );
2099
2100 if (head)
2101 {
2102 for (node = head->content; node; node = node->next)
2103 {
2104 if ( !nodeIsMETA(node) )
2105 continue;
2106
2107 attval = TY_(AttrGetById)( node, TidyAttr_NAME );
2108
2109 if ( !AttrValueIs(attval, "generator") )
2110 continue;
2111
2112 attval = TY_(AttrGetById)( node, TidyAttr_CONTENT );
2113
2114 if ( AttrContains(attval, "Microsoft") )
2115 return yes;
2116 }
2117 }
2118
2119 return no;
2120 }
2121
2122 /* where appropriate move object elements from head to body */
TY_(BumpObject)2123 void TY_(BumpObject)( TidyDocImpl* doc, Node *html )
2124 {
2125 Node *node, *next, *head = NULL, *body = NULL;
2126
2127 if (!html)
2128 return;
2129
2130 for ( node = html->content; node != NULL; node = node->next )
2131 {
2132 if ( nodeIsHEAD(node) )
2133 head = node;
2134
2135 if ( nodeIsBODY(node) )
2136 body = node;
2137 }
2138
2139 if ( head != NULL && body != NULL )
2140 {
2141 for (node = head->content; node != NULL; node = next)
2142 {
2143 next = node->next;
2144
2145 if ( nodeIsOBJECT(node) )
2146 {
2147 Node *child;
2148 Bool bump = no;
2149
2150 for (child = node->content; child != NULL; child = child->next)
2151 {
2152 /* bump to body unless content is param */
2153 if ( (TY_(nodeIsText)(child) && !TY_(IsBlank)(doc->lexer, node))
2154 || !nodeIsPARAM(child) )
2155 {
2156 bump = yes;
2157 break;
2158 }
2159 }
2160
2161 if ( bump )
2162 {
2163 TY_(RemoveNode)( node );
2164 TY_(InsertNodeAtStart)( body, node );
2165 }
2166 }
2167 }
2168 }
2169 }
2170
2171
2172 /*\
2173 * Issue #456 - Check meta charset
2174 * 1. if there is no meta charset, it adds one, according to doctype, no warning.
2175 * 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
2176 * 3. if it doesn't match the output encoding, and fix. Naybe no warning?
2177 * 4. if there are duplicates, discard them, with warning.
2178 \*/
TY_(TidyMetaCharset)2179 Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
2180 {
2181 AttVal *charsetAttr;
2182 AttVal *contentAttr;
2183 AttVal *httpEquivAttr;
2184 Bool charsetFound = no;
2185 uint outenc = cfg(doc, TidyOutCharEncoding);
2186 ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
2187 Node *currentNode;
2188 Node *head = TY_(FindHEAD)(doc);
2189 Node *metaTag;
2190 Node *prevNode;
2191 TidyBuffer buf;
2192 TidyBuffer charsetString;
2193 /* tmbstr httpEquivAttrValue; */
2194 /* tmbstr lcontent; */
2195 tmbstr newValue;
2196 Bool add_meta = cfgBool(doc, TidyMetaCharset);
2197
2198 /* We can't do anything we don't have a head or encoding is NULL */
2199 if (!head || !enc || !TY_(tmbstrlen)(enc))
2200 return no;
2201 if (outenc == RAW)
2202 return no;
2203 #ifndef NO_NATIVE_ISO2022_SUPPORT
2204 if (outenc == ISO2022)
2205 return no;
2206 #endif
2207 if (cfgAutoBool(doc, TidyBodyOnly) == TidyYesState)
2208 return no; /* nothing to do here if showing body only */
2209
2210 tidyBufInit(&charsetString);
2211 /* Set up the content test 'charset=value' */
2212 tidyBufClear(&charsetString);
2213 tidyBufAppend(&charsetString, "charset=", 8);
2214 tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
2215 tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
2216 /* process the children of the head */
2217 /* Issue #656 - guard against 'currentNode' being set NULL in loop */
2218 for (currentNode = head->content; currentNode;
2219 currentNode = (currentNode ? currentNode->next : NULL))
2220 {
2221 if (!nodeIsMETA(currentNode))
2222 continue; /* not a meta node */
2223 charsetAttr = attrGetCHARSET(currentNode);
2224 httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
2225 if (!charsetAttr && !httpEquivAttr)
2226 continue; /* has no charset attribute */
2227 /*
2228 Meta charset comes in quite a few flavors:
2229 1. <meta charset="value"> - expected for (X)HTML5.
2230 */
2231 if (charsetAttr && !httpEquivAttr)
2232 {
2233 /* we already found one, so remove the rest. */
2234 if (charsetFound || !charsetAttr->value)
2235 {
2236 prevNode = currentNode->prev;
2237 TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2238 TY_(DiscardElement)(doc, currentNode);
2239 currentNode = prevNode;
2240 continue;
2241 }
2242 charsetFound = yes;
2243 /* Fix mismatched attribute value */
2244 if (TY_(tmbstrcasecmp)(charsetAttr->value, enc) != 0)
2245 {
2246 newValue = (tmbstr)TidyDocAlloc(doc, TY_(tmbstrlen)(enc) + 1); /* allocate + 1 for 0 */
2247 TY_(tmbstrcpy)(newValue, enc);
2248 /* Note: previously http-equiv had been modified, without warning
2249 in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2250 */
2251 TY_(ReportAttrError)(doc, currentNode, charsetAttr, ATTRIBUTE_VALUE_REPLACED);
2252 TidyDocFree(doc, charsetAttr->value); /* free current value */
2253 charsetAttr->value = newValue;
2254 }
2255 /* Make sure it's the first element. */
2256 if (currentNode != head->content->next) {
2257 TY_(RemoveNode)(currentNode);
2258 TY_(InsertNodeAtStart)(head, currentNode);
2259 }
2260 continue;
2261 }
2262 /*
2263 2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
2264 expected for HTML4. This is normally ok - but can clash.
2265 */
2266 if (httpEquivAttr && !charsetAttr)
2267 {
2268 contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
2269 if (!contentAttr)
2270 continue; /* has no 'content' attribute */
2271 if (!httpEquivAttr->value)
2272 {
2273 prevNode = currentNode->prev;
2274 TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2275 TY_(DiscardElement)(doc, currentNode);
2276 currentNode = prevNode;
2277 continue;
2278 }
2279 /* httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); */
2280 if (TY_(tmbstrcasecmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
2281 continue; /* is not 'content-type' */
2282 if (!contentAttr->value)
2283 {
2284 continue; /* has no 'content' attribute has NO VALUE! */
2285 }
2286 /* check encoding matches
2287 If a miss-match found here, fix it. previous silently done
2288 in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2289 lcontent = TY_(tmbstrtolower)(contentAttr->value);
2290 */
2291 if (TY_(tmbstrcasecmp)(contentAttr->value, (ctmbstr)charsetString.bp) == 0)
2292 {
2293 /* we already found one, so remove the rest. */
2294 if (charsetFound)
2295 {
2296 prevNode = currentNode->prev;
2297 TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2298 TY_(DiscardElement)(doc, currentNode);
2299 currentNode = prevNode;
2300 continue;
2301 }
2302 charsetFound = yes;
2303 }
2304 else
2305 {
2306 /* fix a mis-match */
2307 if (charsetFound)
2308 {
2309 prevNode = currentNode->prev;
2310 TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2311 TY_(DiscardElement)(doc, currentNode);
2312 currentNode = prevNode;
2313 }
2314 else
2315 {
2316 /* correct the content */
2317 newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
2318 TY_(tmbstrcpy)(newValue, "text/html; charset=");
2319 TY_(tmbstrcpy)(newValue + 19, enc);
2320 if (cfgBool(doc, TidyShowMetaChange)) /* Issue #456 - backward compatibility only */
2321 TY_(ReportAttrError)(doc, currentNode, contentAttr, ATTRIBUTE_VALUE_REPLACED);
2322 TidyDocFree(doc, contentAttr->value);
2323 contentAttr->value = newValue;
2324 charsetFound = yes;
2325 }
2326 }
2327 continue;
2328 }
2329 /*
2330 3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
2331 This is generally bad. Discard and warn.
2332 */
2333 if (httpEquivAttr && charsetAttr)
2334 {
2335 /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
2336 prevNode = currentNode->prev;
2337 TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2338 TY_(DiscardElement)(doc, currentNode);
2339 currentNode = prevNode;
2340 }
2341 }
2342
2343 /* completed head scan - add appropriate meta - if 'yes' and none exists */
2344 if (add_meta && !charsetFound)
2345 {
2346 /* add appropriate meta charset tag - no warning */
2347 metaTag = TY_(InferredTag)(doc, TidyTag_META);
2348 switch (TY_(HTMLVersion)(doc))
2349 {
2350 case HT50:
2351 case XH50:
2352 TY_(AddAttribute)(doc, metaTag, "charset", enc);
2353 break;
2354 default:
2355 tidyBufInit(&buf);
2356 tidyBufAppend(&buf, "text/html; ", 11);
2357 tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)((ctmbstr)charsetString.bp));
2358 tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
2359 TY_(AddAttribute)(doc, metaTag, "http-equiv", "Content-Type"); /* add 'http-equiv' const. */
2360 TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp); /* add 'content="<enc>"' */
2361 tidyBufFree(&buf);
2362 }
2363 TY_(InsertNodeAtStart)(head, metaTag);
2364 TY_(Report)(doc, metaTag, head, ADDED_MISSING_CHARSET); /* actually just 'Info:' */
2365 }
2366 tidyBufFree(&charsetString);
2367 return yes;
2368 }
2369
2370
TY_(DropComments)2371 void TY_(DropComments)(TidyDocImpl* doc, Node* node)
2372 {
2373 Node* next;
2374
2375 while (node)
2376 {
2377 next = node->next;
2378
2379 if (node->type == CommentTag)
2380 {
2381 TY_(RemoveNode)(node);
2382 TY_(FreeNode)(doc, node);
2383 node = next;
2384 continue;
2385 }
2386
2387 if (node->content)
2388 TY_(DropComments)(doc, node->content);
2389
2390 node = next;
2391 }
2392 }
2393
TY_(DropFontElements)2394 void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **ARG_UNUSED(pnode))
2395 {
2396 Node* next;
2397
2398 while (node)
2399 {
2400 next = node->next;
2401
2402 if (nodeIsFONT(node))
2403 {
2404 DiscardContainer(doc, node, &next);
2405 node = next;
2406 continue;
2407 }
2408
2409 if (node->content)
2410 TY_(DropFontElements)(doc, node->content, &next);
2411
2412 node = next;
2413 }
2414 }
2415
TY_(WbrToSpace)2416 void TY_(WbrToSpace)(TidyDocImpl* doc, Node* node)
2417 {
2418 Node* next;
2419
2420 while (node)
2421 {
2422 next = node->next;
2423
2424 if (nodeIsWBR(node))
2425 {
2426 Node* text;
2427 text = TY_(NewLiteralTextNode)(doc->lexer, " ");
2428 TY_(InsertNodeAfterElement)(node, text);
2429 TY_(RemoveNode)(node);
2430 TY_(FreeNode)(doc, node);
2431 node = next;
2432 continue;
2433 }
2434
2435 if (node->content)
2436 TY_(WbrToSpace)(doc, node->content);
2437
2438 node = next;
2439 }
2440 }
2441
2442 /*
2443 Filters from Word and PowerPoint often use smart
2444 quotes resulting in character codes between 128
2445 and 159. Unfortunately, the corresponding HTML 4.0
2446 entities for these are not widely supported. The
2447 following converts dashes and quotation marks to
2448 the nearest ASCII equivalent. My thanks to
2449 Andrzej Novosiolov for his help with this code.
2450
2451 Note: The old code in the pretty printer applied
2452 this to all node types and attribute values while
2453 this routine applies it only to text nodes. First,
2454 Microsoft Office products rarely put the relevant
2455 characters into these tokens, second support for
2456 them is much better now and last but not least, it
2457 can be harmful to replace these characters since
2458 US-ASCII quote marks are often used as syntax
2459 characters, a simple
2460
2461 <a onmouseover="alert('‘')">...</a>
2462
2463 would be broken if the U+2018 is replaced by "'".
2464 The old code would neither take care whether the
2465 quote mark is already used as delimiter,
2466
2467 <p title='‘'>...</p>
2468
2469 got
2470
2471 <p title='''>...</p>
2472
2473 Since browser support is much better nowadays and
2474 high-quality typography is better than ASCII it'd
2475 be probably a good idea to drop the feature...
2476 */
TY_(DowngradeTypography)2477 void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node)
2478 {
2479 Node* next;
2480 Lexer* lexer = doc->lexer;
2481
2482 while (node)
2483 {
2484 next = node->next;
2485
2486 if (TY_(nodeIsText)(node))
2487 {
2488 uint i, c;
2489 tmbstr p = lexer->lexbuf + node->start;
2490
2491 for (i = node->start; i < node->end; ++i)
2492 {
2493 c = (unsigned char) lexer->lexbuf[i];
2494
2495 if (c > 0x7F)
2496 i += TY_(GetUTF8)(lexer->lexbuf + i, &c);
2497
2498 if (c >= 0x2013 && c <= 0x201E)
2499 {
2500 switch (c)
2501 {
2502 case 0x2013: /* en dash */
2503 case 0x2014: /* em dash */
2504 c = '-';
2505 break;
2506 case 0x2018: /* left single quotation mark */
2507 case 0x2019: /* right single quotation mark */
2508 case 0x201A: /* single low-9 quotation mark */
2509 c = '\'';
2510 break;
2511 case 0x201C: /* left double quotation mark */
2512 case 0x201D: /* right double quotation mark */
2513 case 0x201E: /* double low-9 quotation mark */
2514 c = '"';
2515 break;
2516 }
2517 }
2518
2519 p = TY_(PutUTF8)(p, c);
2520 }
2521
2522 node->end = p - lexer->lexbuf;
2523 }
2524
2525 if (node->content)
2526 TY_(DowngradeTypography)(doc, node->content);
2527
2528 node = next;
2529 }
2530 }
2531
TY_(ReplacePreformattedSpaces)2532 void TY_(ReplacePreformattedSpaces)(TidyDocImpl* doc, Node* node)
2533 {
2534 Node* next;
2535
2536 while (node)
2537 {
2538 next = node->next;
2539
2540 if (node->tag && node->tag->parser == TY_(ParsePre))
2541 {
2542 TY_(NormalizeSpaces)(doc->lexer, node->content);
2543 node = next;
2544 continue;
2545 }
2546
2547 if (node->content)
2548 TY_(ReplacePreformattedSpaces)(doc, node->content);
2549
2550 node = next;
2551 }
2552 }
2553
TY_(ConvertCDATANodes)2554 void TY_(ConvertCDATANodes)(TidyDocImpl* doc, Node* node)
2555 {
2556 Node* next;
2557
2558 while (node)
2559 {
2560 next = node->next;
2561
2562 if (node->type == CDATATag)
2563 node->type = TextNode;
2564
2565 if (node->content)
2566 TY_(ConvertCDATANodes)(doc, node->content);
2567
2568 node = next;
2569 }
2570 }
2571
2572 /*
2573 FixLanguageInformation ensures that the document contains (only)
2574 the attributes for language information desired by the output
2575 document type. For example, for XHTML 1.0 documents both
2576 'xml:lang' and 'lang' are desired, for XHTML 1.1 only 'xml:lang'
2577 is desired and for HTML 4.01 only 'lang' is desired.
2578 */
TY_(FixLanguageInformation)2579 void TY_(FixLanguageInformation)(TidyDocImpl* doc, Node* node, Bool wantXmlLang, Bool wantLang)
2580 {
2581 Node* next;
2582
2583 while (node)
2584 {
2585 next = node->next;
2586
2587 /* todo: report modifications made here to the report system */
2588
2589 if (TY_(nodeIsElement)(node))
2590 {
2591 AttVal* lang = TY_(AttrGetById)(node, TidyAttr_LANG);
2592 AttVal* xmlLang = TY_(AttrGetById)(node, TidyAttr_XML_LANG);
2593
2594 if (lang && xmlLang)
2595 {
2596 /*
2597 todo: check whether both attributes are in sync,
2598 here or elsewhere, where elsewhere is probably
2599 preferable.
2600 AD - March 2005: not mandatory according the standards.
2601 */
2602 }
2603 else if (lang && wantXmlLang)
2604 {
2605 if (TY_(NodeAttributeVersions)( node, TidyAttr_XML_LANG )
2606 & doc->lexer->versionEmitted)
2607 TY_(RepairAttrValue)(doc, node, "xml:lang", lang->value);
2608 }
2609 else if (xmlLang && wantLang)
2610 {
2611 if (TY_(NodeAttributeVersions)( node, TidyAttr_LANG )
2612 & doc->lexer->versionEmitted)
2613 TY_(RepairAttrValue)(doc, node, "lang", xmlLang->value);
2614 }
2615
2616 if (lang && !wantLang)
2617 TY_(RemoveAttribute)(doc, node, lang);
2618
2619 if (xmlLang && !wantXmlLang)
2620 TY_(RemoveAttribute)(doc, node, xmlLang);
2621 }
2622
2623 if (node->content)
2624 TY_(FixLanguageInformation)(doc, node->content, wantXmlLang, wantLang);
2625
2626 node = next;
2627 }
2628 }
2629
2630 /*
2631 Set/fix/remove <html xmlns='...'>
2632 */
TY_(FixXhtmlNamespace)2633 void TY_(FixXhtmlNamespace)(TidyDocImpl* doc, Bool wantXmlns)
2634 {
2635 Node* html = TY_(FindHTML)(doc);
2636 AttVal* xmlns;
2637
2638 if (!html)
2639 return;
2640
2641 xmlns = TY_(AttrGetById)(html, TidyAttr_XMLNS);
2642
2643 if (wantXmlns)
2644 {
2645 if (!AttrValueIs(xmlns, XHTML_NAMESPACE))
2646 TY_(RepairAttrValue)(doc, html, "xmlns", XHTML_NAMESPACE);
2647 }
2648 else if (xmlns)
2649 {
2650 TY_(RemoveAttribute)(doc, html, xmlns);
2651 }
2652 }
2653
2654 /*
2655 ...
2656 */
TY_(FixAnchors)2657 void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
2658 {
2659 Node* next;
2660
2661 while (node)
2662 {
2663 next = node->next;
2664
2665 if (TY_(IsAnchorElement)(doc, node))
2666 {
2667 AttVal *name = TY_(AttrGetById)(node, TidyAttr_NAME);
2668 AttVal *id = TY_(AttrGetById)(node, TidyAttr_ID);
2669 Bool hadName = name!=NULL;
2670 Bool hadId = id!=NULL;
2671 Bool IdEmitted = no;
2672 Bool NameEmitted = no;
2673
2674 /* todo: how are empty name/id attributes handled? */
2675
2676 if (name && id)
2677 {
2678 Bool NameHasValue = AttrHasValue(name);
2679 Bool IdHasValue = AttrHasValue(id);
2680 if ( (NameHasValue != IdHasValue) ||
2681 (NameHasValue && IdHasValue &&
2682 TY_(tmbstrcmp)(name->value, id->value) != 0 ) )
2683 TY_(ReportAttrError)( doc, node, name, ID_NAME_MISMATCH);
2684 }
2685 else if (name && wantId)
2686 {
2687 if (TY_(NodeAttributeVersions)( node, TidyAttr_ID )
2688 & doc->lexer->versionEmitted)
2689 {
2690 if (TY_(IsValidHTMLID)(name->value))
2691 {
2692 TY_(RepairAttrValue)(doc, node, "id", name->value);
2693 IdEmitted = yes;
2694 }
2695 else
2696 TY_(ReportAttrError)(doc, node, name, INVALID_XML_ID);
2697 }
2698 }
2699 else if (id && wantName)
2700 {
2701 if (TY_(NodeAttributeVersions)( node, TidyAttr_NAME )
2702 & doc->lexer->versionEmitted)
2703 {
2704 /* todo: do not assume id is valid */
2705 TY_(RepairAttrValue)(doc, node, "name", id->value);
2706 NameEmitted = yes;
2707 }
2708 }
2709
2710 if (id && !wantId
2711 /* make sure that Name has been emitted if requested */
2712 && (hadName || !wantName || NameEmitted) ) {
2713 if (!wantId && !wantName)
2714 TY_(RemoveAnchorByNode)(doc, id->value, node);
2715 TY_(RemoveAttribute)(doc, node, id);
2716 }
2717
2718 if (name && !wantName
2719 /* make sure that Id has been emitted if requested */
2720 && (hadId || !wantId || IdEmitted) ) {
2721 if (!wantId && !wantName)
2722 TY_(RemoveAnchorByNode)(doc, name->value, node);
2723 TY_(RemoveAttribute)(doc, node, name);
2724 }
2725 }
2726
2727 if (node->content)
2728 TY_(FixAnchors)(doc, node->content, wantName, wantId);
2729
2730 node = next;
2731 }
2732 }
2733
2734 /* Issue #567 - move style elements from body to head
2735 * ==================================================
2736 */
StyleToHead(TidyDocImpl * doc,Node * head,Node * node,Bool fix,int indent)2737 static void StyleToHead(TidyDocImpl* doc, Node *head, Node *node, Bool fix, int indent)
2738 {
2739 Node *next;
2740 while (node)
2741 {
2742 next = node->next; /* get 'next' now , in case the node is moved */
2743 /* dbg_show_node(doc, node, 0, indent); */
2744 if (nodeIsSTYLE(node))
2745 {
2746 if (fix)
2747 {
2748 TY_(RemoveNode)(node); /* unhook style node from body */
2749 TY_(InsertNodeAtEnd)(head, node); /* add to end of head */
2750 TY_(Report)(doc, node, head, MOVED_STYLE_TO_HEAD); /* report move */
2751 }
2752 else
2753 {
2754 TY_(Report)(doc, node, head, FOUND_STYLE_IN_BODY);
2755 }
2756 }
2757 else if (node->content)
2758 {
2759 StyleToHead(doc, head, node->content, fix, indent + 1);
2760 }
2761 node = next; /* process the 'next', if any */
2762 }
2763 }
2764
2765
TY_(CleanStyle)2766 void TY_(CleanStyle)(TidyDocImpl* doc, Node *html)
2767 {
2768 Node *head = NULL, *body = NULL;
2769 Bool fix = cfgBool(doc, TidyStyleTags);
2770
2771 if (!html)
2772 return; /* oops, not given a start node */
2773
2774 head = TY_(FindHEAD)( doc );
2775 body = TY_(FindBody)( doc );
2776
2777 if ((head != NULL) && (body != NULL))
2778 {
2779 StyleToHead(doc, head, body, fix, 0); /* found head and body */
2780 }
2781 }
2782 /* ==================================================
2783 */
2784
2785 /*
2786 * CleanHead - clean the head node, if it exists, and we
2787 * are going to show it in the output.
2788 * Issue #692 - Remove multiple title elements
2789 */
TY_(CleanHead)2790 void TY_(CleanHead)(TidyDocImpl* doc)
2791 {
2792 Node *head, *node, *next;
2793 uint titles = 0;
2794 if (cfgAutoBool(doc, TidyBodyOnly) == TidyYesState)
2795 return; /* not going to show head, so forget it */
2796 head = TY_(FindHEAD)(doc);
2797 if (!head)
2798 return;
2799 node = head->content;
2800 while (node)
2801 {
2802 next = node->next; /* get any 'next' */
2803 if (nodeIsTITLE(node))
2804 {
2805 titles++;
2806 if (titles > 1)
2807 {
2808 TY_(Report)(doc, head, node, DISCARDING_UNEXPECTED);
2809 TY_(DiscardElement)(doc, node); /* delete this node */
2810 }
2811 }
2812 node = next;
2813 }
2814 }
2815
2816 /*
2817 * local variables:
2818 * mode: c
2819 * indent-tabs-mode: nil
2820 * c-basic-offset: 4
2821 * eval: (c-set-offset 'substatement-open 0)
2822 * end:
2823 */
2824