1 /*
2 clean.c -- clean up misuse of presentation markup
3
4 (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
5 See tidy.h for the copyright notice.
6
7 CVS Info :
8
9 $Author: arnaud02 $
10 $Date: 2008/10/14 12:18:10 $
11 $Revision: 1.111 $
12
13 Filters from other formats such as Microsoft Word
14 often make excessive use of presentation markup such
15 as font tags, B, I, and the align attribute. By applying
16 a set of production rules, it is straight forward to
17 transform this to use CSS.
18
19 Some rules replace some of the children of an element by
20 style properties on the element, e.g.
21
22 <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
23
24 Such rules are applied to the element's content and then
25 to the element itself until none of the rules more apply.
26 Having applied all the rules to an element, it will have
27 a style attribute with one or more properties.
28
29 Other rules strip the element they apply to, replacing
30 it by style properties on the contents, e.g.
31
32 <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
33
34 These rules are applied to an element before processing
35 its content and replace the current element by the first
36 element in the exposed content.
37
38 After applying both sets of rules, you can replace the
39 style attribute by a class value and style rule in the
40 document head. To support this, an association of styles
41 and class names is built.
42
43 A naive approach is to rely on string matching to test
44 when two property lists are the same. A better approach
45 would be to first sort the properties before matching.
46
47 */
48
49 #include <stdio.h>
50 #include <stdlib.h>
51 #include <string.h>
52
53 #include "tidy-int.h"
54 #include "clean.h"
55 #include "lexer.h"
56 #include "parser.h"
57 #include "attrs.h"
58 #include "message.h"
59 #include "tmbstr.h"
60 #include "utf8.h"
61
62 static Node* CleanNode( TidyDocImpl* doc, Node *node );
63
RenameElem(TidyDocImpl * doc,Node * node,TidyTagId tid)64 static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid )
65 {
66 const Dict* dict = TY_(LookupTagDef)( tid );
67 TidyDocFree( doc, node->element );
68 node->element = TY_(tmbstrdup)( doc->allocator, dict->name );
69 node->tag = dict;
70 }
71
FreeStyleProps(TidyDocImpl * doc,StyleProp * props)72 static void FreeStyleProps(TidyDocImpl* doc, StyleProp *props)
73 {
74 StyleProp *next;
75
76 while (props)
77 {
78 next = props->next;
79 TidyDocFree(doc, props->name);
80 TidyDocFree(doc, props->value);
81 TidyDocFree(doc, props);
82 props = next;
83 }
84 }
85
InsertProperty(TidyDocImpl * doc,StyleProp * props,ctmbstr name,ctmbstr value)86 static StyleProp *InsertProperty( TidyDocImpl* doc, StyleProp* props, ctmbstr name, ctmbstr value )
87 {
88 StyleProp *first, *prev, *prop;
89 int cmp;
90
91 prev = NULL;
92 first = props;
93
94 while (props)
95 {
96 cmp = TY_(tmbstrcmp)(props->name, name);
97
98 if (cmp == 0)
99 {
100 /* this property is already defined, ignore new value */
101 return first;
102 }
103
104 if (cmp > 0)
105 {
106 /* insert before this */
107
108 prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
109 prop->name = TY_(tmbstrdup)(doc->allocator, name);
110 prop->value = TY_(tmbstrdup)(doc->allocator, value);
111 prop->next = props;
112
113 if (prev)
114 prev->next = prop;
115 else
116 first = prop;
117
118 return first;
119 }
120
121 prev = props;
122 props = props->next;
123 }
124
125 prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
126 prop->name = TY_(tmbstrdup)(doc->allocator, name);
127 prop->value = TY_(tmbstrdup)(doc->allocator, value);
128 prop->next = NULL;
129
130 if (prev)
131 prev->next = prop;
132 else
133 first = prop;
134
135 return first;
136 }
137
138 /*
139 Create sorted linked list of properties from style string
140 It temporarily places nulls in place of ':' and ';' to
141 delimit the strings for the property name and value.
142 Some systems don't allow you to NULL literal strings,
143 so to avoid this, a copy is made first.
144 */
CreateProps(TidyDocImpl * doc,StyleProp * prop,ctmbstr style)145 static StyleProp* CreateProps( TidyDocImpl* doc, StyleProp* prop, ctmbstr style )
146 {
147 tmbstr name, value = NULL, name_end, value_end, line;
148 Bool more;
149
150 line = TY_(tmbstrdup)(doc->allocator, style);
151 name = line;
152
153 while (*name)
154 {
155 while (*name == ' ')
156 ++name;
157
158 name_end = name;
159
160 while (*name_end)
161 {
162 if (*name_end == ':')
163 {
164 value = name_end + 1;
165 break;
166 }
167
168 ++name_end;
169 }
170
171 if (*name_end != ':')
172 break;
173
174 while ( value && *value == ' ')
175 ++value;
176
177 value_end = value;
178 more = no;
179
180 while (*value_end)
181 {
182 if (*value_end == ';')
183 {
184 more = yes;
185 break;
186 }
187
188 ++value_end;
189 }
190
191 *name_end = '\0';
192 *value_end = '\0';
193
194 prop = InsertProperty(doc, prop, name, value);
195 *name_end = ':';
196
197 if (more)
198 {
199 *value_end = ';';
200 name = value_end + 1;
201 continue;
202 }
203
204 break;
205 }
206
207 TidyDocFree(doc, line); /* free temporary copy */
208 return prop;
209 }
210
CreatePropString(TidyDocImpl * doc,StyleProp * props)211 static tmbstr CreatePropString(TidyDocImpl* doc, StyleProp *props)
212 {
213 tmbstr style, p, s;
214 uint len;
215 StyleProp *prop;
216
217 /* compute length */
218
219 for (len = 0, prop = props; prop; prop = prop->next)
220 {
221 len += TY_(tmbstrlen)(prop->name) + 2;
222 if (prop->value)
223 len += TY_(tmbstrlen)(prop->value) + 2;
224 }
225
226 style = (tmbstr) TidyDocAlloc(doc, len+1);
227 style[0] = '\0';
228
229 for (p = style, prop = props; prop; prop = prop->next)
230 {
231 s = prop->name;
232
233 while((*p++ = *s++))
234 continue;
235
236 if (prop->value)
237 {
238 *--p = ':';
239 *++p = ' ';
240 ++p;
241
242 s = prop->value;
243 while((*p++ = *s++))
244 continue;
245 }
246 if (prop->next == NULL)
247 break;
248
249 *--p = ';';
250 *++p = ' ';
251 ++p;
252 }
253
254 return style;
255 }
256
257 /*
258 create string with merged properties
259 static tmbstr AddProperty( ctmbstr style, ctmbstr property )
260 {
261 tmbstr line;
262 StyleProp *prop;
263
264 prop = CreateProps(doc, NULL, style);
265 prop = CreateProps(doc, prop, property);
266 line = CreatePropString(doc, prop);
267 FreeStyleProps(doc, prop);
268 return line;
269 }
270 */
271
TY_(FreeStyles)272 void TY_(FreeStyles)( TidyDocImpl* doc )
273 {
274 Lexer* lexer = doc->lexer;
275 if ( lexer )
276 {
277 TagStyle *style, *next;
278 for ( style = lexer->styles; style; style = next )
279 {
280 next = style->next;
281 TidyDocFree( doc, style->tag );
282 TidyDocFree( doc, style->tag_class );
283 TidyDocFree( doc, style->properties );
284 TidyDocFree( doc, style );
285 }
286 }
287 }
288
GensymClass(TidyDocImpl * doc)289 static tmbstr GensymClass( TidyDocImpl* doc )
290 {
291 tmbchar buf[512]; /* CSSPrefix is limited to 256 characters */
292 ctmbstr pfx = cfgStr(doc, TidyCSSPrefix);
293 if ( pfx == NULL || *pfx == 0 )
294 pfx = "c";
295
296 TY_(tmbsnprintf)(buf, sizeof(buf), "%s%u", pfx, ++doc->nClassId );
297 return TY_(tmbstrdup)(doc->allocator, buf);
298 }
299
FindStyle(TidyDocImpl * doc,ctmbstr tag,ctmbstr properties)300 static ctmbstr FindStyle( TidyDocImpl* doc, ctmbstr tag, ctmbstr properties )
301 {
302 Lexer* lexer = doc->lexer;
303 TagStyle* style;
304
305 for (style = lexer->styles; style; style=style->next)
306 {
307 if (TY_(tmbstrcmp)(style->tag, tag) == 0 &&
308 TY_(tmbstrcmp)(style->properties, properties) == 0)
309 return style->tag_class;
310 }
311
312 style = (TagStyle *)TidyDocAlloc( doc, sizeof(TagStyle) );
313 style->tag = TY_(tmbstrdup)(doc->allocator, tag);
314 style->tag_class = GensymClass( doc );
315 style->properties = TY_(tmbstrdup)( doc->allocator, properties );
316 style->next = lexer->styles;
317 lexer->styles = style;
318 return style->tag_class;
319 }
320
321 /*
322 Add class="foo" to node
323 */
AddClass(TidyDocImpl * doc,Node * node,ctmbstr classname)324 static void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname )
325 {
326 AttVal *classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);;
327
328 /*
329 if there already is a class attribute
330 then append class name after a space.
331 */
332 if (classattr)
333 TY_(AppendToClassAttr)( doc, classattr, classname );
334 else /* create new class attribute */
335 TY_(AddAttribute)( doc, node, "class", classname );
336 }
337
TY_(AddStyleAsClass)338 void TY_(AddStyleAsClass)( TidyDocImpl* doc, Node *node, ctmbstr stylevalue )
339 {
340 ctmbstr classname;
341
342 classname = FindStyle( doc, node->element, stylevalue );
343 AddClass( doc, node, classname);
344 }
345
346 /*
347 Find style attribute in node, and replace it
348 by corresponding class attribute. Search for
349 class in style dictionary otherwise gensym
350 new class and add to dictionary.
351
352 Assumes that node doesn't have a class attribute
353 */
Style2Rule(TidyDocImpl * doc,Node * node)354 static void Style2Rule( TidyDocImpl* doc, Node *node)
355 {
356 AttVal *styleattr, *classattr;
357 ctmbstr classname;
358
359 styleattr = TY_(AttrGetById)(node, TidyAttr_STYLE);
360
361 if (styleattr)
362 {
363 /* fix for http://tidy.sf.net/bug/850215 */
364 if (!styleattr->value)
365 {
366 TY_(RemoveAttribute)(doc, node, styleattr);
367 return;
368 }
369
370 classname = FindStyle( doc, node->element, styleattr->value );
371 classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);
372
373 /*
374 if there already is a class attribute
375 then append class name after an underscore
376 */
377 if (classattr)
378 {
379 TY_(AppendToClassAttr)( doc, classattr, classname );
380 TY_(RemoveAttribute)( doc, node, styleattr );
381 }
382 else /* reuse style attribute for class attribute */
383 {
384 TidyDocFree(doc, styleattr->attribute);
385 TidyDocFree(doc, styleattr->value);
386 styleattr->attribute = TY_(tmbstrdup)(doc->allocator, "class");
387 styleattr->value = TY_(tmbstrdup)(doc->allocator, classname);
388 }
389 }
390 }
391
AddColorRule(Lexer * lexer,ctmbstr selector,ctmbstr color)392 static void AddColorRule( Lexer* lexer, ctmbstr selector, ctmbstr color )
393 {
394 if ( selector && color )
395 {
396 TY_(AddStringLiteral)(lexer, selector);
397 TY_(AddStringLiteral)(lexer, " { color: ");
398 TY_(AddStringLiteral)(lexer, color);
399 TY_(AddStringLiteral)(lexer, " }\n");
400 }
401 }
402
403 /*
404 move presentation attribs from body to style element
405
406 background="foo" -> body { background-image: url(foo) }
407 bgcolor="foo" -> body { background-color: foo }
408 text="foo" -> body { color: foo }
409 link="foo" -> :link { color: foo }
410 vlink="foo" -> :visited { color: foo }
411 alink="foo" -> :active { color: foo }
412 */
CleanBodyAttrs(TidyDocImpl * doc,Node * body)413 static void CleanBodyAttrs( TidyDocImpl* doc, Node* body )
414 {
415 Lexer* lexer = doc->lexer;
416 tmbstr bgurl = NULL;
417 tmbstr bgcolor = NULL;
418 tmbstr color = NULL;
419 AttVal* attr;
420
421 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BACKGROUND)))
422 {
423 bgurl = attr->value;
424 attr->value = NULL;
425 TY_(RemoveAttribute)( doc, body, attr );
426 }
427
428 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BGCOLOR)))
429 {
430 bgcolor = attr->value;
431 attr->value = NULL;
432 TY_(RemoveAttribute)( doc, body, attr );
433 }
434
435 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_TEXT)))
436 {
437 color = attr->value;
438 attr->value = NULL;
439 TY_(RemoveAttribute)( doc, body, attr );
440 }
441
442 if ( bgurl || bgcolor || color )
443 {
444 TY_(AddStringLiteral)(lexer, " body {\n");
445 if (bgurl)
446 {
447 TY_(AddStringLiteral)(lexer, " background-image: url(");
448 TY_(AddStringLiteral)(lexer, bgurl);
449 TY_(AddStringLiteral)(lexer, ");\n");
450 TidyDocFree(doc, bgurl);
451 }
452 if (bgcolor)
453 {
454 TY_(AddStringLiteral)(lexer, " background-color: ");
455 TY_(AddStringLiteral)(lexer, bgcolor);
456 TY_(AddStringLiteral)(lexer, ";\n");
457 TidyDocFree(doc, bgcolor);
458 }
459 if (color)
460 {
461 TY_(AddStringLiteral)(lexer, " color: ");
462 TY_(AddStringLiteral)(lexer, color);
463 TY_(AddStringLiteral)(lexer, ";\n");
464 TidyDocFree(doc, color);
465 }
466
467 TY_(AddStringLiteral)(lexer, " }\n");
468 }
469
470 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_LINK)))
471 {
472 AddColorRule(lexer, " :link", attr->value);
473 TY_(RemoveAttribute)( doc, body, attr );
474 }
475
476 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_VLINK)))
477 {
478 AddColorRule(lexer, " :visited", attr->value);
479 TY_(RemoveAttribute)( doc, body, attr );
480 }
481
482 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_ALINK)))
483 {
484 AddColorRule(lexer, " :active", attr->value);
485 TY_(RemoveAttribute)( doc, body, attr );
486 }
487 }
488
NiceBody(TidyDocImpl * doc)489 static Bool NiceBody( TidyDocImpl* doc )
490 {
491 Node* node = TY_(FindBody)(doc);
492 if (node)
493 {
494 if (TY_(AttrGetById)(node, TidyAttr_BACKGROUND) ||
495 TY_(AttrGetById)(node, TidyAttr_BGCOLOR) ||
496 TY_(AttrGetById)(node, TidyAttr_TEXT) ||
497 TY_(AttrGetById)(node, TidyAttr_LINK) ||
498 TY_(AttrGetById)(node, TidyAttr_VLINK) ||
499 TY_(AttrGetById)(node, TidyAttr_ALINK))
500 {
501 doc->badLayout |= USING_BODY;
502 return no;
503 }
504 }
505
506 return yes;
507 }
508
509 /* create style element using rules from dictionary */
CreateStyleElement(TidyDocImpl * doc)510 static void CreateStyleElement( TidyDocImpl* doc )
511 {
512 Lexer* lexer = doc->lexer;
513 Node *node, *head, *body;
514 TagStyle *style;
515 AttVal *av;
516
517 if ( lexer->styles == NULL && NiceBody(doc) )
518 return;
519
520 node = TY_(NewNode)( doc->allocator, lexer );
521 node->type = StartTag;
522 node->implicit = yes;
523 node->element = TY_(tmbstrdup)(doc->allocator, "style");
524 TY_(FindTag)( doc, node );
525
526 /* insert type attribute */
527 av = TY_(NewAttributeEx)( doc, "type", "text/css", '"' );
528 TY_(InsertAttributeAtStart)( node, av );
529
530 body = TY_(FindBody)( doc );
531 lexer->txtstart = lexer->lexsize;
532 if ( body )
533 CleanBodyAttrs( doc, body );
534
535 for (style = lexer->styles; style; style = style->next)
536 {
537 TY_(AddCharToLexer)(lexer, ' ');
538 TY_(AddStringLiteral)(lexer, style->tag);
539 TY_(AddCharToLexer)(lexer, '.');
540 TY_(AddStringLiteral)(lexer, style->tag_class);
541 TY_(AddCharToLexer)(lexer, ' ');
542 TY_(AddCharToLexer)(lexer, '{');
543 TY_(AddStringLiteral)(lexer, style->properties);
544 TY_(AddCharToLexer)(lexer, '}');
545 TY_(AddCharToLexer)(lexer, '\n');
546 }
547
548 lexer->txtend = lexer->lexsize;
549
550 TY_(InsertNodeAtEnd)( node, TY_(TextToken)(lexer) );
551
552 /*
553 now insert style element into document head
554
555 doc is root node. search its children for html node
556 the head node should be first child of html node
557 */
558 if ( NULL != (head = TY_(FindHEAD)( doc )) )
559 TY_(InsertNodeAtEnd)( head, node );
560 }
561
562
563 /* ensure bidirectional links are consistent */
TY_(FixNodeLinks)564 void TY_(FixNodeLinks)(Node *node)
565 {
566 Node *child;
567
568 if (node->prev)
569 node->prev->next = node;
570 else
571 node->parent->content = node;
572
573 if (node->next)
574 node->next->prev = node;
575 else
576 node->parent->last = node;
577
578 for (child = node->content; child; child = child->next)
579 child->parent = node;
580 }
581
582 /*
583 used to strip child of node when
584 the node has one and only one child
585 */
StripOnlyChild(TidyDocImpl * doc,Node * node)586 static void StripOnlyChild(TidyDocImpl* doc, Node *node)
587 {
588 Node *child;
589
590 child = node->content;
591 node->content = child->content;
592 node->last = child->last;
593 child->content = NULL;
594 TY_(FreeNode)(doc, child);
595
596 for (child = node->content; child; child = child->next)
597 child->parent = node;
598 }
599
600 /*
601 used to strip font start and end tags.
602 Extricate "element", replace it by its content and delete it.
603 */
DiscardContainer(TidyDocImpl * doc,Node * element,Node ** pnode)604 static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
605 {
606 if (element->content)
607 {
608 Node *node, *parent = element->parent;
609
610 element->last->next = element->next;
611
612 if (element->next)
613 {
614 element->next->prev = element->last;
615 }
616 else
617 parent->last = element->last;
618
619 if (element->prev)
620 {
621 element->content->prev = element->prev;
622 element->prev->next = element->content;
623 }
624 else
625 parent->content = element->content;
626
627 for (node = element->content; node; node = node->next)
628 node->parent = parent;
629
630 *pnode = element->content;
631
632 element->next = element->content = NULL;
633 TY_(FreeNode)(doc, element);
634 }
635 else
636 {
637 *pnode = TY_(DiscardElement)(doc, element);
638 }
639 }
640
641 /*
642 Create new string that consists of the
643 combined style properties in s1 and s2
644
645 To merge property lists, we build a linked
646 list of property/values and insert properties
647 into the list in order, merging values for
648 the same property name.
649 */
MergeProperties(TidyDocImpl * doc,ctmbstr s1,ctmbstr s2)650 static tmbstr MergeProperties( TidyDocImpl* doc, ctmbstr s1, ctmbstr s2 )
651 {
652 tmbstr s;
653 StyleProp *prop;
654
655 prop = CreateProps(doc, NULL, s1);
656 prop = CreateProps(doc, prop, s2);
657 s = CreatePropString(doc, prop);
658 FreeStyleProps(doc, prop);
659 return s;
660 }
661
662 /*
663 Add style property to element, creating style
664 attribute as needed and adding ; delimiter
665 */
TY_(AddStyleProperty)666 void TY_(AddStyleProperty)(TidyDocImpl* doc, Node *node, ctmbstr property )
667 {
668 AttVal *av = TY_(AttrGetById)(node, TidyAttr_STYLE);
669
670 /* if style attribute already exists then insert property */
671
672 if ( av )
673 {
674 if (av->value != NULL)
675 {
676 tmbstr s = MergeProperties( doc, av->value, property );
677 TidyDocFree( doc, av->value );
678 av->value = s;
679 }
680 else
681 {
682 av->value = TY_(tmbstrdup)( doc->allocator, property );
683 }
684 }
685 else /* else create new style attribute */
686 {
687 av = TY_(NewAttributeEx)( doc, "style", property, '"' );
688 TY_(InsertAttributeAtStart)( node, av );
689 }
690 }
691
MergeClasses(TidyDocImpl * doc,Node * node,Node * child)692 static void MergeClasses(TidyDocImpl* doc, Node *node, Node *child)
693 {
694 AttVal *av;
695 tmbstr s1, s2, names;
696
697 for (s2 = NULL, av = child->attributes; av; av = av->next)
698 {
699 if (attrIsCLASS(av))
700 {
701 s2 = av->value;
702 break;
703 }
704 }
705
706 for (s1 = NULL, av = node->attributes; av; av = av->next)
707 {
708 if (attrIsCLASS(av))
709 {
710 s1 = av->value;
711 break;
712 }
713 }
714
715 if (s1)
716 {
717 if (s2) /* merge class names from both */
718 {
719 uint l1, l2;
720 l1 = TY_(tmbstrlen)(s1);
721 l2 = TY_(tmbstrlen)(s2);
722 names = (tmbstr) TidyDocAlloc(doc, l1 + l2 + 2);
723 TY_(tmbstrcpy)(names, s1);
724 names[l1] = ' ';
725 TY_(tmbstrcpy)(names+l1+1, s2);
726 TidyDocFree(doc, av->value);
727 av->value = names;
728 }
729 }
730 else if (s2) /* copy class names from child */
731 {
732 av = TY_(NewAttributeEx)( doc, "class", s2, '"' );
733 TY_(InsertAttributeAtStart)( node, av );
734 }
735 }
736
MergeStyles(TidyDocImpl * doc,Node * node,Node * child)737 static void MergeStyles(TidyDocImpl* doc, Node *node, Node *child)
738 {
739 AttVal *av;
740 tmbstr s1, s2, style;
741
742 /*
743 the child may have a class attribute used
744 for attaching styles, if so the class name
745 needs to be copied to node's class
746 */
747 MergeClasses(doc, node, child);
748
749 for (s2 = NULL, av = child->attributes; av; av = av->next)
750 {
751 if (attrIsSTYLE(av))
752 {
753 s2 = av->value;
754 break;
755 }
756 }
757
758 for (s1 = NULL, av = node->attributes; av; av = av->next)
759 {
760 if (attrIsSTYLE(av))
761 {
762 s1 = av->value;
763 break;
764 }
765 }
766
767 if (s1)
768 {
769 if (s2) /* merge styles from both */
770 {
771 style = MergeProperties(doc, s1, s2);
772 TidyDocFree(doc, av->value);
773 av->value = style;
774 }
775 }
776 else if (s2) /* copy style of child */
777 {
778 av = TY_(NewAttributeEx)( doc, "style", s2, '"' );
779 TY_(InsertAttributeAtStart)( node, av );
780 }
781 }
782
FontSize2Name(ctmbstr size)783 static ctmbstr FontSize2Name(ctmbstr size)
784 {
785 static const ctmbstr sizes[7] =
786 {
787 "60%", "70%", "80%", NULL,
788 "120%", "150%", "200%"
789 };
790
791 /* increment of 0.8 */
792 static const ctmbstr minussizes[] =
793 {
794 "100%", "80%", "64%", "51%",
795 "40%", "32%", "26%"
796 };
797
798 /* increment of 1.2 */
799 static const ctmbstr plussizes[] =
800 {
801 "100%", "120%", "144%", "172%",
802 "207%", "248%", "298%"
803 };
804
805 if (size[0] == '\0')
806 return NULL;
807
808 if ('0' <= size[0] && size[0] <= '6')
809 {
810 int n = size[0] - '0';
811 return sizes[n];
812 }
813
814 if (size[0] == '-')
815 {
816 if ('0' <= size[1] && size[1] <= '6')
817 {
818 int n = size[1] - '0';
819 return minussizes[n];
820 }
821 return "smaller"; /*"70%"; */
822 }
823
824 if ('0' <= size[1] && size[1] <= '6')
825 {
826 int n = size[1] - '0';
827 return plussizes[n];
828 }
829
830 return "larger"; /* "140%" */
831 }
832
AddFontFace(TidyDocImpl * doc,Node * node,ctmbstr face)833 static void AddFontFace( TidyDocImpl* doc, Node *node, ctmbstr face )
834 {
835 tmbchar buf[256];
836 TY_(tmbsnprintf)(buf, sizeof(buf), "font-family: %s", face );
837 TY_(AddStyleProperty)( doc, node, buf );
838 }
839
AddFontSize(TidyDocImpl * doc,Node * node,ctmbstr size)840 static void AddFontSize( TidyDocImpl* doc, Node* node, ctmbstr size )
841 {
842 ctmbstr value = NULL;
843
844 if (nodeIsP(node))
845 {
846 if (TY_(tmbstrcmp)(size, "6") == 0)
847 value = "h1";
848 else if (TY_(tmbstrcmp)(size, "5") == 0)
849 value = "h2";
850 else if (TY_(tmbstrcmp)(size, "4") == 0)
851 value = "h3";
852
853 if (value)
854 {
855 TidyDocFree(doc, node->element);
856 node->element = TY_(tmbstrdup)(doc->allocator, value);
857 TY_(FindTag)(doc, node);
858 return;
859 }
860 }
861
862 value = FontSize2Name(size);
863
864 if (value)
865 {
866 tmbchar buf[64];
867 TY_(tmbsnprintf)(buf, sizeof(buf), "font-size: %s", value);
868 TY_(AddStyleProperty)( doc, node, buf );
869 }
870 }
871
AddFontColor(TidyDocImpl * doc,Node * node,ctmbstr color)872 static void AddFontColor( TidyDocImpl* doc, Node *node, ctmbstr color)
873 {
874 tmbchar buf[128];
875 TY_(tmbsnprintf)(buf, sizeof(buf), "color: %s", color);
876 TY_(AddStyleProperty)( doc, node, buf );
877 }
878
879 /* force alignment value to lower case */
AddAlign(TidyDocImpl * doc,Node * node,ctmbstr align)880 static void AddAlign( TidyDocImpl* doc, Node *node, ctmbstr align )
881 {
882 uint i;
883 tmbchar buf[128];
884
885 TY_(tmbstrcpy)( buf, "text-align: " );
886 for ( i = 12; i < sizeof(buf)/sizeof(buf[0])-1; ++i )
887 {
888 if ( (buf[i] = (tmbchar)TY_(ToLower)(*align++)) == '\0' )
889 break;
890 }
891 buf[i] = '\0';
892 TY_(AddStyleProperty)( doc, node, buf );
893 }
894
895 /*
896 add style properties to node corresponding to
897 the font face, size and color attributes
898 */
AddFontStyles(TidyDocImpl * doc,Node * node,AttVal * av)899 static void AddFontStyles( TidyDocImpl* doc, Node *node, AttVal *av)
900 {
901 while (av)
902 {
903 if (AttrHasValue(av))
904 {
905 if (attrIsFACE(av))
906 AddFontFace( doc, node, av->value );
907 else if (attrIsSIZE(av))
908 AddFontSize( doc, node, av->value );
909 else if (attrIsCOLOR(av))
910 AddFontColor( doc, node, av->value );
911 }
912 av = av->next;
913 }
914 }
915
916 /*
917 Symptom: <p align=center>
918 Action: <p style="text-align: center">
919 */
TextAlign(TidyDocImpl * doc,Node * node)920 static void TextAlign( TidyDocImpl* doc, Node* node )
921 {
922 AttVal *av, *prev;
923
924 prev = NULL;
925
926 for (av = node->attributes; av; av = av->next)
927 {
928 if (attrIsALIGN(av))
929 {
930 if (prev)
931 prev->next = av->next;
932 else
933 node->attributes = av->next;
934
935 if (av->value)
936 AddAlign( doc, node, av->value );
937
938 TY_(FreeAttribute)(doc, av);
939 break;
940 }
941
942 prev = av;
943 }
944 }
945
946 /*
947 Symptom: <table bgcolor="red">
948 Action: <table style="background-color: red">
949 */
TableBgColor(TidyDocImpl * doc,Node * node)950 static void TableBgColor( TidyDocImpl* doc, Node* node )
951 {
952 AttVal* attr;
953 tmbchar buf[256];
954
955 if (NULL != (attr = TY_(AttrGetById)(node, TidyAttr_BGCOLOR)))
956 {
957 TY_(tmbsnprintf)(buf, sizeof(buf), "background-color: %s", attr->value );
958 TY_(RemoveAttribute)( doc, node, attr );
959 TY_(AddStyleProperty)( doc, node, buf );
960 }
961 }
962
963 /*
964 The clean up rules use the pnode argument to return the
965 next node when the original node has been deleted
966 */
967
968 /*
969 Symptom: <dir> <li> where <li> is only child
970 Action: coerce <dir> <li> to <div> with indent.
971 */
972
Dir2Div(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))973 static Bool Dir2Div( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode))
974 {
975 Node *child;
976
977 if ( nodeIsDIR(node) || nodeIsUL(node) || nodeIsOL(node) )
978 {
979 child = node->content;
980
981 if (child == NULL)
982 return no;
983
984 /* check child has no peers */
985
986 if (child->next)
987 return no;
988
989 if ( !nodeIsLI(child) )
990 return no;
991
992 if ( !child->implicit )
993 return no;
994
995 /* coerce dir to div */
996 node->tag = TY_(LookupTagDef)( TidyTag_DIV );
997 TidyDocFree( doc, node->element );
998 node->element = TY_(tmbstrdup)(doc->allocator, "div");
999 TY_(AddStyleProperty)( doc, node, "margin-left: 2em" );
1000 StripOnlyChild( doc, node );
1001 return yes;
1002 }
1003
1004 return no;
1005 }
1006
1007 /*
1008 Symptom: <center>
1009 Action: replace <center> by <div style="text-align: center">
1010 */
1011
Center2Div(TidyDocImpl * doc,Node * node,Node ** pnode)1012 static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode)
1013 {
1014 if ( nodeIsCENTER(node) )
1015 {
1016 if ( cfgBool(doc, TidyDropFontTags) )
1017 {
1018 if (node->content)
1019 {
1020 Node *last = node->last;
1021 DiscardContainer( doc, node, pnode );
1022
1023 node = TY_(InferredTag)(doc, TidyTag_BR);
1024 TY_(InsertNodeAfterElement)(last, node);
1025 }
1026 else
1027 {
1028 Node *prev = node->prev, *next = node->next,
1029 *parent = node->parent;
1030 DiscardContainer( doc, node, pnode );
1031
1032 node = TY_(InferredTag)(doc, TidyTag_BR);
1033 if (next)
1034 TY_(InsertNodeBeforeElement)(next, node);
1035 else if (prev)
1036 TY_(InsertNodeAfterElement)(prev, node);
1037 else
1038 TY_(InsertNodeAtStart)(parent, node);
1039 }
1040
1041 return yes;
1042 }
1043
1044 RenameElem( doc, node, TidyTag_DIV );
1045 TY_(AddStyleProperty)( doc, node, "text-align: center" );
1046 return yes;
1047 }
1048
1049 return no;
1050 }
1051
1052 /* Copy child attributes to node. Duplicate attributes are overwritten.
1053 Unique attributes (such as ID) disable the action.
1054 Attributes style and class are not dealt with. A call to MergeStyles
1055 will do that.
1056 */
CopyAttrs(TidyDocImpl * doc,Node * node,Node * child)1057 static Bool CopyAttrs( TidyDocImpl* doc, Node *node, Node *child)
1058 {
1059 AttVal *av1, *av2;
1060 TidyAttrId id;
1061
1062 /* Detect attributes that cannot be merged or overwritten. */
1063 if (TY_(AttrGetById)(child, TidyAttr_ID) != NULL
1064 && TY_(AttrGetById)(node, TidyAttr_ID) != NULL)
1065 return no;
1066
1067 /* Move child attributes to node. Attributes in node
1068 can be overwritten or merged. */
1069 for (av2 = child->attributes; av2; )
1070 {
1071 /* Dealt by MergeStyles. */
1072 if (attrIsSTYLE(av2) || attrIsCLASS(av2))
1073 {
1074 av2 = av2->next;
1075 continue;
1076 }
1077 /* Avoid duplicates in node */
1078 if ((id=AttrId(av2)) != TidyAttr_UNKNOWN
1079 && (av1=TY_(AttrGetById)(node, id))!= NULL)
1080 TY_(RemoveAttribute)( doc, node, av1 );
1081
1082 /* Move attribute from child to node */
1083 TY_(DetachAttribute)( child, av2 );
1084 av1 = av2;
1085 av2 = av2->next;
1086 av1->next = NULL;
1087 TY_(InsertAttributeAtEnd)( node, av1 );
1088 }
1089
1090 return yes;
1091 }
1092
1093 /*
1094 Symptom <XX><XX>...</XX></XX>
1095 Action: merge the two XXs
1096
1097 For instance, this is useful after nested <dir>s used by Word
1098 for indenting have been converted to <div>s
1099
1100 If state is "no", no merging.
1101 If state is "yes", inner element is discarded. Only Style and Class
1102 attributes are merged using MergeStyles().
1103 If state is "auto", atttibutes are merged as described in CopyAttrs().
1104 Style and Class attributes are merged using MergeStyles().
1105 */
MergeNestedElements(TidyDocImpl * doc,TidyTagId Id,TidyTriState state,Node * node,Node ** ARG_UNUSED (pnode))1106 static Bool MergeNestedElements( TidyDocImpl* doc,
1107 TidyTagId Id, TidyTriState state, Node *node,
1108 Node **ARG_UNUSED(pnode))
1109 {
1110 Node *child;
1111
1112 if ( state == TidyNoState
1113 || !TagIsId(node, Id) )
1114 return no;
1115
1116 child = node->content;
1117
1118 if ( child == NULL
1119 || child->next != NULL
1120 || !TagIsId(child, Id) )
1121 return no;
1122
1123 if ( state == TidyAutoState
1124 && CopyAttrs(doc, node, child) == no )
1125 return no;
1126
1127 MergeStyles( doc, node, child );
1128 StripOnlyChild( doc, node );
1129 return yes;
1130 }
1131
1132 /*
1133 Symptom: <ul><li><ul>...</ul></li></ul>
1134 Action: discard outer list
1135 */
1136
NestedList(TidyDocImpl * doc,Node * node,Node ** pnode)1137 static Bool NestedList( TidyDocImpl* doc, Node *node, Node **pnode )
1138 {
1139 Node *child, *list;
1140
1141 if ( nodeIsUL(node) || nodeIsOL(node) )
1142 {
1143 child = node->content;
1144
1145 if (child == NULL)
1146 return no;
1147
1148 /* check child has no peers */
1149
1150 if (child->next)
1151 return no;
1152
1153 list = child->content;
1154
1155 if (!list)
1156 return no;
1157
1158 if (list->tag != node->tag)
1159 return no;
1160
1161 /* check list has no peers */
1162 if (list->next)
1163 return no;
1164
1165 *pnode = list; /* Set node to resume iteration */
1166
1167 /* move inner list node into position of outer node */
1168 list->prev = node->prev;
1169 list->next = node->next;
1170 list->parent = node->parent;
1171 TY_(FixNodeLinks)(list);
1172
1173 /* get rid of outer ul and its li */
1174 child->content = NULL;
1175 TY_(FreeNode)( doc, child ); /* See test #427841. */
1176 child = NULL;
1177 node->content = NULL;
1178 node->next = NULL;
1179 TY_(FreeNode)( doc, node );
1180 node = NULL;
1181
1182 /*
1183 If prev node was a list the chances are this node
1184 should be appended to that list. Word has no way of
1185 recognizing nested lists and just uses indents
1186 */
1187
1188 if (list->prev)
1189 {
1190 if ( (nodeIsUL(list->prev) || nodeIsOL(list->prev))
1191 && list->prev->last )
1192 {
1193 node = list;
1194 list = node->prev;
1195
1196 child = list->last; /* <li> */
1197
1198 list->next = node->next;
1199 TY_(FixNodeLinks)(list);
1200
1201 node->parent = child;
1202 node->next = NULL;
1203 node->prev = child->last;
1204 TY_(FixNodeLinks)(node);
1205 CleanNode( doc, node );
1206 }
1207 }
1208
1209 return yes;
1210 }
1211
1212 return no;
1213 }
1214
1215 /* Find CSS equivalent in a SPAN element */
1216 static
FindCSSSpanEq(Node * node,ctmbstr * s,Bool deprecatedOnly)1217 Bool FindCSSSpanEq( Node *node, ctmbstr *s, Bool deprecatedOnly )
1218 {
1219 struct
1220 {
1221 TidyTagId id;
1222 ctmbstr CSSeq;
1223 Bool deprecated;
1224 }
1225 const CSS_SpanEq[] =
1226 {
1227 { TidyTag_B, "font-weight: bold", no },
1228 { TidyTag_I, "font-style: italic", no },
1229 { TidyTag_S, "text-decoration: line-through", yes},
1230 { TidyTag_STRIKE, "text-decoration: line-through", yes},
1231 { TidyTag_U, "text-decoration: underline", yes},
1232 { TidyTag_UNKNOWN, NULL, no }
1233 };
1234 uint i;
1235
1236 for (i=0; CSS_SpanEq[i].CSSeq; ++i)
1237 if ( (!deprecatedOnly || CSS_SpanEq[i].deprecated)
1238 && TagIsId(node, CSS_SpanEq[i].id) )
1239 {
1240 *s = CSS_SpanEq[i].CSSeq;
1241 return yes;
1242 }
1243 return no;
1244 }
1245
1246 /* Necessary conditions to apply BlockStyle(). */
CanApplyBlockStyle(Node * node)1247 static Bool CanApplyBlockStyle( Node *node )
1248 {
1249 if (TY_(nodeHasCM)(node,CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE)
1250 && !nodeIsTABLE(node) && !nodeIsTR(node) && !nodeIsLI(node) )
1251 {
1252 return yes;
1253 }
1254 return no;
1255 }
1256
1257 /*
1258 Symptom: the only child of a block-level element is a
1259 presentation element such as B, I or FONT
1260
1261 Action: add style "font-weight: bold" to the block and
1262 strip the <b> element, leaving its children.
1263
1264 example:
1265
1266 <p>
1267 <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1268 </p>
1269
1270 becomes:
1271
1272 <p style="font-weight: bold; font-family: Arial; font-size: 6">
1273 Draft Recommended Practice
1274 </p>
1275
1276 This code also replaces the align attribute by a style attribute.
1277 However, to avoid CSS problems with Navigator 4, this isn't done
1278 for the elements: caption, tr and table
1279 */
BlockStyle(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1280 static Bool BlockStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1281 {
1282 Node *child;
1283 ctmbstr CSSeq;
1284
1285 /* check for bgcolor */
1286 if ( nodeIsTABLE(node)
1287 || nodeIsTD(node) || nodeIsTH(node) || nodeIsTR( node ))
1288 TableBgColor( doc, node );
1289
1290 if (CanApplyBlockStyle(node))
1291 {
1292 /* check for align attribute */
1293 if ( !nodeIsCAPTION(node) )
1294 TextAlign( doc, node );
1295
1296 child = node->content;
1297 if (child == NULL)
1298 return no;
1299
1300 /* check child has no peers */
1301 if (child->next)
1302 return no;
1303
1304 if ( FindCSSSpanEq(child, &CSSeq, no) )
1305 {
1306 MergeStyles( doc, node, child );
1307 TY_(AddStyleProperty)( doc, node, CSSeq );
1308 StripOnlyChild( doc, node );
1309 return yes;
1310 }
1311 else if ( nodeIsFONT(child) )
1312 {
1313 MergeStyles( doc, node, child );
1314 AddFontStyles( doc, node, child->attributes );
1315 StripOnlyChild( doc, node );
1316 return yes;
1317 }
1318 }
1319
1320 return no;
1321 }
1322
1323 /* Necessary conditions to apply InlineStyle(). */
CanApplyInlineStyle(Node * node)1324 static Bool CanApplyInlineStyle( Node *node )
1325 {
1326 return !nodeIsFONT(node) && TY_(nodeHasCM)(node, CM_INLINE|CM_ROW);
1327 }
1328
1329 /* the only child of table cell or an inline element such as em */
InlineStyle(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1330 static Bool InlineStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1331 {
1332 Node *child;
1333 ctmbstr CSSeq;
1334
1335 if ( CanApplyInlineStyle(node) )
1336 {
1337 child = node->content;
1338
1339 if (child == NULL)
1340 return no;
1341
1342 /* check child has no peers */
1343
1344 if (child->next)
1345 return no;
1346
1347 if ( FindCSSSpanEq(child, &CSSeq, no) )
1348 {
1349 MergeStyles( doc, node, child );
1350 TY_(AddStyleProperty)( doc, node, CSSeq );
1351 StripOnlyChild( doc, node );
1352 return yes;
1353 }
1354 else if ( nodeIsFONT(child) )
1355 {
1356 MergeStyles( doc, node, child );
1357 AddFontStyles( doc, node, child->attributes );
1358 StripOnlyChild( doc, node );
1359 return yes;
1360 }
1361 }
1362
1363 return no;
1364 }
1365
1366 /*
1367 Transform element to equivalent CSS
1368 */
InlineElementToCSS(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1369 static Bool InlineElementToCSS( TidyDocImpl* doc, Node* node,
1370 Node **ARG_UNUSED(pnode) )
1371 {
1372 ctmbstr CSSeq;
1373
1374 /* if node is the only child of parent element then leave alone
1375 Do so only if BlockStyle may be succesful. */
1376 if ( node->parent->content == node && node->next == NULL &&
1377 (CanApplyBlockStyle(node->parent)
1378 || CanApplyInlineStyle(node->parent)) )
1379 return no;
1380
1381 if ( FindCSSSpanEq(node, &CSSeq, yes) )
1382 {
1383 RenameElem( doc, node, TidyTag_SPAN );
1384 TY_(AddStyleProperty)( doc, node, CSSeq );
1385 return yes;
1386 }
1387 return no;
1388 }
1389
1390 /*
1391 Replace font elements by span elements, deleting
1392 the font element's attributes and replacing them
1393 by a single style attribute.
1394 */
Font2Span(TidyDocImpl * doc,Node * node,Node ** pnode)1395 static Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode )
1396 {
1397 AttVal *av, *style, *next;
1398
1399 if ( nodeIsFONT(node) )
1400 {
1401 if ( cfgBool(doc, TidyDropFontTags) )
1402 {
1403 DiscardContainer( doc, node, pnode );
1404 return yes;
1405 }
1406
1407 /* if node is the only child of parent element then leave alone
1408 Do so only if BlockStyle may be succesful. */
1409 if ( node->parent->content == node && node->next == NULL &&
1410 CanApplyBlockStyle(node->parent) )
1411 return no;
1412
1413 AddFontStyles( doc, node, node->attributes );
1414
1415 /* extract style attribute and free the rest */
1416 av = node->attributes;
1417 style = NULL;
1418
1419 while (av)
1420 {
1421 next = av->next;
1422
1423 if (attrIsSTYLE(av))
1424 {
1425 av->next = NULL;
1426 style = av;
1427 }
1428 else
1429 {
1430 TY_(FreeAttribute)( doc, av );
1431 }
1432 av = next;
1433 }
1434
1435 node->attributes = style;
1436 RenameElem( doc, node, TidyTag_SPAN );
1437 return yes;
1438 }
1439
1440 return no;
1441 }
1442
1443 /*
1444 Applies all matching rules to a node.
1445 */
CleanNode(TidyDocImpl * doc,Node * node)1446 Node* CleanNode( TidyDocImpl* doc, Node *node )
1447 {
1448 Node *next = NULL;
1449 TidyTriState mergeDivs = cfgAutoBool(doc, TidyMergeDivs);
1450 TidyTriState mergeSpans = cfgAutoBool(doc, TidyMergeSpans);
1451
1452 for (next = node; TY_(nodeIsElement)(node); node = next)
1453 {
1454 if ( Dir2Div(doc, node, &next) )
1455 continue;
1456
1457 /* Special case: true result means
1458 ** that arg node and its parent no longer exist.
1459 ** So we must jump back up the CreateStyleProperties()
1460 ** call stack until we have a valid node reference.
1461 */
1462 if ( NestedList(doc, node, &next) )
1463 return next;
1464
1465 if ( Center2Div(doc, node, &next) )
1466 continue;
1467
1468 if ( MergeNestedElements(doc, TidyTag_DIV, mergeDivs, node, &next) )
1469 continue;
1470
1471 if ( MergeNestedElements(doc, TidyTag_SPAN, mergeSpans, node, &next) )
1472 continue;
1473
1474 if ( BlockStyle(doc, node, &next) )
1475 continue;
1476
1477 if ( InlineStyle(doc, node, &next) )
1478 continue;
1479
1480 if ( InlineElementToCSS(doc, node, &next) )
1481 continue;
1482
1483 if ( Font2Span(doc, node, &next) )
1484 continue;
1485
1486 break;
1487 }
1488
1489 return next;
1490 }
1491
1492 /* Special case: if the current node is destroyed by
1493 ** CleanNode() lower in the tree, this node and its parent
1494 ** no longer exist. So we must jump back up the CleanTree()
1495 ** call stack until we have a valid node reference.
1496 */
1497
CleanTree(TidyDocImpl * doc,Node * node)1498 static Node* CleanTree( TidyDocImpl* doc, Node *node )
1499 {
1500 if (node->content)
1501 {
1502 Node *child;
1503 for (child = node->content; child != NULL; child = child->next)
1504 {
1505 child = CleanTree( doc, child );
1506 if ( !child )
1507 break;
1508 }
1509 }
1510
1511 return CleanNode( doc, node );
1512 }
1513
DefineStyleRules(TidyDocImpl * doc,Node * node)1514 static void DefineStyleRules( TidyDocImpl* doc, Node *node )
1515 {
1516 Node *child;
1517
1518 if (node->content)
1519 {
1520 for (child = node->content;
1521 child != NULL; child = child->next)
1522 {
1523 DefineStyleRules( doc, child );
1524 }
1525 }
1526
1527 Style2Rule( doc, node );
1528 }
1529
TY_(CleanDocument)1530 void TY_(CleanDocument)( TidyDocImpl* doc )
1531 {
1532 /* placeholder. CleanTree()/CleanNode() will not
1533 ** zap root element
1534 */
1535 CleanTree( doc, &doc->root );
1536
1537 if ( cfgBool(doc, TidyMakeClean) )
1538 {
1539 DefineStyleRules( doc, &doc->root );
1540 CreateStyleElement( doc );
1541 }
1542 }
1543
1544 /* simplifies <b><b> ... </b> ...</b> etc. */
TY_(NestedEmphasis)1545 void TY_(NestedEmphasis)( TidyDocImpl* doc, Node* node )
1546 {
1547 Node *next;
1548
1549 while (node)
1550 {
1551 next = node->next;
1552
1553 if ( (nodeIsB(node) || nodeIsI(node))
1554 && node->parent && node->parent->tag == node->tag)
1555 {
1556 /* strip redundant inner element */
1557 DiscardContainer( doc, node, &next );
1558 node = next;
1559 continue;
1560 }
1561
1562 if ( node->content )
1563 TY_(NestedEmphasis)( doc, node->content );
1564
1565 node = next;
1566 }
1567 }
1568
1569
1570
1571 /* replace i by em and b by strong */
TY_(EmFromI)1572 void TY_(EmFromI)( TidyDocImpl* doc, Node* node )
1573 {
1574 while (node)
1575 {
1576 if ( nodeIsI(node) )
1577 RenameElem( doc, node, TidyTag_EM );
1578 else if ( nodeIsB(node) )
1579 RenameElem( doc, node, TidyTag_STRONG );
1580
1581 if ( node->content )
1582 TY_(EmFromI)( doc, node->content );
1583
1584 node = node->next;
1585 }
1586 }
1587
HasOneChild(Node * node)1588 static Bool HasOneChild(Node *node)
1589 {
1590 return (node->content && node->content->next == NULL);
1591 }
1592
1593 /*
1594 Some people use dir or ul without an li
1595 to indent the content. The pattern to
1596 look for is a list with a single implicit
1597 li. This is recursively replaced by an
1598 implicit blockquote.
1599 */
TY_(List2BQ)1600 void TY_(List2BQ)( TidyDocImpl* doc, Node* node )
1601 {
1602 while (node)
1603 {
1604 if (node->content)
1605 TY_(List2BQ)( doc, node->content );
1606
1607 if ( node->tag && node->tag->parser == TY_(ParseList) &&
1608 HasOneChild(node) && node->content->implicit )
1609 {
1610 StripOnlyChild( doc, node );
1611 RenameElem( doc, node, TidyTag_BLOCKQUOTE );
1612 node->implicit = yes;
1613 }
1614
1615 node = node->next;
1616 }
1617 }
1618
1619
1620 /*
1621 Replace implicit blockquote by div with an indent
1622 taking care to reduce nested blockquotes to a single
1623 div with the indent set to match the nesting depth
1624 */
TY_(BQ2Div)1625 void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
1626 {
1627 tmbchar indent_buf[ 32 ];
1628 uint indent;
1629
1630 while (node)
1631 {
1632 if ( nodeIsBLOCKQUOTE(node) && node->implicit )
1633 {
1634 indent = 1;
1635
1636 while( HasOneChild(node) &&
1637 nodeIsBLOCKQUOTE(node->content) &&
1638 node->implicit)
1639 {
1640 ++indent;
1641 StripOnlyChild( doc, node );
1642 }
1643
1644 if (node->content)
1645 TY_(BQ2Div)( doc, node->content );
1646
1647 TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
1648 2*indent);
1649
1650 RenameElem( doc, node, TidyTag_DIV );
1651 TY_(AddStyleProperty)(doc, node, indent_buf );
1652 }
1653 else if (node->content)
1654 TY_(BQ2Div)( doc, node->content );
1655
1656 node = node->next;
1657 }
1658 }
1659
1660
FindEnclosingCell(TidyDocImpl * ARG_UNUSED (doc),Node * node)1661 static Node* FindEnclosingCell( TidyDocImpl* ARG_UNUSED(doc), Node *node)
1662 {
1663 Node *check;
1664
1665 for ( check=node; check; check = check->parent )
1666 {
1667 if ( nodeIsTD(check) )
1668 return check;
1669 }
1670 return NULL;
1671 }
1672
1673 /* node is <![if ...]> prune up to <![endif]> */
PruneSection(TidyDocImpl * doc,Node * node)1674 static Node* PruneSection( TidyDocImpl* doc, Node *node )
1675 {
1676 Lexer* lexer = doc->lexer;
1677
1678 for (;;)
1679 {
1680 ctmbstr lexbuf = lexer->lexbuf + node->start;
1681 if ( TY_(tmbstrncmp)(lexbuf, "if !supportEmptyParas", 21) == 0 )
1682 {
1683 Node* cell = FindEnclosingCell( doc, node );
1684 if ( cell )
1685 {
1686 /* Need to put into cell so it doesn't look weird
1687 */
1688 Node* nbsp = TY_(NewLiteralTextNode)( lexer, "\240" );
1689 assert( (byte)'\240' == (byte)160 );
1690 TY_(InsertNodeBeforeElement)( node, nbsp );
1691 }
1692 }
1693
1694 /* discard node and returns next, unless it is a text node */
1695 if ( node->type == TextNode )
1696 node = node->next;
1697 else
1698 node = TY_(DiscardElement)( doc, node );
1699
1700 if (node == NULL)
1701 return NULL;
1702
1703 if (node->type == SectionTag)
1704 {
1705 if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0)
1706 {
1707 node = PruneSection( doc, node );
1708 continue;
1709 }
1710
1711 if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "endif", 5) == 0)
1712 {
1713 node = TY_(DiscardElement)( doc, node );
1714 break;
1715 }
1716 }
1717 }
1718
1719 return node;
1720 }
1721
TY_(DropSections)1722 void TY_(DropSections)( TidyDocImpl* doc, Node* node )
1723 {
1724 Lexer* lexer = doc->lexer;
1725 while (node)
1726 {
1727 if (node->type == SectionTag)
1728 {
1729 /* prune up to matching endif */
1730 if ((TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0) &&
1731 (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if !vml", 7) != 0)) /* #444394 - fix 13 Sep 01 */
1732 {
1733 node = PruneSection( doc, node );
1734 continue;
1735 }
1736
1737 /* discard others as well */
1738 node = TY_(DiscardElement)( doc, node );
1739 continue;
1740 }
1741
1742 if (node->content)
1743 TY_(DropSections)( doc, node->content );
1744
1745 node = node->next;
1746 }
1747 }
1748
PurgeWord2000Attributes(TidyDocImpl * doc,Node * node)1749 static void PurgeWord2000Attributes( TidyDocImpl* doc, Node* node )
1750 {
1751 AttVal *attr, *next, *prev = NULL;
1752
1753 for ( attr = node->attributes; attr; attr = next )
1754 {
1755 next = attr->next;
1756
1757 /* special check for class="Code" denoting pre text */
1758 /* Pass thru user defined styles as HTML class names */
1759 if (attrIsCLASS(attr))
1760 {
1761 if (AttrValueIs(attr, "Code") ||
1762 TY_(tmbstrncmp)(attr->value, "Mso", 3) != 0 )
1763 {
1764 prev = attr;
1765 continue;
1766 }
1767 }
1768
1769 if (attrIsCLASS(attr) ||
1770 attrIsSTYLE(attr) ||
1771 attrIsLANG(attr) ||
1772 ( (attrIsHEIGHT(attr) || attrIsWIDTH(attr)) &&
1773 (nodeIsTD(node) || nodeIsTR(node) || nodeIsTH(node)) ) ||
1774 (attr->attribute && TY_(tmbstrncmp)(attr->attribute, "x:", 2) == 0) )
1775 {
1776 if (prev)
1777 prev->next = next;
1778 else
1779 node->attributes = next;
1780
1781 TY_(FreeAttribute)( doc, attr );
1782 }
1783 else
1784 prev = attr;
1785 }
1786 }
1787
1788 /* Word2000 uses span excessively, so we strip span out */
StripSpan(TidyDocImpl * doc,Node * span)1789 static Node* StripSpan( TidyDocImpl* doc, Node* span )
1790 {
1791 Node *node, *prev = NULL, *content;
1792
1793 /*
1794 deal with span elements that have content
1795 by splicing the content in place of the span
1796 after having processed it
1797 */
1798
1799 TY_(CleanWord2000)( doc, span->content );
1800 content = span->content;
1801
1802 if (span->prev)
1803 prev = span->prev;
1804 else if (content)
1805 {
1806 node = content;
1807 content = content->next;
1808 TY_(RemoveNode)(node);
1809 TY_(InsertNodeBeforeElement)(span, node);
1810 prev = node;
1811 }
1812
1813 while (content)
1814 {
1815 node = content;
1816 content = content->next;
1817 TY_(RemoveNode)(node);
1818 TY_(InsertNodeAfterElement)(prev, node);
1819 prev = node;
1820 }
1821
1822 if (span->next == NULL)
1823 span->parent->last = prev;
1824
1825 node = span->next;
1826 span->content = NULL;
1827 TY_(DiscardElement)( doc, span );
1828 return node;
1829 }
1830
1831 /* map non-breaking spaces to regular spaces */
TY_(NormalizeSpaces)1832 void TY_(NormalizeSpaces)(Lexer *lexer, Node *node)
1833 {
1834 while ( node )
1835 {
1836 if ( node->content )
1837 TY_(NormalizeSpaces)( lexer, node->content );
1838
1839 if (TY_(nodeIsText)(node))
1840 {
1841 uint i, c;
1842 tmbstr p = lexer->lexbuf + node->start;
1843
1844 for (i = node->start; i < node->end; ++i)
1845 {
1846 c = (byte) lexer->lexbuf[i];
1847
1848 /* look for UTF-8 multibyte character */
1849 if ( c > 0x7F )
1850 i += TY_(GetUTF8)( lexer->lexbuf + i, &c );
1851
1852 if ( c == 160 )
1853 c = ' ';
1854
1855 p = TY_(PutUTF8)(p, c);
1856 }
1857 node->end = p - lexer->lexbuf;
1858 }
1859
1860 node = node->next;
1861 }
1862 }
1863
1864 /* used to hunt for hidden preformatted sections */
NoMargins(Node * node)1865 static Bool NoMargins(Node *node)
1866 {
1867 AttVal *attval = TY_(AttrGetById)(node, TidyAttr_STYLE);
1868
1869 if ( !AttrHasValue(attval) )
1870 return no;
1871
1872 /* search for substring "margin-top: 0" */
1873 if (!TY_(tmbsubstr)(attval->value, "margin-top: 0"))
1874 return no;
1875
1876 /* search for substring "margin-bottom: 0" */
1877 if (!TY_(tmbsubstr)(attval->value, "margin-bottom: 0"))
1878 return no;
1879
1880 return yes;
1881 }
1882
1883 /* does element have a single space as its content? */
SingleSpace(Lexer * lexer,Node * node)1884 static Bool SingleSpace( Lexer* lexer, Node* node )
1885 {
1886 if ( node->content )
1887 {
1888 node = node->content;
1889
1890 if ( node->next != NULL )
1891 return no;
1892
1893 if ( node->type != TextNode )
1894 return no;
1895
1896 if ( (node->end - node->start) == 1 &&
1897 lexer->lexbuf[node->start] == ' ' )
1898 return yes;
1899
1900 if ( (node->end - node->start) == 2 )
1901 {
1902 uint c = 0;
1903 TY_(GetUTF8)( lexer->lexbuf + node->start, &c );
1904 if ( c == 160 )
1905 return yes;
1906 }
1907 }
1908
1909 return no;
1910 }
1911
1912 /*
1913 This is a major clean up to strip out all the extra stuff you get
1914 when you save as web page from Word 2000. It doesn't yet know what
1915 to do with VML tags, but these will appear as errors unless you
1916 declare them as new tags, such as o:p which needs to be declared
1917 as inline.
1918 */
TY_(CleanWord2000)1919 void TY_(CleanWord2000)( TidyDocImpl* doc, Node *node)
1920 {
1921 /* used to a list from a sequence of bulletted p's */
1922 Lexer* lexer = doc->lexer;
1923 Node* list = NULL;
1924
1925 while ( node )
1926 {
1927 /* get rid of Word's xmlns attributes */
1928 if ( nodeIsHTML(node) )
1929 {
1930 /* check that it's a Word 2000 document */
1931 if ( !TY_(GetAttrByName)(node, "xmlns:o") &&
1932 !cfgBool(doc, TidyMakeBare) )
1933 return;
1934
1935 TY_(FreeAttrs)( doc, node );
1936 }
1937
1938 /* fix up preformatted sections by looking for a
1939 ** sequence of paragraphs with zero top/bottom margin
1940 */
1941 if ( nodeIsP(node) )
1942 {
1943 if (NoMargins(node))
1944 {
1945 Node *pre, *next;
1946 TY_(CoerceNode)(doc, node, TidyTag_PRE, no, yes);
1947
1948 PurgeWord2000Attributes( doc, node );
1949
1950 if (node->content)
1951 TY_(CleanWord2000)( doc, node->content );
1952
1953 pre = node;
1954 node = node->next;
1955
1956 /* continue to strip p's */
1957
1958 while ( nodeIsP(node) && NoMargins(node) )
1959 {
1960 next = node->next;
1961 TY_(RemoveNode)(node);
1962 TY_(InsertNodeAtEnd)(pre, TY_(NewLineNode)(lexer));
1963 TY_(InsertNodeAtEnd)(pre, node);
1964 StripSpan( doc, node );
1965 node = next;
1966 }
1967
1968 if (node == NULL)
1969 break;
1970 }
1971 }
1972
1973 if (node->tag && (node->tag->model & CM_BLOCK)
1974 && SingleSpace(lexer, node))
1975 {
1976 node = StripSpan( doc, node );
1977 continue;
1978 }
1979 /* discard Word's style verbiage */
1980 if ( nodeIsSTYLE(node) || nodeIsMETA(node) ||
1981 node->type == CommentTag )
1982 {
1983 node = TY_(DiscardElement)( doc, node );
1984 continue;
1985 }
1986
1987 /* strip out all span and font tags Word scatters so liberally! */
1988 if ( nodeIsSPAN(node) || nodeIsFONT(node) )
1989 {
1990 node = StripSpan( doc, node );
1991 continue;
1992 }
1993
1994 if ( nodeIsLINK(node) )
1995 {
1996 AttVal *attr = TY_(AttrGetById)(node, TidyAttr_REL);
1997
1998 if (AttrValueIs(attr, "File-List"))
1999 {
2000 node = TY_(DiscardElement)( doc, node );
2001 continue;
2002 }
2003 }
2004
2005 /* discards <o:p> which encodes the paragraph mark */
2006 if ( node->tag && TY_(tmbstrcmp)(node->tag->name,"o:p")==0)
2007 {
2008 Node* next;
2009 DiscardContainer( doc, node, &next );
2010 node = next;
2011 continue;
2012 }
2013
2014 /* discard empty paragraphs */
2015
2016 if ( node->content == NULL && nodeIsP(node) )
2017 {
2018 /* Use the existing function to ensure consistency */
2019 Node *next = TY_(TrimEmptyElement)( doc, node );
2020 node = next;
2021 continue;
2022 }
2023
2024 if ( nodeIsP(node) )
2025 {
2026 AttVal *attr, *atrStyle;
2027
2028 attr = TY_(AttrGetById)(node, TidyAttr_CLASS);
2029 atrStyle = TY_(AttrGetById)(node, TidyAttr_STYLE);
2030 /*
2031 (JES) Sometimes Word marks a list item with the following hokie syntax
2032 <p class="MsoNormal" style="...;mso-list:l1 level1 lfo1;
2033 translate these into <li>
2034 */
2035 /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
2036 /* map <p class="MsoListNumber"> to <ol>...</ol> */
2037 if ( AttrValueIs(attr, "MsoListBullet") ||
2038 AttrValueIs(attr, "MsoListNumber") ||
2039 AttrContains(atrStyle, "mso-list:") )
2040 {
2041 TidyTagId listType = TidyTag_UL;
2042 if (AttrValueIs(attr, "MsoListNumber"))
2043 listType = TidyTag_OL;
2044
2045 TY_(CoerceNode)(doc, node, TidyTag_LI, no, yes);
2046
2047 if ( !list || TagId(list) != listType )
2048 {
2049 const Dict* tag = TY_(LookupTagDef)( listType );
2050 list = TY_(InferredTag)(doc, tag->id);
2051 TY_(InsertNodeBeforeElement)(node, list);
2052 }
2053
2054 PurgeWord2000Attributes( doc, node );
2055
2056 if ( node->content )
2057 TY_(CleanWord2000)( doc, node->content );
2058
2059 /* remove node and append to contents of list */
2060 TY_(RemoveNode)(node);
2061 TY_(InsertNodeAtEnd)(list, node);
2062 node = list;
2063 }
2064 /* map sequence of <p class="Code"> to <pre>...</pre> */
2065 else if (AttrValueIs(attr, "Code"))
2066 {
2067 Node *br = TY_(NewLineNode)(lexer);
2068 TY_(NormalizeSpaces)(lexer, node->content);
2069
2070 if ( !list || TagId(list) != TidyTag_PRE )
2071 {
2072 list = TY_(InferredTag)(doc, TidyTag_PRE);
2073 TY_(InsertNodeBeforeElement)(node, list);
2074 }
2075
2076 /* remove node and append to contents of list */
2077 TY_(RemoveNode)(node);
2078 TY_(InsertNodeAtEnd)(list, node);
2079 StripSpan( doc, node );
2080 TY_(InsertNodeAtEnd)(list, br);
2081 node = list->next;
2082 }
2083 else
2084 list = NULL;
2085 }
2086 else
2087 list = NULL;
2088
2089 if (!node)
2090 return;
2091
2092 /* strip out style and class attributes */
2093 if (TY_(nodeIsElement)(node))
2094 PurgeWord2000Attributes( doc, node );
2095
2096 if (node->content)
2097 TY_(CleanWord2000)( doc, node->content );
2098
2099 node = node->next;
2100 }
2101 }
2102
TY_(IsWord2000)2103 Bool TY_(IsWord2000)( TidyDocImpl* doc )
2104 {
2105 AttVal *attval;
2106 Node *node, *head;
2107 Node *html = TY_(FindHTML)( doc );
2108
2109 if (html && TY_(GetAttrByName)(html, "xmlns:o"))
2110 return yes;
2111
2112 /* search for <meta name="GENERATOR" content="Microsoft ..."> */
2113 head = TY_(FindHEAD)( doc );
2114
2115 if (head)
2116 {
2117 for (node = head->content; node; node = node->next)
2118 {
2119 if ( !nodeIsMETA(node) )
2120 continue;
2121
2122 attval = TY_(AttrGetById)( node, TidyAttr_NAME );
2123
2124 if ( !AttrValueIs(attval, "generator") )
2125 continue;
2126
2127 attval = TY_(AttrGetById)( node, TidyAttr_CONTENT );
2128
2129 if ( AttrContains(attval, "Microsoft") )
2130 return yes;
2131 }
2132 }
2133
2134 return no;
2135 }
2136
2137 /* where appropriate move object elements from head to body */
TY_(BumpObject)2138 void TY_(BumpObject)( TidyDocImpl* doc, Node *html )
2139 {
2140 Node *node, *next, *head = NULL, *body = NULL;
2141
2142 if (!html)
2143 return;
2144
2145 for ( node = html->content; node != NULL; node = node->next )
2146 {
2147 if ( nodeIsHEAD(node) )
2148 head = node;
2149
2150 if ( nodeIsBODY(node) )
2151 body = node;
2152 }
2153
2154 if ( head != NULL && body != NULL )
2155 {
2156 for (node = head->content; node != NULL; node = next)
2157 {
2158 next = node->next;
2159
2160 if ( nodeIsOBJECT(node) )
2161 {
2162 Node *child;
2163 Bool bump = no;
2164
2165 for (child = node->content; child != NULL; child = child->next)
2166 {
2167 /* bump to body unless content is param */
2168 if ( (TY_(nodeIsText)(child) && !TY_(IsBlank)(doc->lexer, node))
2169 || !nodeIsPARAM(child) )
2170 {
2171 bump = yes;
2172 break;
2173 }
2174 }
2175
2176 if ( bump )
2177 {
2178 TY_(RemoveNode)( node );
2179 TY_(InsertNodeAtStart)( body, node );
2180 }
2181 }
2182 }
2183 }
2184 }
2185
2186 /* This is disabled due to http://tidy.sf.net/bug/681116 */
2187 #if 0
2188 void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
2189 {
2190 Node *pNode;
2191 Bool bBRDeleted = no;
2192
2193 if (NULL == pParent)
2194 return;
2195
2196 /* First, check the status of All My Children */
2197 pNode = pParent->content;
2198 while (NULL != pNode )
2199 {
2200 /* The node may get trimmed, so save the next pointer, if any */
2201 Node *pNext = pNode->next;
2202 FixBrakes( pDoc, pNode );
2203 pNode = pNext;
2204 }
2205
2206
2207 /* As long as my last child is a <br />, move it to my last peer */
2208 if ( nodeCMIsBlock( pParent ))
2209 {
2210 for ( pNode = pParent->last;
2211 NULL != pNode && nodeIsBR( pNode );
2212 pNode = pParent->last )
2213 {
2214 if ( NULL == pNode->attributes && no == bBRDeleted )
2215 {
2216 TY_(DiscardElement)( pDoc, pNode );
2217 bBRDeleted = yes;
2218 }
2219 else
2220 {
2221 TY_(RemoveNode)( pNode );
2222 TY_(InsertNodeAfterElement)( pParent, pNode );
2223 }
2224 }
2225 TY_(TrimEmptyElement)( pDoc, pParent );
2226 }
2227 }
2228 #endif
2229
TY_(VerifyHTTPEquiv)2230 void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2231 {
2232 Node *pNode;
2233 StyleProp *pFirstProp = NULL, *pLastProp = NULL, *prop = NULL;
2234 tmbstr s, pszBegin, pszEnd;
2235 ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
2236
2237 if (!enc)
2238 return;
2239
2240 if (!nodeIsHEAD(head))
2241 head = TY_(FindHEAD)(doc);
2242
2243 if (!head)
2244 return;
2245
2246 /* Find any <meta http-equiv='Content-Type' content='...' /> */
2247 for (pNode = head->content; NULL != pNode; pNode = pNode->next)
2248 {
2249 AttVal* httpEquiv = TY_(AttrGetById)(pNode, TidyAttr_HTTP_EQUIV);
2250 AttVal* metaContent = TY_(AttrGetById)(pNode, TidyAttr_CONTENT);
2251
2252 if ( !nodeIsMETA(pNode) || !metaContent ||
2253 !AttrValueIs(httpEquiv, "Content-Type") )
2254 continue;
2255
2256 pszBegin = s = TY_(tmbstrdup)( doc->allocator, metaContent->value );
2257 while (pszBegin && *pszBegin)
2258 {
2259 while (isspace( *pszBegin ))
2260 pszBegin++;
2261 pszEnd = pszBegin;
2262 while ('\0' != *pszEnd && ';' != *pszEnd)
2263 pszEnd++;
2264 if (';' == *pszEnd )
2265 *(pszEnd++) = '\0';
2266 if (pszEnd > pszBegin)
2267 {
2268 prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
2269 prop->name = TY_(tmbstrdup)( doc->allocator, pszBegin );
2270 prop->value = NULL;
2271 prop->next = NULL;
2272
2273 if (NULL != pLastProp)
2274 pLastProp->next = prop;
2275 else
2276 pFirstProp = prop;
2277
2278 pLastProp = prop;
2279 pszBegin = pszEnd;
2280 }
2281 }
2282 TidyDocFree( doc, s );
2283
2284 /* find the charset property */
2285 for (prop = pFirstProp; NULL != prop; prop = prop->next)
2286 {
2287 if (0 != TY_(tmbstrncasecmp)( prop->name, "charset", 7 ))
2288 continue;
2289
2290 TidyDocFree( doc, prop->name );
2291 prop->name = (tmbstr)TidyDocAlloc( doc, 8 + TY_(tmbstrlen)(enc) + 1 );
2292 TY_(tmbstrcpy)(prop->name, "charset=");
2293 TY_(tmbstrcpy)(prop->name+8, enc);
2294 s = CreatePropString( doc, pFirstProp );
2295 TidyDocFree( doc, metaContent->value );
2296 metaContent->value = s;
2297 break;
2298 }
2299 /* #718127, prevent memory leakage */
2300 FreeStyleProps(doc, pFirstProp);
2301 pFirstProp = NULL;
2302 pLastProp = NULL;
2303 }
2304 }
2305
TY_(DropComments)2306 void TY_(DropComments)(TidyDocImpl* doc, Node* node)
2307 {
2308 Node* next;
2309
2310 while (node)
2311 {
2312 next = node->next;
2313
2314 if (node->type == CommentTag)
2315 {
2316 TY_(RemoveNode)(node);
2317 TY_(FreeNode)(doc, node);
2318 node = next;
2319 continue;
2320 }
2321
2322 if (node->content)
2323 TY_(DropComments)(doc, node->content);
2324
2325 node = next;
2326 }
2327 }
2328
TY_(DropFontElements)2329 void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **ARG_UNUSED(pnode))
2330 {
2331 Node* next;
2332
2333 while (node)
2334 {
2335 next = node->next;
2336
2337 if (nodeIsFONT(node))
2338 {
2339 DiscardContainer(doc, node, &next);
2340 node = next;
2341 continue;
2342 }
2343
2344 if (node->content)
2345 TY_(DropFontElements)(doc, node->content, &next);
2346
2347 node = next;
2348 }
2349 }
2350
TY_(WbrToSpace)2351 void TY_(WbrToSpace)(TidyDocImpl* doc, Node* node)
2352 {
2353 Node* next;
2354
2355 while (node)
2356 {
2357 next = node->next;
2358
2359 if (nodeIsWBR(node))
2360 {
2361 Node* text;
2362 text = TY_(NewLiteralTextNode)(doc->lexer, " ");
2363 TY_(InsertNodeAfterElement)(node, text);
2364 TY_(RemoveNode)(node);
2365 TY_(FreeNode)(doc, node);
2366 node = next;
2367 continue;
2368 }
2369
2370 if (node->content)
2371 TY_(WbrToSpace)(doc, node->content);
2372
2373 node = next;
2374 }
2375 }
2376
2377 /*
2378 Filters from Word and PowerPoint often use smart
2379 quotes resulting in character codes between 128
2380 and 159. Unfortunately, the corresponding HTML 4.0
2381 entities for these are not widely supported. The
2382 following converts dashes and quotation marks to
2383 the nearest ASCII equivalent. My thanks to
2384 Andrzej Novosiolov for his help with this code.
2385
2386 Note: The old code in the pretty printer applied
2387 this to all node types and attribute values while
2388 this routine applies it only to text nodes. First,
2389 Microsoft Office products rarely put the relevant
2390 characters into these tokens, second support for
2391 them is much better now and last but not least, it
2392 can be harmful to replace these characters since
2393 US-ASCII quote marks are often used as syntax
2394 characters, a simple
2395
2396 <a onmouseover="alert('‘')">...</a>
2397
2398 would be broken if the U+2018 is replaced by "'".
2399 The old code would neither take care whether the
2400 quote mark is already used as delimiter,
2401
2402 <p title='‘'>...</p>
2403
2404 got
2405
2406 <p title='''>...</p>
2407
2408 Since browser support is much better nowadays and
2409 high-quality typography is better than ASCII it'd
2410 be probably a good idea to drop the feature...
2411 */
TY_(DowngradeTypography)2412 void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node)
2413 {
2414 Node* next;
2415 Lexer* lexer = doc->lexer;
2416
2417 while (node)
2418 {
2419 next = node->next;
2420
2421 if (TY_(nodeIsText)(node))
2422 {
2423 uint i, c;
2424 tmbstr p = lexer->lexbuf + node->start;
2425
2426 for (i = node->start; i < node->end; ++i)
2427 {
2428 c = (unsigned char) lexer->lexbuf[i];
2429
2430 if (c > 0x7F)
2431 i += TY_(GetUTF8)(lexer->lexbuf + i, &c);
2432
2433 if (c >= 0x2013 && c <= 0x201E)
2434 {
2435 switch (c)
2436 {
2437 case 0x2013: /* en dash */
2438 case 0x2014: /* em dash */
2439 c = '-';
2440 break;
2441 case 0x2018: /* left single quotation mark */
2442 case 0x2019: /* right single quotation mark */
2443 case 0x201A: /* single low-9 quotation mark */
2444 c = '\'';
2445 break;
2446 case 0x201C: /* left double quotation mark */
2447 case 0x201D: /* right double quotation mark */
2448 case 0x201E: /* double low-9 quotation mark */
2449 c = '"';
2450 break;
2451 }
2452 }
2453
2454 p = TY_(PutUTF8)(p, c);
2455 }
2456
2457 node->end = p - lexer->lexbuf;
2458 }
2459
2460 if (node->content)
2461 TY_(DowngradeTypography)(doc, node->content);
2462
2463 node = next;
2464 }
2465 }
2466
TY_(ReplacePreformattedSpaces)2467 void TY_(ReplacePreformattedSpaces)(TidyDocImpl* doc, Node* node)
2468 {
2469 Node* next;
2470
2471 while (node)
2472 {
2473 next = node->next;
2474
2475 if (node->tag && node->tag->parser == TY_(ParsePre))
2476 {
2477 TY_(NormalizeSpaces)(doc->lexer, node->content);
2478 node = next;
2479 continue;
2480 }
2481
2482 if (node->content)
2483 TY_(ReplacePreformattedSpaces)(doc, node->content);
2484
2485 node = next;
2486 }
2487 }
2488
TY_(ConvertCDATANodes)2489 void TY_(ConvertCDATANodes)(TidyDocImpl* doc, Node* node)
2490 {
2491 Node* next;
2492
2493 while (node)
2494 {
2495 next = node->next;
2496
2497 if (node->type == CDATATag)
2498 node->type = TextNode;
2499
2500 if (node->content)
2501 TY_(ConvertCDATANodes)(doc, node->content);
2502
2503 node = next;
2504 }
2505 }
2506
2507 /*
2508 FixLanguageInformation ensures that the document contains (only)
2509 the attributes for language information desired by the output
2510 document type. For example, for XHTML 1.0 documents both
2511 'xml:lang' and 'lang' are desired, for XHTML 1.1 only 'xml:lang'
2512 is desired and for HTML 4.01 only 'lang' is desired.
2513 */
TY_(FixLanguageInformation)2514 void TY_(FixLanguageInformation)(TidyDocImpl* doc, Node* node, Bool wantXmlLang, Bool wantLang)
2515 {
2516 Node* next;
2517
2518 while (node)
2519 {
2520 next = node->next;
2521
2522 /* todo: report modifications made here to the report system */
2523
2524 if (TY_(nodeIsElement)(node))
2525 {
2526 AttVal* lang = TY_(AttrGetById)(node, TidyAttr_LANG);
2527 AttVal* xmlLang = TY_(AttrGetById)(node, TidyAttr_XML_LANG);
2528
2529 if (lang && xmlLang)
2530 {
2531 /*
2532 todo: check whether both attributes are in sync,
2533 here or elsewhere, where elsewhere is probably
2534 preferable.
2535 AD - March 2005: not mandatory according the standards.
2536 */
2537 }
2538 else if (lang && wantXmlLang)
2539 {
2540 if (TY_(NodeAttributeVersions)( node, TidyAttr_XML_LANG )
2541 & doc->lexer->versionEmitted)
2542 TY_(RepairAttrValue)(doc, node, "xml:lang", lang->value);
2543 }
2544 else if (xmlLang && wantLang)
2545 {
2546 if (TY_(NodeAttributeVersions)( node, TidyAttr_LANG )
2547 & doc->lexer->versionEmitted)
2548 TY_(RepairAttrValue)(doc, node, "lang", xmlLang->value);
2549 }
2550
2551 if (lang && !wantLang)
2552 TY_(RemoveAttribute)(doc, node, lang);
2553
2554 if (xmlLang && !wantXmlLang)
2555 TY_(RemoveAttribute)(doc, node, xmlLang);
2556 }
2557
2558 if (node->content)
2559 TY_(FixLanguageInformation)(doc, node->content, wantXmlLang, wantLang);
2560
2561 node = next;
2562 }
2563 }
2564
2565 /*
2566 Set/fix/remove <html xmlns='...'>
2567 */
TY_(FixXhtmlNamespace)2568 void TY_(FixXhtmlNamespace)(TidyDocImpl* doc, Bool wantXmlns)
2569 {
2570 Node* html = TY_(FindHTML)(doc);
2571 AttVal* xmlns;
2572
2573 if (!html)
2574 return;
2575
2576 xmlns = TY_(AttrGetById)(html, TidyAttr_XMLNS);
2577
2578 if (wantXmlns)
2579 {
2580 if (!AttrValueIs(xmlns, XHTML_NAMESPACE))
2581 TY_(RepairAttrValue)(doc, html, "xmlns", XHTML_NAMESPACE);
2582 }
2583 else if (xmlns)
2584 {
2585 TY_(RemoveAttribute)(doc, html, xmlns);
2586 }
2587 }
2588
2589 /*
2590 ...
2591 */
TY_(FixAnchors)2592 void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
2593 {
2594 Node* next;
2595
2596 while (node)
2597 {
2598 next = node->next;
2599
2600 if (TY_(IsAnchorElement)(doc, node))
2601 {
2602 AttVal *name = TY_(AttrGetById)(node, TidyAttr_NAME);
2603 AttVal *id = TY_(AttrGetById)(node, TidyAttr_ID);
2604 Bool hadName = name!=NULL;
2605 Bool hadId = id!=NULL;
2606 Bool IdEmitted = no;
2607 Bool NameEmitted = no;
2608
2609 /* todo: how are empty name/id attributes handled? */
2610
2611 if (name && id)
2612 {
2613 Bool NameHasValue = AttrHasValue(name);
2614 Bool IdHasValue = AttrHasValue(id);
2615 if ( (NameHasValue != IdHasValue) ||
2616 (NameHasValue && IdHasValue &&
2617 TY_(tmbstrcmp)(name->value, id->value) != 0 ) )
2618 TY_(ReportAttrError)( doc, node, name, ID_NAME_MISMATCH);
2619 }
2620 else if (name && wantId)
2621 {
2622 if (TY_(NodeAttributeVersions)( node, TidyAttr_ID )
2623 & doc->lexer->versionEmitted)
2624 {
2625 if (TY_(IsValidHTMLID)(name->value))
2626 {
2627 TY_(RepairAttrValue)(doc, node, "id", name->value);
2628 IdEmitted = yes;
2629 }
2630 else
2631 TY_(ReportAttrError)(doc, node, name, INVALID_XML_ID);
2632 }
2633 }
2634 else if (id && wantName)
2635 {
2636 if (TY_(NodeAttributeVersions)( node, TidyAttr_NAME )
2637 & doc->lexer->versionEmitted)
2638 {
2639 /* todo: do not assume id is valid */
2640 TY_(RepairAttrValue)(doc, node, "name", id->value);
2641 NameEmitted = yes;
2642 }
2643 }
2644
2645 if (id && !wantId
2646 /* make sure that Name has been emitted if requested */
2647 && (hadName || !wantName || NameEmitted) )
2648 TY_(RemoveAttribute)(doc, node, id);
2649
2650 if (name && !wantName
2651 /* make sure that Id has been emitted if requested */
2652 && (hadId || !wantId || IdEmitted) )
2653 TY_(RemoveAttribute)(doc, node, name);
2654
2655 if (TY_(AttrGetById)(node, TidyAttr_NAME) == NULL &&
2656 TY_(AttrGetById)(node, TidyAttr_ID) == NULL)
2657 TY_(RemoveAnchorByNode)(doc, node);
2658 }
2659
2660 if (node->content)
2661 TY_(FixAnchors)(doc, node->content, wantName, wantId);
2662
2663 node = next;
2664 }
2665 }
2666
2667 /*
2668 * local variables:
2669 * mode: c
2670 * indent-tabs-mode: nil
2671 * c-basic-offset: 4
2672 * eval: (c-set-offset 'substatement-open 0)
2673 * end:
2674 */
2675