1 /*
2 clean.c -- clean up misuse of presentation markup
3
4 (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
5 See tidyp.h for the copyright notice.
6
7 Filters from other formats such as Microsoft Word
8 often make excessive use of presentation markup such
9 as font tags, B, I, and the align attribute. By applying
10 a set of production rules, it is straight forward to
11 transform this to use CSS.
12
13 Some rules replace some of the children of an element by
14 style properties on the element, e.g.
15
16 <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
17
18 Such rules are applied to the element's content and then
19 to the element itself until none of the rules more apply.
20 Having applied all the rules to an element, it will have
21 a style attribute with one or more properties.
22
23 Other rules strip the element they apply to, replacing
24 it by style properties on the contents, e.g.
25
26 <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
27
28 These rules are applied to an element before processing
29 its content and replace the current element by the first
30 element in the exposed content.
31
32 After applying both sets of rules, you can replace the
33 style attribute by a class value and style rule in the
34 document head. To support this, an association of styles
35 and class names is built.
36
37 A naive approach is to rely on string matching to test
38 when two property lists are the same. A better approach
39 would be to first sort the properties before matching.
40
41 */
42
43 #include <stdio.h>
44 #include <stdlib.h>
45 #include <string.h>
46
47 #include "tidy-int.h"
48 #include "clean.h"
49 #include "lexer.h"
50 #include "parser.h"
51 #include "attrs.h"
52 #include "message.h"
53 #include "tmbstr.h"
54 #include "utf8.h"
55
56 static Node* CleanNode( TidyDocImpl* doc, Node *node );
57
RenameElem(TidyDocImpl * doc,Node * node,TidyTagId tid)58 static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid )
59 {
60 const Dict* dict = TY_(LookupTagDef)( tid );
61 TidyDocFree( doc, node->element );
62 node->element = TY_(tmbstrdup)( doc->allocator, dict->name );
63 node->tag = dict;
64 }
65
FreeStyleProps(TidyDocImpl * doc,StyleProp * props)66 static void FreeStyleProps(TidyDocImpl* doc, StyleProp *props)
67 {
68 StyleProp *next;
69
70 while (props)
71 {
72 next = props->next;
73 TidyDocFree(doc, props->name);
74 TidyDocFree(doc, props->value);
75 TidyDocFree(doc, props);
76 props = next;
77 }
78 }
79
InsertProperty(TidyDocImpl * doc,StyleProp * props,ctmbstr name,ctmbstr value)80 static StyleProp *InsertProperty( TidyDocImpl* doc, StyleProp* props, ctmbstr name, ctmbstr value )
81 {
82 StyleProp *first, *prev, *prop;
83 int cmp;
84
85 prev = NULL;
86 first = props;
87
88 while (props)
89 {
90 cmp = TY_(tmbstrcmp)(props->name, name);
91
92 if (cmp == 0)
93 {
94 /* this property is already defined, ignore new value */
95 return first;
96 }
97
98 if (cmp > 0)
99 {
100 /* insert before this */
101
102 prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
103 prop->name = TY_(tmbstrdup)(doc->allocator, name);
104 prop->value = TY_(tmbstrdup)(doc->allocator, value);
105 prop->next = props;
106
107 if (prev)
108 prev->next = prop;
109 else
110 first = prop;
111
112 return first;
113 }
114
115 prev = props;
116 props = props->next;
117 }
118
119 prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
120 prop->name = TY_(tmbstrdup)(doc->allocator, name);
121 prop->value = TY_(tmbstrdup)(doc->allocator, value);
122 prop->next = NULL;
123
124 if (prev)
125 prev->next = prop;
126 else
127 first = prop;
128
129 return first;
130 }
131
132 /*
133 Create sorted linked list of properties from style string
134 It temporarily places nulls in place of ':' and ';' to
135 delimit the strings for the property name and value.
136 Some systems don't allow you to NULL literal strings,
137 so to avoid this, a copy is made first.
138 */
CreateProps(TidyDocImpl * doc,StyleProp * prop,ctmbstr style)139 static StyleProp* CreateProps( TidyDocImpl* doc, StyleProp* prop, ctmbstr style )
140 {
141 tmbstr name, value = NULL, name_end, value_end, line;
142 Bool more;
143
144 line = TY_(tmbstrdup)(doc->allocator, style);
145 name = line;
146
147 while (*name)
148 {
149 while (*name == ' ')
150 ++name;
151
152 name_end = name;
153
154 while (*name_end)
155 {
156 if (*name_end == ':')
157 {
158 value = name_end + 1;
159 break;
160 }
161
162 ++name_end;
163 }
164
165 if (*name_end != ':')
166 break;
167
168 while ( value && *value == ' ')
169 ++value;
170
171 value_end = value;
172 more = no;
173
174 while (*value_end)
175 {
176 if (*value_end == ';')
177 {
178 more = yes;
179 break;
180 }
181
182 ++value_end;
183 }
184
185 *name_end = '\0';
186 *value_end = '\0';
187
188 prop = InsertProperty(doc, prop, name, value);
189 *name_end = ':';
190
191 if (more)
192 {
193 *value_end = ';';
194 name = value_end + 1;
195 continue;
196 }
197
198 break;
199 }
200
201 TidyDocFree(doc, line); /* free temporary copy */
202 return prop;
203 }
204
CreatePropString(TidyDocImpl * doc,StyleProp * props)205 static tmbstr CreatePropString(TidyDocImpl* doc, StyleProp *props)
206 {
207 tmbstr style, p, s;
208 uint len;
209 StyleProp *prop;
210
211 /* compute length */
212
213 for (len = 0, prop = props; prop; prop = prop->next)
214 {
215 len += TY_(tmbstrlen)(prop->name) + 2;
216 if (prop->value)
217 len += TY_(tmbstrlen)(prop->value) + 2;
218 }
219
220 style = (tmbstr) TidyDocAlloc(doc, len+1);
221 style[0] = '\0';
222
223 for (p = style, prop = props; prop; prop = prop->next)
224 {
225 s = prop->name;
226
227 while((*p++ = *s++))
228 continue;
229
230 if (prop->value)
231 {
232 *--p = ':';
233 *++p = ' ';
234 ++p;
235
236 s = prop->value;
237 while((*p++ = *s++))
238 continue;
239 }
240 if (prop->next == NULL)
241 break;
242
243 *--p = ';';
244 *++p = ' ';
245 ++p;
246 }
247
248 return style;
249 }
250
251 /*
252 create string with merged properties
253 static tmbstr AddProperty( ctmbstr style, ctmbstr property )
254 {
255 tmbstr line;
256 StyleProp *prop;
257
258 prop = CreateProps(doc, NULL, style);
259 prop = CreateProps(doc, prop, property);
260 line = CreatePropString(doc, prop);
261 FreeStyleProps(doc, prop);
262 return line;
263 }
264 */
265
TY_(FreeStyles)266 void TY_(FreeStyles)( TidyDocImpl* doc )
267 {
268 Lexer* lexer = doc->lexer;
269 if ( lexer )
270 {
271 TagStyle *style, *next;
272 for ( style = lexer->styles; style; style = next )
273 {
274 next = style->next;
275 TidyDocFree( doc, style->tag );
276 TidyDocFree( doc, style->tag_class );
277 TidyDocFree( doc, style->properties );
278 TidyDocFree( doc, style );
279 }
280 }
281 }
282
GensymClass(TidyDocImpl * doc)283 static tmbstr GensymClass( TidyDocImpl* doc )
284 {
285 tmbchar buf[512]; /* CSSPrefix is limited to 256 characters */
286 ctmbstr pfx = cfgStr(doc, TidyCSSPrefix);
287 if ( pfx == NULL || *pfx == 0 )
288 pfx = "c";
289
290 TY_(tmbsnprintf)(buf, sizeof(buf), "%s%u", pfx, ++doc->nClassId );
291 return TY_(tmbstrdup)(doc->allocator, buf);
292 }
293
FindStyle(TidyDocImpl * doc,ctmbstr tag,ctmbstr properties)294 static ctmbstr FindStyle( TidyDocImpl* doc, ctmbstr tag, ctmbstr properties )
295 {
296 Lexer* lexer = doc->lexer;
297 TagStyle* style;
298
299 for (style = lexer->styles; style; style=style->next)
300 {
301 if (TY_(tmbstrcmp)(style->tag, tag) == 0 &&
302 TY_(tmbstrcmp)(style->properties, properties) == 0)
303 return style->tag_class;
304 }
305
306 style = (TagStyle *)TidyDocAlloc( doc, sizeof(TagStyle) );
307 style->tag = TY_(tmbstrdup)(doc->allocator, tag);
308 style->tag_class = GensymClass( doc );
309 style->properties = TY_(tmbstrdup)( doc->allocator, properties );
310 style->next = lexer->styles;
311 lexer->styles = style;
312 return style->tag_class;
313 }
314
315 /*
316 Add class="foo" to node
317 */
AddClass(TidyDocImpl * doc,Node * node,ctmbstr classname)318 static void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname )
319 {
320 AttVal *classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);;
321
322 /*
323 if there already is a class attribute
324 then append class name after a space.
325 */
326 if (classattr)
327 TY_(AppendToClassAttr)( doc, classattr, classname );
328 else /* create new class attribute */
329 TY_(AddAttribute)( doc, node, "class", classname );
330 }
331
TY_(AddStyleAsClass)332 void TY_(AddStyleAsClass)( TidyDocImpl* doc, Node *node, ctmbstr stylevalue )
333 {
334 ctmbstr classname;
335
336 classname = FindStyle( doc, node->element, stylevalue );
337 AddClass( doc, node, classname);
338 }
339
340 /*
341 Find style attribute in node, and replace it
342 by corresponding class attribute. Search for
343 class in style dictionary otherwise gensym
344 new class and add to dictionary.
345
346 Assumes that node doesn't have a class attribute
347 */
Style2Rule(TidyDocImpl * doc,Node * node)348 static void Style2Rule( TidyDocImpl* doc, Node *node)
349 {
350 AttVal *styleattr, *classattr;
351 ctmbstr classname;
352
353 styleattr = TY_(AttrGetById)(node, TidyAttr_STYLE);
354
355 if (styleattr)
356 {
357 /* fix for http://tidy.sf.net/bug/850215 */
358 if (!styleattr->value)
359 {
360 TY_(RemoveAttribute)(doc, node, styleattr);
361 return;
362 }
363
364 classname = FindStyle( doc, node->element, styleattr->value );
365 classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);
366
367 /*
368 if there already is a class attribute
369 then append class name after an underscore
370 */
371 if (classattr)
372 {
373 TY_(AppendToClassAttr)( doc, classattr, classname );
374 TY_(RemoveAttribute)( doc, node, styleattr );
375 }
376 else /* reuse style attribute for class attribute */
377 {
378 TidyDocFree(doc, styleattr->attribute);
379 TidyDocFree(doc, styleattr->value);
380 styleattr->attribute = TY_(tmbstrdup)(doc->allocator, "class");
381 styleattr->value = TY_(tmbstrdup)(doc->allocator, classname);
382 }
383 }
384 }
385
AddColorRule(Lexer * lexer,ctmbstr selector,ctmbstr color)386 static void AddColorRule( Lexer* lexer, ctmbstr selector, ctmbstr color )
387 {
388 if ( selector && color )
389 {
390 TY_(AddStringLiteral)(lexer, selector);
391 TY_(AddStringLiteral)(lexer, " { color: ");
392 TY_(AddStringLiteral)(lexer, color);
393 TY_(AddStringLiteral)(lexer, " }\n");
394 }
395 }
396
397 /*
398 move presentation attribs from body to style element
399
400 background="foo" -> body { background-image: url(foo) }
401 bgcolor="foo" -> body { background-color: foo }
402 text="foo" -> body { color: foo }
403 link="foo" -> :link { color: foo }
404 vlink="foo" -> :visited { color: foo }
405 alink="foo" -> :active { color: foo }
406 */
CleanBodyAttrs(TidyDocImpl * doc,Node * body)407 static void CleanBodyAttrs( TidyDocImpl* doc, Node* body )
408 {
409 Lexer* lexer = doc->lexer;
410 tmbstr bgurl = NULL;
411 tmbstr bgcolor = NULL;
412 tmbstr color = NULL;
413 AttVal* attr;
414
415 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BACKGROUND)))
416 {
417 bgurl = attr->value;
418 attr->value = NULL;
419 TY_(RemoveAttribute)( doc, body, attr );
420 }
421
422 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BGCOLOR)))
423 {
424 bgcolor = attr->value;
425 attr->value = NULL;
426 TY_(RemoveAttribute)( doc, body, attr );
427 }
428
429 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_TEXT)))
430 {
431 color = attr->value;
432 attr->value = NULL;
433 TY_(RemoveAttribute)( doc, body, attr );
434 }
435
436 if ( bgurl || bgcolor || color )
437 {
438 TY_(AddStringLiteral)(lexer, " body {\n");
439 if (bgurl)
440 {
441 TY_(AddStringLiteral)(lexer, " background-image: url(");
442 TY_(AddStringLiteral)(lexer, bgurl);
443 TY_(AddStringLiteral)(lexer, ");\n");
444 TidyDocFree(doc, bgurl);
445 }
446 if (bgcolor)
447 {
448 TY_(AddStringLiteral)(lexer, " background-color: ");
449 TY_(AddStringLiteral)(lexer, bgcolor);
450 TY_(AddStringLiteral)(lexer, ";\n");
451 TidyDocFree(doc, bgcolor);
452 }
453 if (color)
454 {
455 TY_(AddStringLiteral)(lexer, " color: ");
456 TY_(AddStringLiteral)(lexer, color);
457 TY_(AddStringLiteral)(lexer, ";\n");
458 TidyDocFree(doc, color);
459 }
460
461 TY_(AddStringLiteral)(lexer, " }\n");
462 }
463
464 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_LINK)))
465 {
466 AddColorRule(lexer, " :link", attr->value);
467 TY_(RemoveAttribute)( doc, body, attr );
468 }
469
470 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_VLINK)))
471 {
472 AddColorRule(lexer, " :visited", attr->value);
473 TY_(RemoveAttribute)( doc, body, attr );
474 }
475
476 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_ALINK)))
477 {
478 AddColorRule(lexer, " :active", attr->value);
479 TY_(RemoveAttribute)( doc, body, attr );
480 }
481 }
482
NiceBody(TidyDocImpl * doc)483 static Bool NiceBody( TidyDocImpl* doc )
484 {
485 Node* const node = TY_(FindBody)(doc);
486 if (node) {
487 if (TY_(AttrGetById)(node, TidyAttr_BACKGROUND) ||
488 TY_(AttrGetById)(node, TidyAttr_BGCOLOR) ||
489 TY_(AttrGetById)(node, TidyAttr_TEXT) ||
490 TY_(AttrGetById)(node, TidyAttr_LINK) ||
491 TY_(AttrGetById)(node, TidyAttr_VLINK) ||
492 TY_(AttrGetById)(node, TidyAttr_ALINK))
493 {
494 doc->badLayout |= USING_BODY;
495 return no;
496 }
497 }
498
499 return yes;
500 }
501
502 /* create style element using rules from dictionary */
CreateStyleElement(TidyDocImpl * doc)503 static void CreateStyleElement( TidyDocImpl* doc )
504 {
505 Lexer* lexer = doc->lexer;
506 Node *node, *head, *body;
507 TagStyle *style;
508 AttVal *av;
509
510 if ( lexer->styles == NULL && NiceBody(doc) )
511 return;
512
513 node = TY_(NewNode)( doc->allocator, lexer );
514 node->type = StartTag;
515 node->implicit = yes;
516 node->element = TY_(tmbstrdup)(doc->allocator, "style");
517 TY_(FindTag)( doc, node );
518
519 /* insert type attribute */
520 av = TY_(NewAttributeEx)( doc, "type", "text/css", '"' );
521 TY_(InsertAttributeAtStart)( node, av );
522
523 body = TY_(FindBody)( doc );
524 lexer->txtstart = lexer->lexsize;
525 if ( body )
526 CleanBodyAttrs( doc, body );
527
528 for (style = lexer->styles; style; style = style->next)
529 {
530 TY_(AddCharToLexer)(lexer, ' ');
531 TY_(AddStringLiteral)(lexer, style->tag);
532 TY_(AddCharToLexer)(lexer, '.');
533 TY_(AddStringLiteral)(lexer, style->tag_class);
534 TY_(AddCharToLexer)(lexer, ' ');
535 TY_(AddCharToLexer)(lexer, '{');
536 TY_(AddStringLiteral)(lexer, style->properties);
537 TY_(AddCharToLexer)(lexer, '}');
538 TY_(AddCharToLexer)(lexer, '\n');
539 }
540
541 lexer->txtend = lexer->lexsize;
542
543 TY_(InsertNodeAtEnd)( node, TY_(TextToken)(lexer) );
544
545 /*
546 now insert style element into document head
547
548 doc is root node. search its children for html node
549 the head node should be first child of html node
550 */
551 if ( NULL != (head = TY_(FindHEAD)( doc )) )
552 TY_(InsertNodeAtEnd)( head, node );
553 }
554
555
556 /* ensure bidirectional links are consistent */
TY_(FixNodeLinks)557 void TY_(FixNodeLinks)(Node *node)
558 {
559 Node *child;
560
561 if (node->prev)
562 node->prev->next = node;
563 else
564 node->parent->content = node;
565
566 if (node->next)
567 node->next->prev = node;
568 else
569 node->parent->last = node;
570
571 for (child = node->content; child; child = child->next)
572 child->parent = node;
573 }
574
575 /*
576 used to strip child of node when
577 the node has one and only one child
578 */
StripOnlyChild(TidyDocImpl * doc,Node * node)579 static void StripOnlyChild(TidyDocImpl* doc, Node *node)
580 {
581 Node *child;
582
583 child = node->content;
584 node->content = child->content;
585 node->last = child->last;
586 child->content = NULL;
587 TY_(FreeNode)(doc, child);
588
589 for (child = node->content; child; child = child->next)
590 child->parent = node;
591 }
592
593 /*
594 used to strip font start and end tags.
595 Extricate "element", replace it by its content and delete it.
596 */
DiscardContainer(TidyDocImpl * doc,Node * element,Node ** pnode)597 static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
598 {
599 if (element->content)
600 {
601 Node *node, *parent = element->parent;
602
603 element->last->next = element->next;
604
605 if (element->next)
606 {
607 element->next->prev = element->last;
608 }
609 else
610 parent->last = element->last;
611
612 if (element->prev)
613 {
614 element->content->prev = element->prev;
615 element->prev->next = element->content;
616 }
617 else
618 parent->content = element->content;
619
620 for (node = element->content; node; node = node->next)
621 node->parent = parent;
622
623 *pnode = element->content;
624
625 element->next = element->content = NULL;
626 TY_(FreeNode)(doc, element);
627 }
628 else
629 {
630 *pnode = TY_(DiscardElement)(doc, element);
631 }
632 }
633
634 /*
635 Create new string that consists of the
636 combined style properties in s1 and s2
637
638 To merge property lists, we build a linked
639 list of property/values and insert properties
640 into the list in order, merging values for
641 the same property name.
642 */
MergeProperties(TidyDocImpl * doc,ctmbstr s1,ctmbstr s2)643 static tmbstr MergeProperties( TidyDocImpl* doc, ctmbstr s1, ctmbstr s2 )
644 {
645 tmbstr s;
646 StyleProp *prop;
647
648 prop = CreateProps(doc, NULL, s1);
649 prop = CreateProps(doc, prop, s2);
650 s = CreatePropString(doc, prop);
651 FreeStyleProps(doc, prop);
652 return s;
653 }
654
655 /*
656 Add style property to element, creating style
657 attribute as needed and adding ; delimiter
658 */
TY_(AddStyleProperty)659 void TY_(AddStyleProperty)(TidyDocImpl* doc, Node *node, ctmbstr property )
660 {
661 AttVal *av = TY_(AttrGetById)(node, TidyAttr_STYLE);
662
663 /* if style attribute already exists then insert property */
664
665 if ( av )
666 {
667 if (av->value != NULL)
668 {
669 tmbstr s = MergeProperties( doc, av->value, property );
670 TidyDocFree( doc, av->value );
671 av->value = s;
672 }
673 else
674 {
675 av->value = TY_(tmbstrdup)( doc->allocator, property );
676 }
677 }
678 else /* else create new style attribute */
679 {
680 av = TY_(NewAttributeEx)( doc, "style", property, '"' );
681 TY_(InsertAttributeAtStart)( node, av );
682 }
683 }
684
MergeClasses(TidyDocImpl * doc,Node * node,Node * child)685 static void MergeClasses(TidyDocImpl* doc, Node *node, Node *child)
686 {
687 AttVal *av;
688 tmbstr s1, s2, names;
689
690 for (s2 = NULL, av = child->attributes; av; av = av->next)
691 {
692 if (attrIsCLASS(av))
693 {
694 s2 = av->value;
695 break;
696 }
697 }
698
699 for (s1 = NULL, av = node->attributes; av; av = av->next)
700 {
701 if (attrIsCLASS(av))
702 {
703 s1 = av->value;
704 break;
705 }
706 }
707
708 if (s1)
709 {
710 if (s2) /* merge class names from both */
711 {
712 uint l1, l2;
713 l1 = TY_(tmbstrlen)(s1);
714 l2 = TY_(tmbstrlen)(s2);
715 names = (tmbstr) TidyDocAlloc(doc, l1 + l2 + 2);
716 TY_(tmbstrcpy)(names, s1);
717 names[l1] = ' ';
718 TY_(tmbstrcpy)(names+l1+1, s2);
719 TidyDocFree(doc, av->value);
720 av->value = names;
721 }
722 }
723 else if (s2) /* copy class names from child */
724 {
725 av = TY_(NewAttributeEx)( doc, "class", s2, '"' );
726 TY_(InsertAttributeAtStart)( node, av );
727 }
728 }
729
MergeStyles(TidyDocImpl * doc,Node * node,Node * child)730 static void MergeStyles(TidyDocImpl* doc, Node *node, Node *child)
731 {
732 AttVal *av;
733 tmbstr s1, s2, style;
734
735 /*
736 the child may have a class attribute used
737 for attaching styles, if so the class name
738 needs to be copied to node's class
739 */
740 MergeClasses(doc, node, child);
741
742 for (s2 = NULL, av = child->attributes; av; av = av->next)
743 {
744 if (attrIsSTYLE(av))
745 {
746 s2 = av->value;
747 break;
748 }
749 }
750
751 for (s1 = NULL, av = node->attributes; av; av = av->next)
752 {
753 if (attrIsSTYLE(av))
754 {
755 s1 = av->value;
756 break;
757 }
758 }
759
760 if (s1)
761 {
762 if (s2) /* merge styles from both */
763 {
764 style = MergeProperties(doc, s1, s2);
765 TidyDocFree(doc, av->value);
766 av->value = style;
767 }
768 }
769 else if (s2) /* copy style of child */
770 {
771 av = TY_(NewAttributeEx)( doc, "style", s2, '"' );
772 TY_(InsertAttributeAtStart)( node, av );
773 }
774 }
775
FontSize2Name(ctmbstr size)776 static ctmbstr FontSize2Name(ctmbstr size)
777 {
778 static const ctmbstr sizes[7] =
779 {
780 "60%", "70%", "80%", NULL,
781 "120%", "150%", "200%"
782 };
783
784 /* increment of 0.8 */
785 static const ctmbstr minussizes[] =
786 {
787 "100%", "80%", "64%", "51%",
788 "40%", "32%", "26%"
789 };
790
791 /* increment of 1.2 */
792 static const ctmbstr plussizes[] =
793 {
794 "100%", "120%", "144%", "172%",
795 "207%", "248%", "298%"
796 };
797
798 if (size[0] == '\0')
799 return NULL;
800
801 if ('0' <= size[0] && size[0] <= '6')
802 {
803 int n = size[0] - '0';
804 return sizes[n];
805 }
806
807 if (size[0] == '-')
808 {
809 if ('0' <= size[1] && size[1] <= '6')
810 {
811 int n = size[1] - '0';
812 return minussizes[n];
813 }
814 return "smaller"; /*"70%"; */
815 }
816
817 if ('0' <= size[1] && size[1] <= '6')
818 {
819 int n = size[1] - '0';
820 return plussizes[n];
821 }
822
823 return "larger"; /* "140%" */
824 }
825
AddFontFace(TidyDocImpl * doc,Node * node,ctmbstr face)826 static void AddFontFace( TidyDocImpl* doc, Node *node, ctmbstr face )
827 {
828 tmbchar buf[256];
829 TY_(tmbsnprintf)(buf, sizeof(buf), "font-family: %s", face );
830 TY_(AddStyleProperty)( doc, node, buf );
831 }
832
AddFontSize(TidyDocImpl * doc,Node * node,ctmbstr size)833 static void AddFontSize( TidyDocImpl* doc, Node* node, ctmbstr size )
834 {
835 ctmbstr value = NULL;
836
837 if (nodeIsP(node))
838 {
839 if (TY_(tmbstrcmp)(size, "6") == 0)
840 value = "h1";
841 else if (TY_(tmbstrcmp)(size, "5") == 0)
842 value = "h2";
843 else if (TY_(tmbstrcmp)(size, "4") == 0)
844 value = "h3";
845
846 if (value)
847 {
848 TidyDocFree(doc, node->element);
849 node->element = TY_(tmbstrdup)(doc->allocator, value);
850 TY_(FindTag)(doc, node);
851 return;
852 }
853 }
854
855 value = FontSize2Name(size);
856
857 if (value)
858 {
859 tmbchar buf[64];
860 TY_(tmbsnprintf)(buf, sizeof(buf), "font-size: %s", value);
861 TY_(AddStyleProperty)( doc, node, buf );
862 }
863 }
864
AddFontColor(TidyDocImpl * doc,Node * node,ctmbstr color)865 static void AddFontColor( TidyDocImpl* doc, Node *node, ctmbstr color)
866 {
867 tmbchar buf[128];
868 TY_(tmbsnprintf)(buf, sizeof(buf), "color: %s", color);
869 TY_(AddStyleProperty)( doc, node, buf );
870 }
871
872 /* force alignment value to lower case */
AddAlign(TidyDocImpl * doc,Node * node,ctmbstr align)873 static void AddAlign( TidyDocImpl* doc, Node *node, ctmbstr align )
874 {
875 uint i;
876 tmbchar buf[128];
877
878 TY_(tmbstrcpy)( buf, "text-align: " );
879 for ( i = 12; i < sizeof(buf)/sizeof(buf[0])-1; ++i )
880 {
881 if ( (buf[i] = (tmbchar)TY_(ToLower)(*align++)) == '\0' )
882 break;
883 }
884 buf[i] = '\0';
885 TY_(AddStyleProperty)( doc, node, buf );
886 }
887
888 /*
889 add style properties to node corresponding to
890 the font face, size and color attributes
891 */
AddFontStyles(TidyDocImpl * doc,Node * node,AttVal * av)892 static void AddFontStyles( TidyDocImpl* doc, Node *node, AttVal *av)
893 {
894 while (av)
895 {
896 if (AttrHasValue(av))
897 {
898 if (attrIsFACE(av))
899 AddFontFace( doc, node, av->value );
900 else if (attrIsSIZE(av))
901 AddFontSize( doc, node, av->value );
902 else if (attrIsCOLOR(av))
903 AddFontColor( doc, node, av->value );
904 }
905 av = av->next;
906 }
907 }
908
909 /*
910 Symptom: <p align=center>
911 Action: <p style="text-align: center">
912 */
TextAlign(TidyDocImpl * doc,Node * node)913 static void TextAlign( TidyDocImpl* doc, Node* node )
914 {
915 AttVal *av, *prev;
916
917 prev = NULL;
918
919 for (av = node->attributes; av; av = av->next)
920 {
921 if (attrIsALIGN(av))
922 {
923 if (prev)
924 prev->next = av->next;
925 else
926 node->attributes = av->next;
927
928 if (av->value)
929 AddAlign( doc, node, av->value );
930
931 TY_(FreeAttribute)(doc, av);
932 break;
933 }
934
935 prev = av;
936 }
937 }
938
939 /*
940 Symptom: <table bgcolor="red">
941 Action: <table style="background-color: red">
942 */
TableBgColor(TidyDocImpl * doc,Node * node)943 static void TableBgColor( TidyDocImpl* doc, Node* node )
944 {
945 AttVal* attr;
946 tmbchar buf[256];
947
948 if (NULL != (attr = TY_(AttrGetById)(node, TidyAttr_BGCOLOR)))
949 {
950 TY_(tmbsnprintf)(buf, sizeof(buf), "background-color: %s", attr->value );
951 TY_(RemoveAttribute)( doc, node, attr );
952 TY_(AddStyleProperty)( doc, node, buf );
953 }
954 }
955
956 /*
957 The clean up rules use the pnode argument to return the
958 next node when the original node has been deleted
959 */
960
961 /*
962 Symptom: <dir> <li> where <li> is only child
963 Action: coerce <dir> <li> to <div> with indent.
964 */
965
Dir2Div(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))966 static Bool Dir2Div( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode))
967 {
968 Node *child;
969
970 if ( nodeIsDIR(node) || nodeIsUL(node) || nodeIsOL(node) )
971 {
972 child = node->content;
973
974 if (child == NULL)
975 return no;
976
977 /* check child has no peers */
978
979 if (child->next)
980 return no;
981
982 if ( !nodeIsLI(child) )
983 return no;
984
985 if ( !child->implicit )
986 return no;
987
988 /* coerce dir to div */
989 node->tag = TY_(LookupTagDef)( TidyTag_DIV );
990 TidyDocFree( doc, node->element );
991 node->element = TY_(tmbstrdup)(doc->allocator, "div");
992 TY_(AddStyleProperty)( doc, node, "margin-left: 2em" );
993 StripOnlyChild( doc, node );
994 return yes;
995 }
996
997 return no;
998 }
999
1000 /*
1001 Symptom: <center>
1002 Action: replace <center> by <div style="text-align: center">
1003 */
1004
Center2Div(TidyDocImpl * doc,Node * node,Node ** pnode)1005 static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode)
1006 {
1007 if ( nodeIsCENTER(node) )
1008 {
1009 if ( cfgBool(doc, TidyDropFontTags) )
1010 {
1011 if (node->content)
1012 {
1013 Node *last = node->last;
1014 DiscardContainer( doc, node, pnode );
1015
1016 node = TY_(InferredTag)(doc, TidyTag_BR);
1017 TY_(InsertNodeAfterElement)(last, node);
1018 }
1019 else
1020 {
1021 Node * const prev = node->prev;
1022 Node * const next = node->next;
1023 Node * const parent = node->parent;
1024
1025 DiscardContainer( doc, node, pnode );
1026
1027 node = TY_(InferredTag)(doc, TidyTag_BR);
1028 if (next)
1029 TY_(InsertNodeBeforeElement)(next, node);
1030 else if (prev)
1031 TY_(InsertNodeAfterElement)(prev, node);
1032 else
1033 TY_(InsertNodeAtStart)(parent, node);
1034 }
1035
1036 return yes;
1037 }
1038
1039 RenameElem( doc, node, TidyTag_DIV );
1040 TY_(AddStyleProperty)( doc, node, "text-align: center" );
1041 return yes;
1042 }
1043
1044 return no;
1045 }
1046
1047 /* Copy child attributes to node. Duplicate attributes are overwritten.
1048 Unique attributes (such as ID) disable the action.
1049 Attributes style and class are not dealt with. A call to MergeStyles
1050 will do that.
1051 */
CopyAttrs(TidyDocImpl * doc,Node * node,Node * child)1052 static Bool CopyAttrs( TidyDocImpl* doc, Node *node, Node *child)
1053 {
1054 AttVal *av1, *av2;
1055 TidyAttrId id;
1056
1057 /* Detect attributes that cannot be merged or overwritten. */
1058 if (TY_(AttrGetById)(child, TidyAttr_ID) != NULL
1059 && TY_(AttrGetById)(node, TidyAttr_ID) != NULL)
1060 return no;
1061
1062 /* Move child attributes to node. Attributes in node
1063 can be overwritten or merged. */
1064 for (av2 = child->attributes; av2; )
1065 {
1066 /* Dealt by MergeStyles. */
1067 if (attrIsSTYLE(av2) || attrIsCLASS(av2))
1068 {
1069 av2 = av2->next;
1070 continue;
1071 }
1072 /* Avoid duplicates in node */
1073 if ((id=AttrId(av2)) != TidyAttr_UNKNOWN
1074 && (av1=TY_(AttrGetById)(node, id))!= NULL)
1075 TY_(RemoveAttribute)( doc, node, av1 );
1076
1077 /* Move attribute from child to node */
1078 TY_(DetachAttribute)( child, av2 );
1079 av1 = av2;
1080 av2 = av2->next;
1081 av1->next = NULL;
1082 TY_(InsertAttributeAtEnd)( node, av1 );
1083 }
1084
1085 return yes;
1086 }
1087
1088 /*
1089 Symptom <XX><XX>...</XX></XX>
1090 Action: merge the two XXs
1091
1092 For instance, this is useful after nested <dir>s used by Word
1093 for indenting have been converted to <div>s
1094
1095 If state is "no", no merging.
1096 If state is "yes", inner element is discarded. Only Style and Class
1097 attributes are merged using MergeStyles().
1098 If state is "auto", atttibutes are merged as described in CopyAttrs().
1099 Style and Class attributes are merged using MergeStyles().
1100 */
MergeNestedElements(TidyDocImpl * doc,TidyTagId Id,TidyTriState state,Node * node,Node ** ARG_UNUSED (pnode))1101 static Bool MergeNestedElements( TidyDocImpl* doc,
1102 TidyTagId Id, TidyTriState state, Node *node,
1103 Node **ARG_UNUSED(pnode))
1104 {
1105 Node *child;
1106
1107 if ( state == TidyNoState
1108 || !TagIsId(node, Id) )
1109 return no;
1110
1111 child = node->content;
1112
1113 if ( child == NULL
1114 || child->next != NULL
1115 || !TagIsId(child, Id) )
1116 return no;
1117
1118 if ( state == TidyAutoState && !CopyAttrs(doc, node, child) )
1119 return no;
1120
1121 MergeStyles( doc, node, child );
1122 StripOnlyChild( doc, node );
1123 return yes;
1124 }
1125
1126 /*
1127 Symptom: <ul><li><ul>...</ul></li></ul>
1128 Action: discard outer list
1129 */
1130
NestedList(TidyDocImpl * doc,Node * node,Node ** pnode)1131 static Bool NestedList( TidyDocImpl* doc, Node *node, Node **pnode )
1132 {
1133 Node *list;
1134
1135 if ( nodeIsUL(node) || nodeIsOL(node) )
1136 {
1137 Node *child = node->content;
1138
1139 if (child == NULL)
1140 return no;
1141
1142 /* check child has no peers */
1143
1144 if (child->next)
1145 return no;
1146
1147 list = child->content;
1148
1149 if (!list)
1150 return no;
1151
1152 if (list->tag != node->tag)
1153 return no;
1154
1155 /* check list has no peers */
1156 if (list->next)
1157 return no;
1158
1159 *pnode = list; /* Set node to resume iteration */
1160
1161 /* move inner list node into position of outer node */
1162 list->prev = node->prev;
1163 list->next = node->next;
1164 list->parent = node->parent;
1165 TY_(FixNodeLinks)(list);
1166
1167 /* get rid of outer ul and its li */
1168 child->content = NULL;
1169 TY_(FreeNode)( doc, child ); /* See test #427841. */
1170 child = NULL;
1171 node->content = NULL;
1172 node->next = NULL;
1173 TY_(FreeNode)( doc, node );
1174 node = NULL;
1175
1176 /*
1177 If prev node was a list the chances are this node
1178 should be appended to that list. Word has no way of
1179 recognizing nested lists and just uses indents
1180 */
1181
1182 if (list->prev)
1183 {
1184 if ( (nodeIsUL(list->prev) || nodeIsOL(list->prev))
1185 && list->prev->last )
1186 {
1187 node = list;
1188 list = node->prev;
1189
1190 child = list->last; /* <li> */
1191
1192 list->next = node->next;
1193 TY_(FixNodeLinks)(list);
1194
1195 node->parent = child;
1196 node->next = NULL;
1197 node->prev = child->last;
1198 TY_(FixNodeLinks)(node);
1199 CleanNode( doc, node );
1200 }
1201 }
1202
1203 return yes;
1204 }
1205
1206 return no;
1207 }
1208
1209 /* Find CSS equivalent in a SPAN element */
1210 static
FindCSSSpanEq(Node * node,ctmbstr * s,Bool deprecatedOnly)1211 Bool FindCSSSpanEq( Node *node, ctmbstr *s, Bool deprecatedOnly )
1212 {
1213 struct
1214 {
1215 TidyTagId id;
1216 ctmbstr CSSeq;
1217 Bool deprecated;
1218 }
1219 const CSS_SpanEq[] =
1220 {
1221 { TidyTag_B, "font-weight: bold", no },
1222 { TidyTag_I, "font-style: italic", no },
1223 { TidyTag_S, "text-decoration: line-through", yes},
1224 { TidyTag_STRIKE, "text-decoration: line-through", yes},
1225 { TidyTag_U, "text-decoration: underline", yes},
1226 { TidyTag_UNKNOWN, NULL, no }
1227 };
1228 uint i;
1229
1230 for (i=0; CSS_SpanEq[i].CSSeq; ++i)
1231 if ( (!deprecatedOnly || CSS_SpanEq[i].deprecated)
1232 && TagIsId(node, CSS_SpanEq[i].id) )
1233 {
1234 *s = CSS_SpanEq[i].CSSeq;
1235 return yes;
1236 }
1237 return no;
1238 }
1239
1240 /* Necessary conditions to apply BlockStyle(). */
CanApplyBlockStyle(Node * node)1241 static Bool CanApplyBlockStyle( Node *node )
1242 {
1243 if (TY_(nodeHasCM)(node,CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE)
1244 && !nodeIsTABLE(node) && !nodeIsTR(node) && !nodeIsLI(node) )
1245 {
1246 return yes;
1247 }
1248 return no;
1249 }
1250
1251 /*
1252 Symptom: the only child of a block-level element is a
1253 presentation element such as B, I or FONT
1254
1255 Action: add style "font-weight: bold" to the block and
1256 strip the <b> element, leaving its children.
1257
1258 example:
1259
1260 <p>
1261 <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1262 </p>
1263
1264 becomes:
1265
1266 <p style="font-weight: bold; font-family: Arial; font-size: 6">
1267 Draft Recommended Practice
1268 </p>
1269
1270 This code also replaces the align attribute by a style attribute.
1271 However, to avoid CSS problems with Navigator 4, this isn't done
1272 for the elements: caption, tr and table
1273 */
BlockStyle(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1274 static Bool BlockStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1275 {
1276 Node *child;
1277 ctmbstr CSSeq;
1278
1279 /* check for bgcolor */
1280 if ( nodeIsTABLE(node)
1281 || nodeIsTD(node) || nodeIsTH(node) || nodeIsTR( node ))
1282 TableBgColor( doc, node );
1283
1284 if (CanApplyBlockStyle(node))
1285 {
1286 /* check for align attribute */
1287 if ( !nodeIsCAPTION(node) )
1288 TextAlign( doc, node );
1289
1290 child = node->content;
1291 if (child == NULL)
1292 return no;
1293
1294 /* check child has no peers */
1295 if (child->next)
1296 return no;
1297
1298 if ( FindCSSSpanEq(child, &CSSeq, no) )
1299 {
1300 MergeStyles( doc, node, child );
1301 TY_(AddStyleProperty)( doc, node, CSSeq );
1302 StripOnlyChild( doc, node );
1303 return yes;
1304 }
1305 else if ( nodeIsFONT(child) )
1306 {
1307 MergeStyles( doc, node, child );
1308 AddFontStyles( doc, node, child->attributes );
1309 StripOnlyChild( doc, node );
1310 return yes;
1311 }
1312 }
1313
1314 return no;
1315 }
1316
1317 /* Necessary conditions to apply InlineStyle(). */
CanApplyInlineStyle(Node * node)1318 static Bool CanApplyInlineStyle( Node *node )
1319 {
1320 return !nodeIsFONT(node) && TY_(nodeHasCM)(node, CM_INLINE|CM_ROW);
1321 }
1322
1323 /* the only child of table cell or an inline element such as em */
InlineStyle(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1324 static Bool InlineStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1325 {
1326 Node *child;
1327 ctmbstr CSSeq;
1328
1329 if ( CanApplyInlineStyle(node) )
1330 {
1331 child = node->content;
1332
1333 if (child == NULL)
1334 return no;
1335
1336 /* check child has no peers */
1337
1338 if (child->next)
1339 return no;
1340
1341 if ( FindCSSSpanEq(child, &CSSeq, no) )
1342 {
1343 MergeStyles( doc, node, child );
1344 TY_(AddStyleProperty)( doc, node, CSSeq );
1345 StripOnlyChild( doc, node );
1346 return yes;
1347 }
1348 else if ( nodeIsFONT(child) )
1349 {
1350 MergeStyles( doc, node, child );
1351 AddFontStyles( doc, node, child->attributes );
1352 StripOnlyChild( doc, node );
1353 return yes;
1354 }
1355 }
1356
1357 return no;
1358 }
1359
1360 /*
1361 Transform element to equivalent CSS
1362 */
InlineElementToCSS(TidyDocImpl * doc,Node * node,Node ** ARG_UNUSED (pnode))1363 static Bool InlineElementToCSS( TidyDocImpl* doc, Node* node,
1364 Node **ARG_UNUSED(pnode) )
1365 {
1366 ctmbstr CSSeq;
1367
1368 /* if node is the only child of parent element then leave alone
1369 Do so only if BlockStyle may be succesful. */
1370 if ( node->parent->content == node && node->next == NULL &&
1371 (CanApplyBlockStyle(node->parent)
1372 || CanApplyInlineStyle(node->parent)) )
1373 return no;
1374
1375 if ( FindCSSSpanEq(node, &CSSeq, yes) )
1376 {
1377 RenameElem( doc, node, TidyTag_SPAN );
1378 TY_(AddStyleProperty)( doc, node, CSSeq );
1379 return yes;
1380 }
1381 return no;
1382 }
1383
1384 /*
1385 Replace font elements by span elements, deleting
1386 the font element's attributes and replacing them
1387 by a single style attribute.
1388 */
Font2Span(TidyDocImpl * doc,Node * node,Node ** pnode)1389 static Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode )
1390 {
1391 AttVal *av, *style, *next;
1392
1393 if ( nodeIsFONT(node) )
1394 {
1395 if ( cfgBool(doc, TidyDropFontTags) )
1396 {
1397 DiscardContainer( doc, node, pnode );
1398 return yes;
1399 }
1400
1401 /* if node is the only child of parent element then leave alone
1402 Do so only if BlockStyle may be succesful. */
1403 if ( node->parent->content == node && node->next == NULL &&
1404 CanApplyBlockStyle(node->parent) )
1405 return no;
1406
1407 AddFontStyles( doc, node, node->attributes );
1408
1409 /* extract style attribute and free the rest */
1410 av = node->attributes;
1411 style = NULL;
1412
1413 while (av)
1414 {
1415 next = av->next;
1416
1417 if (attrIsSTYLE(av))
1418 {
1419 av->next = NULL;
1420 style = av;
1421 }
1422 else
1423 {
1424 TY_(FreeAttribute)( doc, av );
1425 }
1426 av = next;
1427 }
1428
1429 node->attributes = style;
1430 RenameElem( doc, node, TidyTag_SPAN );
1431 return yes;
1432 }
1433
1434 return no;
1435 }
1436
1437 /*
1438 Applies all matching rules to a node.
1439 */
CleanNode(TidyDocImpl * doc,Node * node)1440 Node* CleanNode( TidyDocImpl* doc, Node *node )
1441 {
1442 Node *next = NULL;
1443 TidyTriState mergeDivs = cfgAutoBool(doc, TidyMergeDivs);
1444 TidyTriState mergeSpans = cfgAutoBool(doc, TidyMergeSpans);
1445
1446 for (next = node; TY_(nodeIsElement)(node); node = next)
1447 {
1448 if ( Dir2Div(doc, node, &next) )
1449 continue;
1450
1451 /* Special case: true result means
1452 ** that arg node and its parent no longer exist.
1453 ** So we must jump back up the CreateStyleProperties()
1454 ** call stack until we have a valid node reference.
1455 */
1456 if ( NestedList(doc, node, &next) )
1457 return next;
1458
1459 if ( Center2Div(doc, node, &next) )
1460 continue;
1461
1462 if ( MergeNestedElements(doc, TidyTag_DIV, mergeDivs, node, &next) )
1463 continue;
1464
1465 if ( MergeNestedElements(doc, TidyTag_SPAN, mergeSpans, node, &next) )
1466 continue;
1467
1468 if ( BlockStyle(doc, node, &next) )
1469 continue;
1470
1471 if ( InlineStyle(doc, node, &next) )
1472 continue;
1473
1474 if ( InlineElementToCSS(doc, node, &next) )
1475 continue;
1476
1477 if ( Font2Span(doc, node, &next) )
1478 continue;
1479
1480 break;
1481 }
1482
1483 return next;
1484 }
1485
1486 /* Special case: if the current node is destroyed by
1487 ** CleanNode() lower in the tree, this node and its parent
1488 ** no longer exist. So we must jump back up the CleanTree()
1489 ** call stack until we have a valid node reference.
1490 */
1491
CleanTree(TidyDocImpl * doc,Node * node)1492 static Node* CleanTree( TidyDocImpl* doc, Node *node )
1493 {
1494 if (node->content)
1495 {
1496 Node *child;
1497 for (child = node->content; child != NULL; child = child->next)
1498 {
1499 child = CleanTree( doc, child );
1500 if ( !child )
1501 break;
1502 }
1503 }
1504
1505 return CleanNode( doc, node );
1506 }
1507
DefineStyleRules(TidyDocImpl * doc,Node * node)1508 static void DefineStyleRules( TidyDocImpl* doc, Node *node )
1509 {
1510 Node *child;
1511
1512 if (node->content)
1513 {
1514 for (child = node->content;
1515 child != NULL; child = child->next)
1516 {
1517 DefineStyleRules( doc, child );
1518 }
1519 }
1520
1521 Style2Rule( doc, node );
1522 }
1523
TY_(CleanDocument)1524 void TY_(CleanDocument)( TidyDocImpl* doc )
1525 {
1526 /* placeholder. CleanTree()/CleanNode() will not
1527 ** zap root element
1528 */
1529 CleanTree( doc, &doc->root );
1530
1531 if ( cfgBool(doc, TidyMakeClean) )
1532 {
1533 DefineStyleRules( doc, &doc->root );
1534 CreateStyleElement( doc );
1535 }
1536 }
1537
1538 /* simplifies <b><b> ... </b> ...</b> etc. */
TY_(NestedEmphasis)1539 void TY_(NestedEmphasis)( TidyDocImpl* doc, Node* node )
1540 {
1541 Node *next;
1542
1543 while (node)
1544 {
1545 next = node->next;
1546
1547 if ( (nodeIsB(node) || nodeIsI(node))
1548 && node->parent && node->parent->tag == node->tag)
1549 {
1550 /* strip redundant inner element */
1551 DiscardContainer( doc, node, &next );
1552 node = next;
1553 continue;
1554 }
1555
1556 if ( node->content )
1557 TY_(NestedEmphasis)( doc, node->content );
1558
1559 node = next;
1560 }
1561 }
1562
1563
1564
1565 /* replace i by em and b by strong */
TY_(EmFromI)1566 void TY_(EmFromI)( TidyDocImpl* doc, Node* node )
1567 {
1568 while (node)
1569 {
1570 if ( nodeIsI(node) )
1571 RenameElem( doc, node, TidyTag_EM );
1572 else if ( nodeIsB(node) )
1573 RenameElem( doc, node, TidyTag_STRONG );
1574
1575 if ( node->content )
1576 TY_(EmFromI)( doc, node->content );
1577
1578 node = node->next;
1579 }
1580 }
1581
HasOneChild(Node * node)1582 static Bool HasOneChild(Node *node)
1583 {
1584 return (node->content && node->content->next == NULL);
1585 }
1586
1587 /*
1588 Some people use dir or ul without an li
1589 to indent the content. The pattern to
1590 look for is a list with a single implicit
1591 li. This is recursively replaced by an
1592 implicit blockquote.
1593 */
TY_(List2BQ)1594 void TY_(List2BQ)( TidyDocImpl* doc, Node* node )
1595 {
1596 while (node)
1597 {
1598 if (node->content)
1599 TY_(List2BQ)( doc, node->content );
1600
1601 if ( node->tag && node->tag->parser == TY_(ParseList) &&
1602 HasOneChild(node) && node->content->implicit )
1603 {
1604 StripOnlyChild( doc, node );
1605 RenameElem( doc, node, TidyTag_BLOCKQUOTE );
1606 node->implicit = yes;
1607 }
1608
1609 node = node->next;
1610 }
1611 }
1612
1613
1614 /*
1615 Replace implicit blockquote by div with an indent
1616 taking care to reduce nested blockquotes to a single
1617 div with the indent set to match the nesting depth
1618 */
TY_(BQ2Div)1619 void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
1620 {
1621 tmbchar indent_buf[ 32 ];
1622 uint indent;
1623
1624 while (node)
1625 {
1626 if ( nodeIsBLOCKQUOTE(node) && node->implicit )
1627 {
1628 indent = 1;
1629
1630 while( HasOneChild(node) &&
1631 nodeIsBLOCKQUOTE(node->content) &&
1632 node->implicit)
1633 {
1634 ++indent;
1635 StripOnlyChild( doc, node );
1636 }
1637
1638 if (node->content)
1639 TY_(BQ2Div)( doc, node->content );
1640
1641 TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
1642 2*indent);
1643
1644 RenameElem( doc, node, TidyTag_DIV );
1645 TY_(AddStyleProperty)(doc, node, indent_buf );
1646 }
1647 else if (node->content)
1648 TY_(BQ2Div)( doc, node->content );
1649
1650 node = node->next;
1651 }
1652 }
1653
1654
FindEnclosingCell(TidyDocImpl * ARG_UNUSED (doc),Node * node)1655 static Node* FindEnclosingCell( TidyDocImpl* ARG_UNUSED(doc), Node *node)
1656 {
1657 Node *check;
1658
1659 for ( check=node; check; check = check->parent )
1660 {
1661 if ( nodeIsTD(check) )
1662 return check;
1663 }
1664 return NULL;
1665 }
1666
1667 /* node is <![if ...]> prune up to <![endif]> */
PruneSection(TidyDocImpl * doc,Node * node)1668 static Node* PruneSection( TidyDocImpl* doc, Node *node )
1669 {
1670 Lexer* lexer = doc->lexer;
1671
1672 for (;;)
1673 {
1674 ctmbstr lexbuf = lexer->lexbuf + node->start;
1675 if ( TY_(tmbstrncmp)(lexbuf, "if !supportEmptyParas", 21) == 0 )
1676 {
1677 Node* cell = FindEnclosingCell( doc, node );
1678 if ( cell )
1679 {
1680 /* Need to put into cell so it doesn't look weird
1681 */
1682 Node* nbsp = TY_(NewLiteralTextNode)( lexer, "\240" );
1683 assert( (byte)'\240' == (byte)160 );
1684 TY_(InsertNodeBeforeElement)( node, nbsp );
1685 }
1686 }
1687
1688 /* discard node and returns next, unless it is a text node */
1689 if ( node->type == TextNode )
1690 node = node->next;
1691 else
1692 node = TY_(DiscardElement)( doc, node );
1693
1694 if (node == NULL)
1695 return NULL;
1696
1697 if (node->type == SectionTag)
1698 {
1699 if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0)
1700 {
1701 node = PruneSection( doc, node );
1702 continue;
1703 }
1704
1705 if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "endif", 5) == 0)
1706 {
1707 node = TY_(DiscardElement)( doc, node );
1708 break;
1709 }
1710 }
1711 }
1712
1713 return node;
1714 }
1715
TY_(DropSections)1716 void TY_(DropSections)( TidyDocImpl* doc, Node* node )
1717 {
1718 Lexer* lexer = doc->lexer;
1719 while (node)
1720 {
1721 if (node->type == SectionTag)
1722 {
1723 /* prune up to matching endif */
1724 if ((TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0) &&
1725 (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if !vml", 7) != 0)) /* #444394 - fix 13 Sep 01 */
1726 {
1727 node = PruneSection( doc, node );
1728 continue;
1729 }
1730
1731 /* discard others as well */
1732 node = TY_(DiscardElement)( doc, node );
1733 continue;
1734 }
1735
1736 if (node->content)
1737 TY_(DropSections)( doc, node->content );
1738
1739 node = node->next;
1740 }
1741 }
1742
PurgeWord2000Attributes(TidyDocImpl * doc,Node * node)1743 static void PurgeWord2000Attributes( TidyDocImpl* doc, Node* node )
1744 {
1745 AttVal *attr, *next, *prev = NULL;
1746
1747 for ( attr = node->attributes; attr; attr = next )
1748 {
1749 next = attr->next;
1750
1751 /* special check for class="Code" denoting pre text */
1752 /* Pass thru user defined styles as HTML class names */
1753 if (attrIsCLASS(attr))
1754 {
1755 if (AttrValueIs(attr, "Code") ||
1756 TY_(tmbstrncmp)(attr->value, "Mso", 3) != 0 )
1757 {
1758 prev = attr;
1759 continue;
1760 }
1761 }
1762
1763 if (attrIsCLASS(attr) ||
1764 attrIsSTYLE(attr) ||
1765 attrIsLANG(attr) ||
1766 ( (attrIsHEIGHT(attr) || attrIsWIDTH(attr)) &&
1767 (nodeIsTD(node) || nodeIsTR(node) || nodeIsTH(node)) ) ||
1768 (attr->attribute && TY_(tmbstrncmp)(attr->attribute, "x:", 2) == 0) )
1769 {
1770 if (prev)
1771 prev->next = next;
1772 else
1773 node->attributes = next;
1774
1775 TY_(FreeAttribute)( doc, attr );
1776 }
1777 else
1778 prev = attr;
1779 }
1780 }
1781
1782 /* Word2000 uses span excessively, so we strip span out */
StripSpan(TidyDocImpl * doc,Node * span)1783 static Node* StripSpan( TidyDocImpl* doc, Node* span )
1784 {
1785 Node *node, *prev = NULL, *content;
1786
1787 /*
1788 deal with span elements that have content
1789 by splicing the content in place of the span
1790 after having processed it
1791 */
1792
1793 TY_(CleanWord2000)( doc, span->content );
1794 content = span->content;
1795
1796 if (span->prev)
1797 prev = span->prev;
1798 else if (content)
1799 {
1800 node = content;
1801 content = content->next;
1802 TY_(RemoveNode)(node);
1803 TY_(InsertNodeBeforeElement)(span, node);
1804 prev = node;
1805 }
1806
1807 while (content)
1808 {
1809 node = content;
1810 content = content->next;
1811 TY_(RemoveNode)(node);
1812 TY_(InsertNodeAfterElement)(prev, node);
1813 prev = node;
1814 }
1815
1816 if (span->next == NULL)
1817 span->parent->last = prev;
1818
1819 node = span->next;
1820 span->content = NULL;
1821 TY_(DiscardElement)( doc, span );
1822 return node;
1823 }
1824
1825 /* map non-breaking spaces to regular spaces */
TY_(NormalizeSpaces)1826 void TY_(NormalizeSpaces)(Lexer *lexer, Node *node)
1827 {
1828 while ( node )
1829 {
1830 if ( node->content )
1831 TY_(NormalizeSpaces)( lexer, node->content );
1832
1833 if (TY_(nodeIsText)(node))
1834 {
1835 uint i, c;
1836 tmbstr p = lexer->lexbuf + node->start;
1837
1838 for (i = node->start; i < node->end; ++i)
1839 {
1840 c = (byte) lexer->lexbuf[i];
1841
1842 /* look for UTF-8 multibyte character */
1843 if ( c > 0x7F )
1844 i += TY_(GetUTF8)( lexer->lexbuf + i, &c );
1845
1846 if ( c == 160 )
1847 c = ' ';
1848
1849 p = TY_(PutUTF8)(p, c);
1850 }
1851 node->end = p - lexer->lexbuf;
1852 }
1853
1854 node = node->next;
1855 }
1856 }
1857
1858 /* used to hunt for hidden preformatted sections */
NoMargins(Node * node)1859 static Bool NoMargins(Node *node)
1860 {
1861 AttVal * const attval = TY_(AttrGetById)(node, TidyAttr_STYLE);
1862
1863 if ( !AttrHasValue(attval) )
1864 return no;
1865
1866 /* search for substring "margin-top: 0" */
1867 if (!TY_(tmbsubstr)(attval->value, "margin-top: 0"))
1868 return no;
1869
1870 /* search for substring "margin-bottom: 0" */
1871 if (!TY_(tmbsubstr)(attval->value, "margin-bottom: 0"))
1872 return no;
1873
1874 return yes;
1875 }
1876
1877 /* does element have a single space as its content? */
SingleSpace(Lexer * lexer,Node * node)1878 static Bool SingleSpace( Lexer* lexer, Node* node )
1879 {
1880 if ( node->content )
1881 {
1882 node = node->content;
1883
1884 if ( node->next != NULL )
1885 return no;
1886
1887 if ( node->type != TextNode )
1888 return no;
1889
1890 if ( (node->end - node->start) == 1 &&
1891 lexer->lexbuf[node->start] == ' ' )
1892 return yes;
1893
1894 if ( (node->end - node->start) == 2 )
1895 {
1896 uint c = 0;
1897 TY_(GetUTF8)( lexer->lexbuf + node->start, &c );
1898 if ( c == 160 )
1899 return yes;
1900 }
1901 }
1902
1903 return no;
1904 }
1905
1906 /*
1907 This is a major clean up to strip out all the extra stuff you get
1908 when you save as web page from Word 2000. It doesn't yet know what
1909 to do with VML tags, but these will appear as errors unless you
1910 declare them as new tags, such as o:p which needs to be declared
1911 as inline.
1912 */
TY_(CleanWord2000)1913 void TY_(CleanWord2000)( TidyDocImpl* doc, Node *node)
1914 {
1915 /* used to a list from a sequence of bulletted p's */
1916 Lexer* lexer = doc->lexer;
1917 Node* list = NULL;
1918
1919 while ( node )
1920 {
1921 /* get rid of Word's xmlns attributes */
1922 if ( nodeIsHTML(node) )
1923 {
1924 /* check that it's a Word 2000 document */
1925 if ( !TY_(GetAttrByName)(node, "xmlns:o") &&
1926 !cfgBool(doc, TidyMakeBare) )
1927 return;
1928
1929 TY_(FreeAttrs)( doc, node );
1930 }
1931
1932 /* fix up preformatted sections by looking for a
1933 ** sequence of paragraphs with zero top/bottom margin
1934 */
1935 if ( nodeIsP(node) )
1936 {
1937 if (NoMargins(node))
1938 {
1939 Node *pre, *next;
1940 TY_(CoerceNode)(doc, node, TidyTag_PRE, no, yes);
1941
1942 PurgeWord2000Attributes( doc, node );
1943
1944 if (node->content)
1945 TY_(CleanWord2000)( doc, node->content );
1946
1947 pre = node;
1948 node = node->next;
1949
1950 /* continue to strip p's */
1951
1952 while ( nodeIsP(node) && NoMargins(node) )
1953 {
1954 next = node->next;
1955 TY_(RemoveNode)(node);
1956 TY_(InsertNodeAtEnd)(pre, TY_(NewLineNode)(lexer));
1957 TY_(InsertNodeAtEnd)(pre, node);
1958 StripSpan( doc, node );
1959 node = next;
1960 }
1961
1962 if (node == NULL)
1963 break;
1964 }
1965 }
1966
1967 if (node->tag && (node->tag->model & CM_BLOCK)
1968 && SingleSpace(lexer, node))
1969 {
1970 node = StripSpan( doc, node );
1971 continue;
1972 }
1973 /* discard Word's style verbiage */
1974 if ( nodeIsSTYLE(node) || nodeIsMETA(node) ||
1975 node->type == CommentTag )
1976 {
1977 node = TY_(DiscardElement)( doc, node );
1978 continue;
1979 }
1980
1981 /* strip out all span and font tags Word scatters so liberally! */
1982 if ( nodeIsSPAN(node) || nodeIsFONT(node) )
1983 {
1984 node = StripSpan( doc, node );
1985 continue;
1986 }
1987
1988 if ( nodeIsLINK(node) )
1989 {
1990 AttVal *attr = TY_(AttrGetById)(node, TidyAttr_REL);
1991
1992 if (AttrValueIs(attr, "File-List"))
1993 {
1994 node = TY_(DiscardElement)( doc, node );
1995 continue;
1996 }
1997 }
1998
1999 /* discards <o:p> which encodes the paragraph mark */
2000 if ( node->tag && TY_(tmbstrcmp)(node->tag->name,"o:p")==0)
2001 {
2002 Node* next;
2003 DiscardContainer( doc, node, &next );
2004 node = next;
2005 continue;
2006 }
2007
2008 /* discard empty paragraphs */
2009
2010 if ( node->content == NULL && nodeIsP(node) )
2011 {
2012 /* Use the existing function to ensure consistency */
2013 Node *next = TY_(TrimEmptyElement)( doc, node );
2014 node = next;
2015 continue;
2016 }
2017
2018 if ( nodeIsP(node) )
2019 {
2020 AttVal *attr, *atrStyle;
2021
2022 attr = TY_(AttrGetById)(node, TidyAttr_CLASS);
2023 atrStyle = TY_(AttrGetById)(node, TidyAttr_STYLE);
2024 /*
2025 (JES) Sometimes Word marks a list item with the following hokie syntax
2026 <p class="MsoNormal" style="...;mso-list:l1 level1 lfo1;
2027 translate these into <li>
2028 */
2029 /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
2030 /* map <p class="MsoListNumber"> to <ol>...</ol> */
2031 if ( AttrValueIs(attr, "MsoListBullet") ||
2032 AttrValueIs(attr, "MsoListNumber") ||
2033 AttrContains(atrStyle, "mso-list:") )
2034 {
2035 TidyTagId listType = TidyTag_UL;
2036 if (AttrValueIs(attr, "MsoListNumber"))
2037 listType = TidyTag_OL;
2038
2039 TY_(CoerceNode)(doc, node, TidyTag_LI, no, yes);
2040
2041 if ( !list || TagId(list) != listType )
2042 {
2043 const Dict* tag = TY_(LookupTagDef)( listType );
2044 list = TY_(InferredTag)(doc, tag->id);
2045 TY_(InsertNodeBeforeElement)(node, list);
2046 }
2047
2048 PurgeWord2000Attributes( doc, node );
2049
2050 if ( node->content )
2051 TY_(CleanWord2000)( doc, node->content );
2052
2053 /* remove node and append to contents of list */
2054 TY_(RemoveNode)(node);
2055 TY_(InsertNodeAtEnd)(list, node);
2056 node = list;
2057 }
2058 /* map sequence of <p class="Code"> to <pre>...</pre> */
2059 else if (AttrValueIs(attr, "Code"))
2060 {
2061 Node *br = TY_(NewLineNode)(lexer);
2062 TY_(NormalizeSpaces)(lexer, node->content);
2063
2064 if ( !list || TagId(list) != TidyTag_PRE )
2065 {
2066 list = TY_(InferredTag)(doc, TidyTag_PRE);
2067 TY_(InsertNodeBeforeElement)(node, list);
2068 }
2069
2070 /* remove node and append to contents of list */
2071 TY_(RemoveNode)(node);
2072 TY_(InsertNodeAtEnd)(list, node);
2073 StripSpan( doc, node );
2074 TY_(InsertNodeAtEnd)(list, br);
2075 node = list->next;
2076 }
2077 else
2078 list = NULL;
2079 }
2080 else
2081 list = NULL;
2082
2083 if (!node)
2084 return;
2085
2086 /* strip out style and class attributes */
2087 if (TY_(nodeIsElement)(node))
2088 PurgeWord2000Attributes( doc, node );
2089
2090 if (node->content)
2091 TY_(CleanWord2000)( doc, node->content );
2092
2093 node = node->next;
2094 }
2095 }
2096
TY_(IsWord2000)2097 Bool TY_(IsWord2000)( TidyDocImpl* doc )
2098 {
2099 AttVal *attval;
2100 Node *node, *head;
2101 Node *html = TY_(FindHTML)( doc );
2102
2103 if (html && TY_(GetAttrByName)(html, "xmlns:o"))
2104 return yes;
2105
2106 /* search for <meta name="GENERATOR" content="Microsoft ..."> */
2107 head = TY_(FindHEAD)( doc );
2108
2109 if (head)
2110 {
2111 for (node = head->content; node; node = node->next)
2112 {
2113 if ( !nodeIsMETA(node) )
2114 continue;
2115
2116 attval = TY_(AttrGetById)( node, TidyAttr_NAME );
2117
2118 if ( !AttrValueIs(attval, "generator") )
2119 continue;
2120
2121 attval = TY_(AttrGetById)( node, TidyAttr_CONTENT );
2122
2123 if ( AttrContains(attval, "Microsoft") )
2124 return yes;
2125 }
2126 }
2127
2128 return no;
2129 }
2130
2131 /* where appropriate move object elements from head to body */
TY_(BumpObject)2132 void TY_(BumpObject)( TidyDocImpl* doc, Node *html )
2133 {
2134 Node *node, *next, *head = NULL, *body = NULL;
2135
2136 if (!html)
2137 return;
2138
2139 for ( node = html->content; node != NULL; node = node->next )
2140 {
2141 if ( nodeIsHEAD(node) )
2142 head = node;
2143
2144 if ( nodeIsBODY(node) )
2145 body = node;
2146 }
2147
2148 if ( head != NULL && body != NULL )
2149 {
2150 for (node = head->content; node != NULL; node = next)
2151 {
2152 next = node->next;
2153
2154 if ( nodeIsOBJECT(node) )
2155 {
2156 Node *child;
2157 Bool bump = no;
2158
2159 for (child = node->content; child != NULL; child = child->next)
2160 {
2161 /* bump to body unless content is param */
2162 if ( (TY_(nodeIsText)(child) && !TY_(IsBlank)(doc->lexer, node))
2163 || !nodeIsPARAM(child) )
2164 {
2165 bump = yes;
2166 break;
2167 }
2168 }
2169
2170 if ( bump )
2171 {
2172 TY_(RemoveNode)( node );
2173 TY_(InsertNodeAtStart)( body, node );
2174 }
2175 }
2176 }
2177 }
2178 }
2179
TY_(VerifyHTTPEquiv)2180 void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2181 {
2182 Node *pNode;
2183 StyleProp *pFirstProp = NULL, *pLastProp = NULL, *prop = NULL;
2184 tmbstr s, pszBegin, pszEnd;
2185 ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
2186
2187 if (!enc)
2188 return;
2189
2190 if (!nodeIsHEAD(head))
2191 head = TY_(FindHEAD)(doc);
2192
2193 if (!head)
2194 return;
2195
2196 /* Find any <meta http-equiv='Content-Type' content='...' /> */
2197 for (pNode = head->content; NULL != pNode; pNode = pNode->next)
2198 {
2199 AttVal* httpEquiv = TY_(AttrGetById)(pNode, TidyAttr_HTTP_EQUIV);
2200 AttVal* metaContent = TY_(AttrGetById)(pNode, TidyAttr_CONTENT);
2201
2202 if ( !nodeIsMETA(pNode) || !metaContent ||
2203 !AttrValueIs(httpEquiv, "Content-Type") )
2204 continue;
2205
2206 pszBegin = s = TY_(tmbstrdup)( doc->allocator, metaContent->value );
2207 while (pszBegin && *pszBegin)
2208 {
2209 while (isspace( *pszBegin ))
2210 pszBegin++;
2211 pszEnd = pszBegin;
2212 while ('\0' != *pszEnd && ';' != *pszEnd)
2213 pszEnd++;
2214 if (';' == *pszEnd )
2215 *(pszEnd++) = '\0';
2216 if (pszEnd > pszBegin)
2217 {
2218 prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
2219 prop->name = TY_(tmbstrdup)( doc->allocator, pszBegin );
2220 prop->value = NULL;
2221 prop->next = NULL;
2222
2223 if (NULL != pLastProp)
2224 pLastProp->next = prop;
2225 else
2226 pFirstProp = prop;
2227
2228 pLastProp = prop;
2229 pszBegin = pszEnd;
2230 }
2231 }
2232 TidyDocFree( doc, s );
2233
2234 /* find the charset property */
2235 for (prop = pFirstProp; NULL != prop; prop = prop->next)
2236 {
2237 if (0 != TY_(tmbstrncasecmp)( prop->name, "charset", 7 ))
2238 continue;
2239
2240 TidyDocFree( doc, prop->name );
2241 prop->name = (tmbstr)TidyDocAlloc( doc, 8 + TY_(tmbstrlen)(enc) + 1 );
2242 TY_(tmbstrcpy)(prop->name, "charset=");
2243 TY_(tmbstrcpy)(prop->name+8, enc);
2244 s = CreatePropString( doc, pFirstProp );
2245 TidyDocFree( doc, metaContent->value );
2246 metaContent->value = s;
2247 break;
2248 }
2249 /* #718127, prevent memory leakage */
2250 FreeStyleProps(doc, pFirstProp);
2251 pFirstProp = NULL;
2252 pLastProp = NULL;
2253 }
2254 }
2255
TY_(DropComments)2256 void TY_(DropComments)(TidyDocImpl* doc, Node* node)
2257 {
2258 Node* next;
2259
2260 while (node)
2261 {
2262 next = node->next;
2263
2264 if (node->type == CommentTag)
2265 {
2266 TY_(RemoveNode)(node);
2267 TY_(FreeNode)(doc, node);
2268 node = next;
2269 continue;
2270 }
2271
2272 if (node->content)
2273 TY_(DropComments)(doc, node->content);
2274
2275 node = next;
2276 }
2277 }
2278
TY_(DropFontElements)2279 void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **ARG_UNUSED(pnode))
2280 {
2281 Node* next;
2282
2283 while (node)
2284 {
2285 next = node->next;
2286
2287 if (nodeIsFONT(node))
2288 {
2289 DiscardContainer(doc, node, &next);
2290 node = next;
2291 continue;
2292 }
2293
2294 if (node->content)
2295 TY_(DropFontElements)(doc, node->content, &next);
2296
2297 node = next;
2298 }
2299 }
2300
TY_(WbrToSpace)2301 void TY_(WbrToSpace)(TidyDocImpl* doc, Node* node)
2302 {
2303 Node* next;
2304
2305 while (node)
2306 {
2307 next = node->next;
2308
2309 if (nodeIsWBR(node))
2310 {
2311 Node* text;
2312 text = TY_(NewLiteralTextNode)(doc->lexer, " ");
2313 TY_(InsertNodeAfterElement)(node, text);
2314 TY_(RemoveNode)(node);
2315 TY_(FreeNode)(doc, node);
2316 node = next;
2317 continue;
2318 }
2319
2320 if (node->content)
2321 TY_(WbrToSpace)(doc, node->content);
2322
2323 node = next;
2324 }
2325 }
2326
2327 /*
2328 Filters from Word and PowerPoint often use smart
2329 quotes resulting in character codes between 128
2330 and 159. Unfortunately, the corresponding HTML 4.0
2331 entities for these are not widely supported. The
2332 following converts dashes and quotation marks to
2333 the nearest ASCII equivalent. My thanks to
2334 Andrzej Novosiolov for his help with this code.
2335
2336 Note: The old code in the pretty printer applied
2337 this to all node types and attribute values while
2338 this routine applies it only to text nodes. First,
2339 Microsoft Office products rarely put the relevant
2340 characters into these tokens, second support for
2341 them is much better now and last but not least, it
2342 can be harmful to replace these characters since
2343 US-ASCII quote marks are often used as syntax
2344 characters, a simple
2345
2346 <a onmouseover="alert('‘')">...</a>
2347
2348 would be broken if the U+2018 is replaced by "'".
2349 The old code would neither take care whether the
2350 quote mark is already used as delimiter,
2351
2352 <p title='‘'>...</p>
2353
2354 got
2355
2356 <p title='''>...</p>
2357
2358 Since browser support is much better nowadays and
2359 high-quality typography is better than ASCII it'd
2360 be probably a good idea to drop the feature...
2361 */
TY_(DowngradeTypography)2362 void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node)
2363 {
2364 Node* next;
2365 Lexer* lexer = doc->lexer;
2366
2367 while (node)
2368 {
2369 next = node->next;
2370
2371 if (TY_(nodeIsText)(node))
2372 {
2373 uint i, c;
2374 tmbstr p = lexer->lexbuf + node->start;
2375
2376 for (i = node->start; i < node->end; ++i)
2377 {
2378 c = (unsigned char) lexer->lexbuf[i];
2379
2380 if (c > 0x7F)
2381 i += TY_(GetUTF8)(lexer->lexbuf + i, &c);
2382
2383 if (c >= 0x2013 && c <= 0x201E)
2384 {
2385 switch (c)
2386 {
2387 case 0x2013: /* en dash */
2388 case 0x2014: /* em dash */
2389 c = '-';
2390 break;
2391 case 0x2018: /* left single quotation mark */
2392 case 0x2019: /* right single quotation mark */
2393 case 0x201A: /* single low-9 quotation mark */
2394 c = '\'';
2395 break;
2396 case 0x201C: /* left double quotation mark */
2397 case 0x201D: /* right double quotation mark */
2398 case 0x201E: /* double low-9 quotation mark */
2399 c = '"';
2400 break;
2401 }
2402 }
2403
2404 p = TY_(PutUTF8)(p, c);
2405 }
2406
2407 node->end = p - lexer->lexbuf;
2408 }
2409
2410 if (node->content)
2411 TY_(DowngradeTypography)(doc, node->content);
2412
2413 node = next;
2414 }
2415 }
2416
TY_(ReplacePreformattedSpaces)2417 void TY_(ReplacePreformattedSpaces)(TidyDocImpl* doc, Node* node)
2418 {
2419 Node* next;
2420
2421 while (node)
2422 {
2423 next = node->next;
2424
2425 if (node->tag && node->tag->parser == TY_(ParsePre))
2426 {
2427 TY_(NormalizeSpaces)(doc->lexer, node->content);
2428 node = next;
2429 continue;
2430 }
2431
2432 if (node->content)
2433 TY_(ReplacePreformattedSpaces)(doc, node->content);
2434
2435 node = next;
2436 }
2437 }
2438
TY_(ConvertCDATANodes)2439 void TY_(ConvertCDATANodes)(TidyDocImpl* doc, Node* node)
2440 {
2441 Node* next;
2442
2443 while (node)
2444 {
2445 next = node->next;
2446
2447 if (node->type == CDATATag)
2448 node->type = TextNode;
2449
2450 if (node->content)
2451 TY_(ConvertCDATANodes)(doc, node->content);
2452
2453 node = next;
2454 }
2455 }
2456
2457 /*
2458 FixLanguageInformation ensures that the document contains (only)
2459 the attributes for language information desired by the output
2460 document type. For example, for XHTML 1.0 documents both
2461 'xml:lang' and 'lang' are desired, for XHTML 1.1 only 'xml:lang'
2462 is desired and for HTML 4.01 only 'lang' is desired.
2463 */
TY_(FixLanguageInformation)2464 void TY_(FixLanguageInformation)(TidyDocImpl* doc, Node* node, Bool wantXmlLang, Bool wantLang)
2465 {
2466 Node* next;
2467
2468 while (node)
2469 {
2470 next = node->next;
2471
2472 /* todo: report modifications made here to the report system */
2473
2474 if (TY_(nodeIsElement)(node))
2475 {
2476 AttVal* lang = TY_(AttrGetById)(node, TidyAttr_LANG);
2477 AttVal* xmlLang = TY_(AttrGetById)(node, TidyAttr_XML_LANG);
2478
2479 if (lang && xmlLang)
2480 {
2481 /*
2482 todo: check whether both attributes are in sync,
2483 here or elsewhere, where elsewhere is probably
2484 preferable.
2485 AD - March 2005: not mandatory according the standards.
2486 */
2487 }
2488 else if (lang && wantXmlLang)
2489 {
2490 if (TY_(NodeAttributeVersions)( node, TidyAttr_XML_LANG )
2491 & doc->lexer->versionEmitted)
2492 TY_(RepairAttrValue)(doc, node, "xml:lang", lang->value);
2493 }
2494 else if (xmlLang && wantLang)
2495 {
2496 if (TY_(NodeAttributeVersions)( node, TidyAttr_LANG )
2497 & doc->lexer->versionEmitted)
2498 TY_(RepairAttrValue)(doc, node, "lang", xmlLang->value);
2499 }
2500
2501 if (lang && !wantLang)
2502 TY_(RemoveAttribute)(doc, node, lang);
2503
2504 if (xmlLang && !wantXmlLang)
2505 TY_(RemoveAttribute)(doc, node, xmlLang);
2506 }
2507
2508 if (node->content)
2509 TY_(FixLanguageInformation)(doc, node->content, wantXmlLang, wantLang);
2510
2511 node = next;
2512 }
2513 }
2514
2515 /*
2516 Set/fix/remove <html xmlns='...'>
2517 */
TY_(FixXhtmlNamespace)2518 void TY_(FixXhtmlNamespace)(TidyDocImpl* doc, Bool wantXmlns)
2519 {
2520 Node* html = TY_(FindHTML)(doc);
2521 AttVal* xmlns;
2522
2523 if (!html)
2524 return;
2525
2526 xmlns = TY_(AttrGetById)(html, TidyAttr_XMLNS);
2527
2528 if (wantXmlns)
2529 {
2530 if (!AttrValueIs(xmlns, XHTML_NAMESPACE))
2531 TY_(RepairAttrValue)(doc, html, "xmlns", XHTML_NAMESPACE);
2532 }
2533 else if (xmlns)
2534 {
2535 TY_(RemoveAttribute)(doc, html, xmlns);
2536 }
2537 }
2538
2539 /*
2540 ...
2541 */
TY_(FixAnchors)2542 void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
2543 {
2544 Node* next;
2545
2546 while (node)
2547 {
2548 next = node->next;
2549
2550 if (TY_(IsAnchorElement)(doc, node))
2551 {
2552 AttVal *name = TY_(AttrGetById)(node, TidyAttr_NAME);
2553 AttVal *id = TY_(AttrGetById)(node, TidyAttr_ID);
2554 Bool hadName = name!=NULL;
2555 Bool hadId = id!=NULL;
2556 Bool IdEmitted = no;
2557 Bool NameEmitted = no;
2558
2559 /* todo: how are empty name/id attributes handled? */
2560
2561 if (name && id)
2562 {
2563 Bool NameHasValue = AttrHasValue(name);
2564 Bool IdHasValue = AttrHasValue(id);
2565 if ( (NameHasValue != IdHasValue) ||
2566 (NameHasValue && IdHasValue &&
2567 TY_(tmbstrcmp)(name->value, id->value) != 0 ) )
2568 TY_(ReportAttrError)( doc, node, name, ID_NAME_MISMATCH);
2569 }
2570 else if (name && wantId)
2571 {
2572 if (TY_(NodeAttributeVersions)( node, TidyAttr_ID )
2573 & doc->lexer->versionEmitted)
2574 {
2575 if (TY_(IsValidHTMLID)(name->value))
2576 {
2577 TY_(RepairAttrValue)(doc, node, "id", name->value);
2578 IdEmitted = yes;
2579 }
2580 else
2581 TY_(ReportAttrError)(doc, node, name, INVALID_XML_ID);
2582 }
2583 }
2584 else if (id && wantName)
2585 {
2586 if (TY_(NodeAttributeVersions)( node, TidyAttr_NAME )
2587 & doc->lexer->versionEmitted)
2588 {
2589 /* todo: do not assume id is valid */
2590 TY_(RepairAttrValue)(doc, node, "name", id->value);
2591 NameEmitted = yes;
2592 }
2593 }
2594
2595 if (id && !wantId
2596 /* make sure that Name has been emitted if requested */
2597 && (hadName || !wantName || NameEmitted) )
2598 TY_(RemoveAttribute)(doc, node, id);
2599
2600 if (name && !wantName
2601 /* make sure that Id has been emitted if requested */
2602 && (hadId || !wantId || IdEmitted) )
2603 TY_(RemoveAttribute)(doc, node, name);
2604
2605 if (TY_(AttrGetById)(node, TidyAttr_NAME) == NULL &&
2606 TY_(AttrGetById)(node, TidyAttr_ID) == NULL)
2607 TY_(RemoveAnchorByNode)(doc, node);
2608 }
2609
2610 if (node->content)
2611 TY_(FixAnchors)(doc, node->content, wantName, wantId);
2612
2613 node = next;
2614 }
2615 }
2616
2617 /*
2618 * local variables:
2619 * mode: c
2620 * indent-tabs-mode: nil
2621 * c-basic-offset: 4
2622 * eval: (c-set-offset 'substatement-open 0)
2623 * end:
2624 */
2625