1 /*
2  * Copyright © 1997-2016 World Wide Web Consortium
3  * See http://www.w3.org/Consortium/Legal/copyright-software
4  *
5  * Author: Bert Bos <bert@w3.org>
6  * Created: 1997
7  **/
8 #include "config.h"
9 #include <assert.h>
10 #include <stdlib.h>
11 #if STDC_HEADERS
12 # include <string.h>
13 #else
14 # ifndef HAVE_STRCHR
15 #  define strchr index
16 #  define strrchr rindex
17 # endif
18 # ifndef HAVE_STRDUP
19 #  include "strdup.e"
20 # endif
21 #endif
22 #include <ctype.h>
23 #include <stdio.h>
24 #include <stdbool.h>
25 #include "export.h"
26 #include "heap.e"
27 #include "types.e"
28 #include "dtd.e"
29 #include "errexit.e"
30 #include "scan.e"
31 
32 EXPORT typedef enum {
33   Element, Text, Comment, Declaration, Procins, Root
34 } Nodetype;
35 
36 EXPORT typedef struct _node {
37   Nodetype tp;
38   string name;
39   pairlist attribs;
40   string text;
41   string url;
42   struct _node *parent;
43   struct _node *sister;
44   struct _node *children;
45 } Node, *Tree;
46 
47 
48 
49 /* create -- create an empty tree */
create(void)50 EXPORT Tree create(void)
51 {
52   Tree t = malloc(sizeof(*t));
53   assert(t != NULL);
54   t->tp = Root;
55   t->name = "";
56   t->sister = t->children = NULL;
57   return t;
58 }
59 
60 /* tree_delete -- recursively free the memory occupied by a tree */
tree_delete(Tree t)61 EXPORT void tree_delete(Tree t)
62 {
63   if (t != NULL) {
64     switch (t->tp) {
65       case Element:
66 	dispose(t->name);
67 	pairlist_delete(t->attribs);
68 	tree_delete(t->sister);
69 	tree_delete(t->children);
70 	break;
71       case Text:
72 	dispose(t->text);
73 	assert(t->children == NULL);
74 	tree_delete(t->sister);
75 	break;
76       case Comment:
77 	dispose(t->text);
78 	assert(t->children == NULL);
79 	tree_delete(t->sister);
80 	break;
81       case Declaration:
82 	dispose(t->name);
83 	dispose(t->text);
84 	dispose(t->url);
85 	assert(t->children == NULL);
86 	tree_delete(t->sister);
87 	break;
88       case Procins:
89 	dispose(t->text);
90 	assert(t->children == NULL);
91 	tree_delete(t->sister);
92 	break;
93       case Root:
94 	assert(t->sister == NULL);
95 	tree_delete(t->children);
96 	break;
97       default:
98 	assert(!"Cannot happen");
99     }
100     dispose(t);
101   }
102 }
103 
104 /* get_root -- return root of tree */
get_root(Tree t)105 EXPORT Tree get_root(Tree t)
106 {
107   while (t->tp != Root) t = t->parent;
108   return t;
109 }
110 
111 
112 /* get_attrib -- return a ptr to the value of a named attibute, or false */
get_attrib(const Node * e,const conststring attname)113 EXPORT conststring get_attrib(const Node *e, const conststring attname)
114 {
115   assert(e->tp == Element);
116   return pairlist_get(e->attribs, attname);
117 }
118 
119 
120 /* set_attrib -- set an attribute to a value */
set_attrib(Node * e,string name,conststring value)121 EXPORT void set_attrib(Node *e, string name, conststring value)
122 {
123   assert(e->tp == Element);
124   pairlist_set(&e->attribs, name, value);
125 }
126 
127 /* delete_attrib -- remove attribute from element, false if not found */
delete_attrib(Node * e,const conststring name)128 EXPORT bool delete_attrib(Node *e, const conststring name)
129 {
130   assert(e->tp == Element);
131   return pairlist_unset(&e->attribs, name);
132 }
133 
134 /* get_by_id -- recursively get the element node with the id in subtree n */
get_by_id(Node * n,const conststring id)135 static Tree get_by_id(Node *n, const conststring id)
136 {
137   conststring val;
138   Tree h, p;
139 
140   assert(n->tp == Element);
141   if ((val = get_attrib(n, "id")) && eq(val, id)) return n;
142   for (h = n->children; h; h = h->sister)
143     if (h->tp == Element && (p = get_by_id(h, id))) return p;
144   return NULL;
145 }
146 
147 /* get_elt_by_id -- get the element node with the ID attribute id */
get_elt_by_id(Node * n,const conststring id)148 EXPORT Tree get_elt_by_id(Node *n, const conststring id)
149 {
150   /* To do: should this use a hash table? */
151   Tree h, p;
152 
153   h = get_root(n);
154   for (h = h->children; h; h = h->sister)
155     if (h->tp == Element && (p = get_by_id(h, id))) return p;
156   return NULL;
157 }
158 
159 /* wrap_contents -- wrap contents of a node in an element, return new elt */
wrap_contents(Node * n,const string elem,pairlist attr)160 EXPORT Tree wrap_contents(Node *n, const string elem, pairlist attr)
161 {
162   Node *h, *k;
163 
164   new(h);
165   h->tp = Element;
166   h->name = newstring(elem);
167   h->attribs = attr;
168   h->sister = NULL;
169   h->parent = n;
170   h->children = n->children;
171   n->children = h;
172   for (k = h->children; k; k = k->sister) k->parent = h;
173   return h;
174 }
175 
176 /* wrap_elt -- wrap an element in a new element, return the new element */
wrap_elt(Node * n,const conststring elem,pairlist attr)177 EXPORT Tree wrap_elt(Node *n, const conststring elem, pairlist attr)
178 {
179   Node *h, *k;
180 
181   new(h);
182   h->tp = Element;
183   h->name = newstring(elem);
184   h->attribs = attr;
185   h->sister = n->sister;
186   h->parent = n->parent;
187   h->children = n;
188   n->sister = NULL;
189   n->parent = h;
190   if (h->parent->children == n) {
191     h->parent->children = h;
192   } else {
193     k = h->parent->children;
194     while (k->sister != n) {assert(k->sister->sister); k = k->sister;}
195     k->sister = h;
196   }
197   return h;
198 }
199 
200 /* rename_elt -- change the name of an element to elem */
rename_elt(Node * n,const string elem)201 EXPORT void rename_elt(Node *n, const string elem)
202 {
203   assert(n->tp == Element);
204   n->name = newstring(elem);
205 }
206 
207 /* push -- add a child node to the tree */
push(Tree t,Node * n)208 static Tree push(Tree t, Node *n)
209 {
210   if (t->children == NULL) {
211     t->children = n;
212   } else {
213     Tree h = t->children;
214     while (h->sister != NULL) h = h->sister;
215     h->sister = n;
216   }
217   n->parent = t;
218   return n;
219 }
220 
221 /* pop -- go up one level */
pop(Tree t)222 static Tree pop(Tree t)
223 {
224   assert(t != NULL);
225   assert(t->tp != Root);
226   return t->parent;
227 }
228 
229 /* append -- add at end of children */
append(Tree t,Node * n)230 static Tree append(Tree t, Node *n)
231 {
232   assert(t != NULL);
233   if (t->children == NULL) {
234     t->children = n;
235   } else {
236     Tree h = t->children;
237     while (h->sister != NULL) h = h->sister;
238     h->sister = n;
239   }
240   n->parent = t;
241   return t;
242 }
243 
244 /* lookup -- lookup info about an element case-insensitively */
lookup(const string e)245 static const ElementType *lookup(const string e)
246 {
247   char h[MAXNAMELEN+2];
248   strncpy(h, e, sizeof(h) - 1);
249   h[sizeof(h)-1] = '\0';
250   down(h);
251   return lookup_element(h, strlen(h));
252 }
253 
254 /* is_known -- true if the element is an HTML 4 element */
is_known(const string e)255 EXPORT bool is_known(const string e)
256 {
257   return lookup(e) != NULL;
258 }
259 
260 /* is_pre -- true if the element has preformatted content */
is_pre(const string e)261 EXPORT bool is_pre(const string e)
262 {
263   const ElementType *info = lookup(e);
264   return info && info->pre;
265 }
266 
267 /* need_stag -- true if the element's start tag is required */
need_stag(const string e)268 EXPORT bool need_stag(const string e)
269 {
270   const ElementType *info = lookup(e);
271   return !info || info->stag;
272 }
273 
274 /* need_etag -- true if the element's end tag is required */
need_etag(const string e)275 EXPORT bool need_etag(const string e)
276 {
277   const ElementType *info = lookup(e);
278   return !info || info->etag;
279 }
280 
281 /* is_empty -- true if element is empty */
is_empty(const string e)282 EXPORT bool is_empty(const string e)
283 {
284   const ElementType *info = lookup(e);
285   return info && info->empty;
286 }
287 
288 /* has_parent -- true if c accepts p as a parent */
has_parent(const string c,const string p)289 EXPORT bool has_parent(const string c, const string p)
290 {
291   const ElementType *info = lookup_element(c, strlen(c));
292   int i;
293   if (!info) return false;
294   for (i = 0; info->parents[i]; i++)
295     if (eq(info->parents[i], p)) return true;
296   return false;
297 }
298 
299 /* preferred_parent -- return first possible parent of e */
preferred_parent(const string e)300 static string preferred_parent(const string e)
301 {
302   const ElementType *info = lookup_element(e, strlen(e));
303   assert(info != NULL);				/* element is known */
304   assert(info->parents[0] != NULL);		/* element is not root */
305   return info->parents[0];
306 }
307 
308 /* is_root -- true if e has no possible parents */
is_root(const string e)309 static bool is_root(const string e)
310 {
311   const ElementType *info = lookup_element(e, strlen(e));
312   assert(info != NULL);				/* element is known */
313   return info->parents[0] == NULL;
314 }
315 
316 /* is_mixed -- true if e accepts text content */
is_mixed(const string e)317 EXPORT bool is_mixed(const string e)
318 {
319   const ElementType *info = lookup(e);
320   return !info || info->mixed;
321 }
322 
323 /* break_before -- true if element looks better with a newline before it */
break_before(const string e)324 EXPORT bool break_before(const string e)
325 {
326   const ElementType *info = lookup(e);
327   return info && info->break_before;
328 }
329 
330 /* break_after -- true if element looks better with a newline after it */
break_after(const string e)331 EXPORT bool break_after(const string e)
332 {
333   const ElementType *info = lookup(e);
334   return info && info->break_after;
335 }
336 
337 /* is_cdata_elt -- true if element has character data content */
is_cdata_elt(const string e)338 EXPORT bool is_cdata_elt(const string e)
339 {
340   const ElementType *info = lookup(e);
341   return info && info->cdata;
342 }
343 
344 /* build_path -- try to add omittable start tags to make elem acceptable */
build_path(Tree * t,string elem)345 static bool build_path(Tree *t, string elem)
346 {
347   const ElementType *info;
348   Node *n;
349   int i;
350 
351   assert(is_known(elem));
352   assert(is_known((*t)->name));
353 
354   /* Check if we are done */
355   if (has_parent(elem, (*t)->name)) return true;
356 
357   /* Try recursively if any possible parent can be a child of t */
358   info = lookup(elem);
359   for (i = 0; info->parents[i]; i++) {
360     if (!need_stag(info->parents[i]) && build_path(t, info->parents[i])) {
361       /* Success, so add this parent and return true */
362       n = malloc(sizeof(*n));
363       assert(n != NULL);
364       n->tp = Element;
365       n->name = newstring(info->parents[i]);
366       assert(islower(n->name[0]));
367       n->attribs = NULL;
368       n->sister = n->children = NULL;
369       *t = push(*t, n);
370       return true;
371     }
372   }
373   return false;
374 }
375 
376 /* tree_push -- add an element to the tree, without checking the DTD */
tree_push(Tree t,string elem,pairlist attr)377 EXPORT Tree tree_push(Tree t, string elem, pairlist attr)
378 {
379   Node *n;
380   new(n);
381   n->tp = Element;
382   n->name = newstring(elem);
383   n->attribs = attr;
384   n->sister = n->children = NULL;
385   return push(t, n);
386 }
387 
388 /* html_push -- add an element to the tree, open or close missing elements */
html_push(Tree t,string elem,pairlist attr)389 EXPORT Tree html_push(Tree t, string elem, pairlist attr)
390 {
391   pairlist a;
392   Node *h, *n = malloc(sizeof(*n));
393   assert(n != NULL);
394   n->tp = Element;
395   n->name = down(newstring(elem));
396   for (a = attr; a; a = a->next) down(a->name);
397   n->attribs = attr;
398   n->sister = n->children = NULL;
399 
400   /* Unknown elements are just pushed where they are */
401   if (!is_known(n->name)) return push(t, n);
402 
403   if (is_root(n->name)) {
404     while (t->tp != Root) t = pop(t);		/* Make sure root is at root */
405   } else if (is_known(t->name) && build_path(&t, n->name)) {
406     ;						/* Added missing start tags */
407   } else {
408     /* Check if there is a possible parent further up the tree */
409     for (h=t; h->tp!=Root && is_known(h->name) && !has_parent(n->name,h->name);
410 	 h = h->parent) ;
411     /* Close omitted end tags */
412     if (h->tp != Root) while (t != h) t = pop(t);
413     /* If no valid parent, fabricate one */
414     if (t->tp == Root || (is_known(t->name) && !has_parent(n->name, t->name)))
415       t = html_push(t, preferred_parent(n->name), NULL);
416   }
417   t = push(t, n);
418 
419   if (is_empty(n->name)) t = pop(t);
420   if (is_cdata_elt(n->name)) set_cdata_element(n->name); /* Change scanner */
421   return t;
422 }
423 
424 /* tree_pop -- close an open element, without checking the DTD */
tree_pop(Tree t,string elem)425 EXPORT Tree tree_pop(Tree t, string elem)
426 {
427   assert(t != NULL);
428   if (t->tp == Root) errexit("End tag </%s> without matching start tag\n", elem);
429   assert(t->tp == Element);
430   if (*elem == '\0') return pop(t); /* Empty end tag </> */
431   if (eq(t->name, elem)) return pop(t);
432   errexit("End tag </%s> doesn't match start tag <%s>\n", elem, t->name);
433   return NULL;			/* Keep lint happy */
434 }
435 
436 /* html_pop -- close an open element */
html_pop(Tree t,string elem)437 EXPORT Tree html_pop(Tree t, string elem)
438 {
439   Tree h = t;
440   assert(t != NULL);
441   down(elem);
442   if (*elem == '\0') {				/* </> */
443     if (t->tp != Root) t = pop(t);
444   } else {					/* </name> */
445     for (h = t; h->tp != Root && !eq(h->name, elem); h = h->parent) ;
446     if (h->tp != Root) {			/* Found open element */
447       while (t != h) t = pop(t);
448       t = pop(t);
449     }
450   }
451   return t;
452 }
453 
454 /* append_comment -- add a comment to the tree */
append_comment(Tree t,string comment)455 EXPORT Tree append_comment(Tree t, string comment)
456 {
457   Node *n = malloc(sizeof(*n));
458   assert(n != NULL);
459   n->tp = Comment;
460   n->text = comment;
461   n->sister = n->children = NULL;
462   return append(t, n);
463 }
464 
465 /* append_declaration -- add a declaration to the tree */
append_declaration(Tree t,string gi,string fpi,string url)466 EXPORT Tree append_declaration(Tree t, string gi,
467 			       string fpi, string url)
468 {
469   Node *n = malloc(sizeof(*n));
470   assert(n != NULL);
471   n->tp = Declaration;
472   n->name = down(gi);
473   n->text = fpi;
474   n->url = url;
475   n->sister = n->children = NULL;
476   return append(t, n);
477 }
478 
479 /* append_procins -- append a processing instruction */
append_procins(Tree t,string procins)480 EXPORT Tree append_procins(Tree t, string procins)
481 {
482   Node *n = malloc(sizeof(*n));
483   assert(n != NULL);
484   n->tp = Procins;
485   n->text = procins;
486   n->sister = n->children = NULL;
487   return append(t, n);
488 }
489 
490 /* tree_append_text -- append a text chunk, without checking the DTD */
tree_append_text(Tree t,string text)491 EXPORT Tree tree_append_text(Tree t, string text)
492 {
493   Node *n;
494 
495   assert(text);
496   new(n);
497   n->tp = Text;
498   n->text = text;
499   n->sister = n->children = NULL;
500   return append(t, n);
501 }
502 
503 /* append_text -- append a text chunk to the document tree */
append_text(Tree t,string text)504 EXPORT Tree append_text(Tree t, string text)
505 {
506   Node *n;
507   string new_parent;
508 
509   if (only_space(text) && (t->tp == Root || !is_mixed(t->name))) {
510     /* Drop text, since it is non-significant whitespace */
511     return t;
512   }
513   if (t->tp == Root || !is_mixed(t->name)) {
514     /* Need heuristics to make a valid tree */
515     new_parent = preferred_parent("%data");
516     /* Close omitted end tags until text or preferred parent fits */
517     while (t->tp != Root && !is_mixed(t->name) && !need_etag(t->name)
518 	   && !has_parent(new_parent, t->name))
519       t = pop(t);
520     /* Fabricate a parent if needed */
521     if (t->tp == Root || !is_mixed(t->name))
522       t = html_push(t, new_parent, NULL);
523   }
524   n = malloc(sizeof(*n));
525   assert(n != NULL);
526   n->tp = Text;
527   n->text = text;
528   assert(n->text != NULL);
529   n->sister = n->children = NULL;
530   return append(t, n);
531 }
532 
dump2(Tree n,FILE * f)533 static void dump2(Tree n, FILE *f)
534 {
535   pairlist h;
536   Tree l;
537 
538   switch (n->tp) {
539     case Text: fprintf(f, "%s", n->text); break;
540     case Comment: fprintf(f, "<!--%s-->", n->text); break;
541     case Declaration:
542       fprintf(f, "<!DOCTYPE %s", n->name);
543       if (n->text) fprintf(f, " PUBLIC \"%s\">", n->text);
544       if (n->url) fprintf(f, " %s\"%s\">", n->text ? "" : "SYSTEM ", n->url);
545       fprintf(f, ">");
546       break;
547     case Procins: fprintf(f, "<?%s>", n->text); break;
548     case Element:
549       fprintf(f, "<%s", n->name);
550       for (h = n->attribs; h != NULL; h = h->next) {
551 	fprintf(f, " %s", h->name);
552 	if (h->value != NULL) fprintf(f, "=\"%s\"", h->value);
553       }
554       if (is_empty(n->name)) {
555 	assert(n->children == NULL);
556 	fprintf(f, " />");
557       } else {
558 	fprintf(f, ">");
559 	for (l = n->children; l != NULL; l = l->sister) dump2(l, f);
560 	fprintf(f, "</%s>", n->name);
561       }
562       break;
563     default:
564       assert(!"Cannot happen");
565   }
566 }
567 
568 /* dumptree -- write out the tree below t (t's children, not t itself)*/
dumptree(Tree t,FILE * f)569 EXPORT void dumptree(Tree t, FILE *f)
570 {
571   Tree h;
572 
573   for (h = t->children; h != NULL; h = h->sister) dump2(h, f);
574 }
575