1 /*
2  * Copyright 2001 Niels Provos <provos@citi.umich.edu>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *      This product includes software developed by Niels Provos.
16  * 4. The name of the author may not be used to endorse or promote products
17  *    derived from this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/queue.h>
33 #include <sys/time.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 
38 #include "config.h"
39 
40 #include <event.h>
41 
42 #include "http.h"
43 #include "html.h"
44 #include "util.h"
45 
46 char **
html_attr_find(char ** attr,char * name)47 html_attr_find(char **attr, char *name)
48 {
49 	while (*attr != NULL) {
50 		if (!strcasecmp(*attr, name))
51 			break;
52 
53 		attr += 2;
54 	}
55 
56 	return (attr);
57 }
58 
59 void
html_free_cb(struct html_cb * cb)60 html_free_cb(struct html_cb *cb)
61 {
62 	if (cb->name != NULL)
63 		free(cb->name);
64 
65 	free(cb);
66 }
67 
68 int
html_register_cb(struct html_parse * p,char * name,void (* callback)(void *,char *,char **))69 html_register_cb(struct html_parse *p, char *name,
70 		 void (*callback)(void *, char *, char **))
71 {
72 	struct html_cb *cb;
73 
74 	cb = malloc(sizeof (struct html_cb));
75 	if (cb == NULL)
76 		return (-1);
77 
78 	cb->cb = callback;
79 	if ((cb->name = strdup(name)) == NULL)
80 		goto out;
81 
82 	TAILQ_INSERT_TAIL(&p->cbqueue, cb, next);
83 
84 	return (0);
85 
86  out:
87 	html_free_cb(cb);
88 	return (-1);
89 }
90 
91 struct html_parse *
html_newparser(void)92 html_newparser(void)
93 {
94 	struct html_parse *p;
95 
96 	p = calloc(1, sizeof(struct html_parse));
97 	if (p == NULL)
98 		return (NULL);
99 
100 	TAILQ_INIT(&p->cbqueue);
101 
102 	return (p);
103 }
104 
105 void
html_freeparser(struct html_parse * p)106 html_freeparser(struct html_parse *p)
107 {
108 	struct html_cb *cb;
109 
110 	if (p->base != NULL)
111 		free(p->base);
112 
113 	for (cb = TAILQ_FIRST(&p->cbqueue); cb;
114 	     cb = TAILQ_FIRST(&p->cbqueue)) {
115 		TAILQ_REMOVE(&p->cbqueue, cb, next);
116 		html_free_cb(cb);
117 	}
118 
119 	free(p);
120 }
121 
122 int
html_parse_setbase(struct html_parse * p,char * base)123 html_parse_setbase(struct html_parse *p, char *base)
124 {
125 	if (p->base != NULL)
126 		free(p->base);
127 
128 	p->base = strdup(base);
129 
130 	return (p->base == NULL ? -1 : 0);
131 }
132 
133 void
tag_start(struct html_parse * p,char * el,char ** attr)134 tag_start(struct html_parse *p, char *el, char **attr)
135 {
136 	struct html_cb *cb;
137 	void *arg;
138 
139 	arg = p->data != NULL ? p->data : p;
140 
141 	TAILQ_FOREACH(cb, &p->cbqueue, next) {
142 		if (!strcasecmp(cb->name, el)) {
143 			cb->cb(arg, el, attr);
144 			break;
145 		}
146 	}
147 }
148 
149 void
tag_end(struct html_parse * parser,char * el)150 tag_end(struct html_parse *parser, char *el)
151 {
152 
153 }
154 
155 #define WHITESPACE	" \r\n\t"
156 #define WHITEEND	" \r\n\t>"
157 #define ATTRDELIM	" \r\n\t=>"
158 
159 void
html_parsetag(struct html_parse * parser,char * start,char * end)160 html_parsetag(struct html_parse *parser, char *start, char *end)
161 {
162 	char *element, *elend, *attr, *attrend;
163 	char **pattr, **pattrend;
164 	char *attrlist[MAXATTR*2 + 2];
165 	int endtag = 0, i;
166 	char quoted;
167 
168 	start = start + 1 + strspn(start + 1, WHITESPACE);
169 	elend = strpbrk(start, WHITEEND);
170 
171 	if (start >= end)
172 		return;
173 
174 	if (*start == '/') {
175 		endtag = 1;
176 		start++;
177 
178 		if (start >= end)
179 			return;
180 	}
181 
182 	if ((element = strdupend(start, elend)) == NULL)
183 		return;
184 
185 	memset(attrlist, 0, sizeof(attrlist));
186 
187 	if (endtag) {
188 		tag_end(parser, element);
189 		goto out;
190 	}
191 
192 	attr = elend;
193 	pattr = attrlist;
194 	pattrend = pattr + MAXATTR*2;
195 	while (attr < end && pattr < pattrend) {
196 		attr += strspn(attr, WHITESPACE);
197 		attrend = strpbrk(attr, ATTRDELIM);
198 
199 		if (attrend >= end)
200 			break;
201 
202 		*pattr = strdupend(attr, attrend);
203 		if (*pattr == NULL)
204 			goto out;
205 		pattr++;
206 
207 		attr = attrend + strspn(attrend, ATTRDELIM);
208 		if (*attr == '"' || *attr == '\'') {
209 			char delim[5];
210 
211 			quoted = *attr;
212 			attr++;
213 			sprintf(delim, "%c>\r\n", quoted);
214 			attrend = strpbrk(attr, delim);
215 		} else {
216 			quoted = '\0';
217 			attrend = strpbrk(attr, WHITEEND);
218 		}
219 
220 		if (attrend == NULL)
221 			goto out;
222 
223 		*pattr = strdupend(attr, attrend);
224 		if (*pattr == NULL)
225 			goto out;
226 		pattr++;
227 
228 		if (*attrend == quoted)
229 			attrend++;
230 
231 		attr = attrend;
232 	}
233 
234 	tag_start(parser, element, attrlist);
235 
236  out:
237 	for (i = 0; i < MAXATTR * 2; i++)
238 		if (attrlist[i] != NULL)
239 			free(attrlist[i]);
240 	free(element);
241 }
242 
243 int
html_parser(struct html_parse * parser,char * body,size_t len)244 html_parser(struct html_parse *parser, char *body, size_t len)
245 {
246 	int res = 0;
247 	char *p, *end;
248 
249 	p = body;
250 	end = p + len;
251 
252 	while (p < end) {
253 		char *tagend;
254 
255 		p = strchr(p, '<');
256 		if (p == NULL)
257 			break;
258 
259 		if (!strncmp(p, "<!--", 4)) {
260 			/* Skip comments */
261 			tagend = strstr(p, "-->");
262 			if (tagend == NULL)
263 				break;
264 			p = tagend + 3;
265 			continue;
266 		} else
267 			tagend = strchr(p, '>');
268 
269 		if (tagend == NULL)
270 			break;
271 
272 		html_parsetag(parser, p, tagend);
273 		p = tagend + 1;
274 	}
275 
276 	return (res);
277 }
278 
279