1 /**
2 * HTML Parser
3 *
4 * Copyright (C) 2015-2016
5 * Jeffrey Fulmer - <jeff@joedog.org>, et al.
6 *
7 * This file is distributed as part of Siege
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.
21 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 *--
23 */
24 #ifdef HAVE_CONFIG_H
25 # include <config.h>
26 #endif/*HAVE_CONFIG_H*/
27 #include <url.h>
28 #include <parser.h>
29 #include <util.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <ctype.h>
33 #include <stdio.h>
34 #include <array.h>
35 #include <memory.h>
36 #include <joedog/defs.h>
37
38 #define CONTROL_TOKENS " ="
39 #define CONTROL_TOKENS_PLUS " =\"\'"
40 #define CONTROL_TOKENS_QUOTES " \"\'"
41
42 private void __parse_control(ARRAY array, URL base, char *html);
43 private void __add_url(ARRAY array, URL U);
44 private char * __strcasestr(const char *s, const char *find);
45 private char * __xstrip(const char * str, const char *pat);
46
47 #define BUFSZ 4096
48
49 BOOLEAN
html_parser(ARRAY array,URL base,char * page)50 html_parser(ARRAY array, URL base, char *page)
51 {
52 char *str;
53 char *ptr;
54 int i;
55 char tmp[BUFSZ];
56
57 memset(tmp, '\0', BUFSZ);
58 ptr = str = __xstrip(page, "\\");
59
60 if (page == NULL) return FALSE;
61 if (strlen(page) < 1) return FALSE;
62
63 while (*ptr != '\0') {
64 if (*ptr == '<') {
65 ptr++;
66 if (startswith("!--", ptr) == TRUE) {
67 ptr += 3;
68 while (*ptr!='\0') {
69 if (startswith("-->", ptr) == TRUE) {
70 ptr += 3;
71 break;
72 }
73 ptr++;
74 }
75 } else {
76 i = 0;
77 memset(tmp, '\0', sizeof(tmp));
78 while (*ptr != '\0' && *ptr != '>' && i < (BUFSZ-1)) {
79 tmp[i] = *ptr;
80 i++;
81 ptr++;
82 }
83 __parse_control(array, base, tmp);
84 }
85 }
86 ptr++;
87 }
88 xfree(str);
89 return TRUE;
90 }
91
92 private void
__add_url(ARRAY array,URL U)93 __add_url(ARRAY array, URL U)
94 {
95 int i = 0;
96 BOOLEAN found = FALSE;
97
98 if (U == NULL || url_get_hostname(U) == NULL || strlen(url_get_hostname(U)) < 2) {
99 return;
100 }
101
102 if (array != NULL) {
103 for (i = 0; i < (int)array_length(array); i++) {
104 URL url = (URL)array_get(array, i);
105 if (strmatch(url_get_absolute(U), url_get_absolute(url))) {
106 found = TRUE;
107 }
108 }
109 }
110 if (! found) {
111 array_npush(array, U, URLSIZE);
112 }
113 return;
114 }
115
116 /**
117 * The following code is based on parse_control from LinkCheck
118 * by Ken Jones <kbo@inter7.com>
119 *
120 * Copyright (C) 2000 Inter7 Internet Technologies, Inc.
121 * Copyright (C) 2013 Jeffrey Fulmer, et al
122 *
123 * This program is free software; you can redistribute it and/or modify
124 * it under the terms of the GNU General Public License as published by
125 * the Free Software Foundation; either version 2 of the License, or
126 * (at your option) any later version.
127 *
128 * This program is distributed in the hope that it will be useful,
129 * but WITHOUT ANY WARRANTY; without even the implied warranty of
130 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
131 * GNU General Public License for more details.
132 *
133 * You should have received a copy of the GNU General Public License
134 * along with this program; if not, write to the Free Software
135 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
136 *
137 */
138 private void
__parse_control(ARRAY array,URL base,char * html)139 __parse_control(ARRAY array, URL base, char *html)
140 {
141 char * ptr = NULL;
142 char * aid;
143 char tmp[BUFSZ];
144 char * top;
145 BOOLEAN debug = FALSE;
146
147 ptr = top = strtok_r(html, CONTROL_TOKENS, &aid);
148 while (ptr != NULL) {
149 if (strncasecmp(ptr, "href", 4) == 0) {
150 ptr = strtok_r(NULL, CONTROL_TOKENS_PLUS, &aid);
151 if (ptr != NULL) {
152 memset(tmp, '\0', BUFSZ);
153 strncpy(tmp, ptr, BUFSZ-1);
154 }
155 } else if (strncasecmp(ptr, "meta", 4) == 0) {
156 /* <meta http-equiv="refresh" content="0; url=http://example.com/" /> */
157 for (ptr = strtok_r(NULL, CONTROL_TOKENS, &aid); ptr != NULL; ptr = strtok_r(NULL, CONTROL_TOKENS, &aid)) {
158 if (strncasecmp(ptr, "content", 7) == 0) {
159 for (ptr = strtok_r(NULL, CONTROL_TOKENS, &aid); ptr != NULL; ptr = strtok_r(NULL, CONTROL_TOKENS, &aid)) {
160 if (__strcasestr(ptr, "url") != NULL) {
161 ptr = strtok_r(NULL, CONTROL_TOKENS_QUOTES, &aid);
162 if (ptr != NULL) {
163 URL U = url_normalize(base, ptr);
164 url_set_redirect(U, TRUE);
165 if (debug) printf("1.) Adding: %s\n", url_get_absolute(U));
166 __add_url(array, U);
167 }
168 }
169 }
170 }
171 }
172 } else if (strncasecmp(ptr, "img", 3) == 0) {
173 ptr = strtok_r(NULL, CONTROL_TOKENS, &aid);
174 if (ptr != NULL && aid != NULL) {
175 if (! strncasecmp(aid, "\"\"", 2)) {
176 // empty string, i.e., img src=""
177 continue;
178 }
179 if (! strncasecmp(ptr, "src", 3)) {
180 ptr = strtok_r(NULL, CONTROL_TOKENS_QUOTES, &aid);
181 if (ptr != NULL) {
182 if ( !strncasecmp(ptr, "data:image", 10) )
183 continue; //VL issue #1
184 URL U = url_normalize(base, ptr);
185 if (debug) printf("2.) Adding: %s\n", url_get_absolute(U));
186 if (! endswith("+", url_get_absolute(U))) {
187 __add_url(array, U);
188 }
189 }
190 } else {
191 for (ptr = strtok_r(NULL, CONTROL_TOKENS, &aid); ptr != NULL; ptr = strtok_r(NULL, CONTROL_TOKENS, &aid)) {
192 if ((ptr != NULL) && (strncasecmp(ptr, "src", 3) == 0)) {
193 ptr = strtok_r(NULL, CONTROL_TOKENS_QUOTES, &aid);
194 if (ptr != NULL && strlen(ptr) > 1 && strncasecmp(ptr, "data:image", 10)) { //VL issue #1
195 URL U = url_normalize(base, ptr);
196 if (debug) printf("3.) Adding: %s\n", url_get_absolute(U));
197 __add_url(array, U);
198 }
199 }
200 }
201 }
202 }
203 } else if (strncasecmp(ptr, "link", 4) == 0) {
204 /*
205 <link rel="stylesheet" type="text/css" href="/wp-content/themes/joedog/style.css" />
206 <meta name="verify-v1" content="T3mz6whWX6gK4o2ptN99TNTakYMe7InrFRkBqqi/6XI=" />
207 <link href="https://plus.google.com/u/0/102619614955071602341" rel="author" />
208 */
209 BOOLEAN okay = FALSE;
210 char buf[2048]; //XXX: TEMP!!!!! make dynamic
211 for (ptr = strtok_r(NULL, CONTROL_TOKENS, &aid); ptr != NULL; ptr = strtok_r(NULL, CONTROL_TOKENS, &aid)) {
212 if (strncasecmp(ptr, "rel", 3) == 0) {
213 ptr = strtok_r(NULL, CONTROL_TOKENS_PLUS, &aid);
214 if (ptr == NULL) {
215 continue;
216 }
217 if (strncasecmp(ptr, "stylesheet", 10) == 0) {
218 okay = TRUE;
219 }
220 if (strncasecmp(ptr, "next", 4) == 0) {
221 okay = FALSE;
222 }
223 if (strncasecmp(ptr, "alternate", 9) == 0) {
224 okay = FALSE;
225 }
226 }
227 if (strncasecmp(ptr, "href", 4) == 0) {
228 ptr = strtok_r(NULL, CONTROL_TOKENS_QUOTES, &aid);
229 if (ptr != NULL) {
230 memset(buf, '\0', sizeof(buf));
231 strncpy(buf, ptr, strlen(ptr));
232 }
233 }
234 }
235 if (okay) {
236 URL U = url_normalize(base, buf);
237 if (debug) printf("4.) Adding: %s\n", url_get_absolute(U));
238 __add_url(array, U);
239 }
240 } else if (strncasecmp(ptr, "script", 6) == 0) {
241 for (ptr = strtok_r(NULL, CONTROL_TOKENS, &aid); ptr != NULL; ptr = strtok_r(NULL, CONTROL_TOKENS, &aid)) {
242 if (strncasecmp(ptr, "src", 3) == 0) {
243 ptr = strtok_r(NULL, CONTROL_TOKENS_QUOTES, &aid);
244 if (ptr != NULL) {
245 if (startswith("+", ptr)) {
246 continue; // XXX: Kludge - probably an inline script
247 }
248 memset(tmp, 0, BUFSZ);
249 strncpy(tmp, ptr, BUFSZ-1);
250 URL U = url_normalize(base, tmp);
251 if (debug) printf("5.) Adding: %s\n", url_get_absolute(U));
252 __add_url(array, U);
253 }
254 }
255 }
256 } else if (strncasecmp(ptr, "location.href", 13) == 0) {
257 ptr = strtok_r(NULL, CONTROL_TOKENS_PLUS, &aid);
258 if (ptr != NULL ) {
259 memset(tmp, '\0', BUFSZ);
260 strncpy(tmp, ptr, BUFSZ-1);
261 }
262 } else if (strncasecmp(ptr, "frame", 5) == 0) {
263 ptr = strtok_r(NULL, CONTROL_TOKENS, &aid);
264 while (ptr != NULL) {
265 if (strncasecmp(ptr, "src", 3) == 0) {
266 ptr = strtok_r(NULL, CONTROL_TOKENS_PLUS, &aid);
267 if (ptr != NULL) {
268 memset(tmp, '\0', BUFSZ);
269 strncpy(tmp, ptr, BUFSZ-1);
270 }
271 }
272 ptr = strtok_r(NULL, CONTROL_TOKENS, &aid);
273 }
274 } else if (strncasecmp(ptr, "background", 10) == 0) {
275 ptr = strtok_r(NULL, CONTROL_TOKENS_QUOTES, &aid);
276 if (ptr != NULL && strmatch("body", top)) {
277 memset(tmp, 0, BUFSZ);
278 strncpy(tmp, ptr, BUFSZ-1);
279 URL U = url_normalize(base, tmp);
280 if (debug) printf("6.) Adding: %s\n", url_get_absolute(U));
281 __add_url(array, U);
282 }
283 }
284 ptr = strtok_r(NULL, CONTROL_TOKENS, &aid);
285 }
286 }
287
288 /*-
289 * Copyright (c) 1990, 1993
290 * The Regents of the University of California. All rights reserved.
291 *
292 * This code is derived from software contributed to Berkeley by
293 * Chris Torek.
294 *
295 * This code was altered by Jeffrey Fulmer. The original is here:
296 * http://opensource.apple.com/source/Libc/Libc-391.4.1/string/FreeBSD/strcasestr.c
297 *
298 * Redistribution and use in source and binary forms, with or without
299 * modification, are permitted provided that the following conditions
300 * are met:
301 * 1. Redistributions of source code must retain the above copyright
302 * notice, this list of conditions and the following disclaimer.
303 * 2. Redistributions in binary form must reproduce the above copyright
304 * notice, this list of conditions and the following disclaimer in the
305 * documentation and/or other materials provided with the distribution.
306 * 3. All advertising materials mentioning features or use of this software
307 * must display the following acknowledgement:
308 * This product includes software developed by the University of
309 * California, Berkeley and its contributors.
310 * 4. Neither the name of the University nor the names of its contributors
311 * may be used to endorse or promote products derived from this software
312 * without specific prior written permission.
313 *
314 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
315 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
316 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
317 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
318 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
319 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
320 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
321 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
322 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
323 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
324 * SUCH DAMAGE.
325 */
326
327 private char *
__strcasestr(const char * s,const char * find)328 __strcasestr(const char *s, const char *find)
329 {
330 char c, sc;
331 size_t len;
332
333 if ((c = *find++) != 0) {
334 c = tolower((unsigned char)c);
335 len = strlen(find);
336 do {
337 do {
338 if ((sc = *s++) == 0)
339 return (NULL);
340 } while ((char)tolower((unsigned char)sc) != c);
341 } while (strncasecmp(s, find, len) != 0);
342 s--;
343 }
344 return ((char *)s);
345 }
346
347 /**
348 * http://rosettacode.org/wiki/Strip_a_set_of_characters_from_a_string#C
349 */
350 private char *
__xstrip(const char * str,const char * pat)351 __xstrip(const char * str, const char *pat)
352 {
353 int i = 0;
354 int tbl[128] = {0};
355 while (*pat != '\0')
356 tbl[(int)*(pat++)] = 1;
357
358 char *ret = xmalloc(strlen(str) + 1);
359 do {
360 if (!tbl[(int)*str])
361 ret[i++] = *str;
362 } while (*(str++) != '\0');
363
364 return xrealloc(ret, i);
365 }
366
367
368