1 /**
2  * HTML Parser
3  *
4  * Copyright (C) 2015-2016
5  * Jeffrey Fulmer - <jeff@joedog.org>, et al.
6  *
7  * This file is distributed as part of Siege
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License along
20  * with this program; if not, write to the Free Software Foundation, Inc.
21  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22  *--
23  */
24 #ifdef  HAVE_CONFIG_H
25 # include <config.h>
26 #endif/*HAVE_CONFIG_H*/
27 #include <url.h>
28 #include <parser.h>
29 #include <util.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <ctype.h>
33 #include <stdio.h>
34 #include <array.h>
35 #include <memory.h>
36 #include <joedog/defs.h>
37 
38 #define CONTROL_TOKENS      " ="
39 #define CONTROL_TOKENS_PLUS " =\"\'"
40 #define CONTROL_TOKENS_QUOTES " \"\'"
41 
42 private void    __parse_control(ARRAY array, URL base, char *html);
43 private void    __add_url(ARRAY array, URL U);
44 private char *  __strcasestr(const char *s, const char *find);
45 private char *  __xstrip(const char * str, const char *pat);
46 
47 #define BUFSZ 4096
48 
49 BOOLEAN
html_parser(ARRAY array,URL base,char * page)50 html_parser(ARRAY array, URL base, char *page)
51 {
52   char *str;
53   char *ptr;
54   int  i;
55   char tmp[BUFSZ];
56 
57   memset(tmp, '\0', BUFSZ);
58   ptr = str = __xstrip(page, "\\");
59 
60   if (page == NULL) return FALSE;
61   if (strlen(page) < 1) return FALSE;
62 
63   while (*ptr != '\0') {
64     if (*ptr == '<') {
65       ptr++;
66       if (startswith("!--", ptr) == TRUE)  {
67         ptr += 3;
68         while (*ptr!='\0') {
69           if (startswith("-->", ptr) == TRUE) {
70             ptr += 3;
71             break;
72           }
73           ptr++;
74         }
75       } else {
76         i = 0;
77         memset(tmp, '\0', sizeof(tmp));
78         while (*ptr != '\0' && *ptr != '>' && i < (BUFSZ-1)) {
79           tmp[i] = *ptr;
80           i++;
81           ptr++;
82         }
83         __parse_control(array, base, tmp);
84       }
85     }
86     ptr++;
87   }
88   xfree(str);
89   return TRUE;
90 }
91 
92 private void
__add_url(ARRAY array,URL U)93 __add_url(ARRAY array, URL U)
94 {
95   int i = 0;
96   BOOLEAN found = FALSE;
97 
98   if (U == NULL || url_get_hostname(U) == NULL || strlen(url_get_hostname(U)) < 2) {
99     return;
100   }
101 
102   if (array != NULL) {
103     for (i = 0; i < (int)array_length(array); i++) {
104       URL     url   = (URL)array_get(array, i);
105       if (strmatch(url_get_absolute(U), url_get_absolute(url))) {
106         found = TRUE;
107       }
108     }
109   }
110   if (! found) {
111     array_npush(array, U, URLSIZE);
112   }
113   return;
114 }
115 
116 /**
117  * The following code is based on parse_control from LinkCheck
118  * by Ken Jones <kbo@inter7.com>
119  *
120  * Copyright (C) 2000 Inter7 Internet Technologies, Inc.
121  * Copyright (C) 2013 Jeffrey Fulmer, et al
122  *
123  * This program is free software; you can redistribute it and/or modify
124  * it under the terms of the GNU General Public License as published by
125  * the Free Software Foundation; either version 2 of the License, or
126  * (at your option) any later version.
127  *
128  * This program is distributed in the hope that it will be useful,
129  * but WITHOUT ANY WARRANTY; without even the implied warranty of
130  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
131  * GNU General Public License for more details.
132  *
133  * You should have received a copy of the GNU General Public License
134  * along with this program; if not, write to the Free Software
135  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
136  *
137  */
138 private void
__parse_control(ARRAY array,URL base,char * html)139 __parse_control(ARRAY array, URL base, char *html)
140 {
141   char  * ptr = NULL;
142   char  * aid;
143   char    tmp[BUFSZ];
144   char  * top;
145   BOOLEAN debug = FALSE;
146 
147   ptr = top = strtok_r(html, CONTROL_TOKENS, &aid);
148   while (ptr != NULL) {
149     if (strncasecmp(ptr, "href", 4) == 0) {
150       ptr = strtok_r(NULL, CONTROL_TOKENS_PLUS, &aid);
151       if (ptr != NULL) {
152         memset(tmp, '\0', BUFSZ);
153         strncpy(tmp, ptr, BUFSZ-1);
154       }
155     } else if (strncasecmp(ptr, "meta", 4) == 0) {
156       /* <meta http-equiv="refresh" content="0; url=http://example.com/" /> */
157       for (ptr = strtok_r(NULL, CONTROL_TOKENS, &aid); ptr != NULL; ptr = strtok_r(NULL, CONTROL_TOKENS, &aid)) {
158         if (strncasecmp(ptr, "content", 7) == 0) {
159           for (ptr = strtok_r(NULL, CONTROL_TOKENS, &aid); ptr != NULL; ptr = strtok_r(NULL, CONTROL_TOKENS, &aid)) {
160             if (__strcasestr(ptr, "url") != NULL) {
161               ptr = strtok_r(NULL, CONTROL_TOKENS_QUOTES, &aid);
162               if (ptr != NULL) {
163                 URL U = url_normalize(base, ptr);
164                 url_set_redirect(U, TRUE);
165                 if (debug) printf("1.) Adding: %s\n", url_get_absolute(U));
166                 __add_url(array, U);
167               }
168             }
169           }
170         }
171       }
172     } else if (strncasecmp(ptr, "img", 3) == 0) {
173       ptr = strtok_r(NULL, CONTROL_TOKENS, &aid);
174       if (ptr != NULL && aid != NULL) {
175         if (! strncasecmp(aid, "\"\"", 2)) {
176           // empty string, i.e., img src=""
177           continue;
178         }
179         if (! strncasecmp(ptr, "src", 3)) {
180           ptr = strtok_r(NULL, CONTROL_TOKENS_QUOTES, &aid);
181           if (ptr != NULL) {
182 			if ( !strncasecmp(ptr, "data:image", 10) )
183 				continue;	//VL issue #1
184             URL U = url_normalize(base, ptr);
185             if (debug) printf("2.) Adding: %s\n", url_get_absolute(U));
186             if (! endswith("+", url_get_absolute(U))) {
187               __add_url(array, U);
188             }
189           }
190         } else {
191           for (ptr = strtok_r(NULL, CONTROL_TOKENS, &aid); ptr != NULL; ptr = strtok_r(NULL, CONTROL_TOKENS, &aid)) {
192             if ((ptr != NULL) && (strncasecmp(ptr, "src", 3) == 0)) {
193               ptr = strtok_r(NULL, CONTROL_TOKENS_QUOTES, &aid);
194               if (ptr != NULL && strlen(ptr) > 1 && strncasecmp(ptr, "data:image", 10)) { //VL issue #1
195                 URL U = url_normalize(base, ptr);
196                 if (debug) printf("3.) Adding: %s\n", url_get_absolute(U));
197                 __add_url(array, U);
198               }
199             }
200           }
201         }
202       }
203     } else if (strncasecmp(ptr, "link", 4) == 0) {
204       /*
205       <link rel="stylesheet" type="text/css" href="/wp-content/themes/joedog/style.css" />
206       <meta name="verify-v1" content="T3mz6whWX6gK4o2ptN99TNTakYMe7InrFRkBqqi/6XI=" />
207       <link href="https://plus.google.com/u/0/102619614955071602341" rel="author" />
208       */
209       BOOLEAN okay = FALSE;
210       char buf[2048]; //XXX: TEMP!!!!! make dynamic
211       for (ptr = strtok_r(NULL, CONTROL_TOKENS, &aid); ptr != NULL; ptr = strtok_r(NULL, CONTROL_TOKENS, &aid)) {
212         if (strncasecmp(ptr, "rel", 3) == 0) {
213           ptr = strtok_r(NULL, CONTROL_TOKENS_PLUS, &aid);
214 	  if (ptr == NULL) {
215 	    continue;
216 	  }
217           if (strncasecmp(ptr, "stylesheet", 10) == 0) {
218             okay = TRUE;
219           }
220           if (strncasecmp(ptr, "next", 4) == 0) {
221             okay = FALSE;
222           }
223           if (strncasecmp(ptr, "alternate", 9) == 0) {
224             okay = FALSE;
225           }
226         }
227         if (strncasecmp(ptr, "href", 4) == 0) {
228           ptr = strtok_r(NULL, CONTROL_TOKENS_QUOTES, &aid);
229           if (ptr != NULL) {
230             memset(buf, '\0', sizeof(buf));
231             strncpy(buf, ptr, strlen(ptr));
232           }
233         }
234       }
235       if (okay) {
236         URL U = url_normalize(base, buf);
237         if (debug) printf("4.) Adding: %s\n", url_get_absolute(U));
238         __add_url(array, U);
239       }
240     } else if (strncasecmp(ptr, "script", 6) == 0) {
241       for (ptr = strtok_r(NULL, CONTROL_TOKENS, &aid); ptr != NULL; ptr = strtok_r(NULL, CONTROL_TOKENS, &aid)) {
242         if (strncasecmp(ptr, "src", 3) == 0) {
243           ptr = strtok_r(NULL, CONTROL_TOKENS_QUOTES, &aid);
244           if (ptr != NULL) {
245             if (startswith("+", ptr)) {
246               continue; // XXX: Kludge - probably an inline script
247             }
248             memset(tmp, 0, BUFSZ);
249             strncpy(tmp, ptr, BUFSZ-1);
250             URL U = url_normalize(base, tmp);
251             if (debug) printf("5.) Adding: %s\n", url_get_absolute(U));
252             __add_url(array, U);
253           }
254         }
255       }
256     } else if (strncasecmp(ptr, "location.href", 13) == 0) {
257       ptr = strtok_r(NULL, CONTROL_TOKENS_PLUS, &aid);
258       if (ptr != NULL ) {
259         memset(tmp, '\0', BUFSZ);
260         strncpy(tmp, ptr, BUFSZ-1);
261       }
262     } else if (strncasecmp(ptr, "frame", 5) == 0) {
263       ptr = strtok_r(NULL, CONTROL_TOKENS, &aid);
264       while (ptr != NULL) {
265         if (strncasecmp(ptr, "src", 3) == 0) {
266           ptr = strtok_r(NULL, CONTROL_TOKENS_PLUS, &aid);
267           if (ptr != NULL) {
268             memset(tmp, '\0', BUFSZ);
269             strncpy(tmp, ptr, BUFSZ-1);
270           }
271         }
272         ptr = strtok_r(NULL, CONTROL_TOKENS, &aid);
273       }
274     } else if (strncasecmp(ptr, "background", 10) == 0) {
275       ptr = strtok_r(NULL, CONTROL_TOKENS_QUOTES, &aid);
276       if (ptr != NULL && strmatch("body", top)) {
277         memset(tmp, 0, BUFSZ);
278         strncpy(tmp, ptr, BUFSZ-1);
279         URL U = url_normalize(base, tmp);
280         if (debug) printf("6.) Adding: %s\n", url_get_absolute(U));
281         __add_url(array, U);
282       }
283     }
284     ptr = strtok_r(NULL, CONTROL_TOKENS, &aid);
285   }
286 }
287 
288 /*-
289  * Copyright (c) 1990, 1993
290  *	The Regents of the University of California.  All rights reserved.
291  *
292  * This code is derived from software contributed to Berkeley by
293  * Chris Torek.
294  *
295  * This code was altered by Jeffrey Fulmer. The original is here:
296  * http://opensource.apple.com/source/Libc/Libc-391.4.1/string/FreeBSD/strcasestr.c
297  *
298  * Redistribution and use in source and binary forms, with or without
299  * modification, are permitted provided that the following conditions
300  * are met:
301  * 1. Redistributions of source code must retain the above copyright
302  *    notice, this list of conditions and the following disclaimer.
303  * 2. Redistributions in binary form must reproduce the above copyright
304  *    notice, this list of conditions and the following disclaimer in the
305  *    documentation and/or other materials provided with the distribution.
306  * 3. All advertising materials mentioning features or use of this software
307  *    must display the following acknowledgement:
308  *	This product includes software developed by the University of
309  *	California, Berkeley and its contributors.
310  * 4. Neither the name of the University nor the names of its contributors
311  *    may be used to endorse or promote products derived from this software
312  *    without specific prior written permission.
313  *
314  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
315  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
316  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
317  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
318  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
319  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
320  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
321  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
322  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
323  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
324  * SUCH DAMAGE.
325  */
326 
327 private char *
__strcasestr(const char * s,const char * find)328 __strcasestr(const char *s, const char *find)
329 {
330   char c, sc;
331   size_t len;
332 
333   if ((c = *find++) != 0) {
334     c = tolower((unsigned char)c);
335     len = strlen(find);
336     do {
337       do {
338         if ((sc = *s++) == 0)
339           return (NULL);
340       } while ((char)tolower((unsigned char)sc) != c);
341     } while (strncasecmp(s, find, len) != 0);
342     s--;
343   }
344   return ((char *)s);
345 }
346 
347 /**
348  * http://rosettacode.org/wiki/Strip_a_set_of_characters_from_a_string#C
349  */
350 private char *
__xstrip(const char * str,const char * pat)351 __xstrip(const char * str, const char *pat)
352 {
353   int i = 0;
354   int tbl[128] = {0};
355   while (*pat != '\0')
356     tbl[(int)*(pat++)] = 1;
357 
358   char *ret = xmalloc(strlen(str) + 1);
359   do {
360     if (!tbl[(int)*str])
361       ret[i++] = *str;
362   } while (*(str++) != '\0');
363 
364   return xrealloc(ret, i);
365 }
366 
367 
368