1 /*
2  * Copyright (c) 2012 Tim Ruehsen
3  * Copyright (c) 2015-2021 Free Software Foundation, Inc.
4  *
5  * This file is part of libwget.
6  *
7  * Libwget is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU Lesser General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version.
11  *
12  * Libwget is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License
18  * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
19  *
20  *
21  * xml parsing routines
22  *
23  * Changelog
24  * 22.06.2012  Tim Ruehsen  created, but needs definitely a rewrite
25  *
26  * This derives from an old source code that I wrote in 2001.
27  * It is short, fast and has a low memory print, BUT it is a hack.
28  * It has to be replaced by e.g. libxml2 or something better.
29  *
30  * HTML parsing is (very) different from XML parsing, see here:
31  * https://html.spec.whatwg.org/multipage/syntax.html
32  * It is a PITA and should be handled by a specialized, external library !
33  *
34  */
35 
36 #include <config.h>
37 
38 #include <unistd.h>
39 #include <stdio.h>
40 #include <string.h>
41 #include <fcntl.h>
42 #include <sys/stat.h>
43 #ifdef HAVE_MMAP
44 #include <sys/mman.h>
45 #endif
46 
47 #include <wget.h>
48 #include "private.h"
49 
50 typedef struct {
51 	const char
52 		*buf, // pointer to original start of buffer (0-terminated)
53 		*p, // pointer next char in buffer
54 		*token; // token buffer
55 	int
56 		hints; // XML_HINT...
57 	size_t
58 		token_size, // size of token buffer
59 		token_len; // used bytes of token buffer (not counting terminating 0 byte)
60 	void
61 		*user_ctx; // user context (not needed if we were using nested functions)
62 	wget_xml_callback
63 		*callback;
64 } xml_context;
65 
66 /* \cond _hide_internal_symbols */
67 #define ascii_isspace(c) (c == ' ' || (c >= 9 && c <=  13))
68 
69 // working only for consecutive alphabets, e.g. EBCDIC would not work
70 #define ascii_isalpha(c) ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
71 /* \endcond */
72 
73 // append a char to token buffer
74 
getToken(xml_context * context)75 static const char *getToken(xml_context *context)
76 {
77 	int c;
78 	const char *p;
79 
80 	// skip leading whitespace
81 	while ((c = *context->p) && ascii_isspace(c))
82 		context->p++;
83 	if (!c) return NULL; // eof
84 	context->token = context->p++;
85 
86 //	info_printf("a c=%c\n", c);
87 
88 	if (ascii_isalpha(c) || c == '_') {
89 		while ((c = *context->p) && !ascii_isspace(c) && c != '>' && c != '=')
90 			context->p++;
91 		if (!c) return NULL; // syntax error
92 
93 		context->token_len = context->p - context->token;
94 		return context->token;
95 	}
96 
97 	if (c == '/') {
98 		if (!(c = *context->p)) return NULL; // syntax error
99 		context->p++;
100 		if (c == '>') {
101 			context->token_len = 2;
102 			return context->token;
103 		} else return NULL; // syntax error
104 	}
105 
106 	if (c == '\"' || c == '\'') { // read in quoted value
107 		int quote = c;
108 
109 		context->token = context->p;
110 
111 		if (!(p = strchr(context->p, quote)))
112 			return NULL;
113 		context->p = p + 1;
114 
115 		context->token_len = context->p - context->token - 1;
116 		return context->token;
117 	}
118 
119 	if (c == '<') { // fetch specials, e.g. start of comments '<!--'
120 		if (!(c = *context->p)) return NULL; // syntax error
121 		context->p++;
122 		if (c == '?' || c == '/') {
123 			context->token_len = 2;
124 			return context->token;
125 		}
126 
127 		if (c == '!') {
128 			// left: <!--, <![CDATA[ and <!WHATEVER
129 			if (!(c = *context->p)) return NULL; // syntax error
130 			if (c == '-') {
131 				context->p++;
132 				if (!(c = *context->p)) return NULL; // syntax error
133 				context->p++;
134 				if (c == '-') {
135 					context->token_len = 4;
136 					return context->token;
137 				} else {
138 					context->p -= 2;
139 					context->token_len = 2;
140 					return context->token;
141 				}
142 			} else {
143 				context->token_len = 2;
144 				return context->token;
145 			}
146 		} else {
147 			context->p--;
148 			context->token_len = 1;
149 			return context->token;
150 		}
151 	}
152 
153 	if (c == '>' || c == '=') {
154 		context->token_len = 1;
155 		return context->token;
156 	}
157 
158 	if (c == '-') { // fetch specials, e.g. end of comments '-->'
159 		if (!(c = *context->p)) return NULL; // syntax error
160 		if (c != '-') {
161 			c = '-';  //???
162 		} else {
163 			context->p++;
164 			if (!(c = *context->p)) return NULL; // syntax error
165 			context->p++;
166 			if (c != '>') {
167 				context->p -= 2;
168 				c = '-';
169 			} else {
170 				context->token_len = 3;
171 				return context->token;
172 			}
173 		}
174 	}
175 
176 	if (c == '?') { // fetch specials, e.g. '?>'
177 		if (!(c = *context->p)) return NULL; // syntax error
178 		if (c != '>') {
179 			// c = '?';
180 		} else {
181 			context->p++;
182 			context->token_len = 2;
183 			return context->token;
184 		}
185 	}
186 
187 	while ((c = *context->p) && !ascii_isspace(c))
188 		context->p++;
189 
190 	if (c) {
191 		context->token_len = context->p - context->token;
192 		return context->token;
193 	}
194 
195 	return NULL;
196 }
197 
getValue(xml_context * context)198 static int getValue(xml_context *context)
199 {
200 	int c;
201 
202 	context->token_len = 0;
203 	context->token = context->p;
204 
205 	// remove leading spaces
206 	while ((c = *context->p) && ascii_isspace(c))
207 		context->p++;
208 	if (!c) return EOF;
209 
210 	if (c == '=') {
211 		context->p++;
212 		if (!getToken(context))
213 			return EOF; // syntax error
214 		else
215 			return 1; // token valid
216 	}
217 
218 	// attribute without value
219 	context->token = context->p;
220 	return 1;
221 }
222 
223 // special HTML <script> content parsing
224 // see https://html.spec.whatwg.org/multipage/scripting.html#the-script-element
225 // see https://html.spec.whatwg.org/multipage/scripting.html#restrictions-for-contents-of-script-elements
226 
getScriptContent(xml_context * context)227 static const char *getScriptContent(xml_context *context)
228 {
229 	int comment = 0, length_valid = 0;
230 	const char *p;
231 
232 	for (p = context->token = context->p; *p; p++) {
233 		if (comment) {
234 			if (*p == '-' && !strncmp(p, "-->", 3)) {
235 				p += 3 - 1;
236 				comment = 0;
237 			}
238 		} else {
239 			if (*p == '<' && !strncmp(p, "<!--", 4)) {
240 				p += 4 - 1;
241 				comment = 1;
242 			} else if (*p == '<' && !wget_strncasecmp_ascii(p, "</script", 8)) {
243 				context->token_len = p - context->token;
244 				length_valid = 1;
245 				for (p += 8; ascii_isspace(*p); p++);
246 				if (*p == '>') {
247 					p++;
248 					break; // found end of <script>
249 				} else if (!*p)
250 					break; // end of input
251 			}
252 		}
253 	}
254 	context->p = p;
255 
256 	if (!length_valid)
257 		context->token_len = p - context->token;
258 
259 	if (!*p && !context->token_len)
260 		return NULL;
261 
262 	if (context->callback)
263 		context->callback(context->user_ctx, XML_FLG_CONTENT | XML_FLG_END, "script", NULL, context->token, context->token_len, context->token - context->buf);
264 
265 	return context->token;
266 }
267 
268 // special HTML <style> content parsing
269 // see https://html.spec.whatwg.org/multipage/semantics.html#the-style-element
getStyleContent(xml_context * context)270 static const char *getStyleContent(xml_context *context)
271 {
272 	int comment = 0, length_valid = 0;
273 	const char *p;
274 
275 	for (p = context->token = context->p; *p; p++) {
276 		if (comment) {
277 			if (p[0] == '*' && p[1] == '/') {
278 				p += 2 - 1;
279 				comment = 0;
280 			}
281 		} else {
282 			if (p[0] == '/' && p[1] == '*') {
283 				p += 2 - 1;
284 				comment = 1;
285 			} else if (*p == '<' && !wget_strncasecmp_ascii(p, "</style", 7)) {
286 				context->token_len = p - context->token;
287 				length_valid = 1;
288 				for (p += 7; ascii_isspace(*p); p++);
289 				if (*p == '>') {
290 					p++;
291 					break; // found end of <style>
292 				} else if (!*p)
293 					break; // end of input
294 			}
295 		}
296 	}
297 	context->p = p;
298 
299 	if (!length_valid)
300 		context->token_len = p - context->token;
301 
302 	if (!*p && !context->token_len)
303 		return NULL;
304 
305 	if (context->callback)
306 		context->callback(context->user_ctx, XML_FLG_CONTENT | XML_FLG_END, "style", NULL, context->token, context->token_len, context->token - context->buf);
307 
308 	return context->token;
309 }
310 
getUnparsed(xml_context * context,int flags,const char * end,size_t len,const char * directory)311 static const char *getUnparsed(xml_context *context, int flags, const char *end, size_t len, const char *directory)
312 {
313 	int c;
314 
315 	if (len == 1) {
316 		for (context->token = context->p; (c = *context->p) && c != *end; context->p++);
317 	} else {
318 		for (context->token = context->p; (c = *context->p); context->p++) {
319 			if (c == *end && context->p[1] == end[1] && (len == 2 || context->p[2] == end[2])) {
320 				break;
321 			}
322 		}
323 	}
324 
325 	context->token_len = context->p - context->token;
326 	if (c) context->p += len;
327 
328 	if (!c && !context->token_len)
329 		return NULL;
330 /*
331 	if (context->token && context->token_len && context->hints & XML_HINT_REMOVE_EMPTY_CONTENT) {
332 		int notempty = 0;
333 		char *p;
334 
335 		for (p = context->token; *p; p++) {
336 			if (!ascii_isspace(*p)) {
337 				notempty = 1;
338 				break;
339 			}
340 		}
341 
342 		if (notempty) {
343 			if (context->callback)
344 				context->callback(context->user_ctx, flags, directory, NULL, context->token, context->token_len, context->token - context->buf);
345 		} else {
346 			// ignore empty content
347 			context->token_len = 0;
348 			context->token[0] = 0;
349 		}
350 	} else {
351 */
352 	if (context->callback)
353 		context->callback(context->user_ctx, flags, directory, NULL, context->token, context->token_len, context->token - context->buf);
354 
355 //	}
356 
357 	return context->token;
358 }
359 
getComment(xml_context * context)360 static const char *getComment(xml_context *context)
361 {
362 	return getUnparsed(context, XML_FLG_COMMENT, "-->", 3, NULL);
363 }
364 
getProcessing(xml_context * context)365 static const char *getProcessing(xml_context *context)
366 {
367 	return getUnparsed(context, XML_FLG_PROCESSING, "?>", 2, NULL);
368 }
369 
getSpecial(xml_context * context)370 static const char *getSpecial(xml_context *context)
371 {
372 	return getUnparsed(context, XML_FLG_SPECIAL, ">", 1, NULL);
373 }
374 
getContent(xml_context * context,const char * directory)375 static const char *getContent(xml_context *context, const char *directory)
376 {
377 	int c;
378 
379 	for (context->token = context->p; (c = *context->p) && c != '<'; context->p++);
380 
381 	context->token_len = context->p - context->token;
382 
383 	if (!c && !context->token_len)
384 		return NULL;
385 
386 	// debug_printf("content=%.*s\n", (int)context->token_len, context->token);
387 	if (context->callback && context->token_len)
388 		context->callback(context->user_ctx, XML_FLG_CONTENT, directory, NULL, context->token, context->token_len, context->token - context->buf);
389 
390 	return context->token;
391 }
392 
parseXML(const char * dir,xml_context * context)393 static int parseXML(const char *dir, xml_context *context)
394 {
395 	const char *tok;
396 	char directory[256] = "";
397 	size_t pos = 0;
398 
399 	if (!(context->hints & XML_HINT_HTML)) {
400 		pos = wget_strlcpy(directory, dir, sizeof(directory));
401 		if (pos >= sizeof(directory)) pos = sizeof(directory) - 1;
402 	}
403 
404 	do {
405 		getContent(context, directory);
406 		if (context->token_len)
407 			debug_printf("%s='%.*s'\n", directory, (int)context->token_len, context->token);
408 
409 		if (!(tok = getToken(context))) return WGET_E_SUCCESS;  //eof
410 		// debug_printf("A Token '%.*s' len=%zu tok='%s'\n", (int)context->token_len, context->token, context->token_len, tok);
411 
412 		if (context->token_len == 1 && *tok == '<') {
413 			// get element name and add it to directory
414 			int flags = XML_FLG_BEGIN;
415 
416 			if (!(tok = getToken(context))) return WGET_E_XML_PARSE_ERR; // syntax error
417 
418 			// debug_printf("A2 Token '%.*s'\n", (int)context->token_len, context->token);
419 
420 			if (!(context->hints & XML_HINT_HTML)) {
421 				if (!pos || directory[pos - 1] != '/')
422 					wget_snprintf(&directory[pos], sizeof(directory) - pos, "/%.*s", (int)context->token_len, tok);
423 				else
424 					wget_snprintf(&directory[pos], sizeof(directory) - pos, "%.*s", (int)context->token_len, tok);
425 			} else {
426 				// wget_snprintf(directory, sizeof(directory), "%.*s", (int)context->token_len, tok);
427 				if (context->token_len < sizeof(directory)) {
428 					memcpy(directory, tok, context->token_len);
429 					directory[context->token_len] = 0;
430 				} else {
431 					memcpy(directory, tok, sizeof(directory) - 1);
432 					directory[sizeof(directory) - 1] = 0;
433 				}
434 			}
435 
436 			while ((tok = getToken(context))) {
437 				// debug_printf("C Token %.*s %zu %p %p dir=%s tok=%s\n", (int)context->token_len, context->token, context->token_len, context->token, context->p, directory, tok);
438 				if (context->token_len == 2 && !strncmp(tok, "/>", 2)) {
439 					if (context->callback)
440 						context->callback(context->user_ctx, flags | XML_FLG_END, directory, NULL, NULL, 0, 0);
441 					break; // stay in this level
442 				} else if (context->token_len == 1 && *tok == '>') {
443 					if (context->callback)
444 						context->callback(context->user_ctx, flags | XML_FLG_CLOSE, directory, NULL, NULL, 0, 0);
445 					if (context->hints & XML_HINT_HTML) {
446 						if (!wget_strcasecmp_ascii(directory, "script")) {
447 							// special HTML <script> content parsing
448 							// see https://html.spec.whatwg.org/multipage/scripting.html#the-script-element
449 							// 4.3.1.2 Restrictions for contents of script elements
450 							debug_printf("*** need special <script> handling\n");
451 							getScriptContent(context);
452 							if (context->token_len)
453 								debug_printf("%s=%.*s\n", directory, (int)context->token_len, context->token);
454 						}
455 						else if (!wget_strcasecmp_ascii(directory, "style")) {
456 							getStyleContent(context);
457 							if (context->token_len)
458 								debug_printf("%s=%.*s\n", directory, (int)context->token_len, context->token);
459 						}
460 					} else
461 						parseXML(directory, context); // descend one level
462 					break;
463 				} else {
464 					char attribute[256];
465 					size_t attrlen = context->token_len >= sizeof(attribute) ? sizeof(attribute) - 1 : context->token_len;
466 
467 					memcpy(attribute, tok, attrlen);
468 					attribute[attrlen] = 0;
469 
470 					if (getValue(context) == EOF) return WGET_E_XML_PARSE_ERR; // syntax error
471 
472 					if (context->token_len) {
473 						debug_printf("%s/@%s=%.*s\n", directory, attribute, (int)context->token_len, context->token);
474 						if (context->callback)
475 							context->callback(context->user_ctx, flags | XML_FLG_ATTRIBUTE, directory, attribute, context->token, context->token_len, context->token - context->buf);
476 					} else {
477 						debug_printf("%s/@%s\n", directory, attribute);
478 						if (context->callback)
479 							context->callback(context->user_ctx, flags | XML_FLG_ATTRIBUTE, directory, attribute, NULL, 0, 0);
480 					}
481 					flags = 0;
482 				}
483 			}
484 			directory[pos] = 0;
485 		} else if (context->token_len == 2) {
486 			if (!strncmp(tok, "</", 2)) {
487 				// ascend one level
488 				// cleanup - get name and '>'
489 				if (!(tok = getToken(context))) return WGET_E_XML_PARSE_ERR;
490 				// debug_printf("X Token %s\n",tok);
491 				if (context->callback) {
492 					if (!(context->hints & XML_HINT_HTML))
493 						context->callback(context->user_ctx, XML_FLG_END, directory, NULL, NULL, 0, 0);
494 					else {
495 						char tag[context->token_len + 1]; // we need to \0 terminate tok
496 						memcpy(tag, tok, context->token_len);
497 						tag[context->token_len] = 0;
498 						context->callback(context->user_ctx, XML_FLG_END, tag, NULL, NULL, 0, 0);
499 					}
500 				}
501 				if (!(tok = getToken(context))) return WGET_E_XML_PARSE_ERR;
502 				// debug_printf("Y Token %s\n",tok);
503 				if (!(context->hints & XML_HINT_HTML))
504 					return WGET_E_SUCCESS;
505 				else
506 					continue;
507 			} else if (!strncmp(tok, "<?", 2)) { // special info - ignore
508 				getProcessing(context);
509 				debug_printf("%s=<?%.*s?>\n", directory, (int)context->token_len, context->token);
510 				continue;
511 			} else if (!strncmp(tok, "<!", 2)) {
512 				getSpecial(context);
513 				debug_printf("%s=<!%.*s>\n", directory, (int)context->token_len, context->token);
514 			}
515 		} else if (context->token_len == 4 && !strncmp(tok, "<!--", 4)) { // comment - ignore
516 			getComment(context);
517 			debug_printf("%s=<!--%.*s-->\n", directory, (int)context->token_len, context->token);
518 			continue;
519 		}
520 	} while (tok);
521 	return WGET_E_SUCCESS;
522 }
523 
524 /**
525  * \file
526  * \brief XML parsing functions
527  * \defgroup libwget-xml XML parsing functions
528  * @{
529  */
530 
531 /**
532  * \param[in] buf Zero-terminated XML or HTML input data
533  * \param[in] callback Function called for each token scan result
534  * \param[in] user_ctx User-defined context variable, handed to \p callback
535  * \param[in] hints Flags to influence parsing
536  *
537  * This function scans the XML input from \p buf and calls \p callback for each token
538  * found. \p user_ctx is a user-defined context variable and given to each call of \p callback.
539  *
540  * \p hints may be 0 or any combination of %XML_HINT_REMOVE_EMPTY_CONTENT and %XML_HINT_HTML.
541  *
542  * %XML_HINT_REMOVE_EMPTY_CONTENT reduces the number of calls to \p callback by ignoring
543  * empty content and superfluous spaces.
544  *
545  * %XML_HINT_HTML turns on HTML scanning.
546  */
wget_xml_parse_buffer(const char * buf,wget_xml_callback * callback,void * user_ctx,int hints)547 int wget_xml_parse_buffer(
548 	const char *buf,
549 	wget_xml_callback *callback,
550 	void *user_ctx,
551 	int hints)
552 {
553 	xml_context context;
554 
555 	context.token = NULL;
556 	context.token_size = 0;
557 	context.token_len = 0;
558 	context.buf = buf;
559 	context.p = buf;
560 	context.user_ctx = user_ctx;
561 	context.callback = callback;
562 	context.hints = hints;
563 
564 	return parseXML ("/", &context);
565 }
566 
567 /**
568  * \param[in] buf Zero-terminated HTML input data
569  * \param[in] callback Function called for each token scan result
570  * \param[in] user_ctx User-defined context variable, handed to \p callback
571  * \param[in] hints Flags to influence parsing
572  *
573  * Convenience function that calls wget_xml_parse_buffer() with HTML parsing turned on.
574  */
wget_html_parse_buffer(const char * buf,wget_xml_callback * callback,void * user_ctx,int hints)575 void wget_html_parse_buffer(
576 	const char *buf,
577 	wget_xml_callback *callback,
578 	void *user_ctx,
579 	int hints)
580 {
581 	wget_xml_parse_buffer(buf, callback, user_ctx, hints | XML_HINT_HTML);
582 }
583 
584 /**
585  * \param[in] fname Name of XML or HTML input file
586  * \param[in] callback Function called for each token scan result
587  * \param[in] user_ctx User-defined context variable, handed to \p callback
588  * \param[in] hints Flags to influence parsing
589  *
590  * Convenience function that calls wget_xml_parse_buffer() with the file content.
591  *
592  * If \p fname is `-`, the data is read from stdin.
593  */
wget_xml_parse_file(const char * fname,wget_xml_callback * callback,void * user_ctx,int hints)594 void wget_xml_parse_file(
595 	const char *fname,
596 	wget_xml_callback *callback,
597 	void *user_ctx,
598 	int hints)
599 {
600 	if (strcmp(fname,"-")) {
601 		int fd;
602 
603 		if ((fd = open(fname, O_RDONLY|O_BINARY)) != -1) {
604 			struct stat st;
605 			if (fstat(fd, &st) == 0) {
606 #ifdef HAVE_MMAP
607 				size_t nread = st.st_size;
608 				char *buf = mmap(NULL, nread + 1, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
609 #else
610 				char *buf=wget_malloc(st.st_size + 1);
611 				size_t nread=read(fd, buf, st.st_size);
612 #endif
613 
614 				if (nread > 0) {
615 					buf[nread] = 0; // PROT_WRITE allows this write, MAP_PRIVATE prevents changes in underlying file system
616 					wget_xml_parse_buffer(buf, callback, user_ctx, hints);
617 				}
618 
619 #ifdef HAVE_MMAP
620 				munmap(buf, nread);
621 #else
622 				xfree(buf);
623 #endif
624 			}
625 			close(fd);
626 		} else
627 			error_printf(_("Failed to open %s\n"), fname);
628 	} else {
629 		// read data from STDIN.
630 		// maybe should use yy_scan_bytes instead of buffering into memory.
631 		char tmp[4096];
632 		ssize_t nbytes;
633 		wget_buffer buf;
634 
635 		wget_buffer_init(&buf, NULL, 4096);
636 
637 		while ((nbytes = read(STDIN_FILENO, tmp, sizeof(tmp))) > 0) {
638 			wget_buffer_memcat(&buf, tmp, nbytes);
639 		}
640 
641 		if (buf.length)
642 			wget_xml_parse_buffer(buf.data, callback, user_ctx, hints);
643 
644 		wget_buffer_deinit(&buf);
645 	}
646 }
647 
648 /**
649  * \param[in] fname Name of XML or HTML input file
650  * \param[in] callback Function called for each token scan result
651  * \param[in] user_ctx User-defined context variable, handed to \p callback
652  * \param[in] hints Flags to influence parsing
653  *
654  * Convenience function that calls wget_xml_parse_file() with HTML parsing turned on.
655  *
656  * If \p fname is `-`, the data is read from stdin.
657  */
wget_html_parse_file(const char * fname,wget_xml_callback * callback,void * user_ctx,int hints)658 void wget_html_parse_file(
659 	const char *fname,
660 	wget_xml_callback *callback,
661 	void *user_ctx,
662 	int hints)
663 {
664 	wget_xml_parse_file(fname, callback, user_ctx, hints | XML_HINT_HTML);
665 }
666 
667 /** @} */
668