1 /*
2 * Copyright (c) 2012 Tim Ruehsen
3 * Copyright (c) 2015-2021 Free Software Foundation, Inc.
4 *
5 * This file is part of libwget.
6 *
7 * Libwget is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * Libwget is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with libwget. If not, see <https://www.gnu.org/licenses/>.
19 *
20 *
21 * xml parsing routines
22 *
23 * Changelog
24 * 22.06.2012 Tim Ruehsen created, but needs definitely a rewrite
25 *
26 * This derives from an old source code that I wrote in 2001.
27 * It is short, fast and has a low memory print, BUT it is a hack.
28 * It has to be replaced by e.g. libxml2 or something better.
29 *
30 * HTML parsing is (very) different from XML parsing, see here:
31 * https://html.spec.whatwg.org/multipage/syntax.html
32 * It is a PITA and should be handled by a specialized, external library !
33 *
34 */
35
36 #include <config.h>
37
38 #include <unistd.h>
39 #include <stdio.h>
40 #include <string.h>
41 #include <fcntl.h>
42 #include <sys/stat.h>
43 #ifdef HAVE_MMAP
44 #include <sys/mman.h>
45 #endif
46
47 #include <wget.h>
48 #include "private.h"
49
50 typedef struct {
51 const char
52 *buf, // pointer to original start of buffer (0-terminated)
53 *p, // pointer next char in buffer
54 *token; // token buffer
55 int
56 hints; // XML_HINT...
57 size_t
58 token_size, // size of token buffer
59 token_len; // used bytes of token buffer (not counting terminating 0 byte)
60 void
61 *user_ctx; // user context (not needed if we were using nested functions)
62 wget_xml_callback
63 *callback;
64 } xml_context;
65
66 /* \cond _hide_internal_symbols */
67 #define ascii_isspace(c) (c == ' ' || (c >= 9 && c <= 13))
68
69 // working only for consecutive alphabets, e.g. EBCDIC would not work
70 #define ascii_isalpha(c) ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
71 /* \endcond */
72
73 // append a char to token buffer
74
getToken(xml_context * context)75 static const char *getToken(xml_context *context)
76 {
77 int c;
78 const char *p;
79
80 // skip leading whitespace
81 while ((c = *context->p) && ascii_isspace(c))
82 context->p++;
83 if (!c) return NULL; // eof
84 context->token = context->p++;
85
86 // info_printf("a c=%c\n", c);
87
88 if (ascii_isalpha(c) || c == '_') {
89 while ((c = *context->p) && !ascii_isspace(c) && c != '>' && c != '=')
90 context->p++;
91 if (!c) return NULL; // syntax error
92
93 context->token_len = context->p - context->token;
94 return context->token;
95 }
96
97 if (c == '/') {
98 if (!(c = *context->p)) return NULL; // syntax error
99 context->p++;
100 if (c == '>') {
101 context->token_len = 2;
102 return context->token;
103 } else return NULL; // syntax error
104 }
105
106 if (c == '\"' || c == '\'') { // read in quoted value
107 int quote = c;
108
109 context->token = context->p;
110
111 if (!(p = strchr(context->p, quote)))
112 return NULL;
113 context->p = p + 1;
114
115 context->token_len = context->p - context->token - 1;
116 return context->token;
117 }
118
119 if (c == '<') { // fetch specials, e.g. start of comments '<!--'
120 if (!(c = *context->p)) return NULL; // syntax error
121 context->p++;
122 if (c == '?' || c == '/') {
123 context->token_len = 2;
124 return context->token;
125 }
126
127 if (c == '!') {
128 // left: <!--, <![CDATA[ and <!WHATEVER
129 if (!(c = *context->p)) return NULL; // syntax error
130 if (c == '-') {
131 context->p++;
132 if (!(c = *context->p)) return NULL; // syntax error
133 context->p++;
134 if (c == '-') {
135 context->token_len = 4;
136 return context->token;
137 } else {
138 context->p -= 2;
139 context->token_len = 2;
140 return context->token;
141 }
142 } else {
143 context->token_len = 2;
144 return context->token;
145 }
146 } else {
147 context->p--;
148 context->token_len = 1;
149 return context->token;
150 }
151 }
152
153 if (c == '>' || c == '=') {
154 context->token_len = 1;
155 return context->token;
156 }
157
158 if (c == '-') { // fetch specials, e.g. end of comments '-->'
159 if (!(c = *context->p)) return NULL; // syntax error
160 if (c != '-') {
161 c = '-'; //???
162 } else {
163 context->p++;
164 if (!(c = *context->p)) return NULL; // syntax error
165 context->p++;
166 if (c != '>') {
167 context->p -= 2;
168 c = '-';
169 } else {
170 context->token_len = 3;
171 return context->token;
172 }
173 }
174 }
175
176 if (c == '?') { // fetch specials, e.g. '?>'
177 if (!(c = *context->p)) return NULL; // syntax error
178 if (c != '>') {
179 // c = '?';
180 } else {
181 context->p++;
182 context->token_len = 2;
183 return context->token;
184 }
185 }
186
187 while ((c = *context->p) && !ascii_isspace(c))
188 context->p++;
189
190 if (c) {
191 context->token_len = context->p - context->token;
192 return context->token;
193 }
194
195 return NULL;
196 }
197
getValue(xml_context * context)198 static int getValue(xml_context *context)
199 {
200 int c;
201
202 context->token_len = 0;
203 context->token = context->p;
204
205 // remove leading spaces
206 while ((c = *context->p) && ascii_isspace(c))
207 context->p++;
208 if (!c) return EOF;
209
210 if (c == '=') {
211 context->p++;
212 if (!getToken(context))
213 return EOF; // syntax error
214 else
215 return 1; // token valid
216 }
217
218 // attribute without value
219 context->token = context->p;
220 return 1;
221 }
222
223 // special HTML <script> content parsing
224 // see https://html.spec.whatwg.org/multipage/scripting.html#the-script-element
225 // see https://html.spec.whatwg.org/multipage/scripting.html#restrictions-for-contents-of-script-elements
226
getScriptContent(xml_context * context)227 static const char *getScriptContent(xml_context *context)
228 {
229 int comment = 0, length_valid = 0;
230 const char *p;
231
232 for (p = context->token = context->p; *p; p++) {
233 if (comment) {
234 if (*p == '-' && !strncmp(p, "-->", 3)) {
235 p += 3 - 1;
236 comment = 0;
237 }
238 } else {
239 if (*p == '<' && !strncmp(p, "<!--", 4)) {
240 p += 4 - 1;
241 comment = 1;
242 } else if (*p == '<' && !wget_strncasecmp_ascii(p, "</script", 8)) {
243 context->token_len = p - context->token;
244 length_valid = 1;
245 for (p += 8; ascii_isspace(*p); p++);
246 if (*p == '>') {
247 p++;
248 break; // found end of <script>
249 } else if (!*p)
250 break; // end of input
251 }
252 }
253 }
254 context->p = p;
255
256 if (!length_valid)
257 context->token_len = p - context->token;
258
259 if (!*p && !context->token_len)
260 return NULL;
261
262 if (context->callback)
263 context->callback(context->user_ctx, XML_FLG_CONTENT | XML_FLG_END, "script", NULL, context->token, context->token_len, context->token - context->buf);
264
265 return context->token;
266 }
267
268 // special HTML <style> content parsing
269 // see https://html.spec.whatwg.org/multipage/semantics.html#the-style-element
getStyleContent(xml_context * context)270 static const char *getStyleContent(xml_context *context)
271 {
272 int comment = 0, length_valid = 0;
273 const char *p;
274
275 for (p = context->token = context->p; *p; p++) {
276 if (comment) {
277 if (p[0] == '*' && p[1] == '/') {
278 p += 2 - 1;
279 comment = 0;
280 }
281 } else {
282 if (p[0] == '/' && p[1] == '*') {
283 p += 2 - 1;
284 comment = 1;
285 } else if (*p == '<' && !wget_strncasecmp_ascii(p, "</style", 7)) {
286 context->token_len = p - context->token;
287 length_valid = 1;
288 for (p += 7; ascii_isspace(*p); p++);
289 if (*p == '>') {
290 p++;
291 break; // found end of <style>
292 } else if (!*p)
293 break; // end of input
294 }
295 }
296 }
297 context->p = p;
298
299 if (!length_valid)
300 context->token_len = p - context->token;
301
302 if (!*p && !context->token_len)
303 return NULL;
304
305 if (context->callback)
306 context->callback(context->user_ctx, XML_FLG_CONTENT | XML_FLG_END, "style", NULL, context->token, context->token_len, context->token - context->buf);
307
308 return context->token;
309 }
310
getUnparsed(xml_context * context,int flags,const char * end,size_t len,const char * directory)311 static const char *getUnparsed(xml_context *context, int flags, const char *end, size_t len, const char *directory)
312 {
313 int c;
314
315 if (len == 1) {
316 for (context->token = context->p; (c = *context->p) && c != *end; context->p++);
317 } else {
318 for (context->token = context->p; (c = *context->p); context->p++) {
319 if (c == *end && context->p[1] == end[1] && (len == 2 || context->p[2] == end[2])) {
320 break;
321 }
322 }
323 }
324
325 context->token_len = context->p - context->token;
326 if (c) context->p += len;
327
328 if (!c && !context->token_len)
329 return NULL;
330 /*
331 if (context->token && context->token_len && context->hints & XML_HINT_REMOVE_EMPTY_CONTENT) {
332 int notempty = 0;
333 char *p;
334
335 for (p = context->token; *p; p++) {
336 if (!ascii_isspace(*p)) {
337 notempty = 1;
338 break;
339 }
340 }
341
342 if (notempty) {
343 if (context->callback)
344 context->callback(context->user_ctx, flags, directory, NULL, context->token, context->token_len, context->token - context->buf);
345 } else {
346 // ignore empty content
347 context->token_len = 0;
348 context->token[0] = 0;
349 }
350 } else {
351 */
352 if (context->callback)
353 context->callback(context->user_ctx, flags, directory, NULL, context->token, context->token_len, context->token - context->buf);
354
355 // }
356
357 return context->token;
358 }
359
getComment(xml_context * context)360 static const char *getComment(xml_context *context)
361 {
362 return getUnparsed(context, XML_FLG_COMMENT, "-->", 3, NULL);
363 }
364
getProcessing(xml_context * context)365 static const char *getProcessing(xml_context *context)
366 {
367 return getUnparsed(context, XML_FLG_PROCESSING, "?>", 2, NULL);
368 }
369
getSpecial(xml_context * context)370 static const char *getSpecial(xml_context *context)
371 {
372 return getUnparsed(context, XML_FLG_SPECIAL, ">", 1, NULL);
373 }
374
getContent(xml_context * context,const char * directory)375 static const char *getContent(xml_context *context, const char *directory)
376 {
377 int c;
378
379 for (context->token = context->p; (c = *context->p) && c != '<'; context->p++);
380
381 context->token_len = context->p - context->token;
382
383 if (!c && !context->token_len)
384 return NULL;
385
386 // debug_printf("content=%.*s\n", (int)context->token_len, context->token);
387 if (context->callback && context->token_len)
388 context->callback(context->user_ctx, XML_FLG_CONTENT, directory, NULL, context->token, context->token_len, context->token - context->buf);
389
390 return context->token;
391 }
392
parseXML(const char * dir,xml_context * context)393 static int parseXML(const char *dir, xml_context *context)
394 {
395 const char *tok;
396 char directory[256] = "";
397 size_t pos = 0;
398
399 if (!(context->hints & XML_HINT_HTML)) {
400 pos = wget_strlcpy(directory, dir, sizeof(directory));
401 if (pos >= sizeof(directory)) pos = sizeof(directory) - 1;
402 }
403
404 do {
405 getContent(context, directory);
406 if (context->token_len)
407 debug_printf("%s='%.*s'\n", directory, (int)context->token_len, context->token);
408
409 if (!(tok = getToken(context))) return WGET_E_SUCCESS; //eof
410 // debug_printf("A Token '%.*s' len=%zu tok='%s'\n", (int)context->token_len, context->token, context->token_len, tok);
411
412 if (context->token_len == 1 && *tok == '<') {
413 // get element name and add it to directory
414 int flags = XML_FLG_BEGIN;
415
416 if (!(tok = getToken(context))) return WGET_E_XML_PARSE_ERR; // syntax error
417
418 // debug_printf("A2 Token '%.*s'\n", (int)context->token_len, context->token);
419
420 if (!(context->hints & XML_HINT_HTML)) {
421 if (!pos || directory[pos - 1] != '/')
422 wget_snprintf(&directory[pos], sizeof(directory) - pos, "/%.*s", (int)context->token_len, tok);
423 else
424 wget_snprintf(&directory[pos], sizeof(directory) - pos, "%.*s", (int)context->token_len, tok);
425 } else {
426 // wget_snprintf(directory, sizeof(directory), "%.*s", (int)context->token_len, tok);
427 if (context->token_len < sizeof(directory)) {
428 memcpy(directory, tok, context->token_len);
429 directory[context->token_len] = 0;
430 } else {
431 memcpy(directory, tok, sizeof(directory) - 1);
432 directory[sizeof(directory) - 1] = 0;
433 }
434 }
435
436 while ((tok = getToken(context))) {
437 // debug_printf("C Token %.*s %zu %p %p dir=%s tok=%s\n", (int)context->token_len, context->token, context->token_len, context->token, context->p, directory, tok);
438 if (context->token_len == 2 && !strncmp(tok, "/>", 2)) {
439 if (context->callback)
440 context->callback(context->user_ctx, flags | XML_FLG_END, directory, NULL, NULL, 0, 0);
441 break; // stay in this level
442 } else if (context->token_len == 1 && *tok == '>') {
443 if (context->callback)
444 context->callback(context->user_ctx, flags | XML_FLG_CLOSE, directory, NULL, NULL, 0, 0);
445 if (context->hints & XML_HINT_HTML) {
446 if (!wget_strcasecmp_ascii(directory, "script")) {
447 // special HTML <script> content parsing
448 // see https://html.spec.whatwg.org/multipage/scripting.html#the-script-element
449 // 4.3.1.2 Restrictions for contents of script elements
450 debug_printf("*** need special <script> handling\n");
451 getScriptContent(context);
452 if (context->token_len)
453 debug_printf("%s=%.*s\n", directory, (int)context->token_len, context->token);
454 }
455 else if (!wget_strcasecmp_ascii(directory, "style")) {
456 getStyleContent(context);
457 if (context->token_len)
458 debug_printf("%s=%.*s\n", directory, (int)context->token_len, context->token);
459 }
460 } else
461 parseXML(directory, context); // descend one level
462 break;
463 } else {
464 char attribute[256];
465 size_t attrlen = context->token_len >= sizeof(attribute) ? sizeof(attribute) - 1 : context->token_len;
466
467 memcpy(attribute, tok, attrlen);
468 attribute[attrlen] = 0;
469
470 if (getValue(context) == EOF) return WGET_E_XML_PARSE_ERR; // syntax error
471
472 if (context->token_len) {
473 debug_printf("%s/@%s=%.*s\n", directory, attribute, (int)context->token_len, context->token);
474 if (context->callback)
475 context->callback(context->user_ctx, flags | XML_FLG_ATTRIBUTE, directory, attribute, context->token, context->token_len, context->token - context->buf);
476 } else {
477 debug_printf("%s/@%s\n", directory, attribute);
478 if (context->callback)
479 context->callback(context->user_ctx, flags | XML_FLG_ATTRIBUTE, directory, attribute, NULL, 0, 0);
480 }
481 flags = 0;
482 }
483 }
484 directory[pos] = 0;
485 } else if (context->token_len == 2) {
486 if (!strncmp(tok, "</", 2)) {
487 // ascend one level
488 // cleanup - get name and '>'
489 if (!(tok = getToken(context))) return WGET_E_XML_PARSE_ERR;
490 // debug_printf("X Token %s\n",tok);
491 if (context->callback) {
492 if (!(context->hints & XML_HINT_HTML))
493 context->callback(context->user_ctx, XML_FLG_END, directory, NULL, NULL, 0, 0);
494 else {
495 char tag[context->token_len + 1]; // we need to \0 terminate tok
496 memcpy(tag, tok, context->token_len);
497 tag[context->token_len] = 0;
498 context->callback(context->user_ctx, XML_FLG_END, tag, NULL, NULL, 0, 0);
499 }
500 }
501 if (!(tok = getToken(context))) return WGET_E_XML_PARSE_ERR;
502 // debug_printf("Y Token %s\n",tok);
503 if (!(context->hints & XML_HINT_HTML))
504 return WGET_E_SUCCESS;
505 else
506 continue;
507 } else if (!strncmp(tok, "<?", 2)) { // special info - ignore
508 getProcessing(context);
509 debug_printf("%s=<?%.*s?>\n", directory, (int)context->token_len, context->token);
510 continue;
511 } else if (!strncmp(tok, "<!", 2)) {
512 getSpecial(context);
513 debug_printf("%s=<!%.*s>\n", directory, (int)context->token_len, context->token);
514 }
515 } else if (context->token_len == 4 && !strncmp(tok, "<!--", 4)) { // comment - ignore
516 getComment(context);
517 debug_printf("%s=<!--%.*s-->\n", directory, (int)context->token_len, context->token);
518 continue;
519 }
520 } while (tok);
521 return WGET_E_SUCCESS;
522 }
523
524 /**
525 * \file
526 * \brief XML parsing functions
527 * \defgroup libwget-xml XML parsing functions
528 * @{
529 */
530
531 /**
532 * \param[in] buf Zero-terminated XML or HTML input data
533 * \param[in] callback Function called for each token scan result
534 * \param[in] user_ctx User-defined context variable, handed to \p callback
535 * \param[in] hints Flags to influence parsing
536 *
537 * This function scans the XML input from \p buf and calls \p callback for each token
538 * found. \p user_ctx is a user-defined context variable and given to each call of \p callback.
539 *
540 * \p hints may be 0 or any combination of %XML_HINT_REMOVE_EMPTY_CONTENT and %XML_HINT_HTML.
541 *
542 * %XML_HINT_REMOVE_EMPTY_CONTENT reduces the number of calls to \p callback by ignoring
543 * empty content and superfluous spaces.
544 *
545 * %XML_HINT_HTML turns on HTML scanning.
546 */
wget_xml_parse_buffer(const char * buf,wget_xml_callback * callback,void * user_ctx,int hints)547 int wget_xml_parse_buffer(
548 const char *buf,
549 wget_xml_callback *callback,
550 void *user_ctx,
551 int hints)
552 {
553 xml_context context;
554
555 context.token = NULL;
556 context.token_size = 0;
557 context.token_len = 0;
558 context.buf = buf;
559 context.p = buf;
560 context.user_ctx = user_ctx;
561 context.callback = callback;
562 context.hints = hints;
563
564 return parseXML ("/", &context);
565 }
566
567 /**
568 * \param[in] buf Zero-terminated HTML input data
569 * \param[in] callback Function called for each token scan result
570 * \param[in] user_ctx User-defined context variable, handed to \p callback
571 * \param[in] hints Flags to influence parsing
572 *
573 * Convenience function that calls wget_xml_parse_buffer() with HTML parsing turned on.
574 */
wget_html_parse_buffer(const char * buf,wget_xml_callback * callback,void * user_ctx,int hints)575 void wget_html_parse_buffer(
576 const char *buf,
577 wget_xml_callback *callback,
578 void *user_ctx,
579 int hints)
580 {
581 wget_xml_parse_buffer(buf, callback, user_ctx, hints | XML_HINT_HTML);
582 }
583
584 /**
585 * \param[in] fname Name of XML or HTML input file
586 * \param[in] callback Function called for each token scan result
587 * \param[in] user_ctx User-defined context variable, handed to \p callback
588 * \param[in] hints Flags to influence parsing
589 *
590 * Convenience function that calls wget_xml_parse_buffer() with the file content.
591 *
592 * If \p fname is `-`, the data is read from stdin.
593 */
wget_xml_parse_file(const char * fname,wget_xml_callback * callback,void * user_ctx,int hints)594 void wget_xml_parse_file(
595 const char *fname,
596 wget_xml_callback *callback,
597 void *user_ctx,
598 int hints)
599 {
600 if (strcmp(fname,"-")) {
601 int fd;
602
603 if ((fd = open(fname, O_RDONLY|O_BINARY)) != -1) {
604 struct stat st;
605 if (fstat(fd, &st) == 0) {
606 #ifdef HAVE_MMAP
607 size_t nread = st.st_size;
608 char *buf = mmap(NULL, nread + 1, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
609 #else
610 char *buf=wget_malloc(st.st_size + 1);
611 size_t nread=read(fd, buf, st.st_size);
612 #endif
613
614 if (nread > 0) {
615 buf[nread] = 0; // PROT_WRITE allows this write, MAP_PRIVATE prevents changes in underlying file system
616 wget_xml_parse_buffer(buf, callback, user_ctx, hints);
617 }
618
619 #ifdef HAVE_MMAP
620 munmap(buf, nread);
621 #else
622 xfree(buf);
623 #endif
624 }
625 close(fd);
626 } else
627 error_printf(_("Failed to open %s\n"), fname);
628 } else {
629 // read data from STDIN.
630 // maybe should use yy_scan_bytes instead of buffering into memory.
631 char tmp[4096];
632 ssize_t nbytes;
633 wget_buffer buf;
634
635 wget_buffer_init(&buf, NULL, 4096);
636
637 while ((nbytes = read(STDIN_FILENO, tmp, sizeof(tmp))) > 0) {
638 wget_buffer_memcat(&buf, tmp, nbytes);
639 }
640
641 if (buf.length)
642 wget_xml_parse_buffer(buf.data, callback, user_ctx, hints);
643
644 wget_buffer_deinit(&buf);
645 }
646 }
647
648 /**
649 * \param[in] fname Name of XML or HTML input file
650 * \param[in] callback Function called for each token scan result
651 * \param[in] user_ctx User-defined context variable, handed to \p callback
652 * \param[in] hints Flags to influence parsing
653 *
654 * Convenience function that calls wget_xml_parse_file() with HTML parsing turned on.
655 *
656 * If \p fname is `-`, the data is read from stdin.
657 */
wget_html_parse_file(const char * fname,wget_xml_callback * callback,void * user_ctx,int hints)658 void wget_html_parse_file(
659 const char *fname,
660 wget_xml_callback *callback,
661 void *user_ctx,
662 int hints)
663 {
664 wget_xml_parse_file(fname, callback, user_ctx, hints | XML_HINT_HTML);
665 }
666
667 /** @} */
668