/* * Copyright (c) 2012 Tim Ruehsen * Copyright (c) 2015-2021 Free Software Foundation, Inc. * * This file is part of libwget. * * Libwget is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Libwget is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with libwget. If not, see . * * * xml parsing routines * * Changelog * 22.06.2012 Tim Ruehsen created, but needs definitely a rewrite * * This derives from an old source code that I wrote in 2001. * It is short, fast and has a low memory print, BUT it is a hack. * It has to be replaced by e.g. libxml2 or something better. * * HTML parsing is (very) different from XML parsing, see here: * https://html.spec.whatwg.org/multipage/syntax.html * It is a PITA and should be handled by a specialized, external library ! * */ #include #include #include #include #include #include #ifdef HAVE_MMAP #include #endif #include #include "private.h" typedef struct { const char *buf, // pointer to original start of buffer (0-terminated) *p, // pointer next char in buffer *token; // token buffer int hints; // XML_HINT... size_t token_size, // size of token buffer token_len; // used bytes of token buffer (not counting terminating 0 byte) void *user_ctx; // user context (not needed if we were using nested functions) wget_xml_callback *callback; } xml_context; /* \cond _hide_internal_symbols */ #define ascii_isspace(c) (c == ' ' || (c >= 9 && c <= 13)) // working only for consecutive alphabets, e.g. EBCDIC would not work #define ascii_isalpha(c) ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) /* \endcond */ // append a char to token buffer static const char *getToken(xml_context *context) { int c; const char *p; // skip leading whitespace while ((c = *context->p) && ascii_isspace(c)) context->p++; if (!c) return NULL; // eof context->token = context->p++; // info_printf("a c=%c\n", c); if (ascii_isalpha(c) || c == '_') { while ((c = *context->p) && !ascii_isspace(c) && c != '>' && c != '=') context->p++; if (!c) return NULL; // syntax error context->token_len = context->p - context->token; return context->token; } if (c == '/') { if (!(c = *context->p)) return NULL; // syntax error context->p++; if (c == '>') { context->token_len = 2; return context->token; } else return NULL; // syntax error } if (c == '\"' || c == '\'') { // read in quoted value int quote = c; context->token = context->p; if (!(p = strchr(context->p, quote))) return NULL; context->p = p + 1; context->token_len = context->p - context->token - 1; return context->token; } if (c == '<') { // fetch specials, e.g. start of comments '' if (!(c = *context->p)) return NULL; // syntax error if (c != '-') { c = '-'; //??? } else { context->p++; if (!(c = *context->p)) return NULL; // syntax error context->p++; if (c != '>') { context->p -= 2; c = '-'; } else { context->token_len = 3; return context->token; } } } if (c == '?') { // fetch specials, e.g. '?>' if (!(c = *context->p)) return NULL; // syntax error if (c != '>') { // c = '?'; } else { context->p++; context->token_len = 2; return context->token; } } while ((c = *context->p) && !ascii_isspace(c)) context->p++; if (c) { context->token_len = context->p - context->token; return context->token; } return NULL; } static int getValue(xml_context *context) { int c; context->token_len = 0; context->token = context->p; // remove leading spaces while ((c = *context->p) && ascii_isspace(c)) context->p++; if (!c) return EOF; if (c == '=') { context->p++; if (!getToken(context)) return EOF; // syntax error else return 1; // token valid } // attribute without value context->token = context->p; return 1; } // special HTML