/*
* Copyright (c) 2012 Tim Ruehsen
* Copyright (c) 2015-2021 Free Software Foundation, Inc.
*
* This file is part of libwget.
*
* Libwget is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Libwget is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with libwget. If not, see .
*
*
* xml parsing routines
*
* Changelog
* 22.06.2012 Tim Ruehsen created, but needs definitely a rewrite
*
* This derives from an old source code that I wrote in 2001.
* It is short, fast and has a low memory print, BUT it is a hack.
* It has to be replaced by e.g. libxml2 or something better.
*
* HTML parsing is (very) different from XML parsing, see here:
* https://html.spec.whatwg.org/multipage/syntax.html
* It is a PITA and should be handled by a specialized, external library !
*
*/
#include
#include
#include
#include
#include
#include
#ifdef HAVE_MMAP
#include
#endif
#include
#include "private.h"
typedef struct {
const char
*buf, // pointer to original start of buffer (0-terminated)
*p, // pointer next char in buffer
*token; // token buffer
int
hints; // XML_HINT...
size_t
token_size, // size of token buffer
token_len; // used bytes of token buffer (not counting terminating 0 byte)
void
*user_ctx; // user context (not needed if we were using nested functions)
wget_xml_callback
*callback;
} xml_context;
/* \cond _hide_internal_symbols */
#define ascii_isspace(c) (c == ' ' || (c >= 9 && c <= 13))
// working only for consecutive alphabets, e.g. EBCDIC would not work
#define ascii_isalpha(c) ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
/* \endcond */
// append a char to token buffer
static const char *getToken(xml_context *context)
{
int c;
const char *p;
// skip leading whitespace
while ((c = *context->p) && ascii_isspace(c))
context->p++;
if (!c) return NULL; // eof
context->token = context->p++;
// info_printf("a c=%c\n", c);
if (ascii_isalpha(c) || c == '_') {
while ((c = *context->p) && !ascii_isspace(c) && c != '>' && c != '=')
context->p++;
if (!c) return NULL; // syntax error
context->token_len = context->p - context->token;
return context->token;
}
if (c == '/') {
if (!(c = *context->p)) return NULL; // syntax error
context->p++;
if (c == '>') {
context->token_len = 2;
return context->token;
} else return NULL; // syntax error
}
if (c == '\"' || c == '\'') { // read in quoted value
int quote = c;
context->token = context->p;
if (!(p = strchr(context->p, quote)))
return NULL;
context->p = p + 1;
context->token_len = context->p - context->token - 1;
return context->token;
}
if (c == '<') { // fetch specials, e.g. start of comments ''
if (!(c = *context->p)) return NULL; // syntax error
if (c != '-') {
c = '-'; //???
} else {
context->p++;
if (!(c = *context->p)) return NULL; // syntax error
context->p++;
if (c != '>') {
context->p -= 2;
c = '-';
} else {
context->token_len = 3;
return context->token;
}
}
}
if (c == '?') { // fetch specials, e.g. '?>'
if (!(c = *context->p)) return NULL; // syntax error
if (c != '>') {
// c = '?';
} else {
context->p++;
context->token_len = 2;
return context->token;
}
}
while ((c = *context->p) && !ascii_isspace(c))
context->p++;
if (c) {
context->token_len = context->p - context->token;
return context->token;
}
return NULL;
}
static int getValue(xml_context *context)
{
int c;
context->token_len = 0;
context->token = context->p;
// remove leading spaces
while ((c = *context->p) && ascii_isspace(c))
context->p++;
if (!c) return EOF;
if (c == '=') {
context->p++;
if (!getToken(context))
return EOF; // syntax error
else
return 1; // token valid
}
// attribute without value
context->token = context->p;
return 1;
}
// special HTML token_len = p - context->token;
length_valid = 1;
for (p += 8; ascii_isspace(*p); p++);
if (*p == '>') {
p++;
break; // found end of