1 /* libxml2 - Library for parsing XML documents 2 * Copyright (C) 2006-2019 Free Software Foundation, Inc. 3 * 4 * This file is not part of the GNU gettext program, but is used with 5 * GNU gettext. 6 * 7 * The original copyright notice is as follows: 8 */ 9 10 /* 11 * Copyright (C) 1998-2012 Daniel Veillard. All Rights Reserved. 12 * 13 * Permission is hereby granted, free of charge, to any person obtaining a copy 14 * of this software and associated documentation files (the "Software"), to deal 15 * in the Software without restriction, including without limitation the rights 16 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 17 * copies of the Software, and to permit persons to whom the Software is fur- 18 * nished to do so, subject to the following conditions: 19 * 20 * The above copyright notice and this permission notice shall be included in 21 * all copies or substantial portions of the Software. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 24 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FIT- 25 * NESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 26 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 27 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 28 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 29 * THE SOFTWARE. 30 * 31 * Author: Daniel Veillard 32 */ 33 34 /* 35 * Summary: interface for an HTML 4.0 non-verifying parser 36 * Description: this module implements an HTML 4.0 non-verifying parser 37 * with API compatible with the XML parser ones. It should 38 * be able to parse "real world" HTML, even if severely 39 * broken from a specification point of view. 40 */ 41 42 #ifndef __HTML_PARSER_H__ 43 #define __HTML_PARSER_H__ 44 #include <libxml/xmlversion.h> 45 #include <libxml/parser.h> 46 47 #ifdef LIBXML_HTML_ENABLED 48 49 #ifdef __cplusplus 50 extern "C" { 51 #endif 52 53 /* 54 * Most of the back-end structures from XML and HTML are shared. 55 */ 56 typedef xmlParserCtxt htmlParserCtxt; 57 typedef xmlParserCtxtPtr htmlParserCtxtPtr; 58 typedef xmlParserNodeInfo htmlParserNodeInfo; 59 typedef xmlSAXHandler htmlSAXHandler; 60 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; 61 typedef xmlParserInput htmlParserInput; 62 typedef xmlParserInputPtr htmlParserInputPtr; 63 typedef xmlDocPtr htmlDocPtr; 64 typedef xmlNodePtr htmlNodePtr; 65 66 /* 67 * Internal description of an HTML element, representing HTML 4.01 68 * and XHTML 1.0 (which share the same structure). 69 */ 70 typedef struct _htmlElemDesc htmlElemDesc; 71 typedef htmlElemDesc *htmlElemDescPtr; 72 struct _htmlElemDesc { 73 const char *name; /* The tag name */ 74 char startTag; /* Whether the start tag can be implied */ 75 char endTag; /* Whether the end tag can be implied */ 76 char saveEndTag; /* Whether the end tag should be saved */ 77 char empty; /* Is this an empty element ? */ 78 char depr; /* Is this a deprecated element ? */ 79 char dtd; /* 1: only in Loose DTD, 2: only Frameset one */ 80 char isinline; /* is this a block 0 or inline 1 element */ 81 const char *desc; /* the description */ 82 83 /* NRK Jan.2003 84 * New fields encapsulating HTML structure 85 * 86 * Bugs: 87 * This is a very limited representation. It fails to tell us when 88 * an element *requires* subelements (we only have whether they're 89 * allowed or not), and it doesn't tell us where CDATA and PCDATA 90 * are allowed. Some element relationships are not fully represented: 91 * these are flagged with the word MODIFIER 92 */ 93 const char** subelts; /* allowed sub-elements of this element */ 94 const char* defaultsubelt; /* subelement for suggested auto-repair 95 if necessary or NULL */ 96 const char** attrs_opt; /* Optional Attributes */ 97 const char** attrs_depr; /* Additional deprecated attributes */ 98 const char** attrs_req; /* Required attributes */ 99 }; 100 101 /* 102 * Internal description of an HTML entity. 103 */ 104 typedef struct _htmlEntityDesc htmlEntityDesc; 105 typedef htmlEntityDesc *htmlEntityDescPtr; 106 struct _htmlEntityDesc { 107 unsigned int value; /* the UNICODE value for the character */ 108 const char *name; /* The entity name */ 109 const char *desc; /* the description */ 110 }; 111 112 /* 113 * There is only few public functions. 114 */ 115 XMLPUBFUN const htmlElemDesc * XMLCALL 116 htmlTagLookup (const xmlChar *tag); 117 XMLPUBFUN const htmlEntityDesc * XMLCALL 118 htmlEntityLookup(const xmlChar *name); 119 XMLPUBFUN const htmlEntityDesc * XMLCALL 120 htmlEntityValueLookup(unsigned int value); 121 122 XMLPUBFUN int XMLCALL 123 htmlIsAutoClosed(htmlDocPtr doc, 124 htmlNodePtr elem); 125 XMLPUBFUN int XMLCALL 126 htmlAutoCloseTag(htmlDocPtr doc, 127 const xmlChar *name, 128 htmlNodePtr elem); 129 XMLPUBFUN const htmlEntityDesc * XMLCALL 130 htmlParseEntityRef(htmlParserCtxtPtr ctxt, 131 const xmlChar **str); 132 XMLPUBFUN int XMLCALL 133 htmlParseCharRef(htmlParserCtxtPtr ctxt); 134 XMLPUBFUN void XMLCALL 135 htmlParseElement(htmlParserCtxtPtr ctxt); 136 137 XMLPUBFUN htmlParserCtxtPtr XMLCALL 138 htmlNewParserCtxt(void); 139 140 XMLPUBFUN htmlParserCtxtPtr XMLCALL 141 htmlCreateMemoryParserCtxt(const char *buffer, 142 int size); 143 144 XMLPUBFUN int XMLCALL 145 htmlParseDocument(htmlParserCtxtPtr ctxt); 146 XMLPUBFUN htmlDocPtr XMLCALL 147 htmlSAXParseDoc (const xmlChar *cur, 148 const char *encoding, 149 htmlSAXHandlerPtr sax, 150 void *userData); 151 XMLPUBFUN htmlDocPtr XMLCALL 152 htmlParseDoc (const xmlChar *cur, 153 const char *encoding); 154 XMLPUBFUN htmlDocPtr XMLCALL 155 htmlSAXParseFile(const char *filename, 156 const char *encoding, 157 htmlSAXHandlerPtr sax, 158 void *userData); 159 XMLPUBFUN htmlDocPtr XMLCALL 160 htmlParseFile (const char *filename, 161 const char *encoding); 162 XMLPUBFUN int XMLCALL 163 UTF8ToHtml (unsigned char *out, 164 int *outlen, 165 const unsigned char *in, 166 int *inlen); 167 XMLPUBFUN int XMLCALL 168 htmlEncodeEntities(unsigned char *out, 169 int *outlen, 170 const unsigned char *in, 171 int *inlen, int quoteChar); 172 XMLPUBFUN int XMLCALL 173 htmlIsScriptAttribute(const xmlChar *name); 174 XMLPUBFUN int XMLCALL 175 htmlHandleOmittedElem(int val); 176 177 #ifdef LIBXML_PUSH_ENABLED 178 /** 179 * Interfaces for the Push mode. 180 */ 181 XMLPUBFUN htmlParserCtxtPtr XMLCALL 182 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, 183 void *user_data, 184 const char *chunk, 185 int size, 186 const char *filename, 187 xmlCharEncoding enc); 188 XMLPUBFUN int XMLCALL 189 htmlParseChunk (htmlParserCtxtPtr ctxt, 190 const char *chunk, 191 int size, 192 int terminate); 193 #endif /* LIBXML_PUSH_ENABLED */ 194 195 XMLPUBFUN void XMLCALL 196 htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); 197 198 /* 199 * New set of simpler/more flexible APIs 200 */ 201 /** 202 * xmlParserOption: 203 * 204 * This is the set of XML parser options that can be passed down 205 * to the xmlReadDoc() and similar calls. 206 */ 207 typedef enum { 208 HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */ 209 HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */ 210 HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ 211 HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */ 212 HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */ 213 HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */ 214 HTML_PARSE_NONET = 1<<11,/* Forbid network access */ 215 HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */ 216 HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */ 217 HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */ 218 } htmlParserOption; 219 220 XMLPUBFUN void XMLCALL 221 htmlCtxtReset (htmlParserCtxtPtr ctxt); 222 XMLPUBFUN int XMLCALL 223 htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, 224 int options); 225 XMLPUBFUN htmlDocPtr XMLCALL 226 htmlReadDoc (const xmlChar *cur, 227 const char *URL, 228 const char *encoding, 229 int options); 230 XMLPUBFUN htmlDocPtr XMLCALL 231 htmlReadFile (const char *URL, 232 const char *encoding, 233 int options); 234 XMLPUBFUN htmlDocPtr XMLCALL 235 htmlReadMemory (const char *buffer, 236 int size, 237 const char *URL, 238 const char *encoding, 239 int options); 240 XMLPUBFUN htmlDocPtr XMLCALL 241 htmlReadFd (int fd, 242 const char *URL, 243 const char *encoding, 244 int options); 245 XMLPUBFUN htmlDocPtr XMLCALL 246 htmlReadIO (xmlInputReadCallback ioread, 247 xmlInputCloseCallback ioclose, 248 void *ioctx, 249 const char *URL, 250 const char *encoding, 251 int options); 252 XMLPUBFUN htmlDocPtr XMLCALL 253 htmlCtxtReadDoc (xmlParserCtxtPtr ctxt, 254 const xmlChar *cur, 255 const char *URL, 256 const char *encoding, 257 int options); 258 XMLPUBFUN htmlDocPtr XMLCALL 259 htmlCtxtReadFile (xmlParserCtxtPtr ctxt, 260 const char *filename, 261 const char *encoding, 262 int options); 263 XMLPUBFUN htmlDocPtr XMLCALL 264 htmlCtxtReadMemory (xmlParserCtxtPtr ctxt, 265 const char *buffer, 266 int size, 267 const char *URL, 268 const char *encoding, 269 int options); 270 XMLPUBFUN htmlDocPtr XMLCALL 271 htmlCtxtReadFd (xmlParserCtxtPtr ctxt, 272 int fd, 273 const char *URL, 274 const char *encoding, 275 int options); 276 XMLPUBFUN htmlDocPtr XMLCALL 277 htmlCtxtReadIO (xmlParserCtxtPtr ctxt, 278 xmlInputReadCallback ioread, 279 xmlInputCloseCallback ioclose, 280 void *ioctx, 281 const char *URL, 282 const char *encoding, 283 int options); 284 285 /* NRK/Jan2003: further knowledge of HTML structure 286 */ 287 typedef enum { 288 HTML_NA = 0 , /* something we don't check at all */ 289 HTML_INVALID = 0x1 , 290 HTML_DEPRECATED = 0x2 , 291 HTML_VALID = 0x4 , 292 HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */ 293 } htmlStatus ; 294 295 /* Using htmlElemDesc rather than name here, to emphasise the fact 296 that otherwise there's a lookup overhead 297 */ 298 XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ; 299 XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ; 300 XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ; 301 XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ; 302 /** 303 * htmlDefaultSubelement: 304 * @elt: HTML element 305 * 306 * Returns the default subelement for this element 307 */ 308 #define htmlDefaultSubelement(elt) elt->defaultsubelt 309 /** 310 * htmlElementAllowedHereDesc: 311 * @parent: HTML parent element 312 * @elt: HTML element 313 * 314 * Checks whether an HTML element description may be a 315 * direct child of the specified element. 316 * 317 * Returns 1 if allowed; 0 otherwise. 318 */ 319 #define htmlElementAllowedHereDesc(parent,elt) \ 320 htmlElementAllowedHere((parent), (elt)->name) 321 /** 322 * htmlRequiredAttrs: 323 * @elt: HTML element 324 * 325 * Returns the attributes required for the specified element. 326 */ 327 #define htmlRequiredAttrs(elt) (elt)->attrs_req 328 329 330 #ifdef __cplusplus 331 } 332 #endif 333 334 #endif /* LIBXML_HTML_ENABLED */ 335 #endif /* __HTML_PARSER_H__ */ 336