1 /*
2 * libxml2_htmlparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 * Portion Copyright � 2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
8 */
9
10 #define IN_LIBXML
11 #include "xmlenglibxml.h"
12
13 #include <string.h>
14 #if defined(HAVE_CTYPE_H)
15 #include <ctype.h>
16 #endif
17 #ifdef HAVE_STDLIB_H
18 #include <stdlib.h>
19 #endif
20 #ifdef HAVE_SYS_STAT_H
21 #include <sys/stat.h>
22 #endif
23 #ifdef HAVE_FCNTL_H
24 #include <fcntl.h>
25 #endif
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 #ifdef HAVE_ZLIB_H
30 #include <zlib.h>
31 #endif
32
33
34 #include <libxml2_globals.h>
35 #include <libxml2_xmlmemory.h>
36 #include <libxml2_tree.h>
37 #include <libxml2_parser.h>
38 #include <libxml2_parserinternals.h>
39 #include <libxml2_xmlerror.h>
40 #include "libxml2_xmlerror2.h"
41 #include "libxml2_htmlparser.h"
42 #include "libxml2_htmltree.h"
43 #include "libxml2_entities.h"
44 #include <libxml2_encoding.h>
45 #include <libxml2_valid.h>
46 #include <libxml2_xmlio.h>
47 #include <libxml2_uri.h>
48
49 #define HTML_MAX_NAMELEN 1000
50 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
51 #define HTML_PARSER_BUFFER_SIZE 100
52
53 #ifdef LIBXML_HTML_ENABLED
54
55 /* #define DEBUG */
56 /* #define DEBUG_PUSH */
57
58 static const int htmlOmittedDefaultValue = 1;
59
60 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
61 xmlChar end, xmlChar end2, xmlChar end3);
62 static void htmlParseComment(htmlParserCtxtPtr ctxt);
63
64 /************************************************************************
65 * *
66 * Some factorized error routines *
67 * *
68 ************************************************************************/
69
70 /**
71 * htmlErrMemory:
72 * @param ctxt an HTML parser context
73 * @param extra extra informations
74 *
75 * Handle a redefinition of attribute error
76 */
77 void
78 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra); // moved to XSLT-enabled part of this file
79
80 /**
81 * htmlParseErr:
82 * @param ctxt an HTML parser context
83 * @param error the error number
84 * @param msg the error message
85 * @param str1 string infor
86 * @param str2 string infor
87 *
88 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
89 */
90 static void
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)91 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
92 const char *msg, const xmlChar *str1, const xmlChar *str2)
93 {
94 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
95 (ctxt->instate == XML_PARSER_EOF))
96 return;
97 ctxt->errNo = error;
98 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
99 XML_ERR_ERROR, NULL, 0,
100 (const char *) str1, (const char *) str2,
101 NULL, 0, 0,
102 msg, str1, str2);
103 ctxt->wellFormed = 0;
104 }
105
106 /**
107 * htmlParseErrInt:
108 * @param ctxt an HTML parser context
109 * @param error the error number
110 * @param msg the error message
111 * @param val integer info
112 *
113 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
114 */
115 static void
htmlParseErrInt(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,int val)116 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
117 const char *msg, int val)
118 {
119 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
120 (ctxt->instate == XML_PARSER_EOF))
121 return;
122 ctxt->errNo = error;
123 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
124 XML_ERR_ERROR, NULL, 0, NULL, NULL,
125 NULL, val, 0, msg, val);
126 ctxt->wellFormed = 0;
127 }
128
129 /************************************************************************
130 * *
131 * Parser stacks related functions and macros *
132 * *
133 ************************************************************************/
134
135 /**
136 * htmlnamePush:
137 * @param ctxt an HTML parser context
138 * @param value the element name
139 *
140 * Pushes a new element name on top of the name stack
141 *
142 * Returns 0 in case of error, the index in the stack otherwise
143 */
144 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)145 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
146 {
147 if (ctxt->nameNr >= ctxt->nameMax) {
148 void* allocTmp; // DONE: Fix xmlRealloc
149 allocTmp = xmlRealloc((xmlChar**)ctxt->nameTab,
150 ctxt->nameMax * 2 * sizeof(ctxt->nameTab[0]));
151 if (!allocTmp) {
152 htmlErrMemory(ctxt, NULL);
153 return (0);
154 }
155 ctxt->nameMax *= 2;
156 ctxt->nameTab = (const xmlChar**) allocTmp;
157 }
158 ctxt->nameTab[ctxt->nameNr] = value;
159 ctxt->name = value;
160 return (ctxt->nameNr++);
161 }
162 /**
163 * htmlnamePop:
164 * @param ctxt an HTML parser context
165 *
166 * Pops the top element name from the name stack
167 *
168 * Returns the name just removed
169 */
170 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)171 htmlnamePop(htmlParserCtxtPtr ctxt)
172 {
173 const xmlChar *ret;
174
175 if (ctxt->nameNr <= 0)
176 return (0);
177 ctxt->nameNr--;
178 if (ctxt->nameNr < 0)
179 return (0);
180 if (ctxt->nameNr > 0)
181 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
182 else
183 ctxt->name = NULL;
184 ret = ctxt->nameTab[ctxt->nameNr];
185 ctxt->nameTab[ctxt->nameNr] = 0;
186 return (ret);
187 }
188
189 /*
190 * Macros for accessing the content. Those should be used only by the parser,
191 * and not exported.
192 *
193 * Dirty macros, i.e. one need to make assumption on the context to use them
194 *
195 * CUR_PTR return the current pointer to the xmlChar to be parsed.
196 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
197 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
198 * in UNICODE mode. This should be used internally by the parser
199 * only to compare to ASCII values otherwise it would break when
200 * running with UTF-8 encoding.
201 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
202 * to compare on ASCII based substring.
203 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
204 * it should be used only to compare on ASCII based substring.
205 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
206 * strings without newlines within the parser.
207 *
208 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
209 *
210 * CURRENT Returns the current char value, with the full decoding of
211 * UTF-8 if we are using this mode. It returns an int.
212 * NEXT Skip to the next character, this does the proper decoding
213 * in UTF-8 mode.
214 * NEXTL(l) Skip the current unicode character of l xmlChars long.
215 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
216 */
217
218 #define UPPER (toupper(*ctxt->input->cur))
219
220 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
221
222 #define NXT(val) ctxt->input->cur[(val)]
223
224 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
225
226 #define CUR_PTR ctxt->input->cur
227
228 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
229 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
230 xmlParserInputShrink(ctxt->input)
231
232 #define GROW if ((ctxt->progressive == 0) && \
233 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
234 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
235
236 #define CURRENT ((int) (*ctxt->input->cur))
237
238 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
239
240 /* Inported from XML */
241
242 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
243 #define CUR ((int) (*ctxt->input->cur))
244 #define NEXT xmlNextChar(ctxt)
245
246 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
247 #define NXT(val) ctxt->input->cur[(val)]
248 #define CUR_PTR ctxt->input->cur
249
250
251 #define NEXTL(l) do { \
252 if (*(ctxt->input->cur) == '\n') { \
253 ctxt->input->line++; ctxt->input->col = 1; \
254 } else ctxt->input->col++; \
255 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
256 } while (0)
257
258 /************
259 \
260 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
261 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
262 ************/
263
264 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
265 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
266
267 #define COPY_BUF(l,b,i,v) \
268 if (l == 1) b[i++] = (xmlChar) v; \
269 else i += xmlCopyChar(l,&b[i],v)
270
271 /**
272 * htmlCurrentChar:
273 * @param ctxt the HTML parser context
274 * @param len pointer to the length of the char read
275 *
276 * The current char value, if using UTF-8 this may actually span multiple
277 * bytes in the input buffer. Implement the end of line normalization:
278 * 2.11 End-of-Line Handling
279 * If the encoding is unspecified, in the case we find an ISO-Latin-1
280 * char, then the encoding converter is plugged in automatically.
281 *
282 * Returns the current char value and its length
283 */
284
285 static int
htmlCurrentChar(xmlParserCtxtPtr ctxt,int * len)286 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
287 if (ctxt->instate == XML_PARSER_EOF)
288 return(0);
289
290 if (ctxt->token != 0) {
291 *len = 0;
292 return(ctxt->token);
293 }
294 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
295 /*
296 * We are supposed to handle UTF8, check it's valid
297 * From rfc2044: encoding of the Unicode values on UTF-8:
298 *
299 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
300 * 0000 0000-0000 007F 0xxxxxxx
301 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
302 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
303 *
304 * Check for the 0x110000 limit too
305 */
306 const unsigned char *cur = ctxt->input->cur;
307 unsigned char c;
308 unsigned int val;
309
310 c = *cur;
311 if (c & 0x80) {
312 if (cur[1] == 0)
313 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
314 if ((cur[1] & 0xc0) != 0x80)
315 goto encoding_error;
316 if ((c & 0xe0) == 0xe0) {
317
318 if (cur[2] == 0)
319 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
320 if ((cur[2] & 0xc0) != 0x80)
321 goto encoding_error;
322 if ((c & 0xf0) == 0xf0) {
323 if (cur[3] == 0)
324 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
325 if (((c & 0xf8) != 0xf0) ||
326 ((cur[3] & 0xc0) != 0x80))
327 goto encoding_error;
328 /* 4-byte code */
329 *len = 4;
330 val = (cur[0] & 0x7) << 18;
331 val |= (cur[1] & 0x3f) << 12;
332 val |= (cur[2] & 0x3f) << 6;
333 val |= cur[3] & 0x3f;
334 } else {
335 /* 3-byte code */
336 *len = 3;
337 val = (cur[0] & 0xf) << 12;
338 val |= (cur[1] & 0x3f) << 6;
339 val |= cur[2] & 0x3f;
340 }
341 } else {
342 /* 2-byte code */
343 *len = 2;
344 val = (cur[0] & 0x1f) << 6;
345 val |= cur[1] & 0x3f;
346 }
347 if (!IS_CHAR(val)) {
348 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
349 "Char 0x%X out of allowed range\n", val);
350 }
351 return(val);
352 } else {
353 /* 1-byte code */
354 *len = 1;
355 return((int) *ctxt->input->cur);
356 }
357 }
358 /*
359 * Assume it's a fixed length encoding (1) with
360 * a compatible encoding for the ASCII set, since
361 * XML constructs only use < 128 chars
362 */
363 *len = 1;
364 if ((int) *ctxt->input->cur < 0x80)
365 return((int) *ctxt->input->cur);
366
367 /*
368 * Humm this is bad, do an automatic flow conversion
369 */
370 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
371 ctxt->charset = XML_CHAR_ENCODING_UTF8;
372 return(xmlCurrentChar(ctxt, len));
373
374 encoding_error:
375 /*
376 * If we detect an UTF8 error that probably mean that the
377 * input encoding didn't get properly advertized in the
378 * declaration header. Report the error and switch the encoding
379 * to ISO-Latin-1 (if you don't like this policy, just declare the
380 * encoding !)
381 */
382 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
383 "Input is not proper UTF-8, indicate encoding !\n",
384 NULL, NULL);
385 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
386 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
387 ctxt->input->cur[0], ctxt->input->cur[1],
388 ctxt->input->cur[2], ctxt->input->cur[3]);
389 }
390
391 ctxt->charset = XML_CHAR_ENCODING_8859_1;
392 *len = 1;
393 return((int) *ctxt->input->cur);
394 }
395
396 /**
397 * htmlSkipBlankChars:
398 * @param ctxt the HTML parser context
399 *
400 * skip all blanks character found at that point in the input streams.
401 *
402 * Returns the number of space chars skipped
403 */
404
405 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)406 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
407 int res = 0;
408
409 while (IS_BLANK_CH(*(ctxt->input->cur))) {
410 if ((*ctxt->input->cur == 0) &&
411 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
412 xmlPopInput(ctxt);
413 } else {
414 if (*(ctxt->input->cur) == '\n') {
415 ctxt->input->line++; ctxt->input->col = 1;
416 } else ctxt->input->col++;
417 ctxt->input->cur++;
418 ctxt->nbChars++;
419 if (*ctxt->input->cur == 0)
420 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
421 }
422 res++;
423 }
424 return(res);
425 }
426
427
428 #endif /* defined(LIBXML_HTML_ENABLED */
429
430 #if defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT)
431
432 /************************************************************************
433 * *
434 * The list of HTML elements and their properties *
435 * *
436 ************************************************************************/
437
438 /*
439 * Start Tag: 1 means the start tag can be ommited
440 * End Tag: 1 means the end tag can be ommited
441 * 2 means it's forbidden (empty elements)
442 * 3 means the tag is stylistic and should be closed easily
443 * Depr: this element is deprecated
444 * DTD: 1 means that this element is valid only in the Loose DTD
445 * 2 means that this element is valid only in the Frameset DTD
446 *
447 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
448 , subElements , impliedsubelt , Attributes, userdata
449 */
450
451 /* Definitions and a couple of vars for HTML Elements */
452
453 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
454 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
455 #define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
456 #define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
457 #define BLOCK HEADING LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
458 #define FORMCTRL "input", "select", "textarea", "label", "button"
459 #define PCDATA
460 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
461 #define LIST "ul", "ol", "dir", "menu"
462 #define MODIFIER
463 #define FLOW BLOCK,INLINE
464 #define EMPTY NULL
465
466 // TO DO libxslt added 2nd const in between
467 static const char* const html_flow [] = { FLOW, NULL } ;
468 static const char* const html_inline [] = { INLINE, NULL } ;
469
470 /* placeholders: elts with content but no subelements */
471 static const char* const html_pcdata[] = { NULL } ;
472 #define html_cdata html_pcdata
473
474
475 /* ... and for HTML Attributes */
476
477 #define COREATTRS "id", "class", "style", "title"
478 #define I18N "lang", "dir"
479 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
480 #define ATTRS COREATTRS,I18N,EVENTS
481 #define CELLHALIGN "align", "char", "charoff"
482 #define CELLVALIGN "valign"
483
484 static const char* const html_attrs [] = { ATTRS, NULL } ;
485 static const char* const core_i18n_attrs [] = { COREATTRS, I18N, NULL } ;
486 static const char* const core_attrs [] = { COREATTRS, NULL } ;
487 static const char* const i18n_attrs [] = { I18N, NULL } ;
488
489 /* Other declarations that should go inline ... */
490 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
491 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
492 "tabindex", "onfocus", "onblur", NULL } ;
493 static const char* const target_attr[] = { "target", NULL } ;
494 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
495 static const char* const alt_attr[] = { "alt", NULL } ;
496 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
497 static const char* const href_attrs[] = { "href", NULL } ;
498 static const char* const clear_attrs[] = { "clear", NULL } ;
499 static const char* const inline_p[] = { INLINE, "p", NULL } ;
500 static const char* const flow_param[] = { FLOW, "param", NULL } ;
501 static const char* const applet_attrs[] = { COREATTRS , "codebase",
502 "archive", "alt", "name", "height", "width", "align",
503 "hspace", "vspace", NULL } ;
504 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
505 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
506 static const char* const basefont_attrs[] =
507 { "id", "size", "color", "face", NULL } ;
508 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
509 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
510 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
511 static const char* const body_depr[] = { "background", "bgcolor", "text",
512 "link", "vlink", "alink", NULL } ;
513 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
514 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
515
516
517 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
518 static const char* const col_elt[] = { "col", NULL } ;
519 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
520 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
521 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
522 static const char* const compact_attr[] = { "compact", NULL } ;
523 static const char* const label_attr[] = { "label", NULL } ;
524 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
525 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
526 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
527 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
528 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
529 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
530 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
531 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
532 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
533 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
534 static const char* const version_attr[] = { "version", NULL } ;
535 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
536 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
537 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
538 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
539 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
540 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
541 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
542 static const char* const align_attr[] = { "align", NULL } ;
543 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
544 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
545 static const char* const name_attr[] = { "name", NULL } ;
546 static const char* const action_attr[] = { "action", NULL } ;
547 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
548 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
549 static const char* const content_attr[] = { "content", NULL } ;
550 static const char* const type_attr[] = { "type", NULL } ;
551 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
552 static const char* const object_contents[] = { FLOW, "param", NULL } ;
553 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
554 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
555 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
556 static const char* const option_elt[] = { "option", NULL } ;
557 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
558 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
559 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
560 static const char* const width_attr[] = { "width", NULL } ;
561 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
562 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
563 static const char* const language_attr[] = { "language", NULL } ;
564 static const char* const select_content[] = { "optgroup", "option", NULL } ;
565 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
566 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
567 static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
568 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
569 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
570 static const char* const tr_elt[] = { "tr", NULL } ;
571 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
572 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
573 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
574 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
575 static const char* const tr_contents[] = { "th", "td", NULL } ;
576 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
577 static const char* const li_elt[] = { "li", NULL } ;
578 static const char* const ul_depr[] = { "type", "compact", NULL} ;
579 static const char* const dir_attr[] = { "dir", NULL} ;
580
581 #define DECL (const char**)
582
583 static const htmlElemDesc html40ElementTable [] = {
584 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
585 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
586 },
587 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
588 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
589 },
590 { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
591 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
592 },
593 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
594 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
595 },
596 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
597 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
598 },
599 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
600 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
601 },
602 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
603 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
604 },
605 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
606 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
607 },
608 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
609 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
610 },
611 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
612 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
613 },
614 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
615 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
616 },
617 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
618 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
619 },
620 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
621 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
622 },
623 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
624 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
625 },
626 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
627 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
628 },
629 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
630 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
631 },
632 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
633 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
634 },
635 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
636 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
637 },
638 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
639 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
640 },
641 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
642 EMPTY , NULL , DECL col_attrs , NULL, NULL
643 },
644 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
645 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
646 },
647 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
648 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
649 },
650 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
651 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
652 },
653 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
654 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
655 },
656 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
657 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
658 },
659 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
660 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
661 },
662 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
663 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
664 },
665 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
666 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
667 },
668 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
669 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
670 },
671 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
672 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
673 },
674 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
675 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
676 },
677 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
678 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
679 },
680 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
681 EMPTY, NULL, NULL, DECL frame_attrs, NULL
682 },
683 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
684 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
685 },
686 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
687 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
688 },
689 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
690 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
691 },
692 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
693 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
694 },
695 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
696 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
697 },
698 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
699 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
700 },
701 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
702 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
703 },
704 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
705 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
706 },
707 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
708 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
709 },
710 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
711 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
712 },
713 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
714 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
715 },
716 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
717 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
718 },
719 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
720 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
721 },
722 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
723 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
724 },
725 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
726 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
727 },
728 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
729 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
730 },
731 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
732 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
733 },
734 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
735 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
736 },
737 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
738 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
739 },
740 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
741 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
742 },
743 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
744 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
745 },
746 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
747 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
748 },
749 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
750 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
751 },
752 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
753 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
754 },
755 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
756 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
757 },
758 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
759 DECL html_flow, "div", DECL html_attrs, NULL, NULL
760 },
761 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
762 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
763 },
764 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
765 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
766 },
767 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
768 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
769 },
770 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
771 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
772 },
773 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
774 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
775 },
776 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
777 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
778 },
779 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
780 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
781 },
782 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
783 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
784 },
785 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
786 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
787 },
788 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
789 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
790 },
791 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
792 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
793 },
794 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
795 DECL select_content, NULL, DECL select_attrs, NULL, NULL
796 },
797 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
798 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
799 },
800 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
801 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
802 },
803 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
804 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
805 },
806 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
807 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
808 },
809 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
810 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
811 },
812 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
813 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
814 },
815 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
816 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
817 },
818 { "table", 0, 0, 0, 0, 0, 0, 0, "",
819 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
820 },
821 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
822 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
823 },
824 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
825 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
826 },
827 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
828 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
829 },
830 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
831 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
832 },
833 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
834 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
835 },
836 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
837 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
838 },
839 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
840 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
841 },
842 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
843 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
844 },
845 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
846 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
847 },
848 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
849 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
850 },
851 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
852 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
853 },
854 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
855 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
856 }
857 };
858
859 #endif /* defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT) */
860
861 #ifdef LIBXML_HTML_ENABLED
862
863 /*
864 * start tags that imply the end of current element
865 */
866 static const char * const htmlStartClose [] = {
867 "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
868 "dl", "ul", "ol", "menu", "dir", "address", "pre",
869 "listing", "xmp", "head", NULL,
870 "head", "p", NULL,
871 "title", "p", NULL,
872 "body", "head", "style", "link", "title", "p", NULL,
873 "frameset", "head", "style", "link", "title", "p", NULL,
874 "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
875 "pre", "listing", "xmp", "head", "li", NULL,
876 "hr", "p", "head", NULL,
877 "h1", "p", "head", NULL,
878 "h2", "p", "head", NULL,
879 "h3", "p", "head", NULL,
880 "h4", "p", "head", NULL,
881 "h5", "p", "head", NULL,
882 "h6", "p", "head", NULL,
883 "dir", "p", "head", NULL,
884 "address", "p", "head", "ul", NULL,
885 "pre", "p", "head", "ul", NULL,
886 "listing", "p", "head", NULL,
887 "xmp", "p", "head", NULL,
888 "blockquote", "p", "head", NULL,
889 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
890 "xmp", "head", NULL,
891 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
892 "head", "dd", NULL,
893 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
894 "head", "dt", NULL,
895 "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
896 "listing", "xmp", NULL,
897 "ol", "p", "head", "ul", NULL,
898 "menu", "p", "head", "ul", NULL,
899 "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
900 "div", "p", "head", NULL,
901 "noscript", "p", "head", NULL,
902 "center", "font", "b", "i", "p", "head", NULL,
903 "a", "a", NULL,
904 "caption", "p", NULL,
905 "colgroup", "caption", "colgroup", "col", "p", NULL,
906 "col", "caption", "col", "p", NULL,
907 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
908 "listing", "xmp", "a", NULL,
909 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
910 "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
911 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
912 "thead", "caption", "col", "colgroup", NULL,
913 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
914 "tbody", "p", NULL,
915 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
916 "tfoot", "tbody", "p", NULL,
917 "optgroup", "option", NULL,
918 "option", "option", NULL,
919 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
920 "pre", "listing", "xmp", "a", NULL,
921 NULL
922 };
923
924 /*
925 * The list of HTML elements which are supposed not to have
926 * CDATA content and where a p element will be implied
927 *
928
929
930 */
931 static const char * const htmlNoContentElements[] = {
932 "html",
933 "head",
934 "body",
935 NULL
936 };
937
938 /*
939 * The list of HTML attributes which are of content %Script;
940 * NOTE: when adding ones, check htmlIsScriptAttribute() since
941 * it assumes the name starts with 'on'
942 */
943 static const char * const htmlScriptAttributes[] = {
944 "onclick",
945 "ondblclick",
946 "onmousedown",
947 "onmouseup",
948 "onmouseover",
949 "onmousemove",
950 "onmouseout",
951 "onkeypress",
952 "onkeydown",
953 "onkeyup",
954 "onload",
955 "onunload",
956 "onfocus",
957 "onblur",
958 "onsubmit",
959 "onrest",
960 "onchange",
961 "onselect"
962 };
963
964 /*
965 * This table is used by the htmlparser to know what to do with
966 * broken html pages. By assigning different priorities to different
967 * elements the parser can decide how to handle extra endtags.
968 * Endtags are only allowed to close elements with lower or equal
969 * priority.
970 */
971
972 typedef struct {
973 const char *name;
974 int priority;
975 } elementPriority;
976
977 static const elementPriority htmlEndPriority [] = {
978 {"div", 150},
979 {"td", 160},
980 {"th", 160},
981 {"tr", 170},
982 {"thead", 180},
983 {"tbody", 180},
984 {"tfoot", 180},
985 {"table", 190},
986 {"head", 200},
987 {"body", 200},
988 {"html", 220},
989 {NULL, 100} /* Default priority */
990 };
991
992 /************************************************************************
993 * *
994 * functions to handle HTML specific data *
995 * *
996 ************************************************************************/
997
998 /**
999 * htmlInitAutoClose:
1000 *
1001 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1002 * This is not reentrant. Call xmlInitParser() once before processing in
1003 * case of use in multithreaded programs.
1004 */
1005 void
htmlInitAutoClose(void)1006 htmlInitAutoClose(void) {
1007 int indx, i = 0;
1008
1009 if (htmlStartCloseIndexinitialized) return;
1010
1011 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1012 indx = 0;
1013 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1014 // libxslt port: (const char**) cast was added
1015 htmlStartCloseIndex[indx++] = (const char**)&htmlStartClose[i];
1016 while(htmlStartClose[i++]) {};
1017 i++;
1018 }
1019 htmlStartCloseIndexinitialized = 1;
1020 }
1021
1022 /**
1023 * htmlGetEndPriority:
1024 * @param name The name of the element to look up the priority for.
1025 *
1026 * Return value: The "endtag" priority.
1027 **/
1028 static int
htmlGetEndPriority(const xmlChar * name)1029 htmlGetEndPriority (const xmlChar *name) {
1030 int i = 0;
1031
1032 while ((htmlEndPriority[i].name != NULL) &&
1033 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1034 i++;
1035
1036 return(htmlEndPriority[i].priority);
1037 }
1038
1039
1040 /**
1041 * htmlCheckAutoClose:
1042 * @param newtag The new tag name
1043 * @param oldtag The old tag name
1044 *
1045 * Checks whether the new tag is one of the registered valid tags for
1046 * closing old.
1047 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1048 *
1049 * Returns 0 if no, 1 if yes.
1050 */
1051 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1052 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1053 {
1054 int i, indx;
1055 const char **closed = NULL;
1056
1057 if (htmlStartCloseIndexinitialized == 0)
1058 htmlInitAutoClose();
1059
1060 /* inefficient, but not a big deal */
1061 for (indx = 0; indx < 100; indx++) {
1062 closed = htmlStartCloseIndex[indx];
1063 if (closed == NULL)
1064 return (0);
1065 if (xmlStrEqual(BAD_CAST * closed, newtag))
1066 break;
1067 }
1068
1069 i = closed - htmlStartClose;
1070 i++;
1071 while (htmlStartClose[i] != NULL) {
1072 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1073 return (1);
1074 }
1075 i++;
1076 }
1077 return (0);
1078 }
1079
1080 /**
1081 * htmlAutoCloseOnClose:
1082 * @param ctxt an HTML parser context
1083 * @param newtag The new tag name
1084 * @param force force the tag closure
1085 *
1086 * The HTML DTD allows an ending tag to implicitly close other tags.
1087 */
1088 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1089 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1090 {
1091 const htmlElemDesc *info;
1092 int i, priority;
1093
1094 priority = htmlGetEndPriority(newtag);
1095
1096 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1097
1098 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1099 break;
1100 /*
1101 * A missplaced endtag can only close elements with lower
1102 * or equal priority, so if we find an element with higher
1103 * priority before we find an element with
1104 * matching name, we just ignore this endtag
1105 */
1106 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1107 return;
1108 }
1109 if (i < 0)
1110 return;
1111
1112 while (!xmlStrEqual(newtag, ctxt->name)) {
1113 info = htmlTagLookup(ctxt->name);
1114 if ((info != NULL) && (info->endTag == 3)) {
1115 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1116 "Opening and ending tag mismatch: %s and %s\n",
1117 newtag, ctxt->name);
1118 }
1119 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1120 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1121 htmlnamePop(ctxt);
1122 }
1123 }
1124
1125 /**
1126 * htmlAutoCloseOnEnd:
1127 * @param ctxt an HTML parser context
1128 *
1129 * Close all remaining tags at the end of the stream
1130 */
1131 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1132 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1133 {
1134 int i;
1135
1136 if (ctxt->nameNr == 0)
1137 return;
1138 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1139 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1140 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1141 htmlnamePop(ctxt);
1142 }
1143 }
1144
1145 /**
1146 * htmlAutoClose:
1147 * @param ctxt an HTML parser context
1148 * @param newtag The new tag name or NULL
1149 *
1150 * The HTML DTD allows a tag to implicitly close other tags.
1151 * The list is kept in htmlStartClose array. This function is
1152 * called when a new tag has been detected and generates the
1153 * appropriates closes if possible/needed.
1154 * If newtag is NULL this mean we are at the end of the resource
1155 * and we should check
1156 */
1157 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1158 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1159 {
1160 while ((newtag != NULL) && (ctxt->name != NULL) &&
1161 (htmlCheckAutoClose(newtag, ctxt->name))) {
1162 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1163 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1164 htmlnamePop(ctxt);
1165 }
1166 if (newtag == NULL) {
1167 htmlAutoCloseOnEnd(ctxt);
1168 return;
1169 }
1170 while ((newtag == NULL) && (ctxt->name != NULL) &&
1171 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1172 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1173 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1174 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1175 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1176 htmlnamePop(ctxt);
1177 }
1178 }
1179
1180 /**
1181 * htmlAutoCloseTag:
1182 * @param doc the HTML document
1183 * @param name The tag name
1184 * @param elem the HTML element
1185 *
1186 * The HTML DTD allows a tag to implicitly close other tags.
1187 * The list is kept in htmlStartClose array. This function checks
1188 * if the element or one of it's children would autoclose the
1189 * given tag.
1190 *
1191 * Returns 1 if autoclose, 0 otherwise
1192 */
1193 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1194 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1195 htmlNodePtr child;
1196
1197 if (elem == NULL) return(1);
1198 if (xmlStrEqual(name, elem->name)) return(0);
1199 if (htmlCheckAutoClose(elem->name, name)) return(1);
1200 child = elem->children;
1201 while (child != NULL) {
1202 if (htmlAutoCloseTag(doc, name, child)) return(1);
1203 child = child->next;
1204 }
1205 return(0);
1206 }
1207
1208 /**
1209 * htmlIsAutoClosed:
1210 * @param doc the HTML document
1211 * @param elem the HTML element
1212 *
1213 * The HTML DTD allows a tag to implicitly close other tags.
1214 * The list is kept in htmlStartClose array. This function checks
1215 * if a tag is autoclosed by one of it's child
1216 *
1217 * Returns 1 if autoclosed, 0 otherwise
1218 */
1219 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1220 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1221 htmlNodePtr child;
1222
1223 if (elem == NULL) return(1);
1224 child = elem->children;
1225 while (child != NULL) {
1226 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1227 child = child->next;
1228 }
1229 return(0);
1230 }
1231
1232 /**
1233 * htmlCheckImplied:
1234 * @param ctxt an HTML parser context
1235 * @param newtag The new tag name
1236 *
1237 * The HTML DTD allows a tag to exists only implicitly
1238 * called when a new tag has been detected and generates the
1239 * appropriates implicit tags if missing
1240 */
1241 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1242 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1243 if (!htmlOmittedDefaultValue)
1244 return;
1245 if (xmlStrEqual(newtag, BAD_CAST"html"))
1246 return;
1247 if (ctxt->nameNr <= 0) {
1248 htmlnamePush(ctxt, BAD_CAST"html");
1249 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1250 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1251 }
1252 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1253 return;
1254 if ((ctxt->nameNr <= 1) &&
1255 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1256 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1257 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1258 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1259 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1260 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1261 /*
1262 * dropped OBJECT ... i you put it first BODY will be
1263 * assumed !
1264 */
1265 htmlnamePush(ctxt, BAD_CAST"head");
1266 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1267 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1268 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1269 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1270 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1271 int i;
1272 for (i = 0;i < ctxt->nameNr;i++) {
1273 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1274 return;
1275 }
1276 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1277 return;
1278 }
1279 }
1280
1281 htmlnamePush(ctxt, BAD_CAST"body");
1282 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1283 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1284 }
1285 }
1286
1287 /**
1288 * htmlCheckParagraph
1289 * @param ctxt an HTML parser context
1290 *
1291 * Check whether a p element need to be implied before inserting
1292 * characters in the current element.
1293 *
1294 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1295 * in case of error.
1296 */
1297
1298 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1299 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1300 const xmlChar *tag;
1301 int i;
1302
1303 if (ctxt == NULL)
1304 return(-1);
1305 tag = ctxt->name;
1306 if (tag == NULL) {
1307 htmlAutoClose(ctxt, BAD_CAST"p");
1308 htmlCheckImplied(ctxt, BAD_CAST"p");
1309 htmlnamePush(ctxt, BAD_CAST"p");
1310 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1311 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1312 return(1);
1313 }
1314 if (!htmlOmittedDefaultValue)
1315 return(0);
1316 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1317 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1318 htmlAutoClose(ctxt, BAD_CAST"p");
1319 htmlCheckImplied(ctxt, BAD_CAST"p");
1320 htmlnamePush(ctxt, BAD_CAST"p");
1321 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1322 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1323 return(1);
1324 }
1325 }
1326 return(0);
1327 }
1328
1329 /**
1330 * htmlIsScriptAttribute:
1331 * @param name an attribute name
1332 *
1333 * Check if an attribute is of content type Script
1334 *
1335 * Returns 1 is the attribute is a script 0 otherwise
1336 */
1337 int
htmlIsScriptAttribute(const xmlChar * name)1338 htmlIsScriptAttribute(const xmlChar *name) {
1339 unsigned int i;
1340
1341 if (name == NULL)
1342 return(0);
1343 /*
1344 * all script attributes start with 'on'
1345 */
1346 if ((name[0] != 'o') || (name[1] != 'n'))
1347 return(0);
1348 for (i = 0;
1349 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1350 i++) {
1351 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1352 return(1);
1353 }
1354 return(0);
1355 }
1356
1357 /************************************************************************
1358 * *
1359 * The list of HTML predefined entities *
1360 * *
1361 ************************************************************************/
1362
1363
1364 static const htmlEntityDesc html40EntitiesTable[] = {
1365 /*
1366 * the 4 absolute ones, plus apostrophe.
1367 */
1368 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1369 { 38, "amp", "ampersand, U+0026 ISOnum" },
1370 { 39, "apos", "single quote" },
1371 { 60, "lt", "less-than sign, U+003C ISOnum" },
1372 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1373
1374 /*
1375 * A bunch still in the 128-255 range
1376 * Replacing them depend really on the charset used.
1377 */
1378 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1379 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1380 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1381 { 163, "pound","pound sign, U+00A3 ISOnum" },
1382 { 164, "curren","currency sign, U+00A4 ISOnum" },
1383 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1384 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1385 { 167, "sect", "section sign, U+00A7 ISOnum" },
1386 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1387 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1388 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1389 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1390 { 172, "not", "not sign, U+00AC ISOnum" },
1391 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1392 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1393 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1394 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1395 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1396 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1397 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1398 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1399 { 181, "micro","micro sign, U+00B5 ISOnum" },
1400 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1401 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1402 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1403 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1404 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1405 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1406 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1407 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1408 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1409 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1410 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1411 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1412 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1413 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1414 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1415 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1416 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1417 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1418 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1419 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1420 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1421 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1422 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1423 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1424 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1425 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1426 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1427 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1428 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1429 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1430 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1431 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1432 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1433 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1434 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1435 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1436 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1437 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1438 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1439 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1440 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1441 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1442 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1443 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1444 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1445 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1446 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1447 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1448 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1449 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1450 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1451 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1452 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1453 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1454 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1455 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1456 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1457 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1458 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1459 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1460 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1461 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1462 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1463 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1464 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1465 { 247, "divide","division sign, U+00F7 ISOnum" },
1466 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1467 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1468 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1469 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1470 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1471 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1472 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1473 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1474
1475 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1476 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1477 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1478 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1479 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1480
1481 /*
1482 * Anything below should really be kept as entities references
1483 */
1484 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1485
1486 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1487 { 732, "tilde","small tilde, U+02DC ISOdia" },
1488
1489 { 913, "Alpha","greek capital letter alpha, U+0391" },
1490 { 914, "Beta", "greek capital letter beta, U+0392" },
1491 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1492 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1493 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1494 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1495 { 919, "Eta", "greek capital letter eta, U+0397" },
1496 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1497 { 921, "Iota", "greek capital letter iota, U+0399" },
1498 { 922, "Kappa","greek capital letter kappa, U+039A" },
1499 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1500 { 924, "Mu", "greek capital letter mu, U+039C" },
1501 { 925, "Nu", "greek capital letter nu, U+039D" },
1502 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1503 { 927, "Omicron","greek capital letter omicron, U+039F" },
1504 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1505 { 929, "Rho", "greek capital letter rho, U+03A1" },
1506 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1507 { 932, "Tau", "greek capital letter tau, U+03A4" },
1508 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1509 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1510 { 935, "Chi", "greek capital letter chi, U+03A7" },
1511 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1512 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1513
1514 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1515 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1516 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1517 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1518 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1519 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1520 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1521 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1522 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1523 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1524 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1525 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1526 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1527 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1528 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1529 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1530 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1531 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1532 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1533 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1534 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1535 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1536 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1537 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1538 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1539 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1540 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1541 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1542
1543 { 8194, "ensp", "en space, U+2002 ISOpub" },
1544 { 8195, "emsp", "em space, U+2003 ISOpub" },
1545 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1546 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1547 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1548 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1549 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1550 { 8211, "ndash","en dash, U+2013 ISOpub" },
1551 { 8212, "mdash","em dash, U+2014 ISOpub" },
1552 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1553 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1554 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1555 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1556 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1557 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1558 { 8224, "dagger","dagger, U+2020 ISOpub" },
1559 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1560
1561 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1562 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1563
1564 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1565
1566 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1567 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1568
1569 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1570 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1571
1572 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1573 { 8260, "frasl","fraction slash, U+2044 NEW" },
1574
1575 { 8364, "euro", "euro sign, U+20AC NEW" },
1576
1577 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1578 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1579 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1580 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1581 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1582 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1583 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1584 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1585 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1586 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1587 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1588 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1589 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1590 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1591 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1592 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1593
1594 { 8704, "forall","for all, U+2200 ISOtech" },
1595 { 8706, "part", "partial differential, U+2202 ISOtech" },
1596 { 8707, "exist","there exists, U+2203 ISOtech" },
1597 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1598 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1599 { 8712, "isin", "element of, U+2208 ISOtech" },
1600 { 8713, "notin","not an element of, U+2209 ISOtech" },
1601 { 8715, "ni", "contains as member, U+220B ISOtech" },
1602 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1603 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1604 { 8722, "minus","minus sign, U+2212 ISOtech" },
1605 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1606 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1607 { 8733, "prop", "proportional to, U+221D ISOtech" },
1608 { 8734, "infin","infinity, U+221E ISOtech" },
1609 { 8736, "ang", "angle, U+2220 ISOamso" },
1610 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1611 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
1612 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1613 { 8746, "cup", "union = cup, U+222A ISOtech" },
1614 { 8747, "int", "integral, U+222B ISOtech" },
1615 { 8756, "there4","therefore, U+2234 ISOtech" },
1616 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1617 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1618 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1619 { 8800, "ne", "not equal to, U+2260 ISOtech" },
1620 { 8801, "equiv","identical to, U+2261 ISOtech" },
1621 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1622 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1623 { 8834, "sub", "subset of, U+2282 ISOtech" },
1624 { 8835, "sup", "superset of, U+2283 ISOtech" },
1625 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1626 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1627 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1628 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1629 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1630 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1631 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1632 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1633 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1634 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1635 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1636 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1637 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1638 { 9674, "loz", "lozenge, U+25CA ISOpub" },
1639
1640 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1641 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1642 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1643 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1644
1645 };
1646
1647 /************************************************************************
1648 * *
1649 * Commodity functions to handle entities *
1650 * *
1651 ************************************************************************/
1652
1653 /*
1654 * Macro used to grow the current buffer. Buffer is freed in OOM.
1655 */ // DONE: Fix xmlRealloc
1656 #define growBuffer(buffer) { \
1657 void* allocTmp; \
1658 buffer##_size *= 2; \
1659 allocTmp = xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1660 if (!allocTmp) { \
1661 xmlFree(buffer); \
1662 htmlErrMemory(ctxt, "growing buffer\n"); \
1663 return(NULL); \
1664 } \
1665 buffer = (xmlChar*) allocTmp; \
1666 }
1667
1668 /**
1669 * htmlEntityLookup:
1670 * @param name the entity name
1671 *
1672 * Lookup the given entity in EntitiesTable
1673 *
1674
1675 *
1676 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1677 */
1678 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)1679 htmlEntityLookup(const xmlChar *name) {
1680 unsigned int i;
1681
1682 for (i = 0;i < (sizeof(html40EntitiesTable)/
1683 sizeof(html40EntitiesTable[0]));i++) {
1684 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1685 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1686 }
1687 }
1688 return(NULL);
1689 }
1690
1691 /**
1692 * htmlEntityValueLookup:
1693 * @param value the entity's unicode value
1694 *
1695 * Lookup the given entity in EntitiesTable
1696 *
1697
1698 *
1699 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1700 */
1701 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)1702 htmlEntityValueLookup(unsigned int value) {
1703 unsigned int i;
1704
1705 for (i = 0;i < (sizeof(html40EntitiesTable)/
1706 sizeof(html40EntitiesTable[0]));i++) {
1707 if (html40EntitiesTable[i].value >= value) {
1708 if (html40EntitiesTable[i].value > value)
1709 break;
1710 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1711 }
1712 }
1713 return(NULL);
1714 }
1715
1716 /**
1717 * UTF8ToHtml:
1718 * @param out a pointer to an array of bytes to store the result
1719 * @param outlen the length of out
1720 * @param in a pointer to an array of UTF-8 chars
1721 * @param inlen the length of in
1722 *
1723 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1724 * plus HTML entities block of chars out.
1725 *
1726 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1727 * The value of inlen after return is the number of octets consumed
1728 * as the return value is positive, else unpredictable.
1729 * The value of outlen after return is the number of octets consumed.
1730 */
1731 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)1732 UTF8ToHtml(unsigned char* out, int *outlen,
1733 const unsigned char* in, int *inlen) {
1734 const unsigned char* processed = in;
1735 const unsigned char* outend;
1736 const unsigned char* outstart = out;
1737 const unsigned char* instart = in;
1738 const unsigned char* inend;
1739 unsigned int c, d;
1740 int trailing;
1741
1742 if (in == NULL) {
1743 /*
1744 * initialization nothing to do
1745 */
1746 *outlen = 0;
1747 *inlen = 0;
1748 return(0);
1749 }
1750 inend = in + (*inlen);
1751 outend = out + (*outlen);
1752 while (in < inend) {
1753 d = *in++;
1754 if (d < 0x80) { c= d; trailing= 0; }
1755 else if (d < 0xC0) {
1756 /* trailing byte in leading position */
1757 *outlen = out - outstart;
1758 *inlen = processed - instart;
1759 return(-2);
1760 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1761 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1762 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1763 else {
1764 /* no chance for this in Ascii */
1765 *outlen = out - outstart;
1766 *inlen = processed - instart;
1767 return(-2);
1768 }
1769
1770 if (inend - in < trailing) {
1771 break;
1772 }
1773
1774 for ( ; trailing; trailing--) {
1775 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1776 break;
1777 c <<= 6;
1778 c |= d & 0x3F;
1779 }
1780
1781 /* assertion: c is a single UTF-4 value */
1782 if (c < 0x80) {
1783 if (out + 1 >= outend)
1784 break;
1785 *out++ = c;
1786 } else {
1787 int len;
1788 const htmlEntityDesc * ent;
1789
1790 /*
1791 * Try to lookup a predefined HTML entity for it
1792 */
1793
1794 ent = htmlEntityValueLookup(c);
1795 if (ent == NULL) {
1796 /* no chance for this in Ascii */
1797 *outlen = out - outstart;
1798 *inlen = processed - instart;
1799 return(-2);
1800 }
1801 len = strlen(ent->name);
1802 if (out + 2 + len >= outend)
1803 break;
1804 *out++ = '&';
1805 memcpy(out, ent->name, len);
1806 out += len;
1807 *out++ = ';';
1808 }
1809 processed = in;
1810 }
1811 *outlen = out - outstart;
1812 *inlen = processed - instart;
1813 return(0);
1814 }
1815
1816 /**
1817 * htmlEncodeEntities:
1818 * @param out a pointer to an array of bytes to store the result
1819 * @param outlen the length of out
1820 * @param in a pointer to an array of UTF-8 chars
1821 * @param inlen the length of in
1822 * @param quoteChar the quote character to escape (' or ") or zero.
1823 *
1824 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1825 * plus HTML entities block of chars out.
1826 *
1827 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1828 * The value of inlen after return is the number of octets consumed
1829 * as the return value is positive, else unpredictable.
1830 * The value of outlen after return is the number of octets consumed.
1831 */
1832 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)1833 htmlEncodeEntities(unsigned char* out, int *outlen,
1834 const unsigned char* in, int *inlen, int quoteChar) {
1835 const unsigned char* processed = in;
1836 const unsigned char* outend = out + (*outlen);
1837 const unsigned char* outstart = out;
1838 const unsigned char* instart = in;
1839 const unsigned char* inend = in + (*inlen);
1840 unsigned int c, d;
1841 int trailing;
1842
1843 while (in < inend) {
1844 d = *in++;
1845 if (d < 0x80) { c= d; trailing= 0; }
1846 else if (d < 0xC0) {
1847 /* trailing byte in leading position */
1848 *outlen = out - outstart;
1849 *inlen = processed - instart;
1850 return(-2);
1851 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1852 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1853 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1854 else {
1855 /* no chance for this in Ascii */
1856 *outlen = out - outstart;
1857 *inlen = processed - instart;
1858 return(-2);
1859 }
1860
1861 if (inend - in < trailing)
1862 break;
1863
1864 while (trailing--) {
1865 if (((d= *in++) & 0xC0) != 0x80) {
1866 *outlen = out - outstart;
1867 *inlen = processed - instart;
1868 return(-2);
1869 }
1870 c <<= 6;
1871 c |= d & 0x3F;
1872 }
1873
1874 /* assertion: c is a single UTF-4 value */
1875 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1876 (c != '&') && (c != '<') && (c != '>')) {
1877 if (out >= outend)
1878 break;
1879 *out++ = c;
1880 } else {
1881 const htmlEntityDesc * ent;
1882 const char *cp;
1883 char nbuf[16];
1884 int len;
1885
1886 /*
1887 * Try to lookup a predefined HTML entity for it
1888 */
1889 ent = htmlEntityValueLookup(c);
1890 if (ent == NULL) {
1891 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1892 cp = nbuf;
1893 }
1894 else
1895 cp = ent->name;
1896 len = strlen(cp);
1897 if (out + 2 + len > outend)
1898 break;
1899 *out++ = '&';
1900 memcpy(out, cp, len);
1901 out += len;
1902 *out++ = ';';
1903 }
1904 processed = in;
1905 }
1906 *outlen = out - outstart;
1907 *inlen = processed - instart;
1908 return(0);
1909 }
1910
1911 /************************************************************************
1912 * *
1913 * Commodity functions to handle streams *
1914 * *
1915 ************************************************************************/
1916
1917 /**
1918 * htmlNewInputStream:
1919 * @param ctxt an HTML parser context
1920 *
1921 * Create a new input stream structure
1922 * Returns the new input stream or NULL
1923 */
1924 static htmlParserInputPtr
htmlNewInputStream(htmlParserCtxtPtr ctxt)1925 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1926 htmlParserInputPtr input;
1927
1928 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1929 if (input == NULL) {
1930 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
1931 return(NULL);
1932 }
1933 memset(input, 0, sizeof(htmlParserInput));
1934 input->filename = NULL;
1935 input->directory = NULL;
1936 input->base = NULL;
1937 input->cur = NULL;
1938 input->buf = NULL;
1939 input->line = 1;
1940 input->col = 1;
1941 input->buf = NULL;
1942 input->free = NULL;
1943 input->version = NULL;
1944 input->consumed = 0;
1945 input->length = 0;
1946 return(input);
1947 }
1948
1949
1950 /************************************************************************
1951 * *
1952 * Commodity functions, cleanup needed ? *
1953 * *
1954 ************************************************************************/
1955 /*
1956 * all tags allowing pc data from the html 4.01 loose dtd
1957 * NOTE: it might be more apropriate to integrate this information
1958 * into the html40ElementTable array but I don't want to risk any
1959 * binary incomptibility
1960 */
1961 static const char * const allowPCData[] = {
1962 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1963 "blockquote", "body", "button", "caption", "center", "cite", "code",
1964 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1965 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1966 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1967 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1968 };
1969
1970 /**
1971 * areBlanks:
1972 * @param ctxt an HTML parser context
1973 * @param str a xmlChar *
1974 * @param len the size of str
1975 *
1976 * Is this a sequence of blank chars that one can ignore ?
1977 *
1978 * Returns 1 if ignorable 0 otherwise.
1979 */
1980
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)1981 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1982 unsigned int i;
1983 int j;
1984 xmlNodePtr lastChild;
1985
1986 for (j = 0;j < len;j++)
1987 if (!(IS_BLANK_CH(str[j]))) return(0);
1988
1989 if (CUR == 0) return(1);
1990 if (CUR != '<') return(0);
1991 if (ctxt->name == NULL)
1992 return(1);
1993 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1994 return(1);
1995 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1996 return(1);
1997 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1998 return(1);
1999 if (ctxt->node == NULL) return(0);
2000 lastChild = xmlGetLastChild(ctxt->node);
2001 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2002 lastChild = lastChild->prev;
2003 if (lastChild == NULL) {
2004 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2005 (ctxt->node->content != NULL)) return(0);
2006 /* keep ws in constructs like ...<b> </b>...
2007 for all tags "b" allowing PCDATA */
2008 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2009 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2010 return(0);
2011 }
2012 }
2013 } else if (xmlNodeIsText(lastChild)) {
2014 return(0);
2015 } else {
2016 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2017 for all tags "p" allowing PCDATA */
2018 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2019 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2020 return(0);
2021 }
2022 }
2023 }
2024 return(1);
2025 }
2026 #endif /* defined(LIBXML_HTML_ENABLED */
2027
2028 #if defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT)
2029
2030 /**
2031 * htmlErrMemory:
2032 * @param ctxt an HTML parser context
2033 * @param extra extra informations
2034 *
2035 * Handle a redefinition of attribute error
2036 */
2037 static void
htmlErrMemory(xmlParserCtxtPtr ctxt,const char * extra)2038 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
2039 {
2040 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
2041 (ctxt->instate == XML_PARSER_EOF))
2042 return;
2043 if (ctxt != NULL) {
2044 ctxt->errNo = XML_ERR_NO_MEMORY;
2045 ctxt->instate = XML_PARSER_EOF;
2046 ctxt->disableSAX = 1;
2047 }
2048 if (extra)
2049 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
2050 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
2051 NULL, NULL, 0, 0,
2052 "Memory allocation failed : %s\n", extra);
2053 else
2054 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
2055 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
2056 NULL, NULL, 0, 0, "Memory allocation failed\n");
2057 }
2058
2059 /**
2060 * htmlNewDocNoDtD:
2061 * @param URI URI for the dtd, or NULL
2062 * @param ExternalID the external ID of the DTD, or NULL
2063 *
2064 * Creates a new HTML document without a DTD node if URI and ExternalID
2065 * are NULL
2066 *
2067 * Returns a new document, do not initialize the DTD if not provided
2068 */
2069 XMLPUBFUNEXPORT htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2070 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2071 xmlDocPtr cur;
2072
2073 /*
2074 * Allocate a new document and fill the fields.
2075 */
2076 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2077 if (cur == NULL) {
2078 htmlErrMemory(NULL, "HTML document creation failed\n");
2079 return(NULL);
2080 }
2081 memset(cur, 0, sizeof(xmlDoc));
2082
2083 cur->type = XML_HTML_DOCUMENT_NODE;
2084 #ifdef XE_ENABLE_GS_CACHING
2085 cur->cachedGs = xmlGetGlobalState();
2086 #endif
2087
2088 //cur->version = NULL;
2089 //cur->intSubset = NULL;
2090 cur->doc = cur;
2091 //cur->name = NULL;
2092 //cur->children = NULL;
2093 //cur->extSubset = NULL;
2094 //cur->oldNs = NULL;
2095 //cur->encoding = NULL;
2096 cur->standalone = 1;
2097 //cur->compression = 0;
2098 //cur->ids = NULL;
2099 //cur->refs = NULL;
2100 //cur->_private = NULL;
2101
2102 if (ExternalID || URI)
2103 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2104
2105 return(cur);
2106 }
2107
2108 /**
2109 * htmlNewDoc:
2110 * @param URI URI for the dtd, or NULL
2111 * @param ExternalID the external ID of the DTD, or NULL
2112 *
2113 * Creates a new HTML document
2114 *
2115 * Returns a new document
2116 */
2117 XMLPUBFUNEXPORT htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2118 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2119 if ((URI == NULL) && (ExternalID == NULL))
2120 return(htmlNewDocNoDtD(
2121 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2122 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2123
2124 return(htmlNewDocNoDtD(URI, ExternalID));
2125 }
2126
2127 /**
2128 * htmlTagLookup:
2129 * @param tag The tag name in lowercase
2130 *
2131 * Lookup the HTML tag in the ElementTable
2132 *
2133 * Returns the related htmlElemDescPtr or NULL if not found.
2134 */
2135 XMLPUBFUNEXPORT const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)2136 htmlTagLookup(const xmlChar *tag) {
2137 unsigned int i;
2138
2139 for (i = 0; i < (sizeof(html40ElementTable) /
2140 sizeof(html40ElementTable[0]));i++) {
2141 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
2142 return((htmlElemDescPtr) &html40ElementTable[i]);
2143 }
2144 return(NULL);
2145 }
2146
2147 #endif /* defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT) */
2148
2149 #if defined(LIBXML_HTML_ENABLED)
2150
2151 /************************************************************************
2152 * *
2153 * The parser itself *
2154 * Relates to http://www.w3.org/TR/html40 *
2155 * *
2156 ************************************************************************/
2157
2158 /************************************************************************
2159 * *
2160 * The parser itself *
2161 * *
2162 ************************************************************************/
2163
2164 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2165
2166 /**
2167 * htmlParseHTMLName:
2168 * @param ctxt an HTML parser context
2169 *
2170 * parse an HTML tag or attribute name, note that we convert it to lowercase
2171 * since HTML names are not case-sensitive.
2172 *
2173 * Returns the Tag Name parsed or NULL
2174 */
2175
2176 static const xmlChar *
htmlParseHTMLName(htmlParserCtxtPtr ctxt)2177 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2178 int i = 0;
2179 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2180
2181 if (!IS_LETTER_CH(CUR) && (CUR != '_') &&
2182 (CUR != ':')) return(NULL);
2183
2184 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2185 ((IS_LETTER_CH(CUR)) || (IS_DIGIT_CH(CUR)) ||
2186 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2187 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2188 else loc[i] = CUR;
2189 i++;
2190
2191 NEXT;
2192 }
2193
2194 return(xmlDictLookup(ctxt->dict, loc, i));
2195 }
2196
2197 /**
2198 * htmlParseName:
2199 * @param ctxt an HTML parser context
2200 *
2201 * parse an HTML name, this routine is case sensitive.
2202 *
2203 * Returns the Name parsed or NULL
2204 */
2205
2206 static const xmlChar *
htmlParseName(htmlParserCtxtPtr ctxt)2207 htmlParseName(htmlParserCtxtPtr ctxt) {
2208 const xmlChar *in;
2209 const xmlChar *ret;
2210 int count = 0;
2211
2212 GROW;
2213
2214 /*
2215 * Accelerator for simple ASCII names
2216 */
2217 in = ctxt->input->cur;
2218 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2219 ((*in >= 0x41) && (*in <= 0x5A)) ||
2220 (*in == '_') || (*in == ':')) {
2221 in++;
2222 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2223 ((*in >= 0x41) && (*in <= 0x5A)) ||
2224 ((*in >= 0x30) && (*in <= 0x39)) ||
2225 (*in == '_') || (*in == '-') ||
2226 (*in == ':') || (*in == '.'))
2227 in++;
2228 if ((*in > 0) && (*in < 0x80)) {
2229 count = in - ctxt->input->cur;
2230 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2231 ctxt->input->cur = in;
2232 ctxt->nbChars += count;
2233 ctxt->input->col += count;
2234 return(ret);
2235 }
2236 }
2237 return(htmlParseNameComplex(ctxt));
2238 }
2239
2240 static const xmlChar *
htmlParseNameComplex(xmlParserCtxtPtr ctxt)2241 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2242 int len = 0, l;
2243 int c;
2244 int count = 0;
2245
2246 /*
2247 * Handler for more complex cases
2248 */
2249 GROW;
2250 c = CUR_CHAR(l);
2251 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2252 (!IS_LETTER(c) && (c != '_') &&
2253 (c != ':'))) {
2254 return(NULL);
2255 }
2256
2257 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2258 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2259 (c == '.') || (c == '-') ||
2260 (c == '_') || (c == ':') ||
2261 (IS_COMBINING(c)) ||
2262 (IS_EXTENDER(c)))) {
2263 if (count++ > 100) {
2264 count = 0;
2265 GROW;
2266 }
2267 len += l;
2268 NEXTL(l);
2269 c = CUR_CHAR(l);
2270 }
2271 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2272 }
2273
2274
2275 /**
2276 * htmlParseHTMLAttribute:
2277 * @param ctxt an HTML parser context
2278 * @param stop a char stop value
2279 *
2280 * parse an HTML attribute value till the stop (quote), if
2281 * stop is 0 then it stops at the first space
2282 *
2283 * Returns the attribute parsed or NULL
2284 */
2285
2286 static xmlChar *
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt,const xmlChar stop)2287 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2288 xmlChar *buffer = NULL;
2289 int buffer_size = 0;
2290 xmlChar *out = NULL;
2291 const xmlChar *name = NULL;
2292 const xmlChar *cur = NULL;
2293 const htmlEntityDesc * ent;
2294
2295 /*
2296 * allocate a translation buffer.
2297 */
2298 buffer_size = HTML_PARSER_BUFFER_SIZE;
2299 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2300 if (buffer == NULL) {
2301 htmlErrMemory(ctxt, "buffer allocation failed\n");
2302 return(NULL);
2303 }
2304 out = buffer;
2305
2306 /*
2307 * Ok loop until we reach one of the ending chars
2308 */
2309 while ((CUR != 0) && (CUR != stop)) {
2310 if ((stop == 0) && (CUR == '>')) break;
2311 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2312 if (CUR == '&') {
2313 if (NXT(1) == '#') {
2314 unsigned int c;
2315 int bits;
2316
2317 c = htmlParseCharRef(ctxt);
2318 if (c < 0x80)
2319 { *out++ = c; bits= -6; }
2320 else if (c < 0x800)
2321 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2322 else if (c < 0x10000)
2323 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2324 else
2325 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2326
2327 for ( ; bits >= 0; bits-= 6) {
2328 *out++ = ((c >> bits) & 0x3F) | 0x80;
2329 }
2330
2331 if (out - buffer > buffer_size - 100) {
2332 int indx = out - buffer;
2333
2334 growBuffer(buffer);
2335 out = &buffer[indx];
2336 }
2337 } else {
2338 ent = htmlParseEntityRef(ctxt, &name);
2339 if (name == NULL) {
2340 *out++ = '&';
2341 if (out - buffer > buffer_size - 100) {
2342 int indx = out - buffer;
2343
2344 growBuffer(buffer);
2345 out = &buffer[indx];
2346 }
2347 } else if (ent == NULL) {
2348 *out++ = '&';
2349 cur = name;
2350 while (*cur != 0) {
2351 if (out - buffer > buffer_size - 100) {
2352 int indx = out - buffer;
2353
2354 growBuffer(buffer);
2355 out = &buffer[indx];
2356 }
2357 *out++ = *cur++;
2358 }
2359 } else {
2360 unsigned int c;
2361 int bits;
2362
2363 if (out - buffer > buffer_size - 100) {
2364 int indx = out - buffer;
2365
2366 growBuffer(buffer);
2367 out = &buffer[indx];
2368 }
2369 c = (xmlChar)ent->value;
2370 if (c < 0x80)
2371 { *out++ = c; bits= -6; }
2372 else if (c < 0x800)
2373 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2374 else if (c < 0x10000)
2375 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2376 else
2377 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2378
2379 for ( ; bits >= 0; bits-= 6) {
2380 *out++ = ((c >> bits) & 0x3F) | 0x80;
2381 }
2382 }
2383 }
2384 } else {
2385 unsigned int c;
2386 int bits, l;
2387
2388 if (out - buffer > buffer_size - 100) {
2389 int indx = out - buffer;
2390
2391 growBuffer(buffer);
2392 out = &buffer[indx];
2393 }
2394 c = CUR_CHAR(l);
2395 if (c < 0x80)
2396 { *out++ = c; bits= -6; }
2397 else if (c < 0x800)
2398 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2399 else if (c < 0x10000)
2400 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2401 else
2402 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2403
2404 for ( ; bits >= 0; bits-= 6) {
2405 *out++ = ((c >> bits) & 0x3F) | 0x80;
2406 }
2407 NEXT;
2408 }
2409 }
2410 *out++ = 0;
2411 return(buffer);
2412 }
2413
2414 /**
2415 * htmlParseEntityRef:
2416 * @param ctxt an HTML parser context
2417 * @param str location to store the entity name
2418 *
2419 * parse an HTML ENTITY references
2420 *
2421 * [68] EntityRef ::= '&' Name ';'
2422 *
2423 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2424 * if non-NULL *str will have to be freed by the caller.
2425 */
2426 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt,const xmlChar ** str)2427 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2428 const xmlChar *name;
2429 const htmlEntityDesc * ent = NULL;
2430 *str = NULL;
2431
2432 if (CUR == '&') {
2433 NEXT;
2434 name = htmlParseName(ctxt);
2435 if (name == NULL) {
2436 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2437 "htmlParseEntityRef: no name\n", NULL, NULL);
2438 } else {
2439 GROW;
2440 if (CUR == ';') {
2441 *str = name;
2442
2443 /*
2444 * Lookup the entity in the table.
2445 */
2446 ent = htmlEntityLookup(name);
2447 if (ent != NULL) /* OK that's ugly !!! */
2448 NEXT;
2449 } else {
2450 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2451 "htmlParseEntityRef: expecting ';'\n",
2452 NULL, NULL);
2453 *str = name;
2454 }
2455 }
2456 }
2457 return(ent);
2458 }
2459
2460 /**
2461 * htmlParseAttValue:
2462 * @param ctxt an HTML parser context
2463 *
2464 * parse a value for an attribute
2465 * Note: the parser won't do substitution of entities here, this
2466 * will be handled later in xmlStringGetNodeList, unless it was
2467 * asked for ctxt->replaceEntities != 0
2468 *
2469 * Returns the AttValue parsed or NULL.
2470 */
2471
2472 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2473 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2474 xmlChar *ret = NULL;
2475
2476 if (CUR == '"') {
2477 NEXT;
2478 ret = htmlParseHTMLAttribute(ctxt, '"');
2479 if (CUR != '"') {
2480 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2481 "AttValue: \" expected\n", NULL, NULL);
2482 } else
2483 NEXT;
2484 } else if (CUR == '\'') {
2485 NEXT;
2486 ret = htmlParseHTMLAttribute(ctxt, '\'');
2487 if (CUR != '\'') {
2488 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2489 "AttValue: ' expected\n", NULL, NULL);
2490 } else
2491 NEXT;
2492 } else {
2493 /*
2494 * That's an HTMLism, the attribute value may not be quoted
2495 */
2496 ret = htmlParseHTMLAttribute(ctxt, 0);
2497 if (ret == NULL) {
2498 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2499 "AttValue: no value found\n", NULL, NULL);
2500 }
2501 }
2502 return(ret);
2503 }
2504
2505 /**
2506 * htmlParseSystemLiteral:
2507 * @param ctxt an HTML parser context
2508 *
2509 * parse an HTML Literal
2510 *
2511 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2512 *
2513 * Returns the SystemLiteral parsed or NULL
2514 */
2515
2516 static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt)2517 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2518 const xmlChar *q;
2519 xmlChar *ret = NULL;
2520
2521 if (CUR == '"') {
2522 NEXT;
2523 q = CUR_PTR;
2524 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2525 NEXT;
2526 if (!IS_CHAR_CH(CUR)) {
2527 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2528 "Unfinished SystemLiteral\n", NULL, NULL);
2529 } else {
2530 ret = xmlStrndup(q, CUR_PTR - q);
2531 NEXT;
2532 }
2533 } else if (CUR == '\'') {
2534 NEXT;
2535 q = CUR_PTR;
2536 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2537 NEXT;
2538 if (!IS_CHAR_CH(CUR)) {
2539 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2540 "Unfinished SystemLiteral\n", NULL, NULL);
2541 } else {
2542 ret = xmlStrndup(q, CUR_PTR - q);
2543 NEXT;
2544 }
2545 } else {
2546 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2547 " or ' expected\n", NULL, NULL);
2548 }
2549
2550 return(ret);
2551 }
2552
2553 /**
2554 * htmlParsePubidLiteral:
2555 * @param ctxt an HTML parser context
2556 *
2557 * parse an HTML public literal
2558 *
2559 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2560 *
2561 * Returns the PubidLiteral parsed or NULL.
2562 */
2563
2564 static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt)2565 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2566 const xmlChar *q;
2567 xmlChar *ret = NULL;
2568 /*
2569 * Name ::= (Letter | '_') (NameChar)*
2570 */
2571 if (CUR == '"') {
2572 NEXT;
2573 q = CUR_PTR;
2574 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2575 if (CUR != '"') {
2576 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2577 "Unfinished PubidLiteral\n", NULL, NULL);
2578 } else {
2579 ret = xmlStrndup(q, CUR_PTR - q);
2580 NEXT;
2581 }
2582 } else if (CUR == '\'') {
2583 NEXT;
2584 q = CUR_PTR;
2585 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2586 NEXT;
2587 if (CUR != '\'') {
2588 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2589 "Unfinished PubidLiteral\n", NULL, NULL);
2590 } else {
2591 ret = xmlStrndup(q, CUR_PTR - q);
2592 NEXT;
2593 }
2594 } else {
2595 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2596 "PubidLiteral \" or ' expected\n", NULL, NULL);
2597 }
2598
2599 return(ret);
2600 }
2601
2602 /**
2603 * htmlParseScript:
2604 * @param ctxt an HTML parser context
2605 *
2606 * parse the content of an HTML SCRIPT or STYLE element
2607 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2608 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2609 * http://www.w3.org/TR/html4/types.html#type-script
2610 * http://www.w3.org/TR/html4/types.html#h-6.15
2611 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2612 *
2613 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2614 * element and the value of intrinsic event attributes. User agents must
2615 * not evaluate script data as HTML markup but instead must pass it on as
2616 * data to a script engine.
2617 * NOTES:
2618 * - The content is passed like CDATA
2619 * - the attributes for style and scripting "onXXX" are also described
2620 * as CDATA but SGML allows entities references in attributes so their
2621 * processing is identical as other attributes
2622 */
2623 static void
htmlParseScript(htmlParserCtxtPtr ctxt)2624 htmlParseScript(htmlParserCtxtPtr ctxt) {
2625 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2626 int nbchar = 0;
2627 xmlChar cur;
2628
2629 SHRINK;
2630 cur = CUR;
2631 while (IS_CHAR_CH(cur)) {
2632 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2633 (NXT(3) == '-')) {
2634 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2635 if (ctxt->sax->cdataBlock!= NULL) {
2636 /*
2637 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2638 */
2639 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2640 } else if (ctxt->sax->characters != NULL) {
2641 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2642 }
2643 }
2644 nbchar = 0;
2645 htmlParseComment(ctxt);
2646 cur = CUR;
2647 continue;
2648 } else if ((cur == '<') && (NXT(1) == '/')) {
2649 /*
2650 * One should break here, the specification is clear:
2651 * Authors should therefore escape "</" within the content.
2652 * Escape mechanisms are specific to each scripting or
2653 * style sheet language.
2654 */
2655 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2656 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2657 break; /* while */
2658 }
2659 buf[nbchar++] = cur;
2660 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2661 if (ctxt->sax->cdataBlock!= NULL) {
2662 /*
2663 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2664 */
2665 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2666 } else if (ctxt->sax->characters != NULL) {
2667 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2668 }
2669 nbchar = 0;
2670 }
2671 NEXT;
2672 cur = CUR;
2673 }
2674 if (!(IS_CHAR_CH(cur))) {
2675 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2676 "Invalid char in CDATA 0x%X\n", cur);
2677 NEXT;
2678 }
2679
2680 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2681 if (ctxt->sax->cdataBlock!= NULL) {
2682 /*
2683 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2684 */
2685 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2686 } else if (ctxt->sax->characters != NULL) {
2687 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2688 }
2689 }
2690 }
2691
2692
2693 /**
2694 * htmlParseCharData:
2695 * @param ctxt an HTML parser context
2696 *
2697 * parse a CharData section.
2698 * if we are within a CDATA section ']]>' marks an end of section.
2699 *
2700 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2701 */
2702
2703 static void
htmlParseCharData(htmlParserCtxtPtr ctxt)2704 htmlParseCharData(htmlParserCtxtPtr ctxt) {
2705 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2706 int nbchar = 0;
2707 int cur, l;
2708
2709 SHRINK;
2710 cur = CUR_CHAR(l);
2711 while (((cur != '<') || (ctxt->token == '<')) &&
2712 ((cur != '&') || (ctxt->token == '&')) &&
2713 (IS_CHAR(cur))) {
2714 COPY_BUF(l,buf,nbchar,cur);
2715 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2716 /*
2717 * Ok the segment is to be consumed as chars.
2718 */
2719 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2720 if (areBlanks(ctxt, buf, nbchar)) {
2721 if (ctxt->sax->ignorableWhitespace != NULL)
2722 ctxt->sax->ignorableWhitespace(ctxt->userData,
2723 buf, nbchar);
2724 } else {
2725 htmlCheckParagraph(ctxt);
2726 if (ctxt->sax->characters != NULL)
2727 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2728 }
2729 }
2730 nbchar = 0;
2731 }
2732 NEXTL(l);
2733 cur = CUR_CHAR(l);
2734 if (cur == 0) {
2735 SHRINK;
2736 GROW;
2737 cur = CUR_CHAR(l);
2738 }
2739 }
2740 if (nbchar != 0) {
2741 /*
2742 * Ok the segment is to be consumed as chars.
2743 */
2744 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2745 if (areBlanks(ctxt, buf, nbchar)) {
2746 if (ctxt->sax->ignorableWhitespace != NULL)
2747 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2748 } else {
2749 htmlCheckParagraph(ctxt);
2750 if (ctxt->sax->characters != NULL)
2751 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2752 }
2753 }
2754 } else {
2755 /*
2756 * Loop detection
2757 */
2758 if (cur == 0)
2759 ctxt->instate = XML_PARSER_EOF;
2760 }
2761 }
2762
2763 /**
2764 * htmlParseExternalID:
2765 * @param ctxt an HTML parser context
2766 * @param publicID a xmlChar** receiving PubidLiteral
2767 *
2768 * Parse an External ID or a Public ID
2769 *
2770 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2771 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2772 *
2773 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2774 *
2775 * Returns the function returns SystemLiteral and in the second
2776 * case publicID receives PubidLiteral, is strict is off
2777 * it is possible to return NULL and have publicID set.
2778 */
2779
2780 static xmlChar *
htmlParseExternalID(htmlParserCtxtPtr ctxt,xmlChar ** publicID)2781 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
2782 xmlChar *URI = NULL;
2783
2784 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2785 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2786 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2787 SKIP(6);
2788 if (!IS_BLANK_CH(CUR)) {
2789 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2790 "Space required after 'SYSTEM'\n", NULL, NULL);
2791 }
2792 SKIP_BLANKS;
2793 URI = htmlParseSystemLiteral(ctxt);
2794 if (URI == NULL) {
2795 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2796 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
2797 }
2798 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2799 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2800 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2801 SKIP(6);
2802 if (!IS_BLANK_CH(CUR)) {
2803 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2804 "Space required after 'PUBLIC'\n", NULL, NULL);
2805 }
2806 SKIP_BLANKS;
2807 *publicID = htmlParsePubidLiteral(ctxt);
2808 if (*publicID == NULL) {
2809 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2810 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2811 NULL, NULL);
2812 }
2813 SKIP_BLANKS;
2814 if ((CUR == '"') || (CUR == '\'')) {
2815 URI = htmlParseSystemLiteral(ctxt);
2816 }
2817 }
2818 return(URI);
2819 }
2820
2821 /**
2822 * htmlParseComment:
2823 * @param ctxt an HTML parser context
2824 *
2825 * Parse an XML (SGML) comment <!-- .... -->
2826 *
2827 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2828 */
2829 static void
htmlParseComment(htmlParserCtxtPtr ctxt)2830 htmlParseComment(htmlParserCtxtPtr ctxt)
2831 {
2832 xmlChar* buf = NULL;
2833 int len;
2834 int size = HTML_PARSER_BUFFER_SIZE;
2835 int q, ql;
2836 int r, rl;
2837 int cur, l;
2838 xmlParserInputState state;
2839
2840 /*
2841 * Check that there is a comment right here.
2842 */
2843 if ((RAW != '<') || (NXT(1) != '!') ||
2844 (NXT(2) != '-') || (NXT(3) != '-')) return;
2845
2846 state = ctxt->instate;
2847 ctxt->instate = XML_PARSER_COMMENT;
2848 SHRINK;
2849 SKIP(4);
2850 buf = (xmlChar*) xmlMallocAtomic(size * sizeof(xmlChar));
2851 if (!buf)
2852 goto OOM_exit;
2853 // Now we must free 'buf' before returning
2854 q = CUR_CHAR(ql);
2855 NEXTL(ql);
2856 r = CUR_CHAR(rl);
2857 NEXTL(rl);
2858 cur = CUR_CHAR(l);
2859 len = 0;
2860 while (IS_CHAR(cur) &&
2861 ((cur != '>') || (r != '-') || (q != '-')))
2862 {
2863 if (len + 5 >= size)
2864 { // DONE: Fix xmlRealloc
2865 void* tmp;
2866 size *= 2;
2867 tmp = xmlRealloc(buf, size * sizeof(xmlChar));
2868 if (!tmp)
2869 {
2870 OOM:
2871 xmlFree(buf);
2872 OOM_exit:
2873 htmlErrMemory(ctxt, "buffer allocation failed\n");
2874 ctxt->instate = state;
2875 return;
2876 }
2877 buf = (xmlChar*) tmp;
2878 }
2879 COPY_BUF(ql,buf,len,q);
2880 q = r;
2881 ql = rl;
2882 r = cur;
2883 rl = l;
2884 NEXTL(l);
2885 cur = CUR_CHAR(l);
2886 if (cur == 0) {
2887 SHRINK;
2888 GROW;
2889 cur = CUR_CHAR(l);
2890 }
2891 } // end of "while good character and not the end of comment (-->)"
2892
2893 buf[len] = 0;
2894 if (!IS_CHAR(cur)) {
2895 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
2896 "Comment not terminated \n<!--%.50s\n", buf, NULL);
2897 xmlFree(buf);
2898 } else {
2899 NEXT;
2900 if (ctxt->sax &&
2901 ctxt->sax->comment &&
2902 !ctxt->disableSAX)
2903 {
2904 ctxt->sax->comment(ctxt->userData, buf);
2905 }
2906 }
2907 xmlFree(buf);
2908 ctxt->instate = state;
2909 }
2910
2911 /**
2912 * htmlParseCharRef:
2913 * @param ctxt an HTML parser context
2914 *
2915 * parse Reference declarations
2916 *
2917 * [66] CharRef ::= '&#' [0-9]+ ';' |
2918 * '&#x' [0-9a-fA-F]+ ';'
2919 *
2920 * Returns the value parsed (as an int)
2921 */
2922 int
htmlParseCharRef(htmlParserCtxtPtr ctxt)2923 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2924 int val = 0;
2925
2926 if ((CUR == '&') && (NXT(1) == '#') &&
2927 ((NXT(2) == 'x') || NXT(2) == 'X')) {
2928 SKIP(3);
2929 while (CUR != ';') {
2930 if ((CUR >= '0') && (CUR <= '9'))
2931 val = val * 16 + (CUR - '0');
2932 else if ((CUR >= 'a') && (CUR <= 'f'))
2933 val = val * 16 + (CUR - 'a') + 10;
2934 else if ((CUR >= 'A') && (CUR <= 'F'))
2935 val = val * 16 + (CUR - 'A') + 10;
2936 else {
2937 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
2938 "htmlParseCharRef: invalid hexadecimal value\n",
2939 NULL, NULL);
2940 return(0);
2941 }
2942 NEXT;
2943 }
2944 if (CUR == ';')
2945 NEXT;
2946 } else if ((CUR == '&') && (NXT(1) == '#')) {
2947 SKIP(2);
2948 while (CUR != ';') {
2949 if ((CUR >= '0') && (CUR <= '9'))
2950 val = val * 10 + (CUR - '0');
2951 else {
2952 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
2953 "htmlParseCharRef: invalid decimal value\n",
2954 NULL, NULL);
2955 return(0);
2956 }
2957 NEXT;
2958 }
2959 if (CUR == ';')
2960 NEXT;
2961 } else {
2962 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
2963 "htmlParseCharRef: invalid value\n", NULL, NULL);
2964 }
2965 /*
2966 * Check the value IS_CHAR ...
2967 */
2968 if (IS_CHAR(val)) {
2969 return(val);
2970 } else {
2971 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2972 "htmlParseCharRef: invalid xmlChar value %d\n",
2973 val);
2974 }
2975 return(0);
2976 }
2977
2978
2979 /**
2980 * htmlParseDocTypeDecl:
2981 * @param ctxt an HTML parser context
2982 *
2983 * parse a DOCTYPE declaration
2984 *
2985 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2986 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2987 */
2988
2989 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)2990 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2991 const xmlChar *name;
2992 xmlChar *ExternalID = NULL;
2993 xmlChar *URI = NULL;
2994
2995 /*
2996 * We know that '<!DOCTYPE' has been detected.
2997 */
2998 SKIP(9);
2999
3000 SKIP_BLANKS;
3001
3002 /*
3003 * Parse the DOCTYPE name.
3004 */
3005 name = htmlParseName(ctxt);
3006 if (name == NULL) {
3007 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3008 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3009 NULL, NULL);
3010 }
3011 /*
3012 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3013 */
3014
3015 SKIP_BLANKS;
3016
3017 /*
3018 * Check for SystemID and ExternalID
3019 */
3020 URI = htmlParseExternalID(ctxt, &ExternalID);
3021 SKIP_BLANKS;
3022
3023 /*
3024 * We should be at the end of the DOCTYPE declaration.
3025 */
3026 if (CUR != '>') {
3027 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3028 "DOCTYPE improperly terminated\n", NULL, NULL);
3029 /* We shouldn't try to resynchronize ... */
3030 }
3031 NEXT;
3032
3033 /*
3034 * Create or update the document accordingly to the DOCTYPE
3035 */
3036 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3037 (!ctxt->disableSAX))
3038 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3039
3040 /*
3041 * Cleanup, since we don't use all those identifiers
3042 */
3043 if (URI != NULL) xmlFree(URI);
3044 if (ExternalID != NULL) xmlFree(ExternalID);
3045 }
3046
3047 /**
3048 * htmlParseAttribute:
3049 * @param ctxt an HTML parser context
3050 * @param value a xmlChar ** used to store the value of the attribute
3051 *
3052 * parse an attribute
3053 *
3054 * [41] Attribute ::= Name Eq AttValue
3055 *
3056 * [25] Eq ::= S? '=' S?
3057 *
3058 * With namespace:
3059 *
3060 * [NS 11] Attribute ::= QName Eq AttValue
3061 *
3062 * Also the case QName == xmlns:??? is handled independently as a namespace
3063 * definition.
3064 *
3065 * Returns the attribute name, and the value in *value.
3066 */
3067
3068 static const xmlChar *
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3069 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3070 const xmlChar *name;
3071 xmlChar *val = NULL;
3072
3073 *value = NULL;
3074 name = htmlParseHTMLName(ctxt);
3075 if (name == NULL) {
3076 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3077 "error parsing attribute name\n", NULL, NULL);
3078 return(NULL);
3079 }
3080
3081 /*
3082 * read the value
3083 */
3084 SKIP_BLANKS;
3085 if (CUR == '=') {
3086 NEXT;
3087 SKIP_BLANKS;
3088 val = htmlParseAttValue(ctxt);
3089 /******
3090 } else {
3091
3092 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3093 ctxt->sax->warning(ctxt->userData,
3094 "No value for attribute %s\n", name); */
3095 }
3096
3097 *value = val;
3098 return(name);
3099 }
3100
3101 /**
3102 * htmlCheckEncoding:
3103 * @param ctxt an HTML parser context
3104 * @param attvalue the attribute value
3105 *
3106 * Checks an http-equiv attribute from a Meta tag to detect
3107 * the encoding
3108 * If a new encoding is detected the parser is switched to decode
3109 * it and pass UTF8
3110 */
3111 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3112 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3113 const xmlChar *encoding;
3114
3115 if ((ctxt == NULL) || (attvalue == NULL))
3116 return;
3117
3118 /* do not change encoding */
3119 if (ctxt->input->encoding != NULL)
3120 return;
3121
3122 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3123 if (encoding != NULL) {
3124 encoding += 8;
3125 } else {
3126 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3127 if (encoding != NULL)
3128 encoding += 9;
3129 }
3130 if (encoding != NULL) {
3131 xmlCharEncoding enc;
3132 xmlCharEncodingHandlerPtr handler;
3133
3134 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3135
3136 if (ctxt->input->encoding != NULL)
3137 xmlFree((xmlChar *) ctxt->input->encoding);
3138 ctxt->input->encoding = xmlStrdup(encoding);
3139
3140 enc = xmlParseCharEncoding((const char *) encoding);
3141 /*
3142 * registered set of known encodings
3143 */
3144 if (enc != XML_CHAR_ENCODING_ERROR) {
3145 xmlSwitchEncoding(ctxt, enc);
3146 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3147 } else {
3148 /*
3149 * fallback for unknown encodings
3150 */
3151 handler = xmlFindCharEncodingHandler((const char *) encoding);
3152 if (handler != NULL) {
3153 xmlSwitchToEncoding(ctxt, handler);
3154 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3155 } else {
3156 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3157 }
3158 }
3159
3160 if ((ctxt->input->buf != NULL) &&
3161 (ctxt->input->buf->encoder != NULL) &&
3162 (ctxt->input->buf->raw != NULL) &&
3163 (ctxt->input->buf->buffer != NULL)) {
3164 int nbchars;
3165 int processed;
3166
3167 /*
3168 * convert as much as possible to the parser reading buffer.
3169 */
3170 processed = ctxt->input->cur - ctxt->input->base;
3171 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3172 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3173 ctxt->input->buf->buffer,
3174 ctxt->input->buf->raw);
3175 if (nbchars < 0) {
3176 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3177 "htmlCheckEncoding: encoder error\n",
3178 NULL, NULL);
3179 }
3180 ctxt->input->base =
3181 ctxt->input->cur = ctxt->input->buf->buffer->content;
3182 }
3183 }
3184 }
3185
3186 /**
3187 * htmlCheckMeta:
3188 * @param ctxt an HTML parser context
3189 * @param atts the attributes values
3190 *
3191 * Checks an attributes from a Meta tag
3192 */
3193 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3194 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3195 int i;
3196 const xmlChar *att, *value;
3197 int http = 0;
3198 const xmlChar *content = NULL;
3199
3200 if ((ctxt == NULL) || (atts == NULL))
3201 return;
3202
3203 i = 0;
3204 att = atts[i++];
3205 while (att != NULL) {
3206 value = atts[i++];
3207 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3208 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3209 http = 1;
3210 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3211 content = value;
3212 att = atts[i++];
3213 }
3214 if ((http) && (content != NULL))
3215 htmlCheckEncoding(ctxt, content);
3216
3217 }
3218
3219 /**
3220 * htmlParseStartTag:
3221 * @param ctxt an HTML parser context
3222 *
3223 * parse a start of tag either for rule element or
3224 * EmptyElement. In both case we don't parse the tag closing chars.
3225 *
3226 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3227 *
3228 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3229 *
3230 * With namespace:
3231 *
3232 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3233 *
3234 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3235 *
3236 */
3237
3238 static void
htmlParseStartTag(htmlParserCtxtPtr ctxt)3239 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3240 const xmlChar *name;
3241 const xmlChar *attname;
3242 xmlChar *attvalue;
3243 const xmlChar **atts = ctxt->atts;
3244 int nbatts = 0;
3245 int maxatts = ctxt->maxatts;
3246 int meta = 0;
3247 int i;
3248
3249 if (CUR != '<') return;
3250 NEXT;
3251
3252 GROW;
3253 name = htmlParseHTMLName(ctxt);
3254 if (name == NULL) {
3255 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3256 "htmlParseStartTag: invalid element name\n",
3257 NULL, NULL);
3258 /* Dump the bogus tag like browsers do */
3259 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3260 NEXT;
3261 return;
3262 }
3263 if (xmlStrEqual(name, BAD_CAST"meta"))
3264 meta = 1;
3265
3266 /*
3267 * Check for auto-closure of HTML elements.
3268 */
3269 htmlAutoClose(ctxt, name);
3270
3271 /*
3272 * Check for implied HTML elements.
3273 */
3274 htmlCheckImplied(ctxt, name);
3275
3276 /*
3277 * Avoid html at any level > 0, head at any level != 1
3278 * or any attempt to recurse body
3279 */
3280 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3281 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3282 "htmlParseStartTag: misplaced <html> tag\n",
3283 name, NULL);
3284 return;
3285 }
3286 if ((ctxt->nameNr != 1) &&
3287 (xmlStrEqual(name, BAD_CAST"head"))) {
3288 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3289 "htmlParseStartTag: misplaced <head> tag\n",
3290 name, NULL);
3291 return;
3292 }
3293 if (xmlStrEqual(name, BAD_CAST"body")) {
3294 int indx;
3295 for (indx = 0;indx < ctxt->nameNr;indx++) {
3296 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3297 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3298 "htmlParseStartTag: misplaced <body> tag\n",
3299 name, NULL);
3300 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3301 NEXT;
3302 return;
3303 }
3304 }
3305 }
3306
3307 /*
3308 * Now parse the attributes, it ends up with the ending
3309 *
3310 * (S Attribute)* S?
3311 */
3312 SKIP_BLANKS;
3313 while ((IS_CHAR_CH(CUR)) &&
3314 (CUR != '>') &&
3315 ((CUR != '/') || (NXT(1) != '>'))) {
3316 long cons = ctxt->nbChars;
3317
3318 GROW;
3319 attname = htmlParseAttribute(ctxt, &attvalue);
3320 if (attname != NULL) {
3321
3322 /*
3323 * Well formedness requires at most one declaration of an attribute
3324 */
3325 for (i = 0; i < nbatts;i += 2) {
3326 if (xmlStrEqual(atts[i], attname)) {
3327 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3328 "Attribute %s redefined\n", attname, NULL);
3329 if (attvalue != NULL)
3330 xmlFree(attvalue);
3331 goto failed;
3332 }
3333 }
3334
3335 /*
3336 * Add the pair to atts
3337 */
3338 if (atts == NULL) {
3339 maxatts = 22; /* allow for 10 attrs by default */
3340 atts = (const xmlChar **)
3341 xmlMalloc(maxatts * sizeof(xmlChar *));
3342 if (atts == NULL) {
3343 htmlErrMemory(ctxt, NULL);
3344 if (attvalue != NULL)
3345 xmlFree(attvalue);
3346 goto failed;
3347 }
3348 ctxt->atts = atts;
3349 ctxt->maxatts = maxatts;
3350 } else if (nbatts + 4 > maxatts) {
3351 const xmlChar **n;
3352
3353 maxatts *= 2;
3354 n = (const xmlChar **) xmlRealloc((void *) atts,
3355 maxatts * sizeof(const xmlChar *));
3356 if (n == NULL) {
3357 htmlErrMemory(ctxt, NULL);
3358 if (attvalue != NULL)
3359 xmlFree(attvalue);
3360 goto failed;
3361 }
3362 atts = n;
3363 ctxt->atts = atts;
3364 ctxt->maxatts = maxatts;
3365 }
3366 atts[nbatts++] = attname;
3367 atts[nbatts++] = attvalue;
3368 atts[nbatts] = NULL;
3369 atts[nbatts + 1] = NULL;
3370 }
3371 else {
3372 if (attvalue != NULL)
3373 xmlFree(attvalue);
3374 /* Dump the bogus attribute string up to the next blank or
3375 * the end of the tag. */
3376 while ((IS_CHAR_CH(CUR)) &&
3377 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3378 ((CUR != '/') || (NXT(1) != '>')))
3379 NEXT;
3380 }
3381
3382 failed:
3383 SKIP_BLANKS;
3384 if (cons == ctxt->nbChars) {
3385 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3386 "htmlParseStartTag: problem parsing attributes\n",
3387 NULL, NULL);
3388 break;
3389 }
3390 }
3391
3392 /*
3393 * Handle specific association to the META tag
3394 */
3395 if (meta)
3396 htmlCheckMeta(ctxt, atts);
3397
3398 /*
3399 * SAX: Start of Element !
3400 */
3401 htmlnamePush(ctxt, name);
3402 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3403 if (nbatts != 0)
3404 ctxt->sax->startElement(ctxt->userData, name, atts);
3405 else
3406 ctxt->sax->startElement(ctxt->userData, name, NULL);
3407 }
3408
3409 if (atts != NULL) {
3410 for (i = 1;i < nbatts;i += 2) {
3411 if (atts[i] != NULL)
3412 xmlFree((xmlChar *) atts[i]);
3413 }
3414 }
3415 }
3416
3417 /**
3418 * htmlParseEndTag:
3419 * @param ctxt an HTML parser context
3420 *
3421 * parse an end of tag
3422 *
3423 * [42] ETag ::= '</' Name S? '>'
3424 *
3425 * With namespace
3426 *
3427 * [NS 9] ETag ::= '</' QName S? '>'
3428 *
3429 * Returns 1 if the current level should be closed.
3430 */
3431
3432 static int
htmlParseEndTag(htmlParserCtxtPtr ctxt)3433 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3434 {
3435 const xmlChar *name;
3436 const xmlChar *oldname;
3437 int i, ret;
3438
3439 if ((CUR != '<') || (NXT(1) != '/')) {
3440 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3441 "htmlParseEndTag: '</' not found\n", NULL, NULL);
3442 return (0);
3443 }
3444 SKIP(2);
3445
3446 name = htmlParseHTMLName(ctxt);
3447 if (name == NULL)
3448 return (0);
3449
3450 /*
3451 * We should definitely be at the ending "S? '>'" part
3452 */
3453 SKIP_BLANKS;
3454 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3455 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3456 "End tag : expected '>'\n", NULL, NULL);
3457 } else
3458 NEXT;
3459
3460 /*
3461 * If the name read is not one of the element in the parsing stack
3462 * then return, it's just an error.
3463 */
3464 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3465 if (xmlStrEqual(name, ctxt->nameTab[i]))
3466 break;
3467 }
3468 if (i < 0) {
3469 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3470 "Unexpected end tag : %s\n", name, NULL);
3471 return (0);
3472 }
3473
3474
3475 /*
3476 * Check for auto-closure of HTML elements.
3477 */
3478
3479 htmlAutoCloseOnClose(ctxt, name);
3480
3481 /*
3482 * Well formedness constraints, opening and closing must match.
3483 * With the exception that the autoclose may have popped stuff out
3484 * of the stack.
3485 */
3486 if (!xmlStrEqual(name, ctxt->name)) {
3487 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3488 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3489 "Opening and ending tag mismatch: %s and %s\n",
3490 name, ctxt->name);
3491 }
3492 }
3493
3494 /*
3495 * SAX: End of Tag
3496 */
3497 oldname = ctxt->name;
3498 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3499 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3500 ctxt->sax->endElement(ctxt->userData, name);
3501 htmlnamePop(ctxt);
3502 ret = 1;
3503 } else {
3504 ret = 0;
3505 }
3506
3507 return (ret);
3508 }
3509
3510
3511 /**
3512 * htmlParseReference:
3513 * @param ctxt an HTML parser context
3514 *
3515 * parse and handle entity references in content,
3516 * this will end-up in a call to character() since this is either a
3517 * CharRef, or a predefined entity.
3518 */
3519 static void
htmlParseReference(htmlParserCtxtPtr ctxt)3520 htmlParseReference(htmlParserCtxtPtr ctxt) {
3521 const htmlEntityDesc * ent;
3522 xmlChar out[6];
3523 const xmlChar *name;
3524 if (CUR != '&') return;
3525
3526 if (NXT(1) == '#') {
3527 unsigned int c;
3528 int bits, i = 0;
3529
3530 c = htmlParseCharRef(ctxt);
3531 if (c == 0)
3532 return;
3533
3534 if (c < 0x80) { out[i++]= c; bits= -6; }
3535 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3536 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3537 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3538
3539 for ( ; bits >= 0; bits-= 6) {
3540 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3541 }
3542 out[i] = 0;
3543
3544 htmlCheckParagraph(ctxt);
3545 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3546 ctxt->sax->characters(ctxt->userData, out, i);
3547 } else {
3548 ent = htmlParseEntityRef(ctxt, &name);
3549 if (name == NULL) {
3550 htmlCheckParagraph(ctxt);
3551 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3552 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3553 return;
3554 }
3555 if ((ent == NULL) || !(ent->value > 0)) {
3556 htmlCheckParagraph(ctxt);
3557 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3558 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3559 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3560 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3561 }
3562 } else {
3563 unsigned int c;
3564 int bits, i = 0;
3565
3566 c = ent->value;
3567 if (c < 0x80)
3568 { out[i++]= c; bits= -6; }
3569 else if (c < 0x800)
3570 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3571 else if (c < 0x10000)
3572 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3573 else
3574 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3575
3576 for ( ; bits >= 0; bits-= 6) {
3577 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3578 }
3579 out[i] = 0;
3580
3581 htmlCheckParagraph(ctxt);
3582 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3583 ctxt->sax->characters(ctxt->userData, out, i);
3584 }
3585 }
3586 }
3587
3588 /**
3589 * htmlParseContent:
3590 * @param ctxt an HTML parser context
3591 * @param name the node name
3592 *
3593 * Parse a content: comment, sub-element, reference or text.
3594 *
3595 */
3596
3597 static void
htmlParseContent(htmlParserCtxtPtr ctxt)3598 htmlParseContent(htmlParserCtxtPtr ctxt) {
3599 xmlChar *currentNode;
3600 int depth;
3601
3602 currentNode = xmlStrdup(ctxt->name);
3603 depth = ctxt->nameNr;
3604 while (1) {
3605 long cons = ctxt->nbChars;
3606
3607 GROW;
3608 /*
3609 * Our tag or one of it's parent or children is ending.
3610 */
3611 if ((CUR == '<') && (NXT(1) == '/')) {
3612 if (htmlParseEndTag(ctxt) &&
3613 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3614 if (currentNode != NULL)
3615 xmlFree(currentNode);
3616 return;
3617 }
3618 continue; /* while */
3619 }
3620
3621 /*
3622 * Has this node been popped out during parsing of
3623 * the next element
3624 */
3625 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3626 (!xmlStrEqual(currentNode, ctxt->name)))
3627 {
3628 if (currentNode != NULL) xmlFree(currentNode);
3629 return;
3630 }
3631
3632 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3633 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
3634 /*
3635 * Handle SCRIPT/STYLE separately
3636 */
3637 htmlParseScript(ctxt);
3638 } else {
3639 /*
3640 * Sometimes DOCTYPE arrives in the middle of the document
3641 */
3642 if ((CUR == '<') && (NXT(1) == '!') &&
3643 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3644 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3645 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3646 (UPP(8) == 'E')) {
3647 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3648 "Misplaced DOCTYPE declaration\n",
3649 BAD_CAST "DOCTYPE" , NULL);
3650 htmlParseDocTypeDecl(ctxt);
3651 }
3652
3653 /*
3654 * First case : a comment
3655 */
3656 if ((CUR == '<') && (NXT(1) == '!') &&
3657 (NXT(2) == '-') && (NXT(3) == '-')) {
3658 htmlParseComment(ctxt);
3659 }
3660
3661 /*
3662 * Second case : a sub-element.
3663 */
3664 else if (CUR == '<') {
3665 htmlParseElement(ctxt);
3666 }
3667
3668 /*
3669 * Third case : a reference. If if has not been resolved,
3670 * parsing returns it's Name, create the node
3671 */
3672 else if (CUR == '&') {
3673 htmlParseReference(ctxt);
3674 }
3675
3676 /*
3677 * Fourth : end of the resource
3678 */
3679 else if (CUR == 0) {
3680 htmlAutoCloseOnEnd(ctxt);
3681 break;
3682 }
3683
3684 /*
3685 * Last case, text. Note that References are handled directly.
3686 */
3687 else {
3688 htmlParseCharData(ctxt);
3689 }
3690
3691 if (cons == ctxt->nbChars) {
3692 if (ctxt->node != NULL) {
3693 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3694 "detected an error in element content\n",
3695 NULL, NULL);
3696 }
3697 break;
3698 }
3699 }
3700 GROW;
3701 }
3702 if (currentNode != NULL) xmlFree(currentNode);
3703 }
3704
3705 /**
3706 * htmlParseElement:
3707 * @param ctxt an HTML parser context
3708 *
3709 * parse an HTML element, this is highly recursive
3710 *
3711 * [39] element ::= EmptyElemTag | STag content ETag
3712 *
3713 * [41] Attribute ::= Name Eq AttValue
3714 */
3715
3716 void
htmlParseElement(htmlParserCtxtPtr ctxt)3717 htmlParseElement(htmlParserCtxtPtr ctxt) {
3718 const xmlChar *name;
3719 xmlChar *currentNode = NULL;
3720 const htmlElemDesc * info;
3721 htmlParserNodeInfo node_info;
3722 const xmlChar *oldname;
3723 int depth = ctxt->nameNr;
3724 const xmlChar *oldptr;
3725
3726 /* Capture start position */
3727 if (ctxt->record_info) {
3728 node_info.begin_pos = ctxt->input->consumed +
3729 (CUR_PTR - ctxt->input->base);
3730 node_info.begin_line = ctxt->input->line;
3731 }
3732
3733 oldname = ctxt->name;
3734 htmlParseStartTag(ctxt);
3735 name = ctxt->name;
3736 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3737 (name == NULL)) {
3738 if (CUR == '>')
3739 NEXT;
3740 return;
3741 }
3742
3743 /*
3744 * Lookup the info for that element.
3745 */
3746 info = htmlTagLookup(name);
3747 if (info == NULL) {
3748 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
3749 "Tag %s invalid\n", name, NULL);
3750 }
3751
3752 /*
3753 * Check for an Empty Element labeled the XML/SGML way
3754 */
3755 if ((CUR == '/') && (NXT(1) == '>')) {
3756 SKIP(2);
3757 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3758 ctxt->sax->endElement(ctxt->userData, name);
3759 htmlnamePop(ctxt);
3760 return;
3761 }
3762
3763 if (CUR == '>') {
3764 NEXT;
3765 } else {
3766 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3767 "Couldn't find end of Start Tag %s\n", name, NULL);
3768
3769 /*
3770 * end of parsing of this node.
3771 */
3772 if (xmlStrEqual(name, ctxt->name)) {
3773 nodePop(ctxt);
3774 htmlnamePop(ctxt);
3775 }
3776
3777 /*
3778 * Capture end position and add node
3779 */
3780 if ( currentNode != NULL && ctxt->record_info ) {
3781 node_info.end_pos = ctxt->input->consumed +
3782 (CUR_PTR - ctxt->input->base);
3783 node_info.end_line = ctxt->input->line;
3784 node_info.node = ctxt->node;
3785 xmlParserAddNodeInfo(ctxt, &node_info);
3786 }
3787 return;
3788 }
3789
3790 /*
3791 * Check for an Empty Element from DTD definition
3792 */
3793 if ((info != NULL) && (info->empty)) {
3794 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3795 ctxt->sax->endElement(ctxt->userData, name);
3796 htmlnamePop(ctxt);
3797 return;
3798 }
3799
3800 /*
3801 * Parse the content of the element:
3802 */
3803 currentNode = xmlStrdup(ctxt->name);
3804 depth = ctxt->nameNr;
3805 while (IS_CHAR_CH(CUR)) {
3806 oldptr = ctxt->input->cur;
3807 htmlParseContent(ctxt);
3808 if (oldptr==ctxt->input->cur) break;
3809 if (ctxt->nameNr < depth) break;
3810 }
3811
3812 /*
3813 * Capture end position and add node
3814 */
3815 if ( currentNode != NULL && ctxt->record_info ) {
3816 node_info.end_pos = ctxt->input->consumed +
3817 (CUR_PTR - ctxt->input->base);
3818 node_info.end_line = ctxt->input->line;
3819 node_info.node = ctxt->node;
3820 xmlParserAddNodeInfo(ctxt, &node_info);
3821 }
3822 if (!IS_CHAR_CH(CUR)) {
3823 htmlAutoCloseOnEnd(ctxt);
3824 }
3825
3826 if (currentNode != NULL)
3827 xmlFree(currentNode);
3828 }
3829
3830 /**
3831 * htmlParseDocument:
3832 * @param ctxt an HTML parser context
3833 *
3834 * parse an HTML document (and build a tree if using the standard SAX
3835 * interface).
3836 *
3837 * Returns 0, -1 in case of error. the parser context is augmented
3838 * as a result of the parsing.
3839 */
3840
3841 int
htmlParseDocument(htmlParserCtxtPtr ctxt)3842 htmlParseDocument(htmlParserCtxtPtr ctxt) {
3843 xmlDtdPtr dtd;
3844
3845 xmlInitParser();
3846
3847 htmlDefaultSAXHandlerInit();
3848 ctxt->html = 1;
3849
3850 GROW;
3851 /*
3852 * SAX: beginning of the document processing.
3853 */
3854 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3855 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3856
3857 /*
3858 * Wipe out everything which is before the first '<'
3859 */
3860 SKIP_BLANKS;
3861 if (CUR == 0) {
3862 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
3863 "Document is empty\n", NULL, NULL);
3864 }
3865
3866 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3867 ctxt->sax->startDocument(ctxt->userData);
3868
3869
3870 /*
3871 * Parse possible comments before any content
3872 */
3873 while ((CUR == '<') && (NXT(1) == '!') &&
3874 (NXT(2) == '-') && (NXT(3) == '-')) {
3875 htmlParseComment(ctxt);
3876 SKIP_BLANKS;
3877 }
3878
3879
3880 /*
3881 * Then possibly doc type declaration(s) and more Misc
3882 * (doctypedecl Misc*)?
3883 */
3884 if ((CUR == '<') && (NXT(1) == '!') &&
3885 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3886 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3887 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3888 (UPP(8) == 'E')) {
3889 htmlParseDocTypeDecl(ctxt);
3890 }
3891 SKIP_BLANKS;
3892
3893 /*
3894 * Parse possible comments before any content
3895 */
3896 while ((CUR == '<') && (NXT(1) == '!') &&
3897 (NXT(2) == '-') && (NXT(3) == '-')) {
3898 htmlParseComment(ctxt);
3899 SKIP_BLANKS;
3900 }
3901
3902 /*
3903 * Time to start parsing the tree itself
3904 */
3905 htmlParseContent(ctxt);
3906
3907 /*
3908 * autoclose
3909 */
3910 if (CUR == 0)
3911 htmlAutoCloseOnEnd(ctxt);
3912
3913
3914 /*
3915 * SAX: end of the document processing.
3916 */
3917 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3918 ctxt->sax->endDocument(ctxt->userData);
3919
3920 if (ctxt->myDoc != NULL) {
3921 dtd = xmlGetIntSubset(ctxt->myDoc);
3922 if (dtd == NULL)
3923 ctxt->myDoc->intSubset =
3924 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
3925 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3926 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3927 }
3928 if (! ctxt->wellFormed) return(-1);
3929 return(0);
3930 }
3931
3932
3933 /************************************************************************
3934 * *
3935 * Parser contexts handling *
3936 * *
3937 ************************************************************************/
3938
3939 /**
3940 * htmlInitParserCtxt:
3941 * @param ctxt an HTML parser context
3942 *
3943 * Initialize a parser context
3944 *
3945 * Returns 0 in case of success and -1 in case of error
3946 */
3947
3948 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt)3949 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3950 {
3951 htmlSAXHandler *sax;
3952
3953 if (ctxt == NULL) return(-1);
3954 memset(ctxt, 0, sizeof(htmlParserCtxt));
3955 // NOTE: All assignments ctxt->XX = 0; were commented as unnecessary
3956 ctxt->dict = xmlDictCreate();
3957 if (ctxt->dict == NULL) {
3958 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3959 return(-1);
3960 }
3961 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3962 if (sax == NULL) {
3963 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3964 return(-1);
3965 }
3966 else
3967 memset(sax, 0, sizeof(htmlSAXHandler));
3968
3969 /* Allocate the Input stack */
3970 ctxt->inputTab = (htmlParserInputPtr *)
3971 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3972 if (ctxt->inputTab == NULL) {
3973 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3974 //ctxt->inputNr = 0;
3975 //ctxt->inputMax = 0;
3976 //ctxt->input = NULL;
3977 return(-1);
3978 }
3979 //ctxt->inputNr = 0;
3980 ctxt->inputMax = 5;
3981 //ctxt->input = NULL;
3982 //ctxt->version = NULL;
3983 //ctxt->encoding = NULL;
3984 ctxt->standalone = -1;
3985 ctxt->instate = XML_PARSER_START;
3986
3987 /* Allocate the Node stack */
3988 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3989 if (ctxt->nodeTab == NULL) {
3990 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3991 //ctxt->nodeNr = 0;
3992 //ctxt->nodeMax = 0;
3993 //ctxt->node = NULL;
3994 //ctxt->inputNr = 0;
3995 //ctxt->inputMax = 0;
3996 //ctxt->input = NULL;
3997 return(-1);
3998 }
3999 //ctxt->nodeNr = 0;
4000 ctxt->nodeMax = 10;
4001 //ctxt->node = NULL;
4002
4003 /* Allocate the Name stack */
4004 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4005 if (ctxt->nameTab == NULL) {
4006 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4007 //ctxt->nameNr = 0;
4008 ctxt->nameMax = 10;
4009 //ctxt->name = NULL;
4010 //ctxt->nodeNr = 0;
4011 //ctxt->nodeMax = 0;
4012 //ctxt->node = NULL;
4013 //ctxt->inputNr = 0;
4014 //ctxt->inputMax = 0;
4015 ctxt->input = NULL;
4016 return(-1);
4017 }
4018 //ctxt->nameNr = 0;
4019 ctxt->nameMax = 10;
4020 //ctxt->name = NULL;
4021
4022 if (sax == NULL)
4023 ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4024 else {
4025 ctxt->sax = sax;
4026 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4027 }
4028 ctxt->userData = ctxt;
4029 //ctxt->myDoc = NULL;
4030 ctxt->wellFormed = 1;
4031 //ctxt->replaceEntities = 0;
4032 #ifdef LIBXML_ENABLE_NODE_LINEINFO
4033 ctxt->linenumbers = xmlLineNumbersDefaultValue;
4034 #endif
4035 ctxt->html = 1;
4036 ctxt->vctxt.userData = ctxt;
4037 ctxt->vctxt.error = xmlParserValidityError;
4038 ctxt->vctxt.warning = xmlParserValidityWarning;
4039 //ctxt->record_info = 0;
4040 //ctxt->validate = 0;
4041 //ctxt->nbChars = 0;
4042 //ctxt->checkIndex = 0;
4043 //ctxt->catalogs = NULL;
4044 xmlInitNodeInfoSeq(&ctxt->node_seq);
4045 return(0);
4046 }
4047
4048 /**
4049 * htmlFreeParserCtxt:
4050 * @param ctxt an HTML parser context
4051 *
4052 * Free all the memory used by a parser context. However the parsed
4053 * document in ctxt->myDoc is not freed.
4054 */
4055
4056 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)4057 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4058 {
4059 xmlFreeParserCtxt(ctxt);
4060 }
4061
4062 /**
4063 * htmlNewParserCtxt:
4064 *
4065 * Allocate and initialize a new parser context.
4066 *
4067 * Returns the xmlParserCtxtPtr or NULL
4068 */
4069
4070 static htmlParserCtxtPtr
htmlNewParserCtxt(void)4071 htmlNewParserCtxt(void)
4072 {
4073 xmlParserCtxtPtr ctxt;
4074
4075 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4076 if (ctxt == NULL) {
4077 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4078 return(NULL);
4079 }
4080 memset(ctxt, 0, sizeof(xmlParserCtxt));
4081 #ifdef XE_ENABLE_GS_CACHING
4082 ctxt->cachedGs = xmlGetGlobalState();
4083 #endif
4084
4085 if (htmlInitParserCtxt(ctxt) < 0) {
4086 htmlFreeParserCtxt(ctxt);
4087 return(NULL);
4088 }
4089 return(ctxt);
4090 }
4091
4092 /**
4093 * htmlCreateMemoryParserCtxt:
4094 * @param buffer a pointer to a char array
4095 * @param size the size of the array
4096 *
4097 * Create a parser context for an HTML in-memory document.
4098 *
4099 * Returns the new parser context or NULL
4100 */
4101 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)4102 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4103
4104 xmlParserCtxtPtr ctxt;
4105 xmlParserInputPtr input;
4106 xmlParserInputBufferPtr buf;
4107
4108 if (buffer == NULL)
4109 return(NULL);
4110 if (size <= 0)
4111 return(NULL);
4112
4113 ctxt = htmlNewParserCtxt();
4114 if (ctxt == NULL)
4115 return(NULL);
4116
4117 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4118 if (buf == NULL) return(NULL);
4119
4120 input = xmlNewInputStream(ctxt);
4121 if (input == NULL) {
4122 xmlFreeParserCtxt(ctxt);
4123 return(NULL);
4124 }
4125
4126 input->filename = NULL;
4127 input->buf = buf;
4128 input->base = input->buf->buffer->content;
4129 input->cur = input->buf->buffer->content;
4130 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4131
4132 inputPush(ctxt, input);
4133 return(ctxt);
4134 }
4135
4136 /**
4137 * htmlCreateDocParserCtxt:
4138 * @param cur a pointer to an array of xmlChar
4139 * @param encoding a free form C string describing the HTML document encoding, or NULL
4140 *
4141 * Create a parser context for an HTML document.
4142 *
4143
4144 *
4145 * Returns the new parser context or NULL
4146 */
4147 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(xmlChar * cur,const char * encoding ATTRIBUTE_UNUSED)4148 htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
4149 int len;
4150 htmlParserCtxtPtr ctxt;
4151
4152 if (cur == NULL)
4153 return(NULL);
4154 len = xmlStrlen(cur);
4155 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4156
4157 if (encoding != NULL) {
4158 xmlCharEncoding enc;
4159 xmlCharEncodingHandlerPtr handler;
4160
4161 if (ctxt->input->encoding != NULL)
4162 xmlFree((xmlChar *) ctxt->input->encoding);
4163 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4164
4165 enc = xmlParseCharEncoding(encoding);
4166 /*
4167 * registered set of known encodings
4168 */
4169 if (enc != XML_CHAR_ENCODING_ERROR) {
4170 xmlSwitchEncoding(ctxt, enc);
4171 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4172 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4173 "Unsupported encoding %s\n",
4174 (const xmlChar *) encoding, NULL);
4175 }
4176 } else {
4177 /*
4178 * fallback for unknown encodings
4179 */
4180 handler = xmlFindCharEncodingHandler((const char *) encoding);
4181 if (handler != NULL) {
4182 xmlSwitchToEncoding(ctxt, handler);
4183 } else {
4184 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4185 "Unsupported encoding %s\n",
4186 (const xmlChar *) encoding, NULL);
4187 }
4188 }
4189 }
4190 return(ctxt);
4191 }
4192
4193 #ifdef LIBXML_PUSH_ENABLED
4194 /************************************************************************
4195 * *
4196 * Progressive parsing interfaces *
4197 * *
4198 ************************************************************************/
4199
4200 /**
4201 * htmlParseLookupSequence:
4202 * @param ctxt an HTML parser context
4203 * @param first the first char to lookup
4204 * @param next the next char to lookup or zero
4205 * @param third the next char to lookup or zero
4206 * @param comment flag to force checking inside comments
4207 *
4208 * Try to find if a sequence (first, next, third) or just (first next) or
4209 * (first) is available in the input stream.
4210 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4211 * to avoid rescanning sequences of bytes, it DOES change the state of the
4212 * parser, do not use liberally.
4213 * This is basically similar to xmlParseLookupSequence()
4214 *
4215 * Returns the index to the current parsing point if the full sequence
4216 * is available, -1 otherwise.
4217 */
4218 static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt,xmlChar first,xmlChar next,xmlChar third,int iscomment)4219 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4220 xmlChar next, xmlChar third, int iscomment) {
4221 int base, len;
4222 htmlParserInputPtr in;
4223 const xmlChar *buf;
4224 int incomment = 0;
4225
4226 in = ctxt->input;
4227 if (in == NULL) return(-1);
4228 base = in->cur - in->base;
4229 if (base < 0) return(-1);
4230 if (ctxt->checkIndex > base)
4231 base = ctxt->checkIndex;
4232 if (in->buf == NULL) {
4233 buf = in->base;
4234 len = in->length;
4235 } else {
4236 buf = in->buf->buffer->content;
4237 len = in->buf->buffer->use;
4238 }
4239 /* take into account the sequence length */
4240 if (third) len -= 2;
4241 else if (next) len --;
4242 for (;base < len;base++) {
4243 if (!incomment && (base + 4 < len) && !iscomment) {
4244 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4245 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4246 incomment = 1;
4247 /* do not increment past <! - some people use <!--> */
4248 base += 2;
4249 }
4250 }
4251 if (incomment) {
4252 if (base + 3 > len)
4253 return(-1);
4254 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4255 (buf[base + 2] == '>')) {
4256 incomment = 0;
4257 base += 2;
4258 }
4259 continue;
4260 }
4261 if (buf[base] == first) {
4262 if (third != 0) {
4263 if ((buf[base + 1] != next) ||
4264 (buf[base + 2] != third)) continue;
4265 } else if (next != 0) {
4266 if (buf[base + 1] != next) continue;
4267 }
4268 ctxt->checkIndex = 0;
4269 #ifdef DEBUG_PUSH
4270 if (next == 0)
4271 xmlGenericError(xmlGenericErrorContext,
4272 "HPP: lookup '%c' found at %d\n",
4273 first, base);
4274 else if (third == 0)
4275 xmlGenericError(xmlGenericErrorContext,
4276 "HPP: lookup '%c%c' found at %d\n",
4277 first, next, base);
4278 else
4279 xmlGenericError(xmlGenericErrorContext,
4280 "HPP: lookup '%c%c%c' found at %d\n",
4281 first, next, third, base);
4282 #endif
4283 return(base - (in->cur - in->base));
4284 }
4285 }
4286 ctxt->checkIndex = base;
4287 #ifdef DEBUG_PUSH
4288 if (next == 0)
4289 xmlGenericError(xmlGenericErrorContext,
4290 "HPP: lookup '%c' failed\n", first);
4291 else if (third == 0)
4292 xmlGenericError(xmlGenericErrorContext,
4293 "HPP: lookup '%c%c' failed\n", first, next);
4294 else
4295 xmlGenericError(xmlGenericErrorContext,
4296 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4297 #endif
4298 return(-1);
4299 }
4300
4301 /**
4302 * htmlParseTryOrFinish:
4303 * @param ctxt an HTML parser context
4304 * @param terminate last chunk indicator
4305 *
4306 * Try to progress on parsing
4307 *
4308 * Returns zero if no parsing was possible
4309 */
4310 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)4311 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4312 int ret = 0;
4313 htmlParserInputPtr in;
4314 int avail = 0;
4315 xmlChar cur, next;
4316
4317 #ifdef DEBUG_PUSH
4318 switch (ctxt->instate) {
4319 case XML_PARSER_EOF:
4320 xmlGenericError(xmlGenericErrorContext,
4321 "HPP: try EOF\n"); break;
4322 case XML_PARSER_START:
4323 xmlGenericError(xmlGenericErrorContext,
4324 "HPP: try START\n"); break;
4325 case XML_PARSER_MISC:
4326 xmlGenericError(xmlGenericErrorContext,
4327 "HPP: try MISC\n");break;
4328 case XML_PARSER_COMMENT:
4329 xmlGenericError(xmlGenericErrorContext,
4330 "HPP: try COMMENT\n");break;
4331 case XML_PARSER_PROLOG:
4332 xmlGenericError(xmlGenericErrorContext,
4333 "HPP: try PROLOG\n");break;
4334 case XML_PARSER_START_TAG:
4335 xmlGenericError(xmlGenericErrorContext,
4336 "HPP: try START_TAG\n");break;
4337 case XML_PARSER_CONTENT:
4338 xmlGenericError(xmlGenericErrorContext,
4339 "HPP: try CONTENT\n");break;
4340 case XML_PARSER_CDATA_SECTION:
4341 xmlGenericError(xmlGenericErrorContext,
4342 "HPP: try CDATA_SECTION\n");break;
4343 case XML_PARSER_END_TAG:
4344 xmlGenericError(xmlGenericErrorContext,
4345 "HPP: try END_TAG\n");break;
4346 case XML_PARSER_ENTITY_DECL:
4347 xmlGenericError(xmlGenericErrorContext,
4348 "HPP: try ENTITY_DECL\n");break;
4349 case XML_PARSER_ENTITY_VALUE:
4350 xmlGenericError(xmlGenericErrorContext,
4351 "HPP: try ENTITY_VALUE\n");break;
4352 case XML_PARSER_ATTRIBUTE_VALUE:
4353 xmlGenericError(xmlGenericErrorContext,
4354 "HPP: try ATTRIBUTE_VALUE\n");break;
4355 case XML_PARSER_DTD:
4356 xmlGenericError(xmlGenericErrorContext,
4357 "HPP: try DTD\n");break;
4358 case XML_PARSER_EPILOG:
4359 xmlGenericError(xmlGenericErrorContext,
4360 "HPP: try EPILOG\n");break;
4361 case XML_PARSER_PI:
4362 xmlGenericError(xmlGenericErrorContext,
4363 "HPP: try PI\n");break;
4364 case XML_PARSER_SYSTEM_LITERAL:
4365 xmlGenericError(xmlGenericErrorContext,
4366 "HPP: try SYSTEM_LITERAL\n");break;
4367 }
4368 #endif
4369
4370 while (1) {
4371
4372 in = ctxt->input;
4373 if (in == NULL) break;
4374 if (in->buf == NULL)
4375 avail = in->length - (in->cur - in->base);
4376 else
4377 avail = in->buf->buffer->use - (in->cur - in->base);
4378 if ((avail == 0) && (terminate)) {
4379 htmlAutoCloseOnEnd(ctxt);
4380 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4381 /*
4382 * SAX: end of the document processing.
4383 */
4384 ctxt->instate = XML_PARSER_EOF;
4385 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4386 ctxt->sax->endDocument(ctxt->userData);
4387 }
4388 }
4389 if (avail < 1)
4390 goto done;
4391 cur = in->cur[0];
4392 if (cur == 0) {
4393 SKIP(1);
4394 continue;
4395 }
4396
4397 switch (ctxt->instate) {
4398 case XML_PARSER_EOF:
4399 /*
4400 * Document parsing is done !
4401 */
4402 goto done;
4403 case XML_PARSER_START:
4404 /*
4405 * Very first chars read from the document flow.
4406 */
4407 cur = in->cur[0];
4408 if (IS_BLANK_CH(cur)) {
4409 SKIP_BLANKS;
4410 if (in->buf == NULL)
4411 avail = in->length - (in->cur - in->base);
4412 else
4413 avail = in->buf->buffer->use - (in->cur - in->base);
4414 }
4415 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4416 ctxt->sax->setDocumentLocator(ctxt->userData,
4417 &xmlDefaultSAXLocator);
4418 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4419 (!ctxt->disableSAX))
4420 ctxt->sax->startDocument(ctxt->userData);
4421
4422 cur = in->cur[0];
4423 next = in->cur[1];
4424 if ((cur == '<') && (next == '!') &&
4425 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4426 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4427 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4428 (UPP(8) == 'E')) {
4429 if ((!terminate) &&
4430 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4431 goto done;
4432 #ifdef DEBUG_PUSH
4433 xmlGenericError(xmlGenericErrorContext,
4434 "HPP: Parsing internal subset\n");
4435 #endif
4436 htmlParseDocTypeDecl(ctxt);
4437 ctxt->instate = XML_PARSER_PROLOG;
4438 #ifdef DEBUG_PUSH
4439 xmlGenericError(xmlGenericErrorContext,
4440 "HPP: entering PROLOG\n");
4441 #endif
4442 } else {
4443 ctxt->instate = XML_PARSER_MISC;
4444 }
4445 #ifdef DEBUG_PUSH
4446 xmlGenericError(xmlGenericErrorContext,
4447 "HPP: entering MISC\n");
4448 #endif
4449 break;
4450 case XML_PARSER_MISC:
4451 SKIP_BLANKS;
4452 if (in->buf == NULL)
4453 avail = in->length - (in->cur - in->base);
4454 else
4455 avail = in->buf->buffer->use - (in->cur - in->base);
4456 if (avail < 2)
4457 goto done;
4458 cur = in->cur[0];
4459 next = in->cur[1];
4460 if ((cur == '<') && (next == '!') &&
4461 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4462 if ((!terminate) &&
4463 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4464 goto done;
4465 #ifdef DEBUG_PUSH
4466 xmlGenericError(xmlGenericErrorContext,
4467 "HPP: Parsing Comment\n");
4468 #endif
4469 htmlParseComment(ctxt);
4470 ctxt->instate = XML_PARSER_MISC;
4471 } else if ((cur == '<') && (next == '!') &&
4472 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4473 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4474 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4475 (UPP(8) == 'E')) {
4476 if ((!terminate) &&
4477 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4478 goto done;
4479 #ifdef DEBUG_PUSH
4480 xmlGenericError(xmlGenericErrorContext,
4481 "HPP: Parsing internal subset\n");
4482 #endif
4483 htmlParseDocTypeDecl(ctxt);
4484 ctxt->instate = XML_PARSER_PROLOG;
4485 #ifdef DEBUG_PUSH
4486 xmlGenericError(xmlGenericErrorContext,
4487 "HPP: entering PROLOG\n");
4488 #endif
4489 } else if ((cur == '<') && (next == '!') &&
4490 (avail < 9)) {
4491 goto done;
4492 } else {
4493 ctxt->instate = XML_PARSER_START_TAG;
4494 #ifdef DEBUG_PUSH
4495 xmlGenericError(xmlGenericErrorContext,
4496 "HPP: entering START_TAG\n");
4497 #endif
4498 }
4499 break;
4500 case XML_PARSER_PROLOG:
4501 SKIP_BLANKS;
4502 if (in->buf == NULL)
4503 avail = in->length - (in->cur - in->base);
4504 else
4505 avail = in->buf->buffer->use - (in->cur - in->base);
4506 if (avail < 2)
4507 goto done;
4508 cur = in->cur[0];
4509 next = in->cur[1];
4510 if ((cur == '<') && (next == '!') &&
4511 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4512 if ((!terminate) &&
4513 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4514 goto done;
4515 #ifdef DEBUG_PUSH
4516 xmlGenericError(xmlGenericErrorContext,
4517 "HPP: Parsing Comment\n");
4518 #endif
4519 htmlParseComment(ctxt);
4520 ctxt->instate = XML_PARSER_PROLOG;
4521 } else if ((cur == '<') && (next == '!') &&
4522 (avail < 4)) {
4523 goto done;
4524 } else {
4525 ctxt->instate = XML_PARSER_START_TAG;
4526 #ifdef DEBUG_PUSH
4527 xmlGenericError(xmlGenericErrorContext,
4528 "HPP: entering START_TAG\n");
4529 #endif
4530 }
4531 break;
4532 case XML_PARSER_EPILOG:
4533 if (in->buf == NULL)
4534 avail = in->length - (in->cur - in->base);
4535 else
4536 avail = in->buf->buffer->use - (in->cur - in->base);
4537 if (avail < 1)
4538 goto done;
4539 cur = in->cur[0];
4540 if (IS_BLANK_CH(cur)) {
4541 htmlParseCharData(ctxt);
4542 goto done;
4543 }
4544 if (avail < 2)
4545 goto done;
4546 next = in->cur[1];
4547 if ((cur == '<') && (next == '!') &&
4548 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4549 if ((!terminate) &&
4550 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4551 goto done;
4552 #ifdef DEBUG_PUSH
4553 xmlGenericError(xmlGenericErrorContext,
4554 "HPP: Parsing Comment\n");
4555 #endif
4556 htmlParseComment(ctxt);
4557 ctxt->instate = XML_PARSER_EPILOG;
4558 } else if ((cur == '<') && (next == '!') &&
4559 (avail < 4)) {
4560 goto done;
4561 } else {
4562 ctxt->errNo = XML_ERR_DOCUMENT_END;
4563 ctxt->wellFormed = 0;
4564 ctxt->instate = XML_PARSER_EOF;
4565 #ifdef DEBUG_PUSH
4566 xmlGenericError(xmlGenericErrorContext,
4567 "HPP: entering EOF\n");
4568 #endif
4569 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4570 ctxt->sax->endDocument(ctxt->userData);
4571 goto done;
4572 }
4573 break;
4574 case XML_PARSER_START_TAG: {
4575 const xmlChar *name, *oldname;
4576 int depth = ctxt->nameNr;
4577 const htmlElemDesc * info;
4578
4579 if (avail < 2)
4580 goto done;
4581 cur = in->cur[0];
4582 if (cur != '<') {
4583 ctxt->instate = XML_PARSER_CONTENT;
4584 #ifdef DEBUG_PUSH
4585 xmlGenericError(xmlGenericErrorContext,
4586 "HPP: entering CONTENT\n");
4587 #endif
4588 break;
4589 }
4590 if (in->cur[1] == '/') {
4591 ctxt->instate = XML_PARSER_END_TAG;
4592 ctxt->checkIndex = 0;
4593 #ifdef DEBUG_PUSH
4594 xmlGenericError(xmlGenericErrorContext,
4595 "HPP: entering END_TAG\n");
4596 #endif
4597 break;
4598 }
4599 if ((!terminate) &&
4600 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4601 goto done;
4602
4603 oldname = ctxt->name;
4604 htmlParseStartTag(ctxt);
4605 name = ctxt->name;
4606 if (((depth == ctxt->nameNr) &&
4607 (xmlStrEqual(oldname, ctxt->name))) ||
4608 (name == NULL)) {
4609 if (CUR == '>')
4610 NEXT;
4611 break;
4612 }
4613
4614 /*
4615 * Lookup the info for that element.
4616 */
4617 info = htmlTagLookup(name);
4618 if (info == NULL) {
4619 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4620 "Tag %s invalid\n", name, NULL);
4621 }
4622
4623 /*
4624 * Check for an Empty Element labeled the XML/SGML way
4625 */
4626 if ((CUR == '/') && (NXT(1) == '>')) {
4627 SKIP(2);
4628 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4629 ctxt->sax->endElement(ctxt->userData, name);
4630 oldname = htmlnamePop(ctxt);
4631 ctxt->instate = XML_PARSER_CONTENT;
4632 #ifdef DEBUG_PUSH
4633 xmlGenericError(xmlGenericErrorContext,
4634 "HPP: entering CONTENT\n");
4635 #endif
4636 break;
4637 }
4638
4639 if (CUR == '>') {
4640 NEXT;
4641 } else {
4642 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4643 "Couldn't find end of Start Tag %s\n",
4644 name, NULL);
4645
4646 /*
4647 * end of parsing of this node.
4648 */
4649 if (xmlStrEqual(name, ctxt->name)) {
4650 nodePop(ctxt);
4651 oldname = htmlnamePop(ctxt);
4652 }
4653
4654 ctxt->instate = XML_PARSER_CONTENT;
4655 #ifdef DEBUG_PUSH
4656 xmlGenericError(xmlGenericErrorContext,
4657 "HPP: entering CONTENT\n");
4658 #endif
4659 break;
4660 }
4661
4662 /*
4663 * Check for an Empty Element from DTD definition
4664 */
4665 if ((info != NULL) && (info->empty)) {
4666 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4667 ctxt->sax->endElement(ctxt->userData, name);
4668 oldname = htmlnamePop(ctxt);
4669 }
4670 ctxt->instate = XML_PARSER_CONTENT;
4671 #ifdef DEBUG_PUSH
4672 xmlGenericError(xmlGenericErrorContext,
4673 "HPP: entering CONTENT\n");
4674 #endif
4675 break;
4676 }
4677 case XML_PARSER_CONTENT: {
4678 long cons;
4679 /*
4680 * Handle preparsed entities and charRef
4681 */
4682 if (ctxt->token != 0) {
4683 xmlChar chr[2] = { 0 , 0 } ;
4684
4685 chr[0] = (xmlChar) ctxt->token;
4686 htmlCheckParagraph(ctxt);
4687 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4688 ctxt->sax->characters(ctxt->userData, chr, 1);
4689 ctxt->token = 0;
4690 ctxt->checkIndex = 0;
4691 }
4692 if ((avail == 1) && (terminate)) {
4693 cur = in->cur[0];
4694 if ((cur != '<') && (cur != '&')) {
4695 if (ctxt->sax != NULL) {
4696 if (IS_BLANK_CH(cur)) {
4697 if (ctxt->sax->ignorableWhitespace != NULL)
4698 ctxt->sax->ignorableWhitespace(
4699 ctxt->userData, &cur, 1);
4700 } else {
4701 htmlCheckParagraph(ctxt);
4702 if (ctxt->sax->characters != NULL)
4703 ctxt->sax->characters(
4704 ctxt->userData, &cur, 1);
4705 }
4706 }
4707 ctxt->token = 0;
4708 ctxt->checkIndex = 0;
4709 in->cur++;
4710 break;
4711 }
4712 }
4713 if (avail < 2)
4714 goto done;
4715 cur = in->cur[0];
4716 next = in->cur[1];
4717 cons = ctxt->nbChars;
4718 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4719 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4720 /*
4721 * Handle SCRIPT/STYLE separately
4722 */
4723 if ((!terminate) &&
4724 (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0))
4725 goto done;
4726 htmlParseScript(ctxt);
4727 if ((cur == '<') && (next == '/')) {
4728 ctxt->instate = XML_PARSER_END_TAG;
4729 ctxt->checkIndex = 0;
4730 #ifdef DEBUG_PUSH
4731 xmlGenericError(xmlGenericErrorContext,
4732 "HPP: entering END_TAG\n");
4733 #endif
4734 break;
4735 }
4736 } else {
4737 /*
4738 * Sometimes DOCTYPE arrives in the middle of the document
4739 */
4740 if ((cur == '<') && (next == '!') &&
4741 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4742 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4743 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4744 (UPP(8) == 'E')) {
4745 if ((!terminate) &&
4746 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4747 goto done;
4748 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4749 "Misplaced DOCTYPE declaration\n",
4750 BAD_CAST "DOCTYPE" , NULL);
4751 htmlParseDocTypeDecl(ctxt);
4752 } else if ((cur == '<') && (next == '!') &&
4753 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4754 if ((!terminate) &&
4755 (htmlParseLookupSequence(
4756 ctxt, '-', '-', '>', 1) < 0))
4757 goto done;
4758 #ifdef DEBUG_PUSH
4759 xmlGenericError(xmlGenericErrorContext,
4760 "HPP: Parsing Comment\n");
4761 #endif
4762 htmlParseComment(ctxt);
4763 ctxt->instate = XML_PARSER_CONTENT;
4764 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4765 goto done;
4766 } else if ((cur == '<') && (next == '/')) {
4767 ctxt->instate = XML_PARSER_END_TAG;
4768 ctxt->checkIndex = 0;
4769 #ifdef DEBUG_PUSH
4770 xmlGenericError(xmlGenericErrorContext,
4771 "HPP: entering END_TAG\n");
4772 #endif
4773 break;
4774 } else if (cur == '<') {
4775 ctxt->instate = XML_PARSER_START_TAG;
4776 ctxt->checkIndex = 0;
4777 #ifdef DEBUG_PUSH
4778 xmlGenericError(xmlGenericErrorContext,
4779 "HPP: entering START_TAG\n");
4780 #endif
4781 break;
4782 } else if (cur == '&') {
4783 if ((!terminate) &&
4784 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
4785 goto done;
4786 #ifdef DEBUG_PUSH
4787 xmlGenericError(xmlGenericErrorContext,
4788 "HPP: Parsing Reference\n");
4789 #endif
4790
4791 htmlParseReference(ctxt);
4792 } else {
4793 /*
4794 * check that the text sequence is complete
4795 * before handing out the data to the parser
4796 * to avoid problems with erroneous end of
4797 * data detection.
4798 */
4799 if ((!terminate) &&
4800 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
4801 goto done;
4802 ctxt->checkIndex = 0;
4803 #ifdef DEBUG_PUSH
4804 xmlGenericError(xmlGenericErrorContext,
4805 "HPP: Parsing char data\n");
4806 #endif
4807 htmlParseCharData(ctxt);
4808 }
4809 }
4810 if (cons == ctxt->nbChars) {
4811 if (ctxt->node != NULL) {
4812 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4813 "detected an error in element content\n",
4814 NULL, NULL);
4815 }
4816 NEXT;
4817 break;
4818 }
4819
4820 break;
4821 }
4822 case XML_PARSER_END_TAG:
4823 if (avail < 2)
4824 goto done;
4825 if ((!terminate) &&
4826 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4827 goto done;
4828 htmlParseEndTag(ctxt);
4829 if (ctxt->nameNr == 0) {
4830 ctxt->instate = XML_PARSER_EPILOG;
4831 } else {
4832 ctxt->instate = XML_PARSER_CONTENT;
4833 }
4834 ctxt->checkIndex = 0;
4835 #ifdef DEBUG_PUSH
4836 xmlGenericError(xmlGenericErrorContext,
4837 "HPP: entering CONTENT\n");
4838 #endif
4839 break;
4840 case XML_PARSER_CDATA_SECTION:
4841 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4842 "HPP: internal error, state == CDATA\n",
4843 NULL, NULL);
4844 ctxt->instate = XML_PARSER_CONTENT;
4845 ctxt->checkIndex = 0;
4846 #ifdef DEBUG_PUSH
4847 xmlGenericError(xmlGenericErrorContext,
4848 "HPP: entering CONTENT\n");
4849 #endif
4850 break;
4851 case XML_PARSER_DTD:
4852 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4853 "HPP: internal error, state == DTD\n",
4854 NULL, NULL);
4855 ctxt->instate = XML_PARSER_CONTENT;
4856 ctxt->checkIndex = 0;
4857 #ifdef DEBUG_PUSH
4858 xmlGenericError(xmlGenericErrorContext,
4859 "HPP: entering CONTENT\n");
4860 #endif
4861 break;
4862 case XML_PARSER_COMMENT:
4863 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4864 "HPP: internal error, state == COMMENT\n",
4865 NULL, NULL);
4866 ctxt->instate = XML_PARSER_CONTENT;
4867 ctxt->checkIndex = 0;
4868 #ifdef DEBUG_PUSH
4869 xmlGenericError(xmlGenericErrorContext,
4870 "HPP: entering CONTENT\n");
4871 #endif
4872 break;
4873 case XML_PARSER_PI:
4874 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4875 "HPP: internal error, state == PI\n",
4876 NULL, NULL);
4877 ctxt->instate = XML_PARSER_CONTENT;
4878 ctxt->checkIndex = 0;
4879 #ifdef DEBUG_PUSH
4880 xmlGenericError(xmlGenericErrorContext,
4881 "HPP: entering CONTENT\n");
4882 #endif
4883 break;
4884 case XML_PARSER_ENTITY_DECL:
4885 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4886 "HPP: internal error, state == ENTITY_DECL\n",
4887 NULL, NULL);
4888 ctxt->instate = XML_PARSER_CONTENT;
4889 ctxt->checkIndex = 0;
4890 #ifdef DEBUG_PUSH
4891 xmlGenericError(xmlGenericErrorContext,
4892 "HPP: entering CONTENT\n");
4893 #endif
4894 break;
4895 case XML_PARSER_ENTITY_VALUE:
4896 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4897 "HPP: internal error, state == ENTITY_VALUE\n",
4898 NULL, NULL);
4899 ctxt->instate = XML_PARSER_CONTENT;
4900 ctxt->checkIndex = 0;
4901 #ifdef DEBUG_PUSH
4902 xmlGenericError(xmlGenericErrorContext,
4903 "HPP: entering DTD\n");
4904 #endif
4905 break;
4906 case XML_PARSER_ATTRIBUTE_VALUE:
4907 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4908 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
4909 NULL, NULL);
4910 ctxt->instate = XML_PARSER_START_TAG;
4911 ctxt->checkIndex = 0;
4912 #ifdef DEBUG_PUSH
4913 xmlGenericError(xmlGenericErrorContext,
4914 "HPP: entering START_TAG\n");
4915 #endif
4916 break;
4917 case XML_PARSER_SYSTEM_LITERAL:
4918 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4919 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
4920 NULL, NULL);
4921 ctxt->instate = XML_PARSER_CONTENT;
4922 ctxt->checkIndex = 0;
4923 #ifdef DEBUG_PUSH
4924 xmlGenericError(xmlGenericErrorContext,
4925 "HPP: entering CONTENT\n");
4926 #endif
4927 break;
4928 case XML_PARSER_IGNORE:
4929 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4930 "HPP: internal error, state == XML_PARSER_IGNORE\n",
4931 NULL, NULL);
4932 ctxt->instate = XML_PARSER_CONTENT;
4933 ctxt->checkIndex = 0;
4934 #ifdef DEBUG_PUSH
4935 xmlGenericError(xmlGenericErrorContext,
4936 "HPP: entering CONTENT\n");
4937 #endif
4938 break;
4939 case XML_PARSER_PUBLIC_LITERAL:
4940 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4941 "HPP: internal error, state == XML_PARSER_LITERAL\n",
4942 NULL, NULL);
4943 ctxt->instate = XML_PARSER_CONTENT;
4944 ctxt->checkIndex = 0;
4945 #ifdef DEBUG_PUSH
4946 xmlGenericError(xmlGenericErrorContext,
4947 "HPP: entering CONTENT\n");
4948 #endif
4949 break;
4950
4951 }
4952 }
4953 done:
4954 if ((avail == 0) && (terminate)) {
4955 htmlAutoCloseOnEnd(ctxt);
4956 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4957 /*
4958 * SAX: end of the document processing.
4959 */
4960 ctxt->instate = XML_PARSER_EOF;
4961 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4962 ctxt->sax->endDocument(ctxt->userData);
4963 }
4964 }
4965 if ((ctxt->myDoc != NULL) &&
4966 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4967 (ctxt->instate == XML_PARSER_EPILOG))) {
4968 xmlDtdPtr dtd;
4969 dtd = xmlGetIntSubset(ctxt->myDoc);
4970 if (dtd == NULL)
4971 ctxt->myDoc->intSubset =
4972 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4973 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4974 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4975 }
4976 #ifdef DEBUG_PUSH
4977 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4978 #endif
4979 return(ret);
4980 }
4981
4982 /**
4983 * htmlParseChunk:
4984 * @param ctxt an HTML parser context
4985 * @param chunk an char array
4986 * @param size the size in byte of the chunk
4987 * @param terminate last chunk indicator
4988 *
4989 * Parse a Chunk of memory
4990 *
4991 * Returns zero if no error, the xmlParserErrors otherwise.
4992 */
4993 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)4994 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4995 int terminate) {
4996 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4997 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4998 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4999 int cur = ctxt->input->cur - ctxt->input->base;
5000
5001 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5002 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5003 ctxt->input->cur = ctxt->input->base + cur;
5004 #ifdef DEBUG_PUSH
5005 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5006 #endif
5007
5008 #if 0
5009 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5010 htmlParseTryOrFinish(ctxt, terminate);
5011 #endif
5012 } else if (ctxt->instate != XML_PARSER_EOF) {
5013 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5014 xmlParserInputBufferPtr in = ctxt->input->buf;
5015 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5016 (in->raw != NULL)) {
5017 int nbchars;
5018
5019 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5020 if (nbchars < 0) {
5021 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5022 "encoder error\n", NULL, NULL);
5023 return(XML_ERR_INVALID_ENCODING);
5024 }
5025 }
5026 }
5027 }
5028 htmlParseTryOrFinish(ctxt, terminate);
5029 if (terminate) {
5030 if ((ctxt->instate != XML_PARSER_EOF) &&
5031 (ctxt->instate != XML_PARSER_EPILOG) &&
5032 (ctxt->instate != XML_PARSER_MISC)) {
5033 ctxt->errNo = XML_ERR_DOCUMENT_END;
5034 ctxt->wellFormed = 0;
5035 }
5036 if (ctxt->instate != XML_PARSER_EOF) {
5037 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5038 ctxt->sax->endDocument(ctxt->userData);
5039 }
5040 ctxt->instate = XML_PARSER_EOF;
5041 }
5042 return((xmlParserErrors) ctxt->errNo);
5043 }
5044 #endif /* LIBXML_PUSH_ENABLED */
5045
5046 /************************************************************************
5047 * *
5048 * User entry points *
5049 * *
5050 ************************************************************************/
5051
5052 /**
5053 * htmlCreatePushParserCtxt:
5054 * @param sax a SAX handler
5055 * @param user_data The user data returned on SAX callbacks
5056 * @param chunk a pointer to an array of chars
5057 * @param size number of chars in the array
5058 * @param filename an optional file name or URI
5059 * @param enc an optional encoding
5060 *
5061 * Create a parser context for using the HTML parser in push mode
5062 * The value of filename is used for fetching external entities
5063 * and error/warning reports.
5064 *
5065 * Returns the new parser context or NULL
5066 */
5067 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)5068 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5069 const char *chunk, int size, const char *filename,
5070 xmlCharEncoding enc) {
5071 htmlParserCtxtPtr ctxt;
5072 htmlParserInputPtr inputStream;
5073 xmlParserInputBufferPtr buf;
5074
5075 xmlInitParser();
5076
5077 buf = xmlAllocParserInputBuffer(enc);
5078 if (buf == NULL) return(NULL);
5079
5080 ctxt = htmlNewParserCtxt();
5081 if (ctxt == NULL) {
5082 xmlFreeParserInputBuffer(buf);
5083 return(NULL);
5084 }
5085 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5086 ctxt->charset=XML_CHAR_ENCODING_UTF8;
5087 if (sax != NULL) {
5088 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
5089 xmlFree(ctxt->sax);
5090 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5091 if (ctxt->sax == NULL) {
5092 xmlFree(buf);
5093 xmlFree(ctxt);
5094 return(NULL);
5095 }
5096 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5097 if (user_data != NULL)
5098 ctxt->userData = user_data;
5099 }
5100 if (filename == NULL) {
5101 ctxt->directory = NULL;
5102 } else {
5103 ctxt->directory = xmlParserGetDirectory(filename);
5104 }
5105
5106 inputStream = htmlNewInputStream(ctxt);
5107 if (inputStream == NULL) {
5108 xmlFreeParserCtxt(ctxt);
5109 xmlFree(buf);
5110 return(NULL);
5111 }
5112
5113 if (filename == NULL)
5114 inputStream->filename = NULL;
5115 else
5116 inputStream->filename = (char *)
5117 xmlCanonicPath((const xmlChar *) filename);
5118 inputStream->buf = buf;
5119 inputStream->base = inputStream->buf->buffer->content;
5120 inputStream->cur = inputStream->buf->buffer->content;
5121 inputStream->end =
5122 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
5123
5124 inputPush(ctxt, inputStream);
5125
5126 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5127 (ctxt->input->buf != NULL)) {
5128 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5129 int cur = ctxt->input->cur - ctxt->input->base;
5130
5131 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5132
5133 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5134 ctxt->input->cur = ctxt->input->base + cur;
5135 ctxt->input->end =
5136 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5137 #ifdef DEBUG_PUSH
5138 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5139 #endif
5140 }
5141
5142 return(ctxt);
5143 }
5144
5145 /**
5146 * htmlSAXParseDoc:
5147 * @param cur a pointer to an array of xmlChar
5148 * @param encoding a free form C string describing the HTML document encoding, or NULL
5149 * @param sax the SAX handler block
5150 * @param userData if using SAX, this pointer will be provided on callbacks.
5151 *
5152 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5153 * to handle parse events. If sax is NULL, fallback to the default DOM
5154 * behavior and return a tree.
5155 *
5156 * Returns the resulting document tree unless SAX is NULL or the document is
5157 * not well formed.
5158 */
5159
5160 htmlDocPtr
htmlSAXParseDoc(xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5161 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5162 htmlDocPtr ret;
5163 htmlParserCtxtPtr ctxt;
5164
5165 xmlInitParser();
5166
5167 if (cur == NULL) return(NULL);
5168
5169
5170 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5171 if (ctxt == NULL) return(NULL);
5172 if (sax != NULL) {
5173 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
5174 ctxt->sax = sax;
5175 ctxt->userData = userData;
5176 }
5177
5178 htmlParseDocument(ctxt);
5179 ret = ctxt->myDoc;
5180 if (sax != NULL) {
5181 ctxt->sax = NULL;
5182 ctxt->userData = NULL;
5183 }
5184 htmlFreeParserCtxt(ctxt);
5185
5186 return(ret);
5187 }
5188
5189 /**
5190 * htmlParseDoc:
5191 * @param cur a pointer to an array of xmlChar
5192 * @param encoding a free form C string describing the HTML document encoding, or NULL
5193 *
5194 * parse an HTML in-memory document and build a tree.
5195 *
5196 * Returns the resulting document tree
5197 */
5198
5199 htmlDocPtr
htmlParseDoc(xmlChar * cur,const char * encoding)5200 htmlParseDoc(xmlChar *cur, const char *encoding) {
5201 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5202 }
5203
5204
5205 /**
5206 * htmlCreateFileParserCtxt:
5207 * @param filename the filename
5208 * @param encoding a free form C string describing the HTML document encoding, or NULL
5209 *
5210 * Create a parser context for a file content.
5211 * Automatic support for ZLIB/Compress compressed document is provided
5212 * by default if found at compile-time.
5213 *
5214 * Returns the new parser context or NULL
5215 */
5216 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)5217 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5218 {
5219 htmlParserCtxtPtr ctxt;
5220 htmlParserInputPtr inputStream;
5221 char *canonicFilename;
5222 /* htmlCharEncoding enc; */
5223 xmlChar *content, *content_line = (xmlChar *) "charset=";
5224
5225 ctxt = htmlNewParserCtxt();
5226 if (ctxt == NULL) {
5227 return(NULL);
5228 }
5229 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5230 if (canonicFilename == NULL) {
5231 #ifdef LIBXML_SAX1_ENABLED
5232 if (xmlDefaultSAXHandler.error != NULL) {
5233 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5234 }
5235 #endif
5236 xmlFreeParserCtxt(ctxt);
5237 return(NULL);
5238 }
5239
5240 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5241 xmlFree(canonicFilename);
5242 if (inputStream == NULL) {
5243 xmlFreeParserCtxt(ctxt);
5244 return(NULL);
5245 }
5246
5247 inputPush(ctxt, inputStream);
5248
5249 /* set encoding */
5250 if (encoding) {
5251 content = (xmlChar*)xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
5252 if (content) {
5253 strcpy ((char *)content, (char *)content_line);
5254 strcat ((char *)content, (char *)encoding);
5255 htmlCheckEncoding (ctxt, content);
5256 xmlFree (content);
5257 }
5258 }
5259
5260 return(ctxt);
5261 }
5262
5263 /**
5264 * htmlSAXParseFile:
5265 * @param filename the filename
5266 * @param encoding a free form C string describing the HTML document encoding, or NULL
5267 * @param sax the SAX handler block
5268 * @param userData if using SAX, this pointer will be provided on callbacks.
5269 *
5270 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5271 * compressed document is provided by default if found at compile-time.
5272 * It use the given SAX function block to handle the parsing callback.
5273 * If sax is NULL, fallback to the default DOM tree building routines.
5274 *
5275 * Returns the resulting document tree unless SAX is NULL or the document is
5276 * not well formed.
5277 */
5278
5279 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5280 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5281 void *userData) {
5282 htmlDocPtr ret;
5283 htmlParserCtxtPtr ctxt;
5284 htmlSAXHandlerPtr oldsax = NULL;
5285
5286 xmlInitParser();
5287
5288 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5289 if (ctxt == NULL) return(NULL);
5290 if (sax != NULL) {
5291 oldsax = ctxt->sax;
5292 ctxt->sax = sax;
5293 ctxt->userData = userData;
5294 }
5295
5296 htmlParseDocument(ctxt);
5297
5298 ret = ctxt->myDoc;
5299 if (sax != NULL) {
5300 ctxt->sax = oldsax;
5301 ctxt->userData = NULL;
5302 }
5303 htmlFreeParserCtxt(ctxt);
5304
5305 return(ret);
5306 }
5307
5308 /**
5309 * htmlParseFile:
5310 * @param filename the filename
5311 * @param encoding a free form C string describing the HTML document encoding, or NULL
5312 *
5313 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5314 * compressed document is provided by default if found at compile-time.
5315 *
5316 * Returns the resulting document tree
5317 */
5318
5319 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)5320 htmlParseFile(const char *filename, const char *encoding) {
5321 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5322 }
5323
5324 /**
5325 * htmlHandleOmittedElem:
5326 * @param val int 0 or 1
5327 *
5328 * Set and return the previous value for handling HTML omitted tags.
5329 *
5330 * Returns the last value for 0 for no handling, 1 for auto insertion.
5331 */
5332
5333 int
htmlHandleOmittedElem(int val)5334 htmlHandleOmittedElem(int val) {
5335 int old = htmlOmittedDefaultValue;
5336
5337
5338 return(old);
5339 }
5340
5341 /**
5342 * htmlElementAllowedHere:
5343 * @param parent HTML parent element
5344 * @param elt HTML element
5345 *
5346 * Checks whether an HTML element may be a direct child of a parent element.
5347 * Note - doesn't check for deprecated elements
5348 *
5349 * Returns 1 if allowed; 0 otherwise.
5350 */
5351 int
htmlElementAllowedHere(const htmlElemDesc * parent,const xmlChar * elt)5352 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5353 const char** p ;
5354
5355 if ( ! elt || ! parent || ! parent->subelts )
5356 return 0 ;
5357
5358 for ( p = parent->subelts; *p; ++p )
5359 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5360 return 1 ;
5361
5362 return 0 ;
5363 }
5364 /**
5365 * htmlElementStatusHere:
5366 * @param parent HTML parent element
5367 * @param elt HTML element
5368 *
5369 * Checks whether an HTML element may be a direct child of a parent element.
5370 * and if so whether it is valid or deprecated.
5371 *
5372 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5373 */
5374 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent,const htmlElemDesc * elt)5375 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5376 if ( ! parent || ! elt )
5377 return HTML_INVALID ;
5378 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5379 return HTML_INVALID ;
5380
5381 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5382 }
5383 /**
5384 * htmlAttrAllowed:
5385 * @param elt HTML element
5386 * @param attr HTML attribute
5387 * @param legacy whether to allow deprecated attributes
5388 *
5389 * Checks whether an attribute is valid for an element
5390 * Has full knowledge of Required and Deprecated attributes
5391 *
5392 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5393 */
5394 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt,const xmlChar * attr,int legacy)5395 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5396 const char** p ;
5397
5398 if ( !elt || ! attr )
5399 return HTML_INVALID ;
5400
5401 if ( elt->attrs_req )
5402 for ( p = elt->attrs_req; *p; ++p)
5403 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5404 return HTML_REQUIRED ;
5405
5406 if ( elt->attrs_opt )
5407 for ( p = elt->attrs_opt; *p; ++p)
5408 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5409 return HTML_VALID ;
5410
5411 if ( legacy && elt->attrs_depr )
5412 for ( p = elt->attrs_depr; *p; ++p)
5413 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5414 return HTML_DEPRECATED ;
5415
5416 return HTML_INVALID ;
5417 }
5418 /**
5419 * htmlNodeStatus:
5420 * @param node an htmlNodePtr in a tree
5421 * @param legacy whether to allow deprecated elements (YES is faster here
5422 * for Element nodes)
5423 *
5424 * Checks whether the tree node is valid. Experimental (the author
5425 * only uses the HTML enhancements in a SAX parser)
5426 *
5427 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5428 * legacy allowed) or htmlElementStatusHere (otherwise).
5429 * for Attribute nodes, a return from htmlAttrAllowed
5430 * for other nodes, HTML_NA (no checks performed)
5431 */
5432 htmlStatus
htmlNodeStatus(const htmlNodePtr node,int legacy)5433 htmlNodeStatus(const htmlNodePtr node, int legacy) {
5434 if ( ! node )
5435 return HTML_INVALID ;
5436
5437 switch ( node->type ) {
5438 case XML_ELEMENT_NODE:
5439 return legacy
5440 ? ( htmlElementAllowedHere (
5441 htmlTagLookup(node->parent->name) , node->name
5442 ) ? HTML_VALID : HTML_INVALID )
5443 : htmlElementStatusHere(
5444 htmlTagLookup(node->parent->name) ,
5445 htmlTagLookup(node->name) )
5446 ;
5447 case XML_ATTRIBUTE_NODE:
5448 return htmlAttrAllowed(
5449 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5450 default: return HTML_NA ;
5451 }
5452 }
5453 /************************************************************************
5454 * *
5455 * New set (2.6.0) of simpler and more flexible APIs *
5456 * *
5457 ************************************************************************/
5458 /**
5459 * DICT_FREE:
5460 * @param str a string
5461 *
5462 * Free a string if it is not owned by the "dict" dictionnary in the
5463 * current scope
5464 */
5465 #define DICT_FREE(str) \
5466 if ((str) && ((!dict) || \
5467 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5468 xmlFree((char *)(str));
5469
5470 /**
5471 * htmlCtxtReset:
5472 * @param ctxt an HTML parser context
5473 *
5474 * Reset a parser context
5475 */
5476 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)5477 htmlCtxtReset(htmlParserCtxtPtr ctxt)
5478 {
5479 xmlParserInputPtr input;
5480 xmlDictPtr dict = ctxt->dict;
5481
5482 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5483 xmlFreeInputStream(input);
5484 }
5485 ctxt->inputNr = 0;
5486 ctxt->input = NULL;
5487
5488 ctxt->spaceNr = 0;
5489 ctxt->spaceTab[0] = -1;
5490 ctxt->space = &ctxt->spaceTab[0];
5491
5492
5493 ctxt->nodeNr = 0;
5494 ctxt->node = NULL;
5495
5496 ctxt->nameNr = 0;
5497 ctxt->name = NULL;
5498
5499 DICT_FREE(ctxt->version);
5500 ctxt->version = NULL;
5501 DICT_FREE(ctxt->encoding);
5502 ctxt->encoding = NULL;
5503 DICT_FREE(ctxt->directory);
5504 ctxt->directory = NULL;
5505 DICT_FREE(ctxt->extSubURI);
5506 ctxt->extSubURI = NULL;
5507 DICT_FREE(ctxt->extSubSystem);
5508 ctxt->extSubSystem = NULL;
5509 if (ctxt->myDoc != NULL)
5510 xmlFreeDoc(ctxt->myDoc);
5511 ctxt->myDoc = NULL;
5512
5513 ctxt->standalone = -1;
5514 ctxt->hasExternalSubset = 0;
5515 ctxt->hasPErefs = 0;
5516 ctxt->html = 1;
5517 ctxt->external = 0;
5518 ctxt->instate = XML_PARSER_START;
5519 ctxt->token = 0;
5520
5521 ctxt->wellFormed = 1;
5522 ctxt->nsWellFormed = 1;
5523 ctxt->valid = 1;
5524 ctxt->vctxt.userData = ctxt;
5525 ctxt->vctxt.error = xmlParserValidityError;
5526 ctxt->vctxt.warning = xmlParserValidityWarning;
5527 ctxt->record_info = 0;
5528 ctxt->nbChars = 0;
5529 ctxt->checkIndex = 0;
5530 ctxt->inSubset = 0;
5531 ctxt->errNo = XML_ERR_OK;
5532 ctxt->depth = 0;
5533 ctxt->charset = XML_CHAR_ENCODING_UTF8;
5534 ctxt->catalogs = NULL;
5535 xmlInitNodeInfoSeq(&ctxt->node_seq);
5536
5537 if (ctxt->attsDefault != NULL) {
5538 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5539 ctxt->attsDefault = NULL;
5540 }
5541 if (ctxt->attsSpecial != NULL) {
5542 xmlHashFree(ctxt->attsSpecial, NULL);
5543 ctxt->attsSpecial = NULL;
5544 }
5545 }
5546
5547 /**
5548 * htmlCtxtUseOptions:
5549 * @param ctxt an HTML parser context
5550 * @param options a combination of htmlParserOption(s)
5551 *
5552 * Applies the options to the parser context
5553 *
5554 * Returns 0 in case of success, the set of unknown or unimplemented options
5555 * in case of error.
5556 */
5557 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)5558 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5559 {
5560 if (options & HTML_PARSE_NOWARNING) {
5561 ctxt->sax->warning = NULL;
5562 ctxt->vctxt.warning = NULL;
5563 options -= XML_PARSE_NOWARNING;
5564 ctxt->options |= XML_PARSE_NOWARNING;
5565 }
5566 if (options & HTML_PARSE_NOERROR) {
5567 ctxt->sax->error = NULL;
5568 ctxt->vctxt.error = NULL;
5569 ctxt->sax->fatalError = NULL;
5570 options -= XML_PARSE_NOERROR;
5571 ctxt->options |= XML_PARSE_NOERROR;
5572 }
5573 if (options & HTML_PARSE_PEDANTIC) {
5574 ctxt->pedantic = 1;
5575 options -= XML_PARSE_PEDANTIC;
5576 ctxt->options |= XML_PARSE_PEDANTIC;
5577 } else
5578 ctxt->pedantic = 0;
5579 if (options & XML_PARSE_NOBLANKS) {
5580 ctxt->keepBlanks = 0;
5581 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5582 options -= XML_PARSE_NOBLANKS;
5583 ctxt->options |= XML_PARSE_NOBLANKS;
5584 } else
5585 ctxt->keepBlanks = 1;
5586 ctxt->dictNames = 0;
5587 return (options);
5588 }
5589
5590 /**
5591 * htmlDoRead:
5592 * @param ctxt an HTML parser context
5593 * @param URL the base URL to use for the document
5594 * @param encoding the document encoding, or NULL
5595 * @param options a combination of htmlParserOption(s)
5596 * @param reuse keep the context for reuse
5597 *
5598 * Common front-end for the htmlRead functions
5599 *
5600 * Returns the resulting document tree or NULL
5601 */
5602 static htmlDocPtr
htmlDoRead(htmlParserCtxtPtr ctxt,const char * URL,const char * encoding,int options,int reuse)5603 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5604 int options, int reuse)
5605 {
5606 htmlDocPtr ret;
5607
5608 htmlCtxtUseOptions(ctxt, options);
5609 ctxt->html = 1;
5610 if (encoding != NULL) {
5611 xmlCharEncodingHandlerPtr hdlr;
5612
5613 hdlr = xmlFindCharEncodingHandler(encoding);
5614 if (hdlr != NULL)
5615 xmlSwitchToEncoding(ctxt, hdlr);
5616 }
5617 if ((URL != NULL) && (ctxt->input != NULL) &&
5618 (ctxt->input->filename == NULL))
5619 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5620 htmlParseDocument(ctxt);
5621 ret = ctxt->myDoc;
5622 ctxt->myDoc = NULL;
5623 if (!reuse) {
5624 if ((ctxt->dictNames) &&
5625 (ret != NULL) &&
5626 (ret->dict == ctxt->dict))
5627 ctxt->dict = NULL;
5628 xmlFreeParserCtxt(ctxt);
5629 }
5630 return (ret);
5631 }
5632
5633 /**
5634 * htmlReadDoc:
5635 * @param cur a pointer to a zero terminated string
5636 * @param URL the base URL to use for the document
5637 * @param encoding the document encoding, or NULL
5638 * @param options a combination of htmlParserOption(s)
5639 *
5640 * parse an XML in-memory document and build a tree.
5641 *
5642 * Returns the resulting document tree
5643 */
5644 htmlDocPtr
htmlReadDoc(const xmlChar * cur,const char * URL,const char * encoding,int options)5645 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5646 {
5647 htmlParserCtxtPtr ctxt;
5648
5649 if (cur == NULL)
5650 return (NULL);
5651
5652 ctxt = xmlCreateDocParserCtxt(cur, sizeof(cur));
5653 if (ctxt == NULL)
5654 return (NULL);
5655 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5656 }
5657
5658 /**
5659 * htmlReadFile:
5660 * @param filename a file or URL
5661 * @param encoding the document encoding, or NULL
5662 * @param options a combination of htmlParserOption(s)
5663 *
5664 * parse an XML file from the filesystem or the network.
5665 *
5666 * Returns the resulting document tree
5667 */
5668 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)5669 htmlReadFile(const char *filename, const char *encoding, int options)
5670 {
5671 htmlParserCtxtPtr ctxt;
5672
5673 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5674 if (ctxt == NULL)
5675 return (NULL);
5676 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5677 }
5678
5679 /**
5680 * htmlReadMemory:
5681 * @param buffer a pointer to a char array
5682 * @param size the size of the array
5683 * @param URL the base URL to use for the document
5684 * @param encoding the document encoding, or NULL
5685 * @param options a combination of htmlParserOption(s)
5686 *
5687 * parse an XML in-memory document and build a tree.
5688 *
5689 * Returns the resulting document tree
5690 */
5691 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * URL,const char * encoding,int options)5692 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5693 {
5694 htmlParserCtxtPtr ctxt;
5695
5696 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5697 if (ctxt == NULL)
5698 return (NULL);
5699 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5700 }
5701
5702 /**
5703 * htmlReadFd:
5704 * @param fd an open file descriptor
5705 * @param URL the base URL to use for the document
5706 * @param encoding the document encoding, or NULL
5707 * @param options a combination of htmlParserOption(s)
5708 *
5709 * parse an XML from a file descriptor and build a tree.
5710 *
5711 * Returns the resulting document tree
5712 */
5713 htmlDocPtr
htmlReadFd(int fd,const char * URL,const char * encoding,int options)5714 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
5715 {
5716 htmlParserCtxtPtr ctxt;
5717 xmlParserInputBufferPtr input;
5718 xmlParserInputPtr stream;
5719
5720 if (fd < 0)
5721 return (NULL);
5722
5723 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5724 if (input == NULL)
5725 return (NULL);
5726 ctxt = xmlNewParserCtxt();
5727 if (ctxt == NULL) {
5728 xmlFreeParserInputBuffer(input);
5729 return (NULL);
5730 }
5731 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5732 if (stream == NULL) {
5733 xmlFreeParserInputBuffer(input);
5734 xmlFreeParserCtxt(ctxt);
5735 return (NULL);
5736 }
5737 inputPush(ctxt, stream);
5738 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5739 }
5740
5741 /**
5742 * htmlReadIO:
5743 * @param ioread an I/O read function
5744 * @param ioclose an I/O close function
5745 * @param ioctx an I/O handler
5746 * @param URL the base URL to use for the document
5747 * @param encoding the document encoding, or NULL
5748 * @param options a combination of htmlParserOption(s)
5749 *
5750 * parse an HTML document from I/O functions and source and build a tree.
5751 *
5752 * Returns the resulting document tree
5753 */
5754 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)5755 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
5756 void *ioctx, const char *URL, const char *encoding, int options)
5757 {
5758 htmlParserCtxtPtr ctxt;
5759 xmlParserInputBufferPtr input;
5760 xmlParserInputPtr stream;
5761
5762 if (ioread == NULL)
5763 return (NULL);
5764
5765 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5766 XML_CHAR_ENCODING_NONE);
5767 if (input == NULL)
5768 return (NULL);
5769 ctxt = xmlNewParserCtxt();
5770 if (ctxt == NULL) {
5771 xmlFreeParserInputBuffer(input);
5772 return (NULL);
5773 }
5774 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5775 if (stream == NULL) {
5776 xmlFreeParserInputBuffer(input);
5777 xmlFreeParserCtxt(ctxt);
5778 return (NULL);
5779 }
5780 inputPush(ctxt, stream);
5781 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5782 }
5783
5784 /**
5785 * htmlCtxtReadDoc:
5786 * @param ctxt an HTML parser context
5787 * @param cur a pointer to a zero terminated string
5788 * @param URL the base URL to use for the document
5789 * @param encoding the document encoding, or NULL
5790 * @param options a combination of htmlParserOption(s)
5791 *
5792 * parse an XML in-memory document and build a tree.
5793 * This reuses the existing ctxt parser context
5794 *
5795 * Returns the resulting document tree
5796 */
5797 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * cur,const char * URL,const char * encoding,int options)5798 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
5799 const char *URL, const char *encoding, int options)
5800 {
5801 xmlParserInputPtr stream;
5802
5803 if (cur == NULL)
5804 return (NULL);
5805 if (ctxt == NULL)
5806 return (NULL);
5807
5808 htmlCtxtReset(ctxt);
5809
5810 stream = xmlNewStringInputStream(ctxt, cur);
5811 if (stream == NULL) {
5812 return (NULL);
5813 }
5814 inputPush(ctxt, stream);
5815 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5816 }
5817
5818 /**
5819 * htmlCtxtReadFile:
5820 * @param ctxt an HTML parser context
5821 * @param filename a file or URL
5822 * @param encoding the document encoding, or NULL
5823 * @param options a combination of htmlParserOption(s)
5824 *
5825 * parse an XML file from the filesystem or the network.
5826 * This reuses the existing ctxt parser context
5827 *
5828 * Returns the resulting document tree
5829 */
5830 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)5831 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
5832 const char *encoding, int options)
5833 {
5834 xmlParserInputPtr stream;
5835
5836 if (filename == NULL)
5837 return (NULL);
5838 if (ctxt == NULL)
5839 return (NULL);
5840
5841 htmlCtxtReset(ctxt);
5842
5843 stream = xmlNewInputFromFile(ctxt, filename);
5844 if (stream == NULL) {
5845 return (NULL);
5846 }
5847 inputPush(ctxt, stream);
5848 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
5849 }
5850
5851 /**
5852 * htmlCtxtReadMemory:
5853 * @param ctxt an HTML parser context
5854 * @param buffer a pointer to a char array
5855 * @param size the size of the array
5856 * @param URL the base URL to use for the document
5857 * @param encoding the document encoding, or NULL
5858 * @param options a combination of htmlParserOption(s)
5859 *
5860 * parse an XML in-memory document and build a tree.
5861 * This reuses the existing ctxt parser context
5862 *
5863 * Returns the resulting document tree
5864 */
5865 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)5866 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
5867 const char *URL, const char *encoding, int options)
5868 {
5869 xmlParserInputBufferPtr input;
5870 xmlParserInputPtr stream;
5871
5872 if (ctxt == NULL)
5873 return (NULL);
5874 if (buffer == NULL)
5875 return (NULL);
5876
5877 htmlCtxtReset(ctxt);
5878
5879 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5880 if (input == NULL) {
5881 return(NULL);
5882 }
5883
5884 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5885 if (stream == NULL) {
5886 xmlFreeParserInputBuffer(input);
5887 return(NULL);
5888 }
5889
5890 inputPush(ctxt, stream);
5891 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5892 }
5893
5894 /**
5895 * htmlCtxtReadFd:
5896 * @param ctxt an HTML parser context
5897 * @param fd an open file descriptor
5898 * @param URL the base URL to use for the document
5899 * @param encoding the document encoding, or NULL
5900 * @param options a combination of htmlParserOption(s)
5901 *
5902 * parse an XML from a file descriptor and build a tree.
5903 * This reuses the existing ctxt parser context
5904 *
5905 * Returns the resulting document tree
5906 */
5907 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)5908 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
5909 const char *URL, const char *encoding, int options)
5910 {
5911 xmlParserInputBufferPtr input;
5912 xmlParserInputPtr stream;
5913
5914 if (fd < 0)
5915 return (NULL);
5916 if (ctxt == NULL)
5917 return (NULL);
5918
5919 htmlCtxtReset(ctxt);
5920
5921
5922 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5923 if (input == NULL)
5924 return (NULL);
5925 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5926 if (stream == NULL) {
5927 xmlFreeParserInputBuffer(input);
5928 return (NULL);
5929 }
5930 inputPush(ctxt, stream);
5931 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5932 }
5933
5934 /**
5935 * htmlCtxtReadIO:
5936 * @param ctxt an HTML parser context
5937 * @param ioread an I/O read function
5938 * @param ioclose an I/O close function
5939 * @param ioctx an I/O handler
5940 * @param URL the base URL to use for the document
5941 * @param encoding the document encoding, or NULL
5942 * @param options a combination of htmlParserOption(s)
5943 *
5944 * parse an HTML document from I/O functions and source and build a tree.
5945 * This reuses the existing ctxt parser context
5946 *
5947 * Returns the resulting document tree
5948 */
5949 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)5950 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
5951 xmlInputCloseCallback ioclose, void *ioctx,
5952 const char *URL,
5953 const char *encoding, int options)
5954 {
5955 xmlParserInputBufferPtr input;
5956 xmlParserInputPtr stream;
5957
5958 if (ioread == NULL)
5959 return (NULL);
5960 if (ctxt == NULL)
5961 return (NULL);
5962
5963 htmlCtxtReset(ctxt);
5964
5965 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5966 XML_CHAR_ENCODING_NONE);
5967 if (input == NULL)
5968 return (NULL);
5969 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5970 if (stream == NULL) {
5971 xmlFreeParserInputBuffer(input);
5972 return (NULL);
5973 }
5974 inputPush(ctxt, stream);
5975 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5976 }
5977
5978 #endif /* LIBXML_HTML_ENABLED */
5979